#! /usr/bin/gawk -f # Counts the number of Voynichese text characters in an EVT-like file BEGIN { nBytes = 0 } /^#/ { next } function cleanup(txt) { # Removes crud from text # We discard "%" and "!". gsub(/[% !]/, "", txt); # We discard "{}"-comments: gsub(/\{[^}]*\}/, "", txt); # We choose arbitrarily the first of alternative transcriptions: gsub(/\[/, "", txt); gsub(/\|[^\]]*\]/, "", txt); gsub(/\]/, "", txt); return txt } /./ { if (substr($0,1,1) == "<") { skip = 19; loc = substr($0,1,19); gsub(/ *$/, "", loc); if ( loc !~ /^$/ ) { printf "line %d, bad location \"%s\"\n", NR, loc > "/dev/stderr" } } else { skip = 0; } if (skip >= length($0)) next; txt = cleanup(substr($0,1+skip)) nBytes += length(txt) next } END { print nBytes > "/dev/stderr" print nBytes }