#! /usr/bin/gawk -f # Last edited on 2004-11-19 02:56:31 by stolfi # Reads an inerlinear file with "[..|..|..]" groups and # unfolds such lines into multiple lines. BEGIN { # original transcriber codes oldcodes = "CFTLKRJU"; for (i=1; i<=length(oldcodes); i++) { c = substr(oldcodes,i,1); newcode[c, 1] = c; } # new transcriber codes: newcode["C", 2] = "D"; newcode["F", 2] = "G"; newcode["F", 3] = "H"; newcode["J", 2] = "I"; newcode["K", 2] = "Q"; newcode["L", 2] = "M"; } function first_alt(txt) { # Removes from a comment-free Voynich string # all group alternatives but the first. gsub(/\|[^\]]*\]/, "]", txt); gsub(/\[/, "", txt); gsub(/\]/, "", txt); return txt; } function other_alts(txt) { # Removes from a comment-free Voynich string # the first alternative from every group, leaving the rest. # If the group reduces to one alternative, removes the brackets. gsub(/\[[^\|\]]*\|/, "[", txt); txt = gensub(/\[([^\]\|]*)\]/, "\\1", "g", txt); return txt; } function remove_bangs(old, neu, n, i, chunk) { # Removes "!" characters outside the '{}' comments, while # preserving the text inside the '{}' comments. neu = ""; while ((n = length(old)) != 0) { i = index(old, "{"); if (i != 1) { if (i == 0) { i = n+1; } chunk = gensub(/[!]/, "", "g", substr(old, 1, i-1)); old = substr(old, i); neu = (neu chunk); } else { match(old, /^{[^}]*}/); if (RSTART > 0) { neu = (neu substr(old, 1, RLENGTH)); old = substr(old, RLENGTH + 1); } else { printf "line %d, missing '\}'\n", NR > "/dev/stderr"; neu = (neu old); old = ""; } } } return neu; } function extract_choice(old, fst, neu, n, i, chunk) { # Processes the parts of "old" outside the '{}' comments, while # preserving the text inside the '{}' comments. # Returns either first alternative (fst=1) or all but first (fst=0) neu = ""; while ((n = length(old)) != 0) { i = index(old, "{"); if (i != 1) { if (i == 0) { i = n+1; } chunk = substr(old, 1, i-1); old = substr(old, i); if (fst) { chunk = first_alt(chunk); } else { chunk = other_alts(chunk); } neu = (neu chunk); } else { match(old, /^{[^}]*}/); if (RSTART > 0) { neu = (neu substr(old, 1, RLENGTH)); old = substr(old, RLENGTH + 1); } else { printf "line %d, missing '\}'\n", NR > "/dev/stderr"; neu = (neu old); old = ""; } } } return neu; } function has_alts(txt, n, i, chunk) { # Returns 1 iff "txt" contains some [|] groups outside '{}'. while ((n = length(txt)) != 0) { i = index(txt, "{"); if (i != 1) { if (i == 0) { i = n+1; } chunk = substr(txt, 1, i-1); txt = substr(txt, i); if (index(chunk, "[") != 0) return 1; } else { match(txt, /^{[^}]*}/); if (RSTART > 0) { neu = (neu substr(txt, 1, RLENGTH)); txt = substr(txt, RLENGTH + 1); } else { printf "line %d, missing '\}'\n", NR > "/dev/stderr"; txt = ""; } } } return 0; } function fix_loc(loc, k, c, d) { # Replaces the transcriber's code "x" in "loc" by "newcode[x,k]". if ((k == 1) || (loc == "")) return loc; match(loc, /;[A-Z]>/); if (RSTART == 0) { return loc; } else { c = substr(loc, RSTART+1, 1); if ((c,k) in newcode) { d = newcode[c,k]; } else { printf "error: no %dth replacement for code %s\n", k, c > "/dev/stderr"; d = "?"; } return (substr(loc, 1, RSTART) d substr(loc, RSTART+2)); } } function unfold_line(loc, msg, k, r) { # Given a line with location prefix "loc" (possibly empty) and # Voynich text "msg" (possibly with '{}' comments), prints one or # more lines, choosing successive alternatives in every group. For # the second and successive lines, each transcriber's code "x" in # "loc" is replaced by "newcode[x,k]". msg = remove_bangs(msg); k = 1; match(loc, /;[A-Z]>/); r = RSTART; while (has_alts(msg)) { print (fix_loc(loc, k) extract_choice(msg, 1)); msg = extract_choice(msg, 0); k++; } print (fix_loc(loc, k) msg); } /^ *$/ { print; next } /^ *#/ { print; next; } /^<[^>]*> *$/ { print; next } /^