#! /usr/bin/gawk -f # Last edited on 2001-01-15 03:18:12 by stolfi BEGIN { abort = -1; usage = ( "extract-glyph-strings \\\n" \ " -v glyphs=GLYPHS \\\n" \ " < INFILE > OUTFILE" \ ); # Reads recors of the form COUNT WORD where WORD is partitioned # into elements by braces {}. # # For each record, the script partitions the WORD into an alternating # sequence of the form OKOKOKO...KO, where each O is a string of zero # or more elements consisting only of the specified GLYPH letters, # and each K is a single element containig at least one non-GLYPH # letter. # # Then, for each O-string, outputs COUNT {LEFT}{RIGHT} where # STRING is that string (deprived of braces and surrounded by <>), and # {LEFT} and {RIGHT} are the previous and next K elements # respectively. The script provides a dummy K-element "{_}" # before the first O-slot and after the last O-slot, repsectively. if (glyphs == "") { arg_error(("must define \"glyphs\"")); } if (glyphs !~ /^[a-zA-Z0-9]+$/) { arg_error(("bad value for \"glyphs\"")); } gpat = ( "[" glyphs "]" ); } (abort >= 0) { exit abort; } /^#/ { print; next; } /./ { # Insert word-start and word-stop (K-like) markers w = ("{_}" $2 "{_}"); # Merge consecutive GLYPHS-only elements together: pat = ( "(" gpat ")}{(" gpat ")" ); w = gensub(pat, "\\1\\2", "g", w); w = gensub(pat, "\\1\\2", "g", w); # Replace the outer braces of GLYPHS-only elements by <> pat = ( "{(" gpat "+)}" ); w = gensub(pat, "<\\1>", "g", w); # Mark empty GLYPHS-slots with <> w = gensub(/}{/, "}<>{", "g", w); # Duplicate K elements (with space separator) for sharing w = gensub(/{([^{}]*)}/, "{\\1} {\\1}", "g", w); # Now splits into KOK triples: n = split(w, wf); if (wf[1] != "{_}") { data_error(("bad karma \"" wf[1] "\"")); } if (wf[n] != "{_}") { data_error(("bad karma \"" wf[n] "\"")); } pat = ( "^[{][^{}]+[}][<]" gpat "*[>][{][^{}]+[}]$" ); for (i=2; i<=n-1; i++) { if (wf[i] !~ pat) { data_error(("bad karma \"" wf[i] "\"")); } print $1, wf[i]; } } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; }