#! /usr/bin/gawk -f # Last edited on 1998-07-14 23:08:31 by stolfi BEGIN { abort = -1; usage = "replace-signif-digraph DIG SYM < INFILE > OUTFILE"; # Reads a file produced by extract-signif-chars. # Replaces every occurrence of the digraph DIG as # consecutive significant chars (class 3) by the single character SYM. # Ignores decoration (class 0) but is frustrated by word # and paragraph breaks (class 1 and 2). if (ARGC != 3) { error(("bad args - usage: " usage)); } dig = ARGV[1]; if (length(dig) != 2) { error(("bad digraph - usage: " usage)); } diga = substr(dig,1,1); digb = substr(dig,2,1); repl = ARGV[2]; if (length(repl) != 1) { error(("bad replacement - usage: " usage)); } ARGC = 1; # If "siga" is not empty, it is the previous signif char in the current word. # In that case "deco" is the decoration after "siga". # If "siga" is empty, "deco" must be empty too. siga = ""; deco = ""; replacements = 0; } /^[0]/{ # decoration - concatenate with current decoration if (abort >= 0) { exit abort; } if (siga == "") { print; } else { deco = (deco substr($0,2)); } next; } /^[12]/ { # break - flush buffers and restart. if (abort >= 0) { exit abort; } flush_buffers(); print; next; } /^[3]/ { # significant character - try to combine with previous one, if any if (abort >= 0) { exit abort; } if (length($0) != 2) { error(("line " NR ": wrong length in class \"3\" record")); } sigb = substr($0,2,1); if (siga == "") { # save it for now siga = sigb; next; } else { if ((siga == diga) && (sigb == digb)) { # replace by new symbol printf "3%s\n", repl; replacements++; siga = ""; if (deco != "") { warning(("line " NR ": decoration squeezed over")); } } else { flush_buffers(); siga = sigb; } } next; } function flush_buffers() { # writes out the saved character and decoration, if any: if (siga != "") { printf "3%s\n", siga; siga = ""; if (deco != "") { printf "0%s\n", deco; deco = ""; } } } /./ { # What-what-what? if (abort >= 0) { exit abort; } error(("line " NR ": invalid character class")); } END { if (abort >= 0) { exit abort; } flush_buffers(); printf " replaced %d instances\n", replacements > "/dev/stderr"; } function error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; } function warning(msg) { printf " (warning) %s\n", msg >> "/dev/stderr"; }