#! /usr/bin/gawk -f # Last edited on 1998-07-14 23:41:54 by stolfi # Extracts tuples from the output of "extract-signif-chars" BEGIN{ usage = ( \ "compute-cond-tuple-info \\\n" \ " -v order=ORDER \\\n" \ " [ -v filler=CHAR ] \\\n" \ " [ -v lowercase=BOOL ] \\\n" \ " < SIGFILE > TUPFILE" \ ); # The file SIGFILE must have been created by "extract-signif-chars" # This script writes to standard output the n-tuples of consecutive # significant characters read from SIGFILE, where n=ORDER. # The "decoration" records in SIGFILE are ignored. # # If "lowercase: is true the significant characters are converted to lower case. # # The word breaks in SIGFILE are replaced by a single instance of the "filler" # character (which mustbe printable and non-blank). The paragraph breaks # in SIGFILE are replaced by ORDER-1 consecutive fillers. abort = -1; check_options(); init_tup(); } /^[0]/{ if (abort >= 0) { exit(abort); } next; } /^[1]/{ if (abort >= 0) { exit(abort); } push_char(filler); next; } /^[2]/{ if (abort >= 0) { exit(abort); } for (i=1;i= 0) { exit(abort); } for (i=1;i 20)) { error("funny \"order\""); } if (filler == "") { filler = "_"; } if (length(filler) != 1) { error(("the \"filler\" should be a single char")); } # --- lowercase mapping ---------------------------------------------- split("", map); for (i=0;i<256;i++) { c = sprintf("%c", i); map[c] = c; } if (lowercase == "") { lowercase = 0; } if (lowercase > 0) { ucs = "ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ"; lcs = "abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ"; for (i=1;i<=length(ucs);i++) { uc = substr(ucs,i,1); lc = substr(lcs,i,1); map[uc] = lc; } } } function error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; }