#! /usr/bin/gawk -f # Usage: # # cat INFILE \ # | make-word-location-map \ # [-v nblocks=NBLOCKS] \ # [-v totOnly={0|1}] \ # [-v omitSingles={0|1}] \ # > RAWMAP" # # This script reads from standard input a list of phrase # occurrence records of the form # # PNUM FNUM UNIT LINE TRANS START LENGTH POS STRING OBS LANG BLOCK PATT TAG # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # # where # # PNUM is the sequential page number, eg "125"; # # FNUM is the folio-based page number, eg "f86v2"; # # UNIT is the text unit within the page, eg "S1" or "P"; # # LINE is the line number within the UNIT, eg "0a" or "23"; # # TRANS is the transcriber's code, eg. "F" for Friedman; # # START is not used # # LENGTH is not used # # POS is not used # # STRING is a string occurring in the text e.g. "qokeedy.dy" # # OBS is an arbitrary non-empty comment string (without embedded blanks). # # LANG is the language in Currier's sense ("A", "B", or "?"); # # BLOCK is the index of a block (map column), counting from 0; # # PATT is an abstracttion of STRING, e.g. "oteedodo"; # # TAG is "+" for a "special" ocurrence, "-" for an "ordinary" one # # The file should be sorted by PATT, TAG, and STRING. # Moreover, if two records have the same STRING they should # have the same PATT too. # # The script prints a table of the form # # TOTCT XXX...XXX PATT STRING TAG PNUM LOC OBS LANG # # where # # TOTCT is a total number of occurrences for this line; # # XXX...XXX are occurrence counts for each block; # # PATT is a string pattern from the input file; # # STRING is a string from the input file; # # TAG is the corresponding TAG, or "=" for total lines; # # PNUM is the sequential page number, or "-". # # LOC is the full location code "FNUM.UNIT.LINE;TRANS", or "-". # # OBS is the same as in the input file. # # LANG is the same as in the input file. # # The TOTCT field is printed in fixed format "%6d ", the # remaining fields in free format, separated by spaces. # # The script prints one of these lines for each "special" occurrence # (TAG != "-"). All ordinary occurrences (TAG == "-") of the same # STRING are combined into one line, whose PNUM, LOC and OBS # are taekn from the first such line. # # Also, all ordinary occurrences with the same PATT are added and # printed as a separate pattern-total line, whose STRING, PNUM, LOC # and OBS are taken from the most popular STRING of that . # # If "totOnly" is 1 then only the pattern-total lines above are printed. # If "totOnly" is 0 then a pattern-total line is printed only if it # combines two or more different STRINGs. # # If "omitSingles" is 1 then PATTs that occur only once as # ordinary strings are entirely omitted from the output. function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr"; abort = 1; exit; } function print_line(pa, st, tg, tt, ct, pn, lc, ob, lg, i) { # prints pattern "pa", string "st", tag "tg", total count "tt", # per-block counts "ct[0..nblocks-1]", p-number "pn", location "lc", # obs "ob", language "lg". Also updates "last_pat_printed". printf "%6d", tt; for (i=0; i pat_max_tt)) { pat_st = strn; pat_pn = str_pn; pat_lc = str_lc; pat_ob = str_ob; pat_lg = str_lg; pat_max_tt = str_tt; } # Print the string data if ((! totOnly) && ((! omitSingles) || str_is_special || (! pat_is_single))) { if ((patt != last_pat_printed) && (last_pat_printed != "")) { printf "\n"; } print_line(patt, strn, str_tg, str_tt, str_ct, str_pn, str_lc, str_ob, str_lg); } } # Clear string accumulators: strn = ""; for (i=0; i 1))) { print_line(patt, (pat_st "~"), "=", pat_tt, pat_ct, pat_pn, pat_lc, pat_ob, pat_lg); } patt = ""; for (i=0; i= nblocks)) { error("bad block number = " bn); } if ((pa != patt) || (st != strn) || str_is_special) { # Decide whether the current pattern has a single ordinary occurrence: pat_is_single = ((pat_tt == 1) && (patt != pa)); # Print string data, if any, and reset counters: dump_str_buffer(); # If pattern changed, print pattern data too, and reset counters: if (pa != patt) { dump_pat_buffer(); patt = pa; } strn = st; str_tg = tg; str_pn = pn; str_lc = lc; str_ob = ob; str_lg = lg; str_is_special = (str_tg != "-"); if (! str_is-special) { pat_ns++; } } # Tally this occurrence: str_ct[bn]++; str_tt++; if (str_is_special) { patt_is_special = 1; } else { pat_ct[bn]++; pat_tt++; } } END { dump_str_buffer(); dump_pat_buffer(); }