#! /usr/bin/gawk -f # Last edited on 1999-07-20 02:59:44 by stolfi # Usage: # # cat INFILE \ # | make-word-parag-map \ # -v nParags=NPARAGS \ # [-v totOnly={0|1}] \ # [-v omitSingles={0|1}] \ # > RAWMAP" # # This script reads from standard input a list of word # occurrence records of the form # # PATT STRING TAG PARAG # 1 3 2 4 # # where # # PATT is a string pattern (equivalence class), e.g. "oteedodo"; # # STRING is the string as it occurred in the text e.g. "qokeedy.dy" # # TAG is an occurrence type code, e.g. "+" for label, "-" for text; # # PARAG is the sequential paragraph number, eg "125"; # # The file should be sorted by PATT, STRING, and TAG. # Moreover, if two records have the same STRING they should # have the same PATT too. # # The script counts all occurrences of the same STRING with the same # TAG, per paragraph, and prints for it a line of the form # # TOTCT XXX...XXX PATT STRING TAG # # where # # TOTCT is the total number of occurrences for STRING and TAG; # # XXX...XXX are the occurrence counts for each paragraph; # # PATT is the corresponding pattern from the input file; # # STRING is the string from the input file; # # TAG is the input TAG; # # Also, all occurrences with the same PATT are added and # printed as a separate pattern-total line with TAG "=". # The STRING is then the most popular STRING of this PATT, # with "~" appended. # # If "totOnly" is 1 then only the pattern-total lines above are printed. # If "totOnly" is 0 then a pattern-total line is printed only if it # combines two or more different STRINGs. # # If "omitSingles" is 1 then lines with TOTCT = 1 are not printed. function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr"; abort = 1; exit abort; } function print_line(pa, st, tg, tt, ct, i) { # prints pattern "pa", string "st", tag "tg", total count "tt", # per-block counts "ct[1..nParags]". Also updates "last_pat_printed". if ((pa != last_pat_printed) && (last_pat_printed != "")) { printf "\n"; } printf "%6d", tt; for (i = 1; i <= nParags; i++) { printf " %2d", ct[i]; } printf " %s %s %s", pa, st, tg; printf "\n"; last_pat_printed = pa; } # We have a buffer for the current string, and one for the current pattern. # "patt" is the current pattern, "pat_XX" are its attributes. # "strn" is the current string, "str_XX" are its attributes. function dump_str_buffer( i) { # Prints the current string buffer and clears its accumulators if (strn != "") { # Remember most popular string for each pattern: if (str_tt > pat_max_tt) { pat_st = strn; pat_max_tt = str_tt; } # Print the string data if ((! totOnly) && ((! omitSingles) || (! pat_is_single))) { print_line(patt, strn, str_tg, str_tt, str_ct); } } # Clear string accumulators: strn = ""; for (i = 1; i <= nParags; i++) str_ct[i] = 0; str_tt = 0; } function dump_pat_buffer( i) { # prints the current pattern buffer and clears its accumulators. if ((pat_tt != 0) && (totOnly || (pat_ns > 1))) { print_line(patt, (pat_st "~"), "=", pat_tt, pat_ct); } patt = ""; for (i = 1; i <= nParags; i++) pat_ct[i] = 0; pat_tt = 0; pat_ns = 0; pat_max_tt = 0; } BEGIN { abort = -1; if (nParags == 0) error("must specify \"-v nParags=NNNN\""); strn = ""; split("", str_ct); str_tt = 0; patt = ""; split("", pat_ct); pat_tt = 0; pat_ns = 0; pat_max_tt = 0; last_pat_printed = ""; } /./ { if(abort >= 0) exit abort; if (NF != 4) error("wrong number of fields"); # At this point we still have "current" data that hasn't been printed. pa = $1; st = $2; tg = $3; pn = $4; pi = pn + 0; if ((! match(pn, /^[0-9]+/)) || (pi <= 0) || (pi > nParags)) { error("bad parag number = " pn); } if ((pa != patt) || (st != strn)) { # Decide whether the current pattern has a single ordinary occurrence: pat_is_single = ((pat_tt == 1) && (patt != pa)); # Print string data, if any, and reset counters: dump_str_buffer(); # If pattern changed, print pattern data too, and reset counters: if (pa != patt) { dump_pat_buffer(); patt = pa; } strn = st; str_tg = tg; pat_ns++; } # Tally this occurrence: pi = pn + 0; str_ct[pi]++; str_tt++; pat_ct[pi]++; pat_tt++; } END { if(abort >= 0) exit abort; dump_str_buffer(); dump_pat_buffer(); }