#! /usr/bin/gawk -f # Last edited on 2001-01-18 02:38:46 by stolfi BEGIN { abort = -1; usage = ( "count-ngrams -f parse-elem-list.gawk \\\n" \ " [ -v maxOrder=MAXORDER ] \\\n" \ " [ -v elemList='EL1,EL2,...ELN' ] \\\n" \ " [ -v spaceMarker=STRING ] \\\n" \ " [ -v showBadWords=BOOL ] \\\n" \ " < INFILE > OUTFILE" \ ); # Assumes the input is a text, one word per line, is plain EVA, with # capitalized ligatures and elements marked off by {}. The program # inserts a `spaceMarker' element in braces at every word boundary # (including the beginning and end of file), and then writes out # several records of the form # # COUNT NGRAM # # where the NGRAM is a string of from 1 to MAXORDER consecutive elements # from the text. # # One record is written for each NGRAM that does not overlap an # invalid word. A word is considered valid only if all its elements are in # the specified list. if (maxOrder == "") { arg_error("must specify \"maxOrder\""); } if (spaceMarker == "") { spaceMarker = "_"; } spaceMarkerWBraces = ( "{" spaceMarker "}" ); if (showBadWords == "") { showBadWords = 0; } if ((elemList == "") == (elemTable == "")) { arg_error("must define exactly one of \"elemList\" and \"elemTable\""); } split("", elem); split("", elemIndex); split("", elemClass); if (elemList != "") { nelems = parse_explicit_elems(elemList,elem,elemIndex,elemClass); } else { nelems = load_elems_from_file(elemTable,elem,elemIndex,elemClass); } # indexed with the capitalized and bracketed word ngram: split("", ngramCt); ntoken = 0; ntokenBad = 0; # The last valid characters that were processed are `buf[ibuf + k]' # for `k = 0..nbuf-1', where indices are taken modulo `maxOrder' # and `nbuf' is strictly less than `maxOrder'. split("", buf); flush_buffer(); # Account for the inital space: } function flush_buffer() { # Called at the beginning of file, or wherever a bad word is found. # Empties the buffer and pretends a word space was just seen. ibuf = 0; if (maxOrder > 1) { buf[0] = spaceMarkerWBraces; nbuf = 1; } else { nbuf = 0; } ngramCt[spaceMarkerWBraces]++; } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /./ { ntoken++; if (NF != 1) { data_error("bad line format"); } token = $1; if (token !~ /^[{}a-zA-Z0-9?]+$/) { data_error("bad word"); } tk = token; # split word into elements: gsub(/[}][{]/, "} {", tk); ne = split(tk, tkel, " "); # Append a space to the word: tkel[ne+1] = spaceMarkerWBraces; # Element consistency check: tokenIsBad = 0; for (i = 1; i <= ne; i++) { e = tkel[i]; if (e !~ /^[{][a-zA-Z0-9?]+[}]$/) { data_error(("badly formed elem \"" e "\"")); } gsub(/[{}]/, "", e); if (! (e in elemIndex)) { tokenIsBad = 1; } } if (tokenIsBad) { if (showBadWords) { printf "! %s\n", token > "/dev/stderr"; } flush_buffer(); ntokenBad++; } else { # Enumerate ngrams that end in this word or its final space: for (i = 1; i <= ne+1; i++) { push_elem_and_tally_ngrams(tkel[i]); } } next; } function push_elem_and_tally_ngrams(ei, j,g,r) { # Add new element `ei' to buffer and enumerate ngrams that end with it. j = (ibuf + nbuf) % maxOrder; buf[j] = ei; nbuf++; if (nbuf > maxOrder) { prog_error("bad nbuf"); } g = ""; for (r = 0; r < nbuf; r++) { er = buf[(j + maxOrder - r) % maxOrder]; g = (er g); # printf "%s\n", g > "/dev/stderr"; ngramCt[g]++; } if (nbuf == maxOrder) { ibuf = (ibuf + 1) % maxOrder; nbuf--; } } END { if (abort >= 0) { exit abort; } printf "%d tokens read\n", ntoken > "/dev/stderr"; printf "%d bad tokens found\n", ntokenBad > "/dev/stderr"; for (g in ngramCt) { ct = ngramCt[g]; printf "%7d %s\n", ct, g; } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function prog_error(msg) { printf "***prog error: %s\n", msg > "/dev/stderr"; abort = 1; exit 1; }