#! /bin/gawk -f # Last edited on 2004-02-26 18:15:01 by stolfi BEGIN { abort = -1; usage = ( \ "cat INFILE.txt \\\n" \ " | roman-code-words -f roman-code-fns.gawk \\\n" \ " [ -v table=TBLNAME ] \\\n" \ " [ -v honorCase=BOOL ] \\\n" \ " [ -v maxNumWords=NUMWDS [ -v maxCode=MAXCODE ] ] \\\n" \ " > OUTFILE.rcd" \ ); # Each line of the input must contain zero or more words separated # by spaces. Maps each plain-word in the input to a numeric code, # then encodes the latter with a Roman-like number system, and # writes the resulting code-word. # # The file TBLNAME, which defaults to empty, should contain # a list of PLAINWORD CODEWORD pairs, one per line; it is used to map # plain-words to code-words directly. In any case, each input word # that is not found in the table get assigned the next available # code number, and this assignment is entered in the table.. # # By default, the valid numeric codes are sequential, starting from 1. # However, if "maxNumWords" is specified, the valid numeric codes # are spaced further apart so that NUMWDS of them will span the # range [1..MAXNUMCODE] of valid numeric codes. The value # of MAXNUMCODE defaults to the maximum allowed by the encoding, # but may be set explicitly by giving the "maxCode" parameter # (an *encoded* number). # # If {honorCase} is set (default), the script will attempt to copy # the capitalization of the input words into the code. I.e. if the # input word is "Mars", and "foo" is the next available code word, # then "foo" is assigned to "mars" and "Foo" to "Mars". This rule # also applies to pairs pre-defined in the initial table. Otherwise # "Mars" and "mars" are considered distinct words and get distinct # codes. # # In any case, the completed encoding table will be written at the end # of the output text, as a bunch of "#"-comments, bracketed by # the heading "# BEGIN DICTIONARY" and "# END DICTIONARY" if (honorCase == "") { honorCase = 1; } renc_init(); printf "maximum code = %s (%d)\n", \ renc_max_code, renc_max_num > "/dev/stderr"; if (maxNumWords == "") { code_increment = 1; } else { printf "assuming max %d distinct words\n", maxNumWords > "/dev/stderr"; if (maxCode == "") { maxNumCode = renc_max_num; } else { maxNumCode = renc_decode(maxCode); if (maxNumCode > renc_max_num) { arg_error(("given \"maxCode\" is too big")); } } printf "adjusting increment for max code = %s (%d)\n", maxCode, maxNumCode > "/dev/stderr"; if (maxNumWords > maxNumCode) { arg_error(("given \"maxNumWords\" is too big")); } code_increment = maxNumCode/maxNumWords; } printf "code increment = %d\n", code_increment > "/dev/stderr"; split("", dic); num_distinct_words = 0; max_assigned_num_code = 0; max_assigned_code = ""; if (table != "") { load_table(table); } } (abort >= 0) { exit abort; } /^ *([#@]|$)/ { print; next; } // { for (i = 1; i <= NF; i++) { wd = $(i); $(i) = encode_word(wd); } print; next; } # The following is OK for texts with Latin alphabet: # # // { # lin = $0; res = ""; wd = ""; # m = length(lin); # for (i = 1; i <= m; i++) # { c = substr(lin,i,1); # if (is_alpha(c)) # { w = (w c); } # else # { if (w != "") { res = (res encode_word(w)); w = ""; } # res = (res c); # } # } # if (w != "") { res = (res encode_word(w)); w = ""; } # print res; # } # # function is_alpha(c) # { # c = tolower(c); # return \ # ((c >= "a") && (c <= "z")) || \ # ((c >= "ß") && (c <= "ö")) || \ # ((c >= "ø") && (c <= "ÿ")); # } END { printf "%d distinct words seen\n", num_distinct_words > "/dev/stderr"; printf "maximum code used = %s (%d)\n", max_assigned_code, max_assigned_num_code > "/dev/stderr"; printf "# BEGIN DICTIONARY\n"; for (w in dic) { printf "# %s %s\n", w, dic[w]; } printf "# END DICTIONARY\n"; } function encode_word(word, w,num,code,pfx,sfx) { # Empty is not a word: if (word == "") { return word; } # Lookup word in table: w = (honorCase ? tolower(word) : word); if (w in dic) { code = dic[w]; } else { num_distinct_words++; num = int(max_assigned_num_code + code_increment); if (num <= max_assigned_num_code) { prog_error(("num code bug")); } code = renc_encode(num); max_assigned_num_code = num; max_assigned_code = code; dic[w] = code; } if (honorCase && (word != w)) { # The original {word} was not in lowercase. # Try to preserve its capitalization in in the code, if possible. # Note that some codes have non-alpha prefixes. if (word ~ /[A-Z].*[A-Z]/) { # Assume all-caps: code = toupper(code); } else { # Assume initial-caps: if (match(code, /[a-z]/)) { pfx = substr(code,1,RSTART-1); sfx = substr(code,RSTART); code = (pfx toupper(substr(sfx,1,1)) substr(sfx,2)); } } # We assign the modified code to {word} too, for # the benefit of dumber programs who may want to use our table dic[word] = code; } return code; # If all else fails: return word; } function load_table(fname, ntbl,nbadtbl,nlin,lin,fld,nfld,word,code,num) { # Loads the mapping table "dic" from file "fname". # Also updates "max_assigned_num_code". max_assigned_num_code = 0; ntbl = 0; nbadtbl = 0; nlin = 0; while((getline lin < fname) > 0) { nlin++; if (! match(lin, /^[ \011]*([#]|$)/)) { gsub(/[#].*$/, "", lin); nfld = split(lin, fld, " "); if (nfld != 2) tbl_error(fname, nlin, ("bad table entry = \"" lin "\"")); word = fld[1]; code = fld[2]; if (honorCase) { word = tolower(word); code = tolower(code); } if (word in dic) tbl_error(fname, nlin, ("repeated plainword = \"" lin "\"")); dic[word] = code; num = renc_decode(code,-1); if (num == -1) { nbadtbl++; } else if (num > max_assigned_num_code) { max_assigned_num_code = num; } ntbl++; } } if (ERRNO != "0") { tbl_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "loaded %6d pairs from %s\n", ntbl, fname > "/dev/stderr" if (nbadtbl > 0) { printf "!!! warning - file %s contains %d invalid word codes\n", \ fname, nbadtbl >> "/dev/stderr"; } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function tbl_error(f,n,msg) { printf "file %s, line %d: %s\n", f, n, msg > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1 }