#! /usr/bin/gawk -f # Last edited on 2005-01-13 03:32:10 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " \\\n" \ " [ -v trans=CODES ] [ -v require=BOOL ] \\\n" \ " [ -v clean=BOOL ] [ -v prefix=STR ] [ -vsuffix=STR ] \\\n" \ " [ -v txlen=NUM ] \\\n" \ " < INFILE \\\n" \ " > LOCLIST" \ ); # Reads a file in the EVMT or JS-interlinear format. Outputs one # record per text line, containing the line locator (without "<>"s # and without transcriber codes) and the first {txlen} chars of the # line. # If {clean} is TRUE (default), removes all fillers, comments, # []-groups, weirdo codes, etc. If the {prefix} and/or {suffix} # strings are given, they are concatenated with the text. All these # options these take effect before the text is truncated. # # If there are multiple transcriptions of the same line, outputs # only the most reliable one. The valid codes in order of INCREASING # reliability are specified throught {trans} variable. If {require} # is TRUE (default), there must be at least one valid translation for # every locator that appears in the input file. if (trans == "") { trans = "*"; } if (require == "") { default = 1; } if (txlen == "") { txlen = 20; } if (clean == "") { clean = 1; } if (prefix == "") { prefix = ""; } if (suffix == "") { suffix = ""; } oloc = ""; } (abort >= 0) { exit abort; } /^[<][a-z][0-9]+[rv][0-9]*[.]/ { # Get line locator, clean it up: loc = $1; gsub(/[<>]/, "", loc); # Extract the transcriber's code {trc} ("*" means "none"). if (match(loc, /[;][A-Za-z]$/)) { trc = substr(loc, RSTART+1,1); loc = substr(loc, 1, RSTART-1); } else { trc = "*"; } # Find its reliability {rel}. rel = index(trans, trc); # Get sample text {txt}, clean it up, trim it: txt = $0; # Remove locator: gsub(/^[<][^<>]*[>] */, "", txt); if (clean) { # Remove {}-comments: gsub(/[{][^{}]*[}]/, "", txt); gsub(/[{][^{}]*[}]/, "", txt); # Just in case there are nested {}īs... # Remove plumes: gsub(/[\']/, "", txt); # Remove spaces, fillers, line and parag markers, ()-markers: gsub(/[-!,.\/= ()]/, "", txt); # Gabriel-style weirdoes: gsub(/[&][0-9]+[;]/, "*", txt); # Rene-style weirdoes: gsub(/[$][0-9][0-9][0-9]/, "*", txt); # Keep only the first alternative in "[|]" groups: gsub(/[[]/, "", txt); gsub(/[|][a-z\'*|]*[]]/, "", txt); } # Trim text to the required length: txt = substr((prefix txt suffix),1,txlen); # Output it: if (loc != oloc) { flush_loc(); oloc = loc; orel = -1; } if (rel > orel) { orel = rel; otxt = txt; } } END { if (abort >= 0) { exit abort; } flush_loc(); } function flush_loc() { if (oloc != "") { if (orel == 0) { if (require) { data_error(("no valid transcription for \"" oloc "\"")); } } else { print oloc, otxt; } } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; abort = 1; exit 1 }