#! /usr/bin/gawk -f # Last edited on 1999-01-06 06:45:18 by stolfi # Translates a pattern occurrence index file from one encoding to another. # Usage: $0 MAPFILE < INFILE > OUTFILE # # The INFILE should have records of the form # # LOCPOS GLOBPOS PAT OBS # # meaning that pattern PAT occurred in line # displaced LOCPOS character from the beginning of the line, # and GLOBPOS characters from the beginning of the whole text. # The OBS is an optional comment about this occurrence. # # The MAPFILE must contain entries in the format # OLDPAT NEWPAT NEWOBS # meaning that any occurrence of pattern OLDPAT listed in INFILE # should be replaced by an occurrence of NEWPAT. # If the NEWOBS field is present, it replaces the OBS field from # INFILE. # # There may be multiple lines with the same OLDPAT; one output # line will be written for each. function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit } BEGIN { abort = 0 usage = (ARGV[0] " MAPFILE < OCCFILE") if (ARGC != 2) { error(("usage: " usage)); } mfile = ARGV[1]; split("", mpat) split("", mobs) split("", nalts) while((getline e < mfile) > 0) { split(e, fld) opat = fld[1] npat = fld[2] if (opat in nalts) { nalts[opat]++ } else { nalts[opat] = 1 } k = nalts[opat] mpat[opat,k] = npat if (3 in fld) { mobs[opat,k] = fld[3] } } if (ERRNO != "0") { error((mfile ": " ERRNO)); } close (mfile); ARGC = 1 } /./ { if (abort) exit; loc = $1 lpos = $2 gpos = $3 opat = $4 if (!(opat in nalts)) { error(("line " NR ": unmatched pattern \"" opat "\"")); exit 1 } n = nalts[opat]; for (k=1;k<=n;k++) { if ((opat,k) in mobs) { $5 = mobs[opat,k] } if (NF >= 5) { obs = $5 print loc, lpos, gpos, mpat[opat,k], obs } else { print loc, lpos, gpos, mpat[opat,k] } } next }