#! /usr/bin/gawk -f # Last edited on 1998-12-25 06:51:57 by stolfi # Splits a multipage EVT-format file into one file per text unit, # Assumes each unit is preceded by a line "## ... " or # "## ...", where PAGE is a page f-number and UNIT is a # capital letter. # # Also writes to stdout a list of the files, in the order written. # Attempst to work with any locator style (Stolfi's or # EVMT's ). # BEGIN { abort = -1; usage = "split-evt-into-units -v outdir=DIR -v style=STYLE < FILE > PAGELIST"; if (outdir == "") { arg_error("must define \"outdir\""); } if (style == "") { arg_error("must define \"style\""); } if ((style != "EVMT") && (style != "STOLFI")) { arg_error("invalid \"style\""); } file = ""; junk = ( outdir "/junk" ); unit = ""; } function pline(lin) { # Writes the line "" to the current "file". if (file == "") { format_error("line with no unit"); print lin > junk; } else { print lin >> file; } } // { if (abort >= 0) { exit abort; } } /^##/ { if ( style == "EVMT") { mtc = match($0, /[<]f[0-9]+[rv]?[0-9]?(|[.][A-Z])[>]/); } else if ( style == "STOLFI") { mtc = match($0, /[<]f[0-9]+[rv]?[0-9]?(|[.][A-Za-z][A-Za-z0-9]?)[>]/); } else { program_error("bad style"); } if (mtc == 0) { format_error("bad format of unit header"); print $0 > junk; } else { if (file != "") { close(file); } unit = substr($0, RSTART+1, RLENGTH-2); file = (outdir "/" unit); printf "%s\n", file; pline($0); } next; } /^#/ { pline($0); next; } /^ *$/{ next; } // { un = ""; if ( style == "EVMT") { mtc = match($0, /^[<]f[0-9]+[rv]?[0-9]?[.][A-Z]/); if (mtc != 0) { un = substr($0, RSTART+1, RLENGTH-1); mtc = match(substr($0,RLENGTH+1), /^[0-9]+[;][A-Z][>]/); } } else if ( style == "STOLFI") { mtc = match($0, /^[<]f[0-9]+[rv]?[0-9]?[.][A-Za-z][A-Za-z0-9]?[.]/); if (mtc != 0) { un = substr($0, RSTART+1, RLENGTH-2); mtc = match(substr($0,RLENGTH+1), /^[0-9]+[a-z]?[;][A-Z][>]/); } } else { program_error("bad style"); } if (mtc == 0) { format_error("bad format"); print $0 > junk; } else if (un != unit) { format_error("wrong unit"); print $0 > junk; } else { pline($0); } next; } // { format_error("bad line format"); print $0 > junk; next; } END{ if (file != "") { close(file); } close(junk); } function format_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg >> "/dev/stderr"; } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; printf "usage: %s\n", usage >> "/dev/stderr"; abort = 1; exit abort; } function program_error(msg) { printf "file %s, line %d: prog error %s\n", FILENAME, FNR, msg >> "/dev/stderr"; abort = 1; exit abort; }