#! /usr/bin/gawk -f # Last edited on 2004-09-26 10:38:26 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " < INPUT.wds > OUTPUT.flo" ); # Input is a list of tokens, one per line, non-empty and with # no embedded spaces. Output has one line for each distinct word, # in the format # # "{COUNT} {FIRST} {LAST} {DIFF} {WORD}" # # where {COUNT} is the number of occurrences, {FIRST} and # {LAST} are the positions of the first and last occurrence # (couting from 0), and {DIFF} is {LAST-FIRST}. split("",fo); split("",lo); split("",ct); } (abort >= 0) { exit abort; } /./ { wd = $1; pos = FNR-1; if (! (wd in ct)) { ct[wd]=0; fo[wd] = FNR-1; } lo[wd] = FNR-1; ct[wd]++; next } // { data_error("bad word"); } END { if (abort >= 0) { exit abort; } for (wd in fo) { printf "%7d %7d %7d %7d %s\n", \ ct[wd], fo[wd], lo[wd], lo[wd]-fo[wd], wd; } } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1; }