#! /usr/bin/gawk -f # Last edited on 1999-07-28 01:41:44 by stolfi BEGIN{ abort = -1; usage = "compute-freq-diffs [-normalize] FILEA FILEB"; # # where FILEA and FILEB are names of files, and each line in them # has the format COUNT COORD LABEL, where COUNT is an integer, COORD # is a real number, and LABEL is any word. (The LABELs must be # sorted and must match in both files.) The COORD field of the ith # line of FILEA is interpreted as the ith coordinate A[i] of a # vector in some high-dimensional space; and ditto for FILEB. # # After reading both files, prints to stdout another list of # COORD LABEL pairs, where the COORD on line i is (A[i] - B[i])**2 # # If "-normalize" is specified, the numbers are scaled # so that the largest has absolute value 1.0000 while ((ARGC > 1) && (substr(ARGV[1],1,1) == "-")) { if (ARGV[1] == "-normalize") { normalize = 1; shiftarg(); } else { error(("bad option " ARGV[1] " - usage: " usage)); } } if (ARGC != 3) { error(("ARGC = " ARGC " - usage: " usage)); } apt = ARGV[1]; if (apt == "") { error(("usage: " usage)); } bpt = ARGV[2]; if (bpt == "") { error(("usage: " usage)); } N = 0; while ((getline < apt) > 0) { N++; if (NF != 3) { error((apt ", line " N ": bad format")); } w = $3; d[w] = $2; } if (ERRNO != "0") { error((apt ": " ERRNO)); } close(apt); N = 0; while ((getline < bpt) > 0) { N++; if (NF != 3) { error((bpt ", line " N ": bad format")); } w = $3; d[w] = d[w] - $2; } if (ERRNO != "0") { error((bpt ": " ERRNO)); } close(bpt); if (normalize) { maxd = 0; for (w in d) { da = d[w]; da = ( da < 0 ? -da : da ); if(da > maxd) { maxd = da;} } } else { maxd = 1.0; } for (w in d) { printf "%+7.5f %s\n", d[w]/maxd, w; } } function error(msg) { printf "%s\n", msg > "/dev/stderr"; abort=1; exit(1); } function shiftarg( i) { for(i=1;i