#! /usr/bin/gawk -f # Usage: "$0 -v TOTAL=NNN < joinfile > output.cmp # This script is used internally by compare-freqs. # The input should be a file in the format "word ct1 fr1 ct2 fr2 ... ctn frn" # where the "ct"s are counts anf the "fr"s are freqs in [0_1]. # Adds the counts and frequencies of each word in the input. # The output has the format " ctt frt ct1 fr1 ct2 fr2 ... ctn frn word" # where "ctt" is the total count and "frt" the total frequency. # The command line parameter "TOTAL" is used to compute the total percentages. # WARNING: the word should not contain any spaces. BEGIN { if (TOTAL == "") { printf "must define TOTAL\n" > "/dev/stderr"; exit 1 } } /^#/ { printf "##%11.11s", "TOTAL"; for (i=2;i<=NF;i++) printf " %-11.11s", $(i); printf " %-11.11s", "WORD"; printf "\n"; printf "# %11.11s", "--------------------------"; for (i=2;i<=NF;i++) printf " %-11.11s", "--------------------------"; printf " %11.11s", "--------------------------"; printf "\n"; next; } /./ { if(TOTAL == "") exit 1; TOTCT=0; for (i=2;i<=NF;i+=2) { TOTCT+=$(i); } printf " %5d %5.3f", TOTCT, (TOTCT/TOTAL); for (i=2;i<=NF;i+=2) printf " %5d %5.3f", $(i), $(i+1); printf " %s\n", $1; next }