#! /usr/bin/gawk -f # Last edited on 1999-07-17 15:48:14 by stolfi # Reads a a sequence of records of the form CTA FRA CTB FRB WORD # where CTA, CTB are counts of occurrences of WORD in each language, # FRA, FRB are the corresponding estimated frequencies. # Outputs for each record a line CTA FRA CTB FRB PROB WORD, # where PROB is the probability of an occurrence of the # WORD being an instance of language A. /./{ na = $1 + 1; fa = $2 + 0.0001; nb = $3 + 1; fb = $4 + 0.0001; x = fa /(fa + fb); printf "%7d %6.4f %7d %6.4f %6.4f %s\n", $1, $2, $3, $4, x, $5; } # Alternative formula: x = (fa*na - fb*nb)/(fa*na + fb*nb); # Result is in [ -1 .. +1 ]