#! /usr/bin/gawk -f
# Last edited on 1999-07-17 15:48:14 by stolfi

# Reads a a sequence of records of the form CTA FRA CTB FRB WORD
# where CTA, CTB are counts of occurrences of WORD in each language,
# FRA, FRB are the corresponding estimated frequencies.
# Outputs for each record a line CTA FRA CTB FRB PROB WORD,
# where PROB is the probability of an occurrence of the 
# WORD being an instance of language A.

/./{
  na = $1 + 1; fa = $2 + 0.0001; 
  nb = $3 + 1; fb = $4 + 0.0001;
  x = fa /(fa + fb);
  printf "%7d %6.4f %7d %6.4f %6.4f %s\n", $1, $2, $3, $4, x, $5;
}

# Alternative formula: x = (fa*na - fb*nb)/(fa*na + fb*nb);
# Result is in [ -1 .. +1 ]