#! /usr/bin/gawk -f
# Last edited on 2013-03-18 22:56:51 by stolfilocal

BEGIN {
  USAGE = "sort-segs-by-words.gawk -v charFile={CHARFILE} -v wordFile={WORDSFILE} \\\n";
  
  # Reads two files {CHARFILE} and {WORDSFILE} written by {SegmentTruth.java}.
  # Writes to standard output 
  # a file in the same format with one line for each character, where the field 
  # {ICH} is the same as in {CHARFILE} but {IWD} is the number of the best-matching
  # words from {WORDSFILE}.  
  # 
  # Characters are sorted by word and left to right within each word. A
  # blank line is written after each word. Flags and discards characters
  # that do not match any word, and words that are not matched by any
  # character.
  
  if (charFile == "") { args_error(("must define \"charFile\"")); }
  if (wordFile == "") { args_error(("must define \"wordFile\"")); }

  # Data read from character file (indexed from 0 to {nchars}):
  split("", ch_iname); # Image name.
  split("", ch_level); # Image level.
  split("", ch_seq);   # Segment number assigned by {SegmentTruth.java}.
  split("", ch_xmin);  # Min X of bounding box.
  split("", ch_ymin);  # Min Y of bounding box.
  split("", ch_w);     # X size of bounding box.
  split("", ch_h);     # Y size of bounding box.
  split("", ch_rmin);  # Min ellipse radius.
  split("", ch_rmax);  # Max ellipse radius.
  split("", ch_val);   # Character value.


  # Data on words (indexed from 0 to {nwords}):
  split("", wd_iname); # Image name.
  split("", wd_level); # Image level. 
  split("", wd_seq);   # Segment number assigned by {SegmentTruth.java}.
  split("", wd_xmin);  # Min X of bounding box.
  split("", wd_ymin);  # Min Y of bounding box.
  split("", wd_w);     # X size of bounding box.
  split("", wd_h);     # Y size of bounding box.
  split("", wd_rmin);  # Min ellipse radius.
  split("", wd_rmax);  # Max ellipse radius.
  split("", wd_val);   # Character value.
  
  # Read the files:
  nchars = read_file(charFile, ch_iname, ch_level, ch_seq, ch_xmin, ch_ymin, ch_w, ch_h, ch_rmin, ch_rmax, ch_val); 
  nwords = read_file(wordFile, wd_iname, wd_level, wd_seq, wd_xmin, wd_ymin, wd_w, wd_h, wd_rmin, wd_rmax, wd_val); 

  # Computed character data (indexed from 0 to {nchars}): 
  split("", ch_iwd);   # Word number assigned by this program.
  
  # Computed word data (indexed from 0 to {nwords}): 
  split("", wd_nch);   # Number of characters assigned to this word.
  for (iwd = 0; iwd < nwords; iwd++) { wd_nch[iwd] = 0; }
  
  # Pair characters and words:
  for (ich = 0; ich < nchars; ich++)
    { 
      mcw_best = 0.00001;
      iwd_best = -1;
      for (iwd = 0; iwd < nwords; iwd++)
        {
          mcw = compare_char_word_boxes(ich, iwd);
          if (mcw > mcw_best)
            { mcw_best = mcw; iwd_best = iwd; }
        }
        
        ch_iwd[ich] = iwd_best;
        
        if (iwd_best < 0) 
          { match_warning(("character " ich " (seq \"" ch_seq[ich] "\") does not match any word")); }
        else
          { wd_nch[iwd_best]++; }
    }
    
  # Check for unmatched words: 
  for (iwd = 0; iwd < nwords; iwd++) 
    { if (wd_nch[iwd] == 0)
        { match_warning(("word " iwd " (seq \"" wd_seq[iwd] "\") does not match any character")); }
    }
  
  # Index sort characters by word and X position:
  split("", ch_ich); # Indexed by {0..nchars-1}, gives indices of characters in sorted order.
  for (ich = 0; ich < nchars; ich++)
    { jch = ich;
      while((jch > 0) && (compare_chars(ch_ich[jch-1],ich) > 0)) 
        { ch_ich[jch] = ch_ich[jch-1];
          jch--;
        }
      ch_ich[jch] = ich;
    }
      
  # Write out:
  swd_prev = -1;
  for (kch = 0; kch < nchars; kch++)
    {
      ich = ch_ich[kch];
      iwd = ch_iwd[ich];
      if (iwd >= 0)
        { swd = wd_seq[iwd];
          if (swd != swd_prev)
            { if (swd_prev >= 0) { print_word_stats(swd_prev);  printf "\n"; }
              init_word_stats();
            }
          printf \
            "%30s %02d  %5d %5d  %5d %5d  %5d %5d  %7.1f %7.1f %s\n", \
            ch_iname[ich], ch_level[ich], ch_seq[ich], swd, \
            ch_xmin[ich], ch_ymin[ich], ch_w[ich], ch_h[ich], \
            ch_rmin[ich], ch_rmax[ich], ch_val[ich];
          fflush("/dev/stdout");
          swd_prev = swd;
          accum_word_stats(ch_w[ich], ch_h[ich]);
        }
    }
  if (swd_prev >= 0) { print_word_stats(swd_prev); }
}


function init_word_stats()
{
  split("", wst_w); # Letters widths in decreasing order, indexed {0..wst_nchars}.
  split("", wst_h); # Letters heights in decreasing order, indexed {0..wst_nchars}.
  split("", wst_a); # Letters aspects in decreasing order, indexed {0..wst_nchars}.
 
  wst_nchars = 0;
}

function accum_word_stats(w,h,  k)
{
  insert_word_stat(wst_w,wst_nchars,w);
  insert_word_stat(wst_h,wst_nchars,h);
  insert_word_stat(wst_a,wst_nchars,h/w);
  wst_nchars++;
}

function insert_word_stat(ztb,n,z,  k)
{
  # Inserts {z} in table {ztb[0..n-1]} in order. Does not increment {n}.
  k = n;
  while ((k > 0) && (ztb[k-1] < z)) { ztb[k] = ztb[k-1]; k--; }
  ztb[k] = z;
}

function print_word_stats(swd,   wmax,hmax,amax,wmin,hmin,amin)
{
  printf "word %5s: %3d chars", swd, wst_nchars > "/dev/stderr";
  wmin = word_stats_pick_min(wst_w,wst_nchars); wmax = wst_w[0];
  hmin = word_stats_pick_min(wst_h,wst_nchars); hmax = wst_h[0];
  amin = word_stats_pick_min(wst_a,wst_nchars); amax = wst_a[0];
  print_word_dim_stats("width",  wmin, wmax);
  print_word_dim_stats("height", hmin, hmax);
  print_word_dim_stats("aspect", amin, amax);
  print_word_dim_outliers("width",  wst_w, wst_nchars, wmin);
  print_word_dim_outliers("height", wst_h, wst_nchars, hmin);
  print_word_dim_outliers("aspect", wst_a, wst_nchars, amin);
  printf "\n" > "/dev/stderr"; 
}

function print_word_dim_stats(name,zmin,zmax, k,sep)
{
  # Prints to {stderr} a summary of the values in {ztb[0..n-1]}, 
  # assuming they are in decreasing order and the chosen
  # minimum is {zmin}.
  printf " %s range = [ %5.1f _ %5.1f ] %5.1f", name, zmin, zmax, zmax/zmin > "/dev/stderr";
}

function print_word_dim_outliers(name,ztb,n,zmin, k,sep)
{
  # Prints to {stderr} the outliers in {ztb[0..n-1]}, 
  # assuming they are in decreasing order and the chosen
  # minimum is {zmin}.
  if (ztb[n-1] < zmin)
    { printf " %s outliers = ", name > "/dev/stderr";
      k = n-1; sep = "(";
      while ((k >= 0) && (ztb[k] < zmin)) 
        { printf "%s%.1f", sep, ztb[k] > "/dev/stderr";
          k--; sep = ",";
        }
      printf ")" > "/dev/stderr";
    }
}

function word_stats_pick_min(ztb,n,  k0,k)
{
  # Picks the smalles entry in {ztb[0..n-1]} but tries to reject outliers.
  k0 = int(n*0.75); # 0.25 lower fractile.
  # Accept anything that is at least 70% of the lower quartile:
  k = k0;
  if (k >= n) { k = n-1; }
  while ((k < n-1) && (ztb[k+1] >= 0.70*ztb[k0])) { k++; }
  return ztb[k];
}

function read_file(fname,iname,level,seq,xmin,ymin,w,h,rmin,rmax,val,  lin,ntbl,nlin,fld,nfld,tmp)
{
  # Reads a file created by by {SegmentTruth.java}.
  # Ignores #-comment lines and blank lines.
  # Stores the segments sequentially in the tables {iname[0,..n-1],...,val[0,..n-1]}.
  # Returns the number {n} of segments read.
  ntbl=0;
  nlin=0;
  while((getline lin < fname) > 0) { 
    nlin++;
    gsub(/[\011]/, " ", lin);
    gsub(/^[ ]*([\#].*|)$/, "", lin);
    if (lin != "")
      { nfld = split(lin, fld, " ");
        if ((nfld >= 12) && (fld[12] ~ /^[\#]/)) { nfld = 11; }
        if (nfld != 11) { tbl_error(fname, nlin, ("bad table entry = \"" lin "\"")); }
        # Fill the fields
        iname[ntbl] = fld[1];
        level[ntbl] = check_num(fname, nlin, 2, fld[2],0,0);
        seq[ntbl] = check_num(fname, nlin, 3, fld[3],0,999999);
        tmp = check_num(fname, nlin, 4, fld[4],0,999999);
        if (tmp != seq[ntbl]) { tbl_error(fname, nlin, ("inconsistent seq fields = \"" fld[3] "\" \"" fld[4] "\"")); }
        xmin[ntbl] = check_num(fname, nlin, 5, fld[5],0,999999);
        ymin[ntbl] = check_num(fname, nlin, 6, fld[6],0,999999);;
        w[ntbl] = check_num(fname, nlin, 7, fld[7],1,999999);
        h[ntbl] = check_num(fname, nlin, 8, fld[8],1,999999);
        rmin[ntbl] = check_num(fname, nlin, 9, fld[9],0,999999);
        rmax[ntbl] = check_num(fname, nlin, 10, fld[10],0,999999);
        val[ntbl] = fld[11];
        ntbl++;
      }
  }
  if (ERRNO != "0") { tbl_error(fname, nlin, ERRNO); }
  close (fname);
  if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
  printf "read %6d lines and %6d segments from %s\n", nlin, ntbl, fname > "/dev/stderr";
  return ntbl;
}

function check_num(fname,nlin,ifld,fld,fmin,fmax,  val)
{
  # Checks whether {fld} is a number in {[fmin _ fmax]}.
  # If OK, returns the numeric value of {fld}.
  if (!(fld ~ /^[-+]?[0-9]*([0-9]|[.][0-9]*)$/))
    { tbl_error(fname, nlin, ("field " ifld " is not a valid number = \"" fld "\"")); }
  val = fld + 0;
  if ((val < fmin) || (val > fmax))
    { tbl_error(fname, nlin, ("field " ifld " = " val " out of range [" fmin " _ " fmax "]")); }
  return val;
}

function compare_char_word_boxes(ich,iwd,  w_cbx,h_cbx,w_wbx,h_wbx,w_int,h_int,fcw,fwc)
{
  # Compares box {{ch_xmin,ch_ymin,ch_w,ch_h}[ich]} 
  # with box {{wd_xmin,wd_ymin,wd_w,wd_h}[iwd]} 
  # Returns the fraction of the area of the character box that is contained
  # in the word box, with penalty if the word box is much larger.
  
  # Size of character box:
  w_cbx = ch_w[ich];
  h_cbx = ch_h[ich];
  
  # Size of word box:
  w_wbx = wd_w[iwd];
  h_wbx = wd_h[iwd];
  
  # Size of interesection of word and character boxes:
  w_int = size_inter(ch_xmin[ich], ch_w[ich], wd_xmin[iwd], wd_w[iwd]);
  h_int = size_inter(ch_ymin[ich], ch_h[ich], wd_ymin[iwd], wd_h[iwd]);
  
  # Fraction of char box that is contained in word box:
  fcw = (w_int*h_int + 0.0)/(w_cbx*h_cbx + 0.00001);
  
  # Fraction of word box that is contained in char box:
  fwc = (w_int*h_int + 0.0)/(w_wbx*h_wbx + 0.00001);
  
  # Return {fcw} but with a small penalty for {fwc}:
  return fcw * (1 - 0.01*(1 - fwc));
}

function size_inter(a_zmin,a_zsize,b_zmin,b_zsize,  a_zmax,b_zmax,zmin,zmax)
{
  # Returns the size of the intersection fo the intervals
  # {[a_zmin _ a_zmin+a_zsize]} and {[b_zmin _ b_zmin+b_zsize]}.
  
  a_zmax = a_zmin + a_zsize;
  b_zmax = b_zmin + b_zsize;
  if (a_zmin > b_zmin) { zmin = a_zmin; } else { zmin = b_zmin; }
  if (a_zmax < b_zmax) { zmax = a_zmax; } else { zmax = b_zmax; }
  if (zmin >= zmax) { return 0; }
  return zmax - zmin;
}

function compare_chars(ach,bch,  ax,bx)
{
  # Compares characters with indices {ach} and {bch}
  # Returns -1, 0, or +1 if {ach} must come before, together, or after {bch}.
  
  if (ch_iwd[ach] < ch_iwd[bch]) { return -1; }
  if (ch_iwd[ach] > ch_iwd[bch]) { return +1; }
  ax = ch_xmin[ach]+0.5*ch_w[ach];
  bx = ch_xmin[bch]+0.5*ch_w[bch];
  if (ax < bx) { return -1; }
  if (ax > bx) { return +1; }
  return 0;
}

function match_warning(msg)
{
  printf "!! %s\n", msg > "/dev/stderr";
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1
}

function tbl_error(f,n,msg)
{ 
  printf "%s:%d: %s\n", f, n, msg > "/dev/stderr";
  abort = 1;
  exit 1
}

function data_error(msg)
{ 
  printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1
}