#! /usr/bin/gawk -f # Last edited on 2013-03-18 22:56:51 by stolfilocal BEGIN { USAGE = "sort-segs-by-words.gawk -v charFile={CHARFILE} -v wordFile={WORDSFILE} \\\n"; # Reads two files {CHARFILE} and {WORDSFILE} written by {SegmentTruth.java}. # Writes to standard output # a file in the same format with one line for each character, where the field # {ICH} is the same as in {CHARFILE} but {IWD} is the number of the best-matching # words from {WORDSFILE}. # # Characters are sorted by word and left to right within each word. A # blank line is written after each word. Flags and discards characters # that do not match any word, and words that are not matched by any # character. if (charFile == "") { args_error(("must define \"charFile\"")); } if (wordFile == "") { args_error(("must define \"wordFile\"")); } # Data read from character file (indexed from 0 to {nchars}): split("", ch_iname); # Image name. split("", ch_level); # Image level. split("", ch_seq); # Segment number assigned by {SegmentTruth.java}. split("", ch_xmin); # Min X of bounding box. split("", ch_ymin); # Min Y of bounding box. split("", ch_w); # X size of bounding box. split("", ch_h); # Y size of bounding box. split("", ch_rmin); # Min ellipse radius. split("", ch_rmax); # Max ellipse radius. split("", ch_val); # Character value. # Data on words (indexed from 0 to {nwords}): split("", wd_iname); # Image name. split("", wd_level); # Image level. split("", wd_seq); # Segment number assigned by {SegmentTruth.java}. split("", wd_xmin); # Min X of bounding box. split("", wd_ymin); # Min Y of bounding box. split("", wd_w); # X size of bounding box. split("", wd_h); # Y size of bounding box. split("", wd_rmin); # Min ellipse radius. split("", wd_rmax); # Max ellipse radius. split("", wd_val); # Character value. # Read the files: nchars = read_file(charFile, ch_iname, ch_level, ch_seq, ch_xmin, ch_ymin, ch_w, ch_h, ch_rmin, ch_rmax, ch_val); nwords = read_file(wordFile, wd_iname, wd_level, wd_seq, wd_xmin, wd_ymin, wd_w, wd_h, wd_rmin, wd_rmax, wd_val); # Computed character data (indexed from 0 to {nchars}): split("", ch_iwd); # Word number assigned by this program. # Computed word data (indexed from 0 to {nwords}): split("", wd_nch); # Number of characters assigned to this word. for (iwd = 0; iwd < nwords; iwd++) { wd_nch[iwd] = 0; } # Pair characters and words: for (ich = 0; ich < nchars; ich++) { mcw_best = 0.00001; iwd_best = -1; for (iwd = 0; iwd < nwords; iwd++) { mcw = compare_char_word_boxes(ich, iwd); if (mcw > mcw_best) { mcw_best = mcw; iwd_best = iwd; } } ch_iwd[ich] = iwd_best; if (iwd_best < 0) { match_warning(("character " ich " (seq \"" ch_seq[ich] "\") does not match any word")); } else { wd_nch[iwd_best]++; } } # Check for unmatched words: for (iwd = 0; iwd < nwords; iwd++) { if (wd_nch[iwd] == 0) { match_warning(("word " iwd " (seq \"" wd_seq[iwd] "\") does not match any character")); } } # Index sort characters by word and X position: split("", ch_ich); # Indexed by {0..nchars-1}, gives indices of characters in sorted order. for (ich = 0; ich < nchars; ich++) { jch = ich; while((jch > 0) && (compare_chars(ch_ich[jch-1],ich) > 0)) { ch_ich[jch] = ch_ich[jch-1]; jch--; } ch_ich[jch] = ich; } # Write out: swd_prev = -1; for (kch = 0; kch < nchars; kch++) { ich = ch_ich[kch]; iwd = ch_iwd[ich]; if (iwd >= 0) { swd = wd_seq[iwd]; if (swd != swd_prev) { if (swd_prev >= 0) { print_word_stats(swd_prev); printf "\n"; } init_word_stats(); } printf \ "%30s %02d %5d %5d %5d %5d %5d %5d %7.1f %7.1f %s\n", \ ch_iname[ich], ch_level[ich], ch_seq[ich], swd, \ ch_xmin[ich], ch_ymin[ich], ch_w[ich], ch_h[ich], \ ch_rmin[ich], ch_rmax[ich], ch_val[ich]; fflush("/dev/stdout"); swd_prev = swd; accum_word_stats(ch_w[ich], ch_h[ich]); } } if (swd_prev >= 0) { print_word_stats(swd_prev); } } function init_word_stats() { split("", wst_w); # Letters widths in decreasing order, indexed {0..wst_nchars}. split("", wst_h); # Letters heights in decreasing order, indexed {0..wst_nchars}. split("", wst_a); # Letters aspects in decreasing order, indexed {0..wst_nchars}. wst_nchars = 0; } function accum_word_stats(w,h, k) { insert_word_stat(wst_w,wst_nchars,w); insert_word_stat(wst_h,wst_nchars,h); insert_word_stat(wst_a,wst_nchars,h/w); wst_nchars++; } function insert_word_stat(ztb,n,z, k) { # Inserts {z} in table {ztb[0..n-1]} in order. Does not increment {n}. k = n; while ((k > 0) && (ztb[k-1] < z)) { ztb[k] = ztb[k-1]; k--; } ztb[k] = z; } function print_word_stats(swd, wmax,hmax,amax,wmin,hmin,amin) { printf "word %5s: %3d chars", swd, wst_nchars > "/dev/stderr"; wmin = word_stats_pick_min(wst_w,wst_nchars); wmax = wst_w[0]; hmin = word_stats_pick_min(wst_h,wst_nchars); hmax = wst_h[0]; amin = word_stats_pick_min(wst_a,wst_nchars); amax = wst_a[0]; print_word_dim_stats("width", wmin, wmax); print_word_dim_stats("height", hmin, hmax); print_word_dim_stats("aspect", amin, amax); print_word_dim_outliers("width", wst_w, wst_nchars, wmin); print_word_dim_outliers("height", wst_h, wst_nchars, hmin); print_word_dim_outliers("aspect", wst_a, wst_nchars, amin); printf "\n" > "/dev/stderr"; } function print_word_dim_stats(name,zmin,zmax, k,sep) { # Prints to {stderr} a summary of the values in {ztb[0..n-1]}, # assuming they are in decreasing order and the chosen # minimum is {zmin}. printf " %s range = [ %5.1f _ %5.1f ] %5.1f", name, zmin, zmax, zmax/zmin > "/dev/stderr"; } function print_word_dim_outliers(name,ztb,n,zmin, k,sep) { # Prints to {stderr} the outliers in {ztb[0..n-1]}, # assuming they are in decreasing order and the chosen # minimum is {zmin}. if (ztb[n-1] < zmin) { printf " %s outliers = ", name > "/dev/stderr"; k = n-1; sep = "("; while ((k >= 0) && (ztb[k] < zmin)) { printf "%s%.1f", sep, ztb[k] > "/dev/stderr"; k--; sep = ","; } printf ")" > "/dev/stderr"; } } function word_stats_pick_min(ztb,n, k0,k) { # Picks the smalles entry in {ztb[0..n-1]} but tries to reject outliers. k0 = int(n*0.75); # 0.25 lower fractile. # Accept anything that is at least 70% of the lower quartile: k = k0; if (k >= n) { k = n-1; } while ((k < n-1) && (ztb[k+1] >= 0.70*ztb[k0])) { k++; } return ztb[k]; } function read_file(fname,iname,level,seq,xmin,ymin,w,h,rmin,rmax,val, lin,ntbl,nlin,fld,nfld,tmp) { # Reads a file created by by {SegmentTruth.java}. # Ignores #-comment lines and blank lines. # Stores the segments sequentially in the tables {iname[0,..n-1],...,val[0,..n-1]}. # Returns the number {n} of segments read. ntbl=0; nlin=0; while((getline lin < fname) > 0) { nlin++; gsub(/[\011]/, " ", lin); gsub(/^[ ]*([\#].*|)$/, "", lin); if (lin != "") { nfld = split(lin, fld, " "); if ((nfld >= 12) && (fld[12] ~ /^[\#]/)) { nfld = 11; } if (nfld != 11) { tbl_error(fname, nlin, ("bad table entry = \"" lin "\"")); } # Fill the fields iname[ntbl] = fld[1]; level[ntbl] = check_num(fname, nlin, 2, fld[2],0,0); seq[ntbl] = check_num(fname, nlin, 3, fld[3],0,999999); tmp = check_num(fname, nlin, 4, fld[4],0,999999); if (tmp != seq[ntbl]) { tbl_error(fname, nlin, ("inconsistent seq fields = \"" fld[3] "\" \"" fld[4] "\"")); } xmin[ntbl] = check_num(fname, nlin, 5, fld[5],0,999999); ymin[ntbl] = check_num(fname, nlin, 6, fld[6],0,999999);; w[ntbl] = check_num(fname, nlin, 7, fld[7],1,999999); h[ntbl] = check_num(fname, nlin, 8, fld[8],1,999999); rmin[ntbl] = check_num(fname, nlin, 9, fld[9],0,999999); rmax[ntbl] = check_num(fname, nlin, 10, fld[10],0,999999); val[ntbl] = fld[11]; ntbl++; } } if (ERRNO != "0") { tbl_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "read %6d lines and %6d segments from %s\n", nlin, ntbl, fname > "/dev/stderr"; return ntbl; } function check_num(fname,nlin,ifld,fld,fmin,fmax, val) { # Checks whether {fld} is a number in {[fmin _ fmax]}. # If OK, returns the numeric value of {fld}. if (!(fld ~ /^[-+]?[0-9]*([0-9]|[.][0-9]*)$/)) { tbl_error(fname, nlin, ("field " ifld " is not a valid number = \"" fld "\"")); } val = fld + 0; if ((val < fmin) || (val > fmax)) { tbl_error(fname, nlin, ("field " ifld " = " val " out of range [" fmin " _ " fmax "]")); } return val; } function compare_char_word_boxes(ich,iwd, w_cbx,h_cbx,w_wbx,h_wbx,w_int,h_int,fcw,fwc) { # Compares box {{ch_xmin,ch_ymin,ch_w,ch_h}[ich]} # with box {{wd_xmin,wd_ymin,wd_w,wd_h}[iwd]} # Returns the fraction of the area of the character box that is contained # in the word box, with penalty if the word box is much larger. # Size of character box: w_cbx = ch_w[ich]; h_cbx = ch_h[ich]; # Size of word box: w_wbx = wd_w[iwd]; h_wbx = wd_h[iwd]; # Size of interesection of word and character boxes: w_int = size_inter(ch_xmin[ich], ch_w[ich], wd_xmin[iwd], wd_w[iwd]); h_int = size_inter(ch_ymin[ich], ch_h[ich], wd_ymin[iwd], wd_h[iwd]); # Fraction of char box that is contained in word box: fcw = (w_int*h_int + 0.0)/(w_cbx*h_cbx + 0.00001); # Fraction of word box that is contained in char box: fwc = (w_int*h_int + 0.0)/(w_wbx*h_wbx + 0.00001); # Return {fcw} but with a small penalty for {fwc}: return fcw * (1 - 0.01*(1 - fwc)); } function size_inter(a_zmin,a_zsize,b_zmin,b_zsize, a_zmax,b_zmax,zmin,zmax) { # Returns the size of the intersection fo the intervals # {[a_zmin _ a_zmin+a_zsize]} and {[b_zmin _ b_zmin+b_zsize]}. a_zmax = a_zmin + a_zsize; b_zmax = b_zmin + b_zsize; if (a_zmin > b_zmin) { zmin = a_zmin; } else { zmin = b_zmin; } if (a_zmax < b_zmax) { zmax = a_zmax; } else { zmax = b_zmax; } if (zmin >= zmax) { return 0; } return zmax - zmin; } function compare_chars(ach,bch, ax,bx) { # Compares characters with indices {ach} and {bch} # Returns -1, 0, or +1 if {ach} must come before, together, or after {bch}. if (ch_iwd[ach] < ch_iwd[bch]) { return -1; } if (ch_iwd[ach] > ch_iwd[bch]) { return +1; } ax = ch_xmin[ach]+0.5*ch_w[ach]; bx = ch_xmin[bch]+0.5*ch_w[bch]; if (ax < bx) { return -1; } if (ax > bx) { return +1; } return 0; } function match_warning(msg) { printf "!! %s\n", msg > "/dev/stderr"; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function tbl_error(f,n,msg) { printf "%s:%d: %s\n", f, n, msg > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 }