#! /usr/bin/gawk -f # Last edited on 2000-09-21 16:39:18 by stolfi BEGIN { abort = -1; # Reads the run-length and line-length statistics, # writes the expected run-length frequencies. # Assumes the input contains records of the form # COUNT STRING # where STRING is a run of all zeros or all ones, # or a string of dots "." indicating a line length; # COUNT is the number of STRING occurrences seen. # Computes the character probabilities for "0" and "1", # and writes the relative run length frequencies of 0- and 1- # runs, for a random set of strings with those character frequencies # and those line lengths. maxlen = 1; maxlinlen = 1; split("", s); split("", nlines); totlines = 0; totchars = 0; } (abort >= 0) { exit abort; } ($2 ~ /^[.]+$/) { len = length($2); ct = $1 + 0; nlines[len] += ct; totlines += ct; totchars += ct * len; if (len > maxlinlen) { maxlinlen = len; } next; } ($2 ~ /^[0]+$/) { tally($1 + 0, $2); next; } ($2 ~ /^[1]+$/) { tally($1 + 0, $2); next; } /./ { data_error("bad run"); } function tally(ct, run, c, m) { c = substr(run,1,1); m = length(run); s[c] += ct * m; if (maxlen < m) { maxlen = m; } } END { if (abort >= 0) { exit abort; } # Print line length distribution, for checking: printf "line length distribution:\n"; for (len = 0; len <= maxlinlen; len++) { printf " %3d %5d\n", len, nlines[len]; } printf "total lines = %d\n", totlines; printf "\n"; # Compute character probabilities: stot = s["0"]+s["1"]; if (stot != totchars) { prog_error(("inconsistency stot = " stot " totchars = " totchars)); } if (stot == 0) { stot = 1; } p0 = s["0"]/stot; p1 = s["1"]/stot; printf "s0 = %5d p0 = %7.5f\n", s["0"], p0; printf "s1 = %5d p1 = %7.5f\n", s["1"], p1; printf "\n"; # Computes expected counts x0[k], x1[k] of runs of length k split("", x0); split("", x1); for (len1 in nlines) { m = nlines[len1]; # Fucked-up gawk semantics: must add 0 to "len" to force numeric # comparison in the for loop test "i <= len"! len = len1 + 0; # printf "len = %d nlines = %d\n", len, m; # Compute and tally the expected number of runs of length k # in a line of length len. split("", r0); r0[1] = p0; split("", r1); r1[1] = p1; for (i = 2; i <= len; i++) { # now r0[j], for 1 <= j < i, is the probability that # a line with i-1 characters ends with a run of i-j 0's. # Ditto for r1[j]. # Consider what happens if the ith character is # different from the last one: r0[i] = 0; r1[i] = 0; for (j = 1; j < i; j++) { p01 = r0[j]*p1; x0[i-j] = x0[i-j] + m*p01; r1[i] += p01; # printf "run 0^%02d: + %8.4f\n", i-j, m*p01; p10 = r1[j]*p0; x1[i-j] = x1[i-j] + m*p10; r0[i] += p10; # printf "run 1^%02d: + %8.4f\n", i-j, m*p10; } # Now consider what happens if the ith character is the same: for (j = 1; j < i; j++) { r0[j] *= p0; r1[j] *= p1; } } # Now tally the last runs on the line: for (j = 1; j <= len; j++) { x0[len+1-j] += m*r0[j]; x1[len+1-j] += m*r1[j]; } } # Compute total expected run counts: totruns = 0; for (k in x0) { totruns += x0[k] + x1[k]; } if (totruns == 0) { totruns = 1; } # Now write the run frequencies: t0 = 0; t1 = 0; z0 = 0; z1 = 0; for(i = 1; i <= maxlen; i++) { f0 = x0[i]/totruns; f1 = x1[i]/totruns; printf "%2d %8.2f %7.5f %8.2f %7.5f\n", i, x0[i], f0, x1[i], f1; t0 += f0; t1 += f1; z0 += i*f0; z1 += i*f1; } printf "\n"; printf " %7.5f %7.5f\n", t0, t1; ztot = z0 + z1; if (ztot == 0) { ztot = 1; } z0 /= ztot; z1 /= ztot; printf " %7.5f %7.5f\n", z0, z1; } function data_error(msg) { printf "file %s, line %s: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function prog_error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit abort; }