#! /usr/bin/gawk -f # Last edited on 2004-03-01 00:21:38 by stolfi # A library of functions for generating synthetic words # from a given sample. function mky_make_word(wd,pr,n,ord,avglen,maxlen, \ word,mw,i,st,c,frac,alt,apr,debug \ ) { # Tries to make up a new string {word} by a (very slow) Shannon # monkey of order {ord}. Uses words {wd[0..n-1]} as the sample, with # probabilities proportional to {pr[0..n-1]}. The length of the word # should be biased towards {avglen}, and will not exceed {maxlen}. # Returns the string {word}, or "**FAIL**" if the trial failed. debug = 0; word = ""; mw = 0; st = ""; # State of Markov chain frac = 0; # Fraction of word that has been built so far split("", alt); # Alternative characters, or "" for end-of-word. split("", apr); # Probability weight of alternative chars. if (debug) { printf "monkey started....\n" > "/dev/stderr"; } c = mky_choose_next_char(wd,pr,n,st,ord,frac,alt,apr,debug); if (debug) { printf " -> {%s}\n", c > "/dev/stderr"; } while (c != "") { if ((c == "**FAIL**") || (mw >= maxlen)) { if (debug) { printf "**FAIL**\n" > "/dev/stderr"; } return "**FAIL**"; } # Append {c} to the word, update its length: word = (word c); mw = length(word); # Remember only the last {ord} characters: st = substr(word, mw-ord+1, ord); # Re-estimate the fraction {frac} frac = (mw < avglen ? \ 0.5*mw/avglen : \ 0.5 + 0.5*(mw - avglen)/(maxlen - avglen) \ ); if (debug) { printf "word = {%s}\n", word > "/dev/stderr"; } c = mky_choose_next_char(wd,pr,n,st,ord,frac,alt,apr,debug); if (debug) { printf " -> {%s}\n", c > "/dev/stderr"; } } if (debug) { printf "word = {%s}\n", word > "/dev/stderr"; } return word; } function mky_choose_next_char(wd,pr,n,st,ord,frac,alt,apr,debug, \ ms,na,i,wdi,mi,pri,jmax,j,prj,tp,px,na1,k \ ) { # Select the next character to append, given the string {st} that # consists of the last {ord} characters already generated, and the # approximate relative displacement {frac} of the character within # the word. Returns "" to signal `terminate the word', and "**FAIL**" to # indicate that the {st} does not occur in the sample. # The alternatives for the next character are chosen by finding the # string {st} in one of the words {dic[0..n-1]}, and looking at the # next character. Arguments {alt} and {apr} are two working arrays. if (debug) { printf " %s: ", st > "/dev/stderr"; } # If {length(st) < ord} then ms = length(st); # Gather the alternatives and their probs in {alt,apr}: na = 0; for (i = 0; i < n; i++) { wdi = wd[i]; mi = length(wdi); pri = pr[i]; # If {ms < ord}, the search is anchored as word-start: jmax = (ms < ord ? 0 : mi-ms); # Scan the word for {st}: for (j = 0; j <= jmax; j++) { if (st == substr(wdi,j+1,ms)) { prj = 1.0/(1 + mky_abs(j+ms - frac*mi)); alt[na] = substr(wdi,1+j+ms,1); apr[na] = pri*prj; na++; } } } # If {st} doesn't match anywhere, abort: if (na == 0) { return "**FAIL**"; } if (debug) { for (k = 0; k < na; k++) { printf "{%s}", alt[k] > "/dev/stderr"; } } # Select an alternative {alt[k]} with probability proportional to {apr[k]} tp = 0; for (k = 0; k < na; k++) { tp += apr[k]; } px = tp*rand(); tp = 0; for (k = 0; k < na; k++) { tp += apr[k]; if (tp >= px) { return alt[k]; } } # Just in case... return alt[na-1]; } function mky_abs(x) { return (x < 0 ? -x : x); }