#! /bin/bash # Last edited on 2023-05-15 17:05:02 by stolfi cmd="${0##*/}" usage="${cmd} [-noylabels] [-color] [-format {FORMAT}] [-show {BOOL}] {ASMPSEC} {ATITLE} {BSMPSEC} {BTITLE} ... {ONAME}" # Exit on first error: set -e # Generates a comparative Zipf plot for one or more languages. # The "-const" option specifies the constant for the ideal Zipf plot. # Input files must be named "{dir}/{lang}/{book}/{sec}/gud.wfr and # must have records of the form # {count} {freq} {word} # Uses the {freq} column to plot. # Output files are "fig/{ONAME}.svg" "fig/{ONAME}.eps" "fig/{ONAME}.png" # # Assumes that there are files "{dir}/{lang}/{book}/{sec}/whole.tlw" with the # full list of words of the original text. See "../../Notes/101/Note-101.txt" # # Also writes "fig/{ONAME}.stat" with statistics of the files. # # Also writes "fig/{ONAME}.wik" with a description of the plot for Wikimedia. ylabelfmt="%g"; color=0 format="svg" const="" show=0 while [[ ( $# -gt 0 ) && ( "/$1" =~ /-.* ) ]]; do if [[ ( $# -ge 1 ) && ( "/$1" == "/-noylabels" ) ]]; then ylabelfmt=""; shift elif [[ ( $# -ge 1 ) && ( "/$1" == "/-color" ) ]]; then color=1; shift elif [[ ( $# -ge 2 ) && ( "/$1" == "/-const" ) ]]; then const="$2"; shift; shift elif [[ ( $# -ge 2 ) && ( "/$1" == "/-format" ) ]]; then format="$2"; shift; shift elif [[ ( $# -ge 2 ) && ( "/$1" == "/-show" ) ]]; then show="$2"; shift; shift else echo "bad option \"$1\"" 1>&2; echo "usage: ${usage}" 1>&2; exit 1 fi done if [[ $# -lt 3 ]]; then echo "insufficient args \"$1\" - usage: ${usage}" 1>&2; exit 1 fi #Parse file/title pairs: smpsec=() # {lang}/{book}/{sec} items, from command line. title=() # Respective titles for plot key, from command line. wfrFile=() # Respective plot input files, from {smpsec} i=0 while [[ $# -gt 1 ]]; do if [[ $# -lt 2 ]]; then echo "unpaired plot arg \"$1\" - usage: ${usage}" 1>&2; exit 1 fi smpsec[$i]="$1"; shift title[$i]="$1"; shift echo " ${smpsec[$i]} \"${title[$i]}\"" 1>&2 wfrFile[$i]="dat/${smpsec[$i]}/gud.wfr" if [[ ! -r ${wfrFile[$i]} ]]; then echo "** ${wfrFile[$i]} does not exist" 1>&2; exit 1 fi i=$(( $i + 1 )) done nfiles=${i} # Parse output filename prefix: if [[ $# -ne 1 ]]; then echo "missing output prefix - usage: ${usage}" 1>&2; exit 1 fi oname="$1"; shift; tmp="/tmp/$$" # Determine the max distinct words {Nmax} and max key length {Lmax}: i=0 Nmax=0 # Max number of distinct words. Lmax=0 # Max title length. rm -fv ${statfile} 2>&1 | sed -e 's:^: :g' 1>&2 while [[ $i -lt ${nfiles} ]]; do N=`cat ${wfrFile[$i]} | wc -l` if [[ ${N} -gt ${Nmax} ]]; then Nmax=${N}; fi L="${#title[$i]}" if [[ ${L} -gt ${Lmax} ]]; then Lmax=${L}; fi i=$(( $i + 1 )) done # Compute the constant ${const} for the ideal Zipf law: if [[ "/${const}" == "/" ]]; then echo " computing {const} from {Nmax = ${Nmax}}" 1>&2 const=`echo "s=0; for (k=1; k <= ${Nmax}; k++) { s = s + 1/k; }; 1/s" | bc -lq` # echo " const = '${const}' (2)" 1>&2 const=`printf "%.4f" "${const}"` fi echo " ideal plot constant = '${const}'" 1>&2 # Generate the wikimedia description and statistics file: statFile="fig/${oname}.stats" capFile="fig/${oname}.wik" rm -fv ${statFile} ${capFile} 2>&1 | sed -e 's:^: :g' 1>&2 printf "Zipf law plot (frequency as function of frequency rank) for various texts.\n\n" >> ${capFile} printf "The languages, texts and the frequency files are:\n\n" >> ${capFile} prevsmpsec="NONE/NONE/NONE" i=0 tfFlag=0 # Set to 1 if any files were truncated/filtered. while [[ $i -lt ${nfiles} ]]; do # Compute good words statistics: stats=( `compute-smpsec-gud-stats.sh "${smpsec[$i]}"` ) if [[ ${#stats[@]} -ne 3 ]]; then echo "** bad stats = (${stats[*]})" 1>&2; exit 1; fi nTksWhole="${stats[0]}" nTksTrunc="${stats[1]}" nWdsTrunc="${stats[2]}" if [[ ${nTksWhole} -ne ${nTksTrunc} ]]; then tfFlag=1; fi # Append statistics to the plot statistics file: printf "%s:" "${smpsec[$i]}" >> ${statFile} printf " whole %6d words, truncated to %6d words" "${nTksWhole}" "${nTksTrunc}" >> ${statFile} printf ", ''N'' = %6d distinct" "${nWdsTrunc}" >> ${statFile} printf " (%s)\n" "${title[$i]}" >> ${statFile} # Append this {smpsec} to Wikimedia description file create-wikimedia-description.sh \ "${prevsmpsec}" \ "${smpsec[$i]}" \ "${nTksWhole}" "${nTksTrunc}" "${nWdsTrunc}" \ >> ${capFile} prevsmpsec="${smpsec[$i]}" i=$(( $i + 1 )) done printf "Nmax = %d const = %.4f\n" "${Nmax}" "${const}" >> ${statFile} printf " --- ${statFile} ------------------------------\n" 1>&2 cat ${statFile} | sed -e 's:^: :g' 1>&2 printf " ----------------------------------------------------\n" 1>&2 # Degermine whether any files were truncated/filteres: tfNoteA="" tfNoteB=" extracted" if [[ ${tfFlag} -ne 0 ]]; then tfNoteA=", before truncation/filtering," tfNoteB=" truncated/filtered" fi printf "The word frequency files '*/*/*/gud.wfr' are available at the [https://www.ic.unicamp.br/~stolfi/EXPORT/projects/voynich/Notes/tr-stats/dat/ UNICAMP website]. The original annotated full texts${tfNoteA} are in the companion files */*/org/main.src. The${tfNoteB} texts -- one word per line, without punctuation -- are in */*/*/gud.tlw.\n" >> ${capFile} printf " --- ${capFile} ------------------------------\n" 1>&2 cat ${capFile} | sed -e 's:^: :g' 1>&2 printf " ----------------------------------------------------\n" 1>&2 colors=( \ ff2200 0033ff 006633 ee8800 aa5500 \ 55aa00 008877 8800ff dd0044 ff55ff \ 777777 \ ) if [[ ${#colors[@]} -lt ${#smpsec[@]} ]]; then echo "** ran out of line colors" 1>&2; exit 1; fi # Generate the plot commands file: if [[ "/${format}" == "/svg" ]]; then if [[ ${color} -ne 0 ]]; then ptypes=( 7 7 7 7 7 7 7 7 7 7 ) else ptypes=( 4 2 3 1 6 7 8 1 2 4 ) fi pointsize="0.30" linewidth="2.00" dashtype="(10,5)" keyspacing="0.8" # Gnuplot SVG computes the key width wrong. We must compensate: keywidth=`echo "w = ${Lmax}/3.0; 2-w" | bc -lq` keywidth=`printf "%.1f" "${keywidth}"` termspec="set term svg size 610,600 dynamic font 'Helvetica,18' noenhanced;" # sizespec="set size ratio -1;" # Does not work? sizespec="set size 1.05,1.00;" marginspec="set lmargin 6; set rmargin 4; set tmargin 0.5; set bmargin 2.0;" elif [[ "/${format}" == "/eps" ]]; then if [[ ${color} -ne 0 ]]; then # Gnuplot's linetype -> color mapping (bleech) for color PS output # 0 = black # 1 = red # 2 = light (BAD!) # 3 = blue # 4 = magenta # 5 = cyan (BAD!) # 6 = yellow (BAD!) # 7 = black # 8 = orange # 9 = gray (BAD!) # 10,11,12,... = 1,2,3,... colorspec="color solid" ptypes=( 7 7 7 7 7 7 7 7 7 7 ) else colorspec="mono" ptypes=( 4 2 3 1 6 7 8 1 2 4 ) fi pointsize="1.0" linewidth="2.50" dashtype="(4,2)" keyspacing="1.1" keywidth="0.0" termspec1="postscript eps ${colorspec} font 'Helvetica,32'" termspec2="linewidth 1.5 background '#ffffff' noenhanced;" termspec="set term ${termspec1} ${termspec2};" # sizespec="set size ratio -1;" # Does not work. sizespec="set size 1.5,1.85;" marginspec="set lmargin 7; set rmargin 4; set tmargin 0.5; set bmargin 2.0;" elif [[ "/${format}" == "/png" ]]; then if [[ ${color} -ne 0 ]]; then ptypes=( 7 7 7 7 7 7 7 7 7 7 ) else ptypes=( 7 6 5 4 1 3 2 8 9 6 ) fi pointsize="1.7" linewidth="1.5" dashtype="(4,2)" keyspacing="1.1" keywidth="0.0" termspec="set term png font arial 40 noenhanced linewidth 3.0 size 1950,1800;" marginspec="set lmargin 7; set rmargin 4; set tmargin 0.5; set bmargin 2.0;" # sizespec="set size ratio -1;" # Does not work. sizespec="set size 1.00,1.00;" else echo "invalid plot output format" 1>&2; exit 1 fi # Common part of plot file: gplFile="${tmp}.gnuplot" tmpPlotFile="${tmp}.${format}" outPlotFile="fig/${oname}.${format}" cat > ${gplFile} <5000 ? 0/0 : const/x)) notitle' >> ${gplFile} printf ' with lines ls 20 lw 1.8 lc rgb "#885500" ' >> ${gplFile} # generate plot commands sep="," i=0 while [[ $i -lt ${nfiles} ]]; do printf '%s \\\n "%s" using ($0+1):2 title "%s"' \ "${sep}" "${wfrFile[$i]}" "${title[$i]}" >> ${gplFile} printf ' \\\n with linespoints ls %d lw %s pt %d ps %s' \ "$(( $i + 1 ))" "${linewidth}" "${ptypes[$i]}" "${pointsize}" >> ${gplFile} if [[ ${color} -ne 0 ]]; then printf " lc rgb '#%s'" "${colors[$i]}" >> ${gplFile} fi sep="," i=$(( $i + 1 )) done printf '\n' >> ${gplFile} printf 'quit\n' >> ${gplFile} export GDFONTPATH=. gnuplot `cat gnuplot-X11-options.sh` < ${gplFile} if [[ "/${format}" == "/svg" ]]; then mv ${tmpPlotFile} ${outPlotFile} if [[ ${show} -ne 0 ]]; then time inkview ${outPlotFile} fi /bin/rm -f ${gplFile} elif [[ "/${format}" == "/eps" ]]; then mv ${tmpPlotFile} ${outPlotFile} if [[ ${show} -ne 0 ]]; then evince ${outPlotFile} fi /bin/rm -f ${gplFile} elif [[ "/${format}" == "/png" ]]; then convert ${tmpPlotFile} -resize '600x' ${outPlotFile} if [[ ${show} -ne 0 ]]; then display ${outPlotFile} fi /bin/rm -f ${tmpPlotFile} ${gplFile} else echo "invalid plot output format" 1>&2 fi