# Last edited on 2004-02-17 15:14:22 by stolfi # Creates the VMS samples: MAKEFILE := vms-samples.make # MAKERULES := MAKERULES := ${MAKEFILE} LANG := voyn .PHONY: all data export clean # Trap spurious "make"s: all: data # "make data" creates all sample files in "dat": data: dat/${LANG} exp/${LANG} ${MAKE} -R -f ${MAKEFILE} ACTION=data everything # "make export" exports TeX tables and figures from "dat" to "exp": export: ${MAKE} -R -f ${MAKEFILE} ACTION=export everything # Remove derived files from "dat": clean: ${MAKE} -R -f ${MAKEFILE} ACTION=clean everything # Create directories for derived files and exported data: dat/${LANG}: ; mkdir dat/${LANG} exp/${LANG}: ; mkdir exp/${LANG} # Create a table mapping unit ID to unit type, from # the table that describes the various units of the VMS: UTYPE_TBL := unit-to-type.tbl UNIT_TBL := work/L16+H-eva/unit16e6.txt ${UTYPE_TBL}: ${UNIT_TBL} cat ${UNIT_TBL} \ | gawk -v FS=":" '/./{print $$2,$$6}' \ > ${UTYPE_TBL} ###################################################################### # Rules for a given ${ACTION} ("data", "export", "clean") # ACTION := ACTION.IS.UNDEFINED ifneq "${ACTION}" "ACTION.IS.UNDEFINED" # The various "books" (actually views of the same book, the VMS). # The "sectioned" ones are worth analyzing separately by section. # SECTIONED_BOOKS := maj prs lab UNSECTIONED_BOOKS := tak ini fin mid .PHONY: everything ev-recurse ev-pos-${ACTION} VMS_SECTIONS := ${shell cat sections.tags} everything: ev-recurse ev-pos-${ACTION} ev-recurse: for book in ${SECTIONED_BOOKS}; do \ ${MAKE} -R -f ${MAKEFILE} \ BOOK=$$book \ SECTIONS="${VMS_SECTIONS}" \ ACTION=${ACTION} single-book; \ done for book in ${UNSECTIONED_BOOKS}; do \ ${MAKE} -R -f ${MAKEFILE} \ BOOK=$$book \ SECTIONS="" \ ACTION=${ACTION} single-book; \ done ev-pos-data: for sec in ${VMS_SECTIONS}; do \ printf "\n%-32s" "voyn/{prs,lab}/$$sec/raw.wfr: "; \ cat ${foreach B,prs lab,dat/${LANG}/${B}/$$sec/raw.wfr} \ | gawk '/./{t += $$1;} END{print t}' ; \ printf "%-32s" "voyn/maj/$$sec/raw.wfr: " ; \ cat dat/${LANG}/maj/$$sec/raw.wfr \ | gawk '/./{t += $$1;} END{print t}' ; \ done ev-pos-export: ev-pos-clean: ###################################################################### # Rules for given ${ACTION} and ${BOOK} where # ${BOOK} = book to make ("prs", "lab", "maj", etc.) # BOOK := BOOK.IS.UNDEFINED ifneq "${BOOK}" "BOOK.IS.UNDEFINED" SMP := ${LANG}/${BOOK} dat/${SMP}: ; mkdir dat/${SMP} exp/${SMP}: ; mkdir exp/${SMP} #---------------------------------------------------------------------- # Target files for this book # # List of sections for this book: BOOK_SECTION_LIST := ${SMP}/sections.tags # # Ditto, excluding sections of dubious type: BOOK_SECTION_OK_LIST := ${SMP}/sections-ok.tags #---------------------------------------------------------------------- ###################################################################### # Rules for given ${ACTION}, ${BOOK}, and ${SECTIONS} where # ${SECTIONS} = blank-separated list of section samples to # create for that book, excluding "tot.1". # SECTIONS := SECTIONS.IS.UNDEFINED ifneq "${SECTIONS}" "SECTIONS.IS.UNDEFINED" #---------------------------------------------------------------------- # Target files for this book and section-set # # Counts and summaries (minus file extension) BOOK_RGB_CTS := ${SMP}/raw-gud-bad-tw-counts BOOK_SUMM := ${SMP}/raw-gud-bad-tw-summary #---------------------------------------------------------------------- .PHONY: single-book sb-recurse sb-pre-${ACTION} sb-pos-${ACTION} .PHONY: sb-show-sizes sb-check-sections-total single-book: sb-pre-${ACTION} sb-recurse sb-pos-${ACTION} sb-recurse: dat/${SMP} exp/${SMP} for sec in ${SECTIONS} tot.1; do \ ${MAKE} -R -f ${MAKEFILE} \ BOOK=${BOOK} \ SEC=$$sec \ ACTION=${ACTION} \ single-section; \ done sb-pre-data: dat/${BOOK_SECTION_LIST} dat/${BOOK_SECTION_OK_LIST} sb-pos-data: sb-show-sizes sb-check-sections-total \ dat/${BOOK_RGB_CTS}.tex \ dat/${BOOK_SUMM}.tex # Report sizes of various files: sb-show-sizes: @dicio-wc ${foreach S,${SECTIONS} tot.1,dat/${SMP}/${S}/raw.evt} @dicio-wc ${foreach S,${SECTIONS} tot.1,dat/${SMP}/${S}/raw.tlw} @dicio-wc ${foreach S,${SECTIONS} tot.1,dat/${SMP}/${S}/raw.wfr} \ | gawk '/./{ printf " %8s %s\n", $$1,$$4;}' @dicio-wc ${foreach S,${SECTIONS} tot.1,dat/${SMP}/${S}/gud.wfr} \ | gawk '/./{ printf " %8s %s\n", $$1,$$4;}' @dicio-wc ${foreach S,${SECTIONS} tot.1,dat/${SMP}/${S}/bad.wfr} \ | gawk '/./{ printf " %8s %s\n", $$1,$$4;}' # Check whether counts add up: sb-check-sections-total: @printf "\n%-24s" "dat/${SMP}/*/raw.wfr: " @cat ${foreach S,${SECTIONS},dat/${SMP}/${S}/raw.wfr} /dev/null \ | gawk '/./{t += $$1;} END{print t}' @printf "%-24s" "dat/${SMP}/tot.1/raw.wfr: " @cat dat/${SMP}/tot.1/raw.wfr \ | gawk '/./{t += $$1;} END{print t}' # Export TEX files for the technical report: sb-pre-export: sb-pos-export: update-paper-include dat/${BOOK_RGB_CTS}.tex exp/${BOOK_RGB_CTS}.tex update-paper-include dat/${BOOK_SUMM}.tex exp/${BOOK_SUMM}.tex sb-pre-clean: sb-pos-clean: -rm -f dat/${BOOK_SECTION_LIST} dat/${BOOK_SECTION_OK_LIST} # Create list of sections for this book: dat/${BOOK_SECTION_LIST}: dat/${SMP} ${MAKERULES} echo "${SECTIONS}" \ | tr ' ' '\012' \ > dat/${BOOK_SECTION_LIST} # Create list of "OK" sections for this book: dat/${BOOK_SECTION_OK_LIST}: dat/${BOOK_SECTION_LIST} cat dat/${BOOK_SECTION_LIST} \ | egrep -v '^(unk|xxx)' \ > dat/${BOOK_SECTION_OK_LIST} # Count tokens and words per section, output as text files: dat/${BOOK_RGB_CTS}.txt: ${MAKERULES} dat/${SMP} \ count-raw-gud-bad-toks-wrds @echo " " @echo " Good/bad statistics for ${SMP}:" @echo " " count-raw-gud-bad-toks-wrds \ dat/${SMP} ${SECTIONS} / tot.1 \ > dat/${BOOK_RGB_CTS}.txt cat dat/${BOOK_RGB_CTS}.txt \ | sed -e 's:/::g' -e 's/^/ /' # Format the counts as TeX source files: dat/${BOOK_RGB_CTS}.tex: ${MAKERULES} dat/${BOOK_RGB_CTS}.txt \ tex-format-raw-gud-bad-counts @echo " " @echo " dat/${BOOK_RGB_CTS}.txt -> dat/${BOOK_RGB_CTS}.tex" @echo " " cat dat/${BOOK_RGB_CTS}.txt \ | tex-format-raw-gud-bad-counts \ > dat/${BOOK_RGB_CTS}.tex # Format a summary of the counts as a TeX file: dat/${BOOK_SUMM}.tex: ${MAKERULES} dat/${BOOK_RGB_CTS}.txt \ tex-format-raw-gud-bad-summary @echo " " @echo " dat/${BOOK_RGB_CTS}.txt -> ${BOOK_SUMM}.tex" @echo " " cat dat/${BOOK_RGB_CTS}.txt \ | tex-format-raw-gud-bad-summary \ -v smp=${LANG}${BOOK} \ > dat/${BOOK_SUMM}.tex endif # End of ${ACTION}/${BOOK}/${SECTIONS} rules ###################################################################### ###################################################################### # Rules for given ${ACTION}/${BOOK}/${SEC} where # ${SEC} is a specific section (possibly "tot.1"). # SEC := SEC.IS.UNDEFINED ifneq "${SEC}" "SEC.IS.UNDEFINED" SMPSEC := ${SMP}/${SEC} dat/${SMPSEC}: ; mkdir dat/${SMPSEC} exp/${SMPSEC}: ; mkdir exp/${SMPSEC} PRS_UTYPES := parags,starred-parags,circular-lines,circular-text,radial-lines,titles LAB_UTYPES := labels,words # Define ${SOURCE_EVT} and ${TRANS_TAG}: ifeq "${BOOK}" "tak" ifeq "${SEC}" "tot.1" SOURCE_EVT := work/L16+H-eva/text16e6.evt else SOURCE_EVT := SOURCE_EVT.NOT.DEFINED endif TRANS_TAG := H else ifeq "${SEC}" "tot.1" SOURCE_EVT := work/Notes/045/only-m.evt else SOURCE_EVT := work/Notes/045/sections-m/${SEC}.evt endif TRANS_TAG := A endif # Define ${UTYPES} and ${LINE_SEL}: ifeq "${BOOK}" "tak" UTYPES := ${PRS_UTYPES},${LAB_UTYPES} OWN_EVT := YES LINE_SEL := endif ifeq "${BOOK}" "maj" UTYPES := ${PRS_UTYPES},${LAB_UTYPES} OWN_EVT := YES LINE_SEL := endif ifeq "${BOOK}" "prs" UTYPES := ${PRS_UTYPES} OWN_EVT := YES LINE_SEL := endif ifeq "${BOOK}" "lab" UTYPES := ${LAB_UTYPES} OWN_EVT := YES LINE_SEL := endif ifeq "${BOOK}" "ini" UTYPES := ${PRS_UTYPES} OWN_EVT := NO LINE_SEL := -v omitMedial=1 -v omitFinal=1 endif ifeq "${BOOK}" "mid" UTYPES := ${PRS_UTYPES} OWN_EVT := NO LINE_SEL := -v omitInitial=1 -v omitFinal=1 endif ifeq "${BOOK}" "fin" UTYPES := ${PRS_UTYPES} OWN_EVT := NO LINE_SEL := -v omitInitial=1 -v omitMedial=1 endif ifeq "${OWN_EVT}" "YES" # Create a private copy of the EVT file, with specified # units and section, converting all weirdos to basic # EVA chars, or to "*" if impossible. RAW_EVT := dat/${SMPSEC}/raw.evt ${RAW_EVT}: ${SOURCE_EVT} ${MAKERULES} \ ./basify-weirdos ./select-units ${UTYPE_TBL} @echo "${SOURCE_EVT} -> ${RAW_EVT}" cat ${SOURCE_EVT} \ | egrep -v '[;][^'"${TRANS_TAG}"'][>]' \ | sed -e 's/[&][*!][*!][*!][*!;]/*!!!!/g' \ | ./basify-weirdos \ | ./select-units \ -v types="${UTYPES}" \ -v table=${UTYPE_TBL} \ > ${RAW_EVT} else # Use the EVT file previously created for the "prs" book, # and this same section. Assumes that the "prs" # EVT file contains the same units as ${BOOK} should. .PHONY: make-prs RAW_EVT := dat/${LANG}/prs/${SEC}/raw.evt ${RAW_EVT}: make-prs make-prs: ${MAKE} -R -f ${MAKEFILE} ACTION=data \ BOOK=prs SEC=${SEC} ${RAW_EVT} endif #---------------------------------------------------------------------- # The following section updates the following files: # # Raw words file, where each entry has the format {TYPE LOC WORD} # The {TYPE} field is "a"=alpha, "p"=punct, "s"=symbol # The {LOC} field has the format {"{" FNUM "}{" UNIT "}{" LINE "}"} RAW_TLW := dat/${SMPSEC}/raw.tlw # # The "gud"/"bad" subsets of ${RAW_TLW}, defined by {select-gud-bad-voyn-words}: GUD_TLW := dat/${SMPSEC}/gud.tlw BAD_TLW := dat/${SMPSEC}/bad.tlw # # The corresponding word counts and frequencies: RAW_WFR := dat/${SMPSEC}/raw.wfr GUD_WFR := dat/${SMPSEC}/gud.wfr BAD_WFR := dat/${SMPSEC}/bad.wfr #---------------------------------------------------------------------- # Define the list of files to create: DERIVED_FILES := \ ${RAW_TLW} ${GUD_TLW} ${BAD_TLW} \ ${RAW_WFR} ${GUD_WFR} ${BAD_WFR} ifeq "OWN_EVT" "YES" DERIVED_FILES := ${RAW_EVT} ${DERIVED_FILES} endif .PHONY: single-section ss-${ACTION} single-section: dat/${SMPSEC} exp/${SMPSEC} ss-${ACTION} ss-data: ${DERIVED_FILES} ss-export: ss-clean: -rm -f ${DERIVED_FILES} # Rule to extract raw token stream with locations from EVT file ${RAW_TLW}: ${RAW_EVT} ${MAKERULES} \ ./words-from-evt @echo "${RAW_EVT} -> ${RAW_TLW}" cat ${RAW_EVT} \ | ./words-from-evt \ -v showParags=1 \ ${LINE_SEL} \ -v showLocation=1 \ | gawk \ ' BEGIN { c = "{f0}{P0}{0}"; } \ /^ *$$/{ print "# ="; next; } \ /./{ c = ( "{" $$1 "}" ); gsub(/[.;]/, "}{", c); } \ /[*?]/{ print "s", c, $$2; next; } \ /./{ print "a", c, $$2; next; } \ ' \ > ${RAW_TLW} # Rule to extract the good words: ${GUD_TLW}: ${RAW_TLW} ${MAKERULES} \ ./select-gud-bad-voyn-words @echo "${RAW_TLW} -> ${GUD_TLW}" cat ${RAW_TLW} \ | ./select-gud-bad-voyn-words \ -v field=3 \ -v smp=${SMP} \ -v sec=${SEC} \ -v writeGud=1 -v writeBad=0 \ > ${GUD_TLW} # Rule to extract the bad words: ${BAD_TLW}: ${RAW_TLW} ${MAKERULES} \ ./select-gud-bad-voyn-words @echo "${RAW_TLW} -> ${BAD_TLW}" cat ${RAW_TLW} \ | ./select-gud-bad-voyn-words \ -v field=3 \ -v smp=${SMP} \ -v sec=${SEC} \ -v writeGud=0 -v writeBad=1 \ > ${BAD_TLW} # Rule to count word ocurrences and compute their rel. frequencies: %.wfr: %.tlw ${MAKERULES} \ ./compute-freqs @echo "$*.tlw -> $*.wfr" cat $*.tlw \ | egrep -v '^ *([\#]|$$)' \ | gawk '/./ { print $$3; }' \ | sort | uniq -c | expand \ | sort -b +0 -1nr +1 -2 \ | ./compute-freqs \ > $*.wfr endif # End ${ACTION}/${BOOK}/${SEC} rules ###################################################################### endif # End ${ACTION}/${BOOK} rules ###################################################################### endif # End ${ACTION} rules ######################################################################