# Last edited on 2004-02-17 15:14:22 by stolfi
# Creates the VMS samples:

MAKEFILE := vms-samples.make
# MAKERULES := 
MAKERULES := ${MAKEFILE}

LANG := voyn

.PHONY: all data export clean

# Trap spurious "make"s:
all: data

# "make data" creates all sample files in "dat":
data: dat/${LANG} exp/${LANG}
	${MAKE} -R -f ${MAKEFILE} ACTION=data everything

# "make export" exports TeX tables and figures from "dat" to "exp":
export:
	${MAKE} -R -f ${MAKEFILE} ACTION=export everything

# Remove derived files from "dat":
clean:
	${MAKE} -R -f ${MAKEFILE} ACTION=clean everything

# Create directories for derived files and exported data:
dat/${LANG}: ; mkdir dat/${LANG}
exp/${LANG}: ; mkdir exp/${LANG}

# Create a table mapping unit ID to unit type, from
# the table that describes the various units of the VMS:

UTYPE_TBL := unit-to-type.tbl
UNIT_TBL := work/L16+H-eva/unit16e6.txt

${UTYPE_TBL}: ${UNIT_TBL}
	cat ${UNIT_TBL} \
          | gawk -v FS=":" '/./{print $$2,$$6}' \
          > ${UTYPE_TBL}

######################################################################
# Rules for a given ${ACTION} ("data", "export", "clean")
#
ACTION := ACTION.IS.UNDEFINED
ifneq "${ACTION}" "ACTION.IS.UNDEFINED"

# The various "books" (actually views of the same book, the VMS).
# The "sectioned" ones are worth analyzing separately by section.
#
SECTIONED_BOOKS := maj prs lab
UNSECTIONED_BOOKS := tak ini fin mid 

.PHONY: everything ev-recurse ev-pos-${ACTION}

VMS_SECTIONS := ${shell cat sections.tags}

everything: ev-recurse ev-pos-${ACTION}

ev-recurse:
	for book in ${SECTIONED_BOOKS}; do \
          ${MAKE} -R -f ${MAKEFILE} \
            BOOK=$$book \
            SECTIONS="${VMS_SECTIONS}" \
            ACTION=${ACTION} single-book; \
        done
	for book in ${UNSECTIONED_BOOKS}; do \
          ${MAKE} -R -f ${MAKEFILE} \
            BOOK=$$book \
            SECTIONS="" \
            ACTION=${ACTION} single-book; \
        done

ev-pos-data:
	for sec in ${VMS_SECTIONS}; do \
          printf "\n%-32s" "voyn/{prs,lab}/$$sec/raw.wfr: "; \
          cat ${foreach B,prs lab,dat/${LANG}/${B}/$$sec/raw.wfr} \
            | gawk '/./{t += $$1;} END{print t}' ; \
          printf "%-32s" "voyn/maj/$$sec/raw.wfr: " ; \
          cat dat/${LANG}/maj/$$sec/raw.wfr  \
            | gawk '/./{t += $$1;} END{print t}' ; \
        done

ev-pos-export:

ev-pos-clean:

######################################################################
# Rules for given ${ACTION} and ${BOOK} where
#   ${BOOK} = book to make ("prs", "lab", "maj", etc.)
#
BOOK := BOOK.IS.UNDEFINED
ifneq "${BOOK}" "BOOK.IS.UNDEFINED"
        
SMP := ${LANG}/${BOOK}
        
dat/${SMP}: ; mkdir dat/${SMP}
exp/${SMP}: ; mkdir exp/${SMP}

#----------------------------------------------------------------------
# Target files for this book 
# 
# List of sections for this book:
BOOK_SECTION_LIST := ${SMP}/sections.tags
# 
# Ditto, excluding sections of dubious type:
BOOK_SECTION_OK_LIST := ${SMP}/sections-ok.tags
#----------------------------------------------------------------------

######################################################################
# Rules for given ${ACTION}, ${BOOK}, and ${SECTIONS} where
#   ${SECTIONS} = blank-separated list of section samples to
#     create for that book, excluding "tot.1".
#
SECTIONS := SECTIONS.IS.UNDEFINED
ifneq "${SECTIONS}" "SECTIONS.IS.UNDEFINED"

#----------------------------------------------------------------------
# Target files for this book and section-set
# 
# Counts and summaries (minus file extension)
BOOK_RGB_CTS := ${SMP}/raw-gud-bad-tw-counts
BOOK_SUMM := ${SMP}/raw-gud-bad-tw-summary
#----------------------------------------------------------------------

.PHONY: single-book sb-recurse sb-pre-${ACTION} sb-pos-${ACTION}
.PHONY: sb-show-sizes sb-check-sections-total

single-book: sb-pre-${ACTION} sb-recurse sb-pos-${ACTION}

sb-recurse: dat/${SMP} exp/${SMP}
	for sec in ${SECTIONS} tot.1; do \
          ${MAKE} -R -f ${MAKEFILE} \
            BOOK=${BOOK} \
            SEC=$$sec \
            ACTION=${ACTION} \
            single-section; \
        done

sb-pre-data:  dat/${BOOK_SECTION_LIST} dat/${BOOK_SECTION_OK_LIST}

sb-pos-data: sb-show-sizes  sb-check-sections-total \
               dat/${BOOK_RGB_CTS}.tex \
               dat/${BOOK_SUMM}.tex

# Report sizes of various files:
sb-show-sizes:
	@dicio-wc ${foreach S,${SECTIONS} tot.1,dat/${SMP}/${S}/raw.evt}
	@dicio-wc ${foreach S,${SECTIONS} tot.1,dat/${SMP}/${S}/raw.tlw}
	@dicio-wc ${foreach S,${SECTIONS} tot.1,dat/${SMP}/${S}/raw.wfr} \
          | gawk '/./{ printf "    %8s %s\n", $$1,$$4;}' 
	@dicio-wc ${foreach S,${SECTIONS} tot.1,dat/${SMP}/${S}/gud.wfr} \
          | gawk '/./{ printf "    %8s %s\n", $$1,$$4;}' 
	@dicio-wc ${foreach S,${SECTIONS} tot.1,dat/${SMP}/${S}/bad.wfr} \
          | gawk '/./{ printf "    %8s %s\n", $$1,$$4;}'

# Check whether counts add up:
sb-check-sections-total: 
	@printf "\n%-24s" "dat/${SMP}/*/raw.wfr: "
	@cat ${foreach S,${SECTIONS},dat/${SMP}/${S}/raw.wfr} /dev/null \
          | gawk '/./{t += $$1;} END{print t}' 
	@printf "%-24s" "dat/${SMP}/tot.1/raw.wfr: "
	@cat dat/${SMP}/tot.1/raw.wfr \
          | gawk '/./{t += $$1;} END{print t}' 
        
# Export TEX files for the technical report:
sb-pre-export:
sb-pos-export:
	update-paper-include dat/${BOOK_RGB_CTS}.tex exp/${BOOK_RGB_CTS}.tex
	update-paper-include dat/${BOOK_SUMM}.tex    exp/${BOOK_SUMM}.tex

sb-pre-clean:
sb-pos-clean:
	-rm -f dat/${BOOK_SECTION_LIST} dat/${BOOK_SECTION_OK_LIST}

# Create list of sections for this book:
dat/${BOOK_SECTION_LIST}: dat/${SMP} ${MAKERULES}
	echo "${SECTIONS}" \
          | tr ' ' '\012' \
          > dat/${BOOK_SECTION_LIST}

# Create list of "OK" sections for this book:
dat/${BOOK_SECTION_OK_LIST}: dat/${BOOK_SECTION_LIST}
	cat dat/${BOOK_SECTION_LIST} \
          | egrep -v '^(unk|xxx)' \
          > dat/${BOOK_SECTION_OK_LIST}

# Count tokens and words per section, output as text files:
dat/${BOOK_RGB_CTS}.txt: ${MAKERULES} dat/${SMP} \
	    count-raw-gud-bad-toks-wrds
	@echo " "
	@echo "      Good/bad statistics for ${SMP}:"
	@echo " "
	count-raw-gud-bad-toks-wrds \
            dat/${SMP} ${SECTIONS} / tot.1 \
          > dat/${BOOK_RGB_CTS}.txt
	cat dat/${BOOK_RGB_CTS}.txt \
          | sed -e 's:/::g' -e 's/^/      /' 

# Format the counts as TeX source files:
dat/${BOOK_RGB_CTS}.tex: ${MAKERULES} dat/${BOOK_RGB_CTS}.txt \
	    tex-format-raw-gud-bad-counts
	@echo " "
	@echo " dat/${BOOK_RGB_CTS}.txt -> dat/${BOOK_RGB_CTS}.tex"
	@echo " "
	cat dat/${BOOK_RGB_CTS}.txt \
	  | tex-format-raw-gud-bad-counts \
	  > dat/${BOOK_RGB_CTS}.tex

# Format a summary of the counts as a TeX file:
dat/${BOOK_SUMM}.tex: ${MAKERULES} dat/${BOOK_RGB_CTS}.txt \
	    tex-format-raw-gud-bad-summary
	@echo " "
	@echo " dat/${BOOK_RGB_CTS}.txt -> ${BOOK_SUMM}.tex"
	@echo " "
	cat dat/${BOOK_RGB_CTS}.txt \
	  | tex-format-raw-gud-bad-summary \
              -v smp=${LANG}${BOOK} \
	  > dat/${BOOK_SUMM}.tex

endif
# End of ${ACTION}/${BOOK}/${SECTIONS} rules
######################################################################

######################################################################
# Rules for given ${ACTION}/${BOOK}/${SEC} where
#   ${SEC} is a specific section (possibly "tot.1").
#
SEC := SEC.IS.UNDEFINED
ifneq "${SEC}" "SEC.IS.UNDEFINED"
        
SMPSEC := ${SMP}/${SEC}
        
dat/${SMPSEC}: ; mkdir dat/${SMPSEC}
exp/${SMPSEC}: ; mkdir exp/${SMPSEC}

PRS_UTYPES := parags,starred-parags,circular-lines,circular-text,radial-lines,titles
LAB_UTYPES := labels,words

# Define ${SOURCE_EVT} and ${TRANS_TAG}:

ifeq "${BOOK}" "tak"
  ifeq "${SEC}" "tot.1"
    SOURCE_EVT := work/L16+H-eva/text16e6.evt
  else
    SOURCE_EVT := SOURCE_EVT.NOT.DEFINED
  endif
  TRANS_TAG := H
else
  ifeq "${SEC}" "tot.1"
    SOURCE_EVT := work/Notes/045/only-m.evt
  else
    SOURCE_EVT := work/Notes/045/sections-m/${SEC}.evt
  endif
  TRANS_TAG := A
endif

# Define ${UTYPES} and ${LINE_SEL}:

ifeq "${BOOK}" "tak"
  UTYPES := ${PRS_UTYPES},${LAB_UTYPES}
  OWN_EVT := YES
  LINE_SEL := 
endif

ifeq "${BOOK}" "maj"
  UTYPES := ${PRS_UTYPES},${LAB_UTYPES}
  OWN_EVT := YES
  LINE_SEL := 
endif

ifeq "${BOOK}" "prs"
  UTYPES := ${PRS_UTYPES}
  OWN_EVT := YES
  LINE_SEL := 
endif

ifeq "${BOOK}" "lab"
  UTYPES := ${LAB_UTYPES}
  OWN_EVT := YES
  LINE_SEL := 
endif

ifeq "${BOOK}" "ini"
  UTYPES := ${PRS_UTYPES}
  OWN_EVT := NO
  LINE_SEL := -v omitMedial=1 -v omitFinal=1
endif

ifeq "${BOOK}" "mid"
  UTYPES := ${PRS_UTYPES}
  OWN_EVT := NO
  LINE_SEL := -v omitInitial=1 -v omitFinal=1
endif

ifeq "${BOOK}" "fin"
  UTYPES := ${PRS_UTYPES}
  OWN_EVT := NO
  LINE_SEL := -v omitInitial=1 -v omitMedial=1
endif

ifeq "${OWN_EVT}" "YES"
  
  # Create a private copy of the EVT file, with specified
  # units and section, converting all weirdos to basic 
  # EVA chars, or to "*" if impossible. 

  RAW_EVT := dat/${SMPSEC}/raw.evt

  ${RAW_EVT}: ${SOURCE_EVT} ${MAKERULES} \
              ./basify-weirdos ./select-units ${UTYPE_TBL}
	@echo "${SOURCE_EVT} -> ${RAW_EVT}"
	cat ${SOURCE_EVT} \
          | egrep -v '[;][^'"${TRANS_TAG}"'][>]' \
          | sed -e 's/[&][*!][*!][*!][*!;]/*!!!!/g' \
          | ./basify-weirdos \
          | ./select-units \
              -v types="${UTYPES}" \
              -v table=${UTYPE_TBL} \
          > ${RAW_EVT}
          
else
  
  # Use the EVT file previously created for the "prs" book,
  # and this same section.  Assumes that the "prs" 
  # EVT file contains the same units as ${BOOK} should.
  
  .PHONY: make-prs

  RAW_EVT := dat/${LANG}/prs/${SEC}/raw.evt
  
  ${RAW_EVT}: make-prs
  
  make-prs:
	${MAKE} -R -f ${MAKEFILE} ACTION=data \
          BOOK=prs SEC=${SEC} ${RAW_EVT}
endif
        
#----------------------------------------------------------------------
# The following section updates the following files:
#
# Raw words file, where each entry has the format {TYPE LOC WORD}
# The {TYPE} field is "a"=alpha, "p"=punct, "s"=symbol
# The {LOC} field has the format {"{" FNUM "}{" UNIT "}{" LINE "}"}
RAW_TLW := dat/${SMPSEC}/raw.tlw
#
# The "gud"/"bad" subsets of ${RAW_TLW}, defined by {select-gud-bad-voyn-words}:
GUD_TLW := dat/${SMPSEC}/gud.tlw
BAD_TLW := dat/${SMPSEC}/bad.tlw
#
# The corresponding word counts and frequencies:
RAW_WFR := dat/${SMPSEC}/raw.wfr
GUD_WFR := dat/${SMPSEC}/gud.wfr
BAD_WFR := dat/${SMPSEC}/bad.wfr
#----------------------------------------------------------------------

# Define the list of files to create:
DERIVED_FILES := \
  ${RAW_TLW} ${GUD_TLW} ${BAD_TLW} \
  ${RAW_WFR} ${GUD_WFR} ${BAD_WFR}
ifeq "OWN_EVT" "YES"
  DERIVED_FILES := ${RAW_EVT} ${DERIVED_FILES}
endif

.PHONY: single-section ss-${ACTION}

single-section: dat/${SMPSEC} exp/${SMPSEC} ss-${ACTION}

ss-data: ${DERIVED_FILES}

ss-export: 

ss-clean: 
	-rm -f ${DERIVED_FILES}

# Rule to extract raw token stream with locations from EVT file
${RAW_TLW}:  ${RAW_EVT} ${MAKERULES} \
              ./words-from-evt
	@echo "${RAW_EVT} -> ${RAW_TLW}"
	cat ${RAW_EVT} \
          | ./words-from-evt \
              -v showParags=1 \
              ${LINE_SEL} \
              -v showLocation=1 \
          | gawk \
              ' BEGIN { c = "{f0}{P0}{0}"; } \
                /^ *$$/{ print "# ="; next; } \
                /./{ c = ( "{" $$1 "}" );  gsub(/[.;]/, "}{", c); } \
                /[*?]/{ print "s", c, $$2; next; } \
                /./{ print "a", c, $$2; next; } \
              ' \
          > ${RAW_TLW}

# Rule to extract the good words:
${GUD_TLW}: ${RAW_TLW} ${MAKERULES} \
              ./select-gud-bad-voyn-words
	@echo "${RAW_TLW} -> ${GUD_TLW}"
	cat ${RAW_TLW} \
          | ./select-gud-bad-voyn-words \
              -v field=3 \
              -v smp=${SMP} \
              -v sec=${SEC} \
              -v writeGud=1 -v writeBad=0 \
          > ${GUD_TLW}

# Rule to extract the bad words:
${BAD_TLW}: ${RAW_TLW} ${MAKERULES} \
              ./select-gud-bad-voyn-words
	@echo "${RAW_TLW} -> ${BAD_TLW}"
	cat ${RAW_TLW} \
          | ./select-gud-bad-voyn-words \
              -v field=3 \
              -v smp=${SMP} \
              -v sec=${SEC} \
              -v writeGud=0 -v writeBad=1 \
          > ${BAD_TLW}
  
# Rule to count word ocurrences and compute their rel. frequencies:
%.wfr: %.tlw ${MAKERULES} \
              ./compute-freqs
	@echo "$*.tlw -> $*.wfr"
	cat $*.tlw \
          | egrep -v '^ *([\#]|$$)' \
          | gawk '/./ { print $$3; }' \
          | sort | uniq -c | expand \
          | sort -b +0 -1nr +1 -2 \
          | ./compute-freqs \
          > $*.wfr

endif
# End ${ACTION}/${BOOK}/${SEC} rules
######################################################################

endif
# End ${ACTION}/${BOOK} rules
######################################################################

endif
# End ${ACTION} rules
######################################################################