# Makefile for creating a language sample (other than Voynichese)
# Last edited on 2023-05-10 09:03:41 by stolfi

MAKEFILE := lang-sample.make
MAKERULES := ${MAKEFILE}

BIN := ${STOLFIHOME}/bin

BANKDIR := ${STOLFIHOME}/projects/langbank
BANKBIN := ${BANKDIR}/tools

######################################################################
# Caller must define the following variables:
#  
#   SMP = name for text sample, e.g. "engl/wow".
#   SEC = section tag, e.g. "cos.2" or "tot.1".
#   SIZEOPT = name for the ".tlw" file, "all" or "raw".
#   GUDNUM = number of "good" words to take.
#
SMP := SMP.IS.UNDEFINED
SEC := SEC.IS.UNDEFINED
SIZEOPT := SIZEOPT.IS.UNDEFINED
GUDNUM := GUDNUM.IS.UNDEFINED
ifneq "${SMP}" "SMP.IS.UNDEFINED"
ifneq "${SIZEOPT}" "SIZEOPT.IS.UNDEFINED"
# 
#----------------------------------------------------------------------
# This makefile assumes that the following files exist:
# 
# Directory for files related to this sample
SMP_REP := dat
SMP_DIR := ${SMP_REP}/${SMP}
# 
# Makefile that creates the source word list:
SOURCE_DIR := ${SMP_DIR}/source
SOURCE_MKF := ${SOURCE_DIR}/Makefile
#
# Smp-specific filtering functions and word-mapping table:
SMP_AWK := ${SMP_DIR}/sample-fns.gawk
SMP_TBL := ${SMP_DIR}/word-map.tbl
#----------------------------------------------------------------------
# 
#----------------------------------------------------------------------
# This makefile ensures the existence of the following files:
# 
# Words from the whole source file. Each line must have the format
# {TYPE WORD} where {TYPE} is one of "#$@aspbn". The file must include
# section ids ({TYPE="$"}) and line nums ({TYPE="@"}).
SOURCE_WDS := ${SOURCE_DIR}/main.wds
# 
# Cleaned token streams for each section, with locators,
# in the format {TYPE LOC WD}.
# The "raw" streams include "*"s but not punctuation.
# The "gud" streams exclude "*"s and possibly other undesirable words.
# The "bad" streams are the complement of the "gud" ones.
# The function {is_good_word} in ${SMP_AWK} defines who is "gud".
RAW_TLW := ${SMP_DIR}/${SEC}/${SIZEOPT}.tlw
GUD_TLW := ${SMP_DIR}/${SEC}/gud.tlw
BAD_TLW := ${SMP_DIR}/${SEC}/bad.tlw
# 
# Word counts and frequencies for each section.
RAW_WFR := ${SMP_DIR}/${SEC}/raw.wfr
GUD_WFR := ${SMP_DIR}/${SEC}/gud.wfr
BAD_WFR := ${SMP_DIR}/${SEC}/bad.wfr
# 
# Raw text in filled-line format
RAW_WDF := ${SMP_DIR}/${SEC}/${SIZEOPT}.wdf
GUD_WDF := ${SMP_DIR}/${SEC}/gud.wdf
BAD_WDF := ${SMP_DIR}/${SEC}/bad.wdf
# 
DERIVED_FILES := \
  ${RAW_TLW} ${GUD_TLW} ${BAD_TLW} \
  ${RAW_WFR} ${GUD_WFR} ${BAD_WFR} \
  ${RAW_WDF} ${GUD_WDF} ${BAD_WDF}
#----------------------------------------------------------------------

all: make_source make_derived

make_source: ${SOURCE_MKF}
	cd ${SOURCE_DIR} && ${MAKE} main.wds

make_derived:
	${MAKE} -f ${MAKEFILE} \
            SMP=${SMP} \
            SEC=${SEC} \
            GUDNUM=${GUDNUM} \
          derived
        
derived: ${DERIVED_FILES}

# If there are sections other than "tot.1", the "tot.1" section
# must be created from them, in order to get the right
# number of words from each section.

PARTIAL_SECS := ${strip ${shell cat ${SMP_DIR}/sections.tags}}
ifeq "/${PARTIAL_SECS}/" "//"
  TOT_SEC := *NONE*
else
  TOT_SEC := tot.1
endif

######################################################################
# The following applies only to sections whose raw tokens are 
# to be extracted directly from the ${SOURCE_WDS} file:
ifneq "${SEC}" "${TOT_SEC}"

# Create a partial source file ${RAW_TLW} withd containing only tokens
# from section ${SEC}, and at most ${GUDNUM} good tokens.

${RAW_TLW}: ${SOURCE_WDS} ${MAKERULES} \
                ${SMP_AWK} \
                ${SMP_TBL} \
                ./wds-to-tlw
	@echo "PARTIAL_SECS = '${PARTIAL_SECS}'"
	@echo "TOT_SEC = '${TOT_SEC}'"
	ls -ld ${SOURCE_WDS}
	cat ${SOURCE_WDS}  \
	  | ./wds-to-tlw \
	      -f ${SMP_AWK} \
	      -v smp=${SMP} \
	      -v sec=${SEC} \
              -v table=${SMP_TBL} \
	      -v maxGud=${GUDNUM} \
	  > ${RAW_TLW}

endif
# End rules for sections extracted from the TLW file
######################################################################

######################################################################
# The following applies only for a section that is the union of others:
# 
ifeq "${SEC}" "${TOT_SEC}"

PARTIAL_SEC_DIRS := ${addprefix ${SMP_DIR}/,${PARTIAL_SECS}}
PARTIAL_TLWS := ${addsuffix /raw.tlw,${PARTIAL_SEC_DIRS}}

${RAW_TLW}: ${PARTIAL_TLWS} ${MAKERULES}
	@echo "PARTIAL_SECS = '${PARTIAL_SECS}'"
	@echo "TOT_SEC = '${TOT_SEC}'"
	cat ${PARTIAL_TLWS} \
	  | gawk -v maxGud=${GUDNUM} \
              ' BEGIN { na = 0; } \
                (na >= maxGud) { exit 0; } \
                //{ print; } \
                ($$1 == "a") { na++; } \
              ' \
	  > ${RAW_TLW}

endif
# End of code for a section that is the total of others
######################################################################
 
# Extract the good words:

${GUD_TLW}: ${RAW_TLW} ${MAKERULES} ${SMP_AWK}
	@echo "${RAW_TLW} -> ${GUD_TLW}"
	cat ${RAW_TLW} \
	  | gawk ' ($$1 == "a") { print } ' \
          > ${GUD_TLW}

# Extract the bad words:

${BAD_TLW}: ${RAW_TLW} ${MAKERULES} ${SMP_AWK}
	@echo "${RAW_TLW} -> ${BAD_TLW}"
	cat ${RAW_TLW} \
	  | gawk ' ($$1 == "s") { print } ' \
	  > ${BAD_TLW}

# Count word ocurrences and compute their rel. frequencies:

# Rule to compute word frequencis:

%.wfr: %.tlw  
	cat $*.tlw \
	  | gawk '// { print $$3; }' \
	  | sort | uniq -c | expand \
	  | sort -b +0 -1nr +1 -2 \
	  | compute-freqs \
	  > $*.wfr
	@echo "The 10 most common words in $*.tlw:"
	head -10 $*.wfr

# Generate filled text for display:

%.wdf: %.tlw  ./format-words-filled
	cat $*.tlw \
	  | gawk ' /^ *([\#]|$$)/ { next; } // { print $$3; } ' \
          | ./format-words-filled -v width=72 \
	  > $*.wdf
	@echo "Sample of $*.tlw:"
	head -20 $*.wdf
	@echo ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ."
	tail -3 $*.wdf

endif
endif
# End of code for whole makefile
# ======================================================================