# Makefile for creating a language sample (other than Voynichese) # Last edited on 2023-05-10 09:03:41 by stolfi MAKEFILE := lang-sample.make MAKERULES := ${MAKEFILE} BIN := ${STOLFIHOME}/bin BANKDIR := ${STOLFIHOME}/projects/langbank BANKBIN := ${BANKDIR}/tools ###################################################################### # Caller must define the following variables: # # SMP = name for text sample, e.g. "engl/wow". # SEC = section tag, e.g. "cos.2" or "tot.1". # SIZEOPT = name for the ".tlw" file, "all" or "raw". # GUDNUM = number of "good" words to take. # SMP := SMP.IS.UNDEFINED SEC := SEC.IS.UNDEFINED SIZEOPT := SIZEOPT.IS.UNDEFINED GUDNUM := GUDNUM.IS.UNDEFINED ifneq "${SMP}" "SMP.IS.UNDEFINED" ifneq "${SIZEOPT}" "SIZEOPT.IS.UNDEFINED" # #---------------------------------------------------------------------- # This makefile assumes that the following files exist: # # Directory for files related to this sample SMP_REP := dat SMP_DIR := ${SMP_REP}/${SMP} # # Makefile that creates the source word list: SOURCE_DIR := ${SMP_DIR}/source SOURCE_MKF := ${SOURCE_DIR}/Makefile # # Smp-specific filtering functions and word-mapping table: SMP_AWK := ${SMP_DIR}/sample-fns.gawk SMP_TBL := ${SMP_DIR}/word-map.tbl #---------------------------------------------------------------------- # #---------------------------------------------------------------------- # This makefile ensures the existence of the following files: # # Words from the whole source file. Each line must have the format # {TYPE WORD} where {TYPE} is one of "#$@aspbn". The file must include # section ids ({TYPE="$"}) and line nums ({TYPE="@"}). SOURCE_WDS := ${SOURCE_DIR}/main.wds # # Cleaned token streams for each section, with locators, # in the format {TYPE LOC WD}. # The "raw" streams include "*"s but not punctuation. # The "gud" streams exclude "*"s and possibly other undesirable words. # The "bad" streams are the complement of the "gud" ones. # The function {is_good_word} in ${SMP_AWK} defines who is "gud". RAW_TLW := ${SMP_DIR}/${SEC}/${SIZEOPT}.tlw GUD_TLW := ${SMP_DIR}/${SEC}/gud.tlw BAD_TLW := ${SMP_DIR}/${SEC}/bad.tlw # # Word counts and frequencies for each section. RAW_WFR := ${SMP_DIR}/${SEC}/raw.wfr GUD_WFR := ${SMP_DIR}/${SEC}/gud.wfr BAD_WFR := ${SMP_DIR}/${SEC}/bad.wfr # # Raw text in filled-line format RAW_WDF := ${SMP_DIR}/${SEC}/${SIZEOPT}.wdf GUD_WDF := ${SMP_DIR}/${SEC}/gud.wdf BAD_WDF := ${SMP_DIR}/${SEC}/bad.wdf # DERIVED_FILES := \ ${RAW_TLW} ${GUD_TLW} ${BAD_TLW} \ ${RAW_WFR} ${GUD_WFR} ${BAD_WFR} \ ${RAW_WDF} ${GUD_WDF} ${BAD_WDF} #---------------------------------------------------------------------- all: make_source make_derived make_source: ${SOURCE_MKF} cd ${SOURCE_DIR} && ${MAKE} main.wds make_derived: ${MAKE} -f ${MAKEFILE} \ SMP=${SMP} \ SEC=${SEC} \ GUDNUM=${GUDNUM} \ derived derived: ${DERIVED_FILES} # If there are sections other than "tot.1", the "tot.1" section # must be created from them, in order to get the right # number of words from each section. PARTIAL_SECS := ${strip ${shell cat ${SMP_DIR}/sections.tags}} ifeq "/${PARTIAL_SECS}/" "//" TOT_SEC := *NONE* else TOT_SEC := tot.1 endif ###################################################################### # The following applies only to sections whose raw tokens are # to be extracted directly from the ${SOURCE_WDS} file: ifneq "${SEC}" "${TOT_SEC}" # Create a partial source file ${RAW_TLW} withd containing only tokens # from section ${SEC}, and at most ${GUDNUM} good tokens. ${RAW_TLW}: ${SOURCE_WDS} ${MAKERULES} \ ${SMP_AWK} \ ${SMP_TBL} \ ./wds-to-tlw @echo "PARTIAL_SECS = '${PARTIAL_SECS}'" @echo "TOT_SEC = '${TOT_SEC}'" ls -ld ${SOURCE_WDS} cat ${SOURCE_WDS} \ | ./wds-to-tlw \ -f ${SMP_AWK} \ -v smp=${SMP} \ -v sec=${SEC} \ -v table=${SMP_TBL} \ -v maxGud=${GUDNUM} \ > ${RAW_TLW} endif # End rules for sections extracted from the TLW file ###################################################################### ###################################################################### # The following applies only for a section that is the union of others: # ifeq "${SEC}" "${TOT_SEC}" PARTIAL_SEC_DIRS := ${addprefix ${SMP_DIR}/,${PARTIAL_SECS}} PARTIAL_TLWS := ${addsuffix /raw.tlw,${PARTIAL_SEC_DIRS}} ${RAW_TLW}: ${PARTIAL_TLWS} ${MAKERULES} @echo "PARTIAL_SECS = '${PARTIAL_SECS}'" @echo "TOT_SEC = '${TOT_SEC}'" cat ${PARTIAL_TLWS} \ | gawk -v maxGud=${GUDNUM} \ ' BEGIN { na = 0; } \ (na >= maxGud) { exit 0; } \ //{ print; } \ ($$1 == "a") { na++; } \ ' \ > ${RAW_TLW} endif # End of code for a section that is the total of others ###################################################################### # Extract the good words: ${GUD_TLW}: ${RAW_TLW} ${MAKERULES} ${SMP_AWK} @echo "${RAW_TLW} -> ${GUD_TLW}" cat ${RAW_TLW} \ | gawk ' ($$1 == "a") { print } ' \ > ${GUD_TLW} # Extract the bad words: ${BAD_TLW}: ${RAW_TLW} ${MAKERULES} ${SMP_AWK} @echo "${RAW_TLW} -> ${BAD_TLW}" cat ${RAW_TLW} \ | gawk ' ($$1 == "s") { print } ' \ > ${BAD_TLW} # Count word ocurrences and compute their rel. frequencies: # Rule to compute word frequencis: %.wfr: %.tlw cat $*.tlw \ | gawk '// { print $$3; }' \ | sort | uniq -c | expand \ | sort -b +0 -1nr +1 -2 \ | compute-freqs \ > $*.wfr @echo "The 10 most common words in $*.tlw:" head -10 $*.wfr # Generate filled text for display: %.wdf: %.tlw ./format-words-filled cat $*.tlw \ | gawk ' /^ *([\#]|$$)/ { next; } // { print $$3; } ' \ | ./format-words-filled -v width=72 \ > $*.wdf @echo "Sample of $*.tlw:" head -20 $*.wdf @echo ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ." tail -3 $*.wdf endif endif # End of code for whole makefile # ======================================================================