#! /usr/bin/python3

import sys, re
import html_gen as h
from process_funcs import bash
import html_report_funcs as hr

last_edit = "Last edited on 2026-03-14 09:35:08 by stolfi"

def main():

  global last_edit
  
  title = "[096] Similarities between the \"languages\" of Herbal-A and Herbal-B"
  st = h.new_doc(title, "#eeffdd", text_width = 1400)
  
  h.section(st, 2, "Summary")

  h.parags(st, """There have been many studies of the differences between the word frequency distributions (WFDs) of different parts of the Voynich Manuscript (VMS).  Many of these studies have focused on character or digraph frequency distributions, which are of course narrow projections of the WFDs.  These distinct WFDs have been conventionally called <i>languages</i>, even though the differences do not seem to be anywhere near the differences one usually finds between the WFDs of different natural languages, or even between dialects of the same language.  
  
  We find that the lexicon of the Herbal section can be divided by simple morphological criteria into a couple dozen classes, such that the total frequency of each class is roughly the same in languages A and B.  Specifically, each class consists of all the words that yield the same string when some glyphs are mapped to 'D', some are mapped to 'K', and others are deleted.  Except for a few rarely occurring words, the result is a string of 'D's and 'K's, here called the <i>root</i> of the word.  Said another way, the change from language A to language B seems to preserve certain features of the words, codified by their roots.
  
  See <a href="report_096_glossary.html">this sub-page</a> for details on the input data files and definitions of special terms used in this note.""")
  
  h.section(st, 2, "Differences between languages")
  
  h.parags(st, """The following plot illustrates the difference between the two languages. Each point is a word that occurs in Herbal-A and/or Herbal-B.  The horizontal and vertical coordinates are the freqs of that word in the Herbal A ("hea") and Herbal B ("heb") sections, respectively.""")
  
  fig_wd_url = "st_words/hea_heb_freqs_plot.png"
  fig_wd_con = f"<a href=\"{fig_wd_url}\"><img src=\"{fig_wd_url}\" style=\"width: 100%;\"></a>"
  h.figure(st, fig_wd_con, caption = None, centered = False)
  
  h.parags(st, """This plot shows only words that occur at least once on Herbal-A and Herbal-B.   For this plot, the cleanup described <a href="report_096_glossary.html">here</a> was applied, and then words with invalid characters ('<tt>?</tt>') were excluded. 
  
  Note that only a few words occur with similar freqs in both sections: most notably <tt>dar</tt>, <tt>ol</tt>, <tt>dal</tt>, <tt>dam</tt>, <tt>saiin</tt>, <tt>otaiin</tt>.  The freqs of the most common word in both, <tt>daiin</tt> (443 occurrences in total) are also fairly similar.  (Other analyses suggest that <tt>daiin</tt> may be a keyword preceding a list of ndications or benefits of the plant.) The single-glyph words <tt>r</tt> and <tt>s</tt> are fairly similar too.  Other that those, most words have very different freqs in the two sections.  In particular, there is a large number of words that occur in only one of the sections, and are not shown in that plot.
  
  Many hypotheses have been advanced to explain this difference between the two WFDs.""")
  
  h.begin_enum(st, "ul")
  h.enum_item_parags(st, """The TOPIC hypothesis proposes the differences are caused by differences in topic, emphasis, style, etc.  For instance, the paragraphs in Herbal-A might include detailed instructions for the preparation and dosage, while those in Herbal-B omit those details and instead emphasize astrological or alchemical aspects of the plant.""")
  
  h.enum_item_parags(st, """The LANGUAGE hypothesis instead proposes that the plaintext of each section is in a different natural language.  Thus while both sections may use mostly the same abstract terms with similar frequencies, they are expressed and connected by different words.""")
  
  h.enum_item_parags(st, """The ENCODING hypothesis assumes that the language is the same and there is no major change of topic and style, but intead there is a change of the encoding of the words into glyphs.  If the text is not encrypted, this could be a radical change in the spelling system, with the reassignment of glyphs and glyph clusters to different sounds.  If the text is encrypted, this could be a change in the encryption algorithm that radically changed the encrypted form of most words.""")
  
  h.enum_item_parags(st, """The HOAX hypothesis maintains that some or all of the VMS text is the output of a random text generation process, and the difference between the WFDs of Herbal-A and Herbal-B is due to a sudden and drastic change in the algorithm or its parameters.  Or, alternatively, one of the two sections has meaningful text, while the other is  randomly generated.""")
  h.end_enum(st, "ul")
  
  h.parags(st, """It has been conjectured that the VMS may be encoded with a "codebook cipher", where each word is replaced a "code" -- a number or some other string of characters -- according to a "code book".  In this schema the codes would be assigned is some ad-hoc way, and the code of each word would have no simple relation to its sound or normal spelling.  While technically this would be a variant of the ENCODING hypothesis, its effect would be basically the same as that of the LANGUAGE one.""") 
  
  h.section(st, 2, "Word roots")
  
  h.parags(st, """The main point of this note is that the WFDs of the two sections become fairly similar if each word is reduced to its <i>root</i> by deleting certain glyphs, and mapping the remaining ones into two similarity classes.
  
  More specifically, the root of a word is obtained by first deleting the following glyphs and glyph combinations""")
  h.begin_enum(st, "ul")
  h.enum_item(st, "<tt>{a} {o} {y}</tt> (the <i>circles</i>)")
  h.enum_item(st, "<tt>{q}</tt>")
  h.enum_item(st, "<tt>{ch} {che} {sh} {she} {ee} {eee}</tt> (the <i>benches</i>)")
  h.enum_item(st, "<tt>{n} {in} {iin} {iiin} {m} {im} {iim} {ir} {iir} {</tt> (the <i>codas</i>)")
  h.end_enum(st, "ul")
  h.parags(st, """Then one maps to 'K' all the so-called <i>gallows</i>""")
  h.begin_enum(st, "ul")
  h.enum_item(st, "<tt>{k} {ke} {t} {te} {ckh} {ckhe} {cth} {cthe}</tt> (the <i>kites</i>)")
  h.enum_item(st, "<tt>{p} {f} {cph} {cphe} {cfh} {cfhe}</tt> (the <i>puffs</i>)")
  h.end_enum(st, "ul")
  h.parags(st, """And finally one maps to 'D' all the <i>dealers</i>""")
  h.begin_enum(st, "ul")
  h.enum_item(st, "<tt>{d} {l} {r} {s}</tt>")
  h.end_enum(st, "ul")
  h.parags(st, """Finally, if the result is the empty string (that is, if the word has no gallows or dealers), the root is '<tt>Z</tt>' by convention.
  
  Again, the roots are computed after the clean-up described <a href="report_096_glossary.html">here</a>.  Only roots that occur at least twice in both sections are shown. 
  
  It turns out that the frequencies of these <b>roots</b> are remarkably the same in both languages:""")
   
  fig_rt_url = "st_roots/hea_heb_freqs_plot.png"
  fig_rt_con = f"<a href=\"{fig_rt_url}\"><img src=\"{fig_rt_url}\" style=\"width: 100%;\"></a>"
  h.figure(st, fig_rt_con, caption = None, centered = False)
  
  h.parags(st, """This plot strongly suggests that the differences from one language to the other consist mainly of (1) unspecified changes in the deleted elements (circles, benches, codas, the {q} prefix, and the platforms of gallows), which may involve substitution and/or displacement and/or deletion and/or insertion of those elements; (2) replacement of gallows by other gallows, and (3) replacement of dealers by other dealers.   Apparently, the switch from language A to language B does <b>not</b> entail deletion or insertion of gallows or dealers, nor transpositions between dealers and gallows. 
  
  The plot not quite <i>prove</i> these constraints, because it could still be that, for instance, the switch implies deletion of some dealers from @KD words and insertions of dealers in @K words so that the total frequency of words with root @KD remains the same.  Still, such perfect compensation over all seems much less likely than the conservation of gallows and dealers """)

  h.section(st, 2, "Word classes")

  h.parags(st, """  
  The set of all the words whose root is {R}, denoted by {⟨R⟩}, is the <i>class</i> of {R}. Thus, for example, {⟨@KDD⟩} is all the words with exactly one gallows and two dealers somewhere after it; whrereas {⟨@DKD⟩} is all the words with one gallows, one dealer somewhere before the gallows, and one dealer somewhere after it.
  
  The following are the main word classes:""")
  
  path = Path("st_roots")
  files = path.glob(

  hr.links_section(st)

  h.output_doc(st, sys.stdout, 99, last_edit)
  return 0
  # ----------------------------------------------------------------------

main()