Decided to create another error-tolerant encoding even more "lossy" than HOP. This one collapses FSG A with O, R with 2, S with T. Also ignore spaces (periods): --- fsg2ecc ------------------------ #! /n/gnu/bin/gawk -f # Recoding an interlinear file from the FSG alphabet to # my Super-Lossy Fault-Tolerant encoding BEGIN { print "# Output of fsg2ecc - Stolfi's Semi-Analytic Fault-Tolerant alphabet" } /^ *$/ { print; next } /^ *#/ { print; next } /^<[^>.;]*>/ { print; next } /^<[^>]*\.[^>]*;[A-Z]> / { curtxt = substr($0,20) # We discard "%" and "!" since the conversion # will destroy synchronism anyway. gsub(/[%!]/, "", curtxt); # We also discard spaces ("." in the evt format), # since they are not reliable gsub(/[.]/, "", curtxt); # First, the conversion from FSG to JSA (Stolfi's super-analytic) gsub(/IIIK/, "iiiij", curtxt); gsub(/IIIL/, "iiiiu", curtxt); gsub(/IIIR/, "iiiis", curtxt); gsub(/IIIE/, "iiiix", curtxt); gsub(/IIE/, "iiix", curtxt); gsub(/IIR/, "iiis", curtxt); gsub(/IIK/, "iiij", curtxt); gsub(/HZ/, "cqjc", curtxt); gsub(/PZ/, "cqgc", curtxt); gsub(/DZ/, "cljc", curtxt); gsub(/FZ/, "clgc", curtxt); gsub(/IE/, "iix", curtxt); gsub(/IR/, "iis", curtxt); gsub(/IK/, "iij", curtxt); gsub(/2/, "cs", curtxt); gsub(/4/, "q", curtxt); gsub(/6/, "cj", curtxt); gsub(/7/, "ig", curtxt); gsub(/8/, "cg", curtxt); gsub(/A/, "ci", curtxt); gsub(/C/, "c", curtxt); gsub(/D/, "lj", curtxt); gsub(/E/, "ix", curtxt); gsub(/F/, "lg", curtxt); gsub(/G/, "cy", curtxt); gsub(/H/, "qj", curtxt); gsub(/I/, "i", curtxt); gsub(/K/, "ij", curtxt); gsub(/L/, "iu", curtxt); gsub(/M/, "iiiu", curtxt); gsub(/N/, "iiu", curtxt); gsub(/O/, "o", curtxt); gsub(/P/, "qg", curtxt); gsub(/R/, "is", curtxt); gsub(/S/, "cc", curtxt); # Was "csc" in JSA gsub(/T/, "cc", curtxt); gsub(/V/, "?", curtxt); gsub(/Y/, "?", curtxt); # Now, the conversion from JSA to ECC: gsub(/[ql]j/, "H", curtxt); gsub(/[ql]g/, "P", curtxt); gsub(/ij/, "k", curtxt); gsub(/ii*x/, "e", curtxt); gsub(/is/, "r", curtxt); gsub(/iiu/, "n", curtxt); gsub(/y/, "i", curtxt); gsub(/ci/, "a", curtxt); gsub(/cg/, "8", curtxt); gsub(/cs/, "r", curtxt); gsub(/ii*r/, "w", curtxt); gsub(/i*n/, "m", curtxt); gsub(/a/, "o", curtxt); print (substr($0,1,19) curtxt); next } ------------------------------------ cat bio-m-evt.evt \ | fsg2ecc \ > bio-m-ecc.evt cat bio-m-ecc.evt \ | make-consensus-interlin \ > bio-x-ecc.evt cat bio-x-ecc.evt \ | egrep '^<.*;J> ' \ | sed \ -e 's/{[^}]*}//g' \ > bio-j-ecc.evt extract-words-from-interlin \ -chars "8coqHPemrwk" \ bio-j-ecc.evt \ bio-j-ecc lines words bytes file ------ ------- --------- ------------ 1605 1605 35644 bio-j-ecc.wds 767 767 33204 bio-j-ecc.dic 333 333 13811 bio-j-ecc-gut.wds 333 333 13811 bio-j-ecc-gut.dic 840 840 2445 bio-j-ecc-fun.wds 2 2 5 bio-j-ecc-fun.dic 432 432 19388 bio-j-ecc-bad.wds 432 432 19388 bio-j-ecc-bad.dic Here are the statistics. Keep in mind that spaces were deleted, and here " " means line break. Digraph counts: TT 8 c o q H P e m r w k ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 333 . 39 15 51 89 24 38 11 . 66 . . 8 1166 4 2 92 1052 9 2 . 4 . 1 . . c 4351 1 909 2389 585 1 183 18 232 3 30 . . o 3864 189 113 211 261 576 972 41 683 402 384 10 22 q 728 . . 10 718 . . . . . . . . H 1347 . 2 853 484 . . . 5 1 2 . . P 109 . 1 75 33 . . . . . . . . e 958 64 67 360 224 29 162 10 18 . 24 . . m 406 24 24 188 148 13 1 . 2 . 6 . . r 517 31 9 153 302 11 3 2 3 . 3 . . w 10 . . 5 5 . . . . . . . . k 22 20 . . 1 . . . . . 1 . . ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- TOT 13811 333 1166 4351 3864 728 1347 109 958 406 517 10 22 Next-symbol probability (× 99): 8 c o q H P e m r w k -- -- -- -- -- -- -- -- -- -- -- -- . 12 4 15 26 7 11 3 . 20 . . c . 21 54 13 . 4 . 5 . 1 . . o 5 3 5 7 15 25 1 17 10 10 . 1 8 . . 8 89 1 . . . . . . . q . . 1 98 . . . . . . . . H . . 63 36 . . . . . . . . P . 1 68 30 . . . . . . . . w . . 50 50 . . . . . . . . e 7 7 37 23 3 17 1 2 . 2 . . m 6 6 46 36 3 . . . . 1 . . r 6 2 29 58 2 1 . 1 . 1 . . k 90 . . 5 . . . . . 5 . . -- -- -- -- -- -- -- -- -- -- -- -- TOT 2 8 31 28 5 10 1 7 3 4 0 0 Note that "e", "m", and "r" have become more similar. It is curious that "8" and "q" have very similar next-symbol statistics. Also curious that P and H become identical... Previous-symbol probability (× 99): TT w k m e H P q r 8 c o -- -- -- -- -- -- -- -- -- -- -- -- -- 2 . . . . 1 2 35 12 13 3 . 1 o 28 56 99 99 98 71 71 37 78 74 10 5 7 c 31 . . . 1 24 13 16 . 6 77 54 15 8 8 1 . . . . . . 1 . . 2 27 q 5 . . . . . . . . . . . 18 H 10 . . . . 1 . . . . . 19 12 P 1 . . . . . . . . . . 2 1 e 7 19 . . . 2 12 9 4 5 6 8 6 m 3 7 . . . . . . 2 1 2 4 4 r 4 9 . . . . . 2 1 1 1 3 8 w 0 . . . . . . . . . . . . k 0 6 . . . . . . . . . . . -- -- -- -- -- -- -- -- -- -- -- -- -- Symbol entropy: 2.693 An encouraging sign: with this encoding, all labels in f77v can be found in the text of the bio section, hand B. Let's try to discern word/syllabe boundaries from the line breaks, in this reduced encoding: cat bio-j-ecc-gut.wds \ | tr -d '\012' \ | enum-ngraphs -v n=2 \ | egrep -v '\*' \ > .bio-j-ecc-tt-2.grm cat .bio-j-ecc-tt-2.grm \ | sed -e 's/^\(.\)\(.\)$/\1:\2/g' \ > .bio-j-ecc-tt-1-1.grm cat .bio-j-ecc-tt-1-1.grm \ | sort | uniq -c | expand \ | compute-freqs \ > .bio-j-ecc-tt-1-1.frq Digraph frequencies around line breaks, ignoring spaces: cat bio-j-ecc-gut.wds \ | sed -e 's/^\(..\).*\(..\)$/\1\2/g' \ | tr -s '\012' ':' \ | enum-ngraphs -v n=3 \ | egrep -v '\*' \ | egrep '^.:.$' \ > .bio-j-ecc-nl-1-1.grm cat .bio-j-ecc-nl-1-1.grm \ | sort | uniq -c | expand \ | compute-freqs \ > .bio-j-ecc-nl-1-1.frq compare-freqs \ .bio-j-ecc-tt-1-1.frq \ .bio-j-ecc-nl-1-1.frq \ | compute-count-ratio \ -v nmin=10 -v mw=10 -v mc=40 \ | sort +0.0 -0.2r +4 -5nr \ > .bio-j-ecc-tt-nl-1-1.cmp cat .bio-j-ecc-tt-nl-1-1.cmp \ | print-pattern-classes \ -v rowchars='co8qHPwemrk' \ -v colchars='co8qHPwemrk' Pattern classes: c o 8 q H P w e m r k -- -- -- -- -- -- -- -- -- -- -- c | -- -- -- -? -- -- . -- -? -- . q | -- -- . . . . . . . . . H | -- -- -? . . . . -? -? -? . P | -- -- -? . . . . . . . . w | -? -? . . . . . . . . . 8 | -- -- -? || -? -? . -? . -? . o | -- || || ++ -- || -- -- -- ++ -- e | -- -- || || -- || . || . || . m | -- -- || || -? +? . +? . ## . r | -- -- || || +? +? . -? . +? . k | -? +? +? +? . +? . -? . +? . Fixing the count ratio and classification as in previous manual classification experiment: --- compute-count-ratio-new ------------------------ #! /n/gnu/bin/gawk -f # # Usage: "$0 -v nmin=NNN -v mw=N.NNN mc=N.NNN # # Computes the ratio of two counts for a list of patterns. # The input must be the output of compare-freqs, in the # format " NT FT NL FL patt", where "NT","NL" are # two counts, and "FT","FL" the corresponding relative # frequencies. The output will have the format # " NT FT NL FL rat mk patt" where "rat=(NL)/(NT+2)". # # The "mk" field is a class code, assigned based on the # ratio and its certainty, and the parameters "mw", "mc", # and "nmin", as follows: function classify(NT, NL, ratio, nmin, mw, mc) { if (ratio >= 1.0/mw) { if (NT >= nmin) { return "++" } # Probably word break else { return "+?" } # unimportant but looks more like a word break } else if (ratio >= 0.005) { if (NL >= nmin) { return "::" } # possible syllabe break else { return ":?" } # uncertain but looks more like syllabe break } else { if (2*NT < mc) { return "??" } # too rare, can't tell else if (NT < 2*mc) { return "-?" } # uncertain but looks more like non-break else { return "--" } # non-break } } /^##/ { $0 = substr($0, 3); printf "##%11.11s %11.11s RelFr MK %s\n", $1, $2, $3; next } /^# / { $0 = substr($0, 3); printf "# %11.11s %11.11s ----- -- %s\n", $1, $2, $3; next } /[0-9]\.[0-9]/ { if (mw == 0) { print "must define mw" > "/dev/stderr"; exit 1; } if (mc == 0) { print "must define mc" > "/dev/stderr"; exit 1; } if (nmin == 0) { print "must define nmin" > "/dev/stderr"; exit 1; } NT = $1 NL = $3 rat = (NL/(NT+2)); mark = classify(NT, NL, rat, nmin, mw, mc) printf " %5d %5.3f %5d %5.3f %6.3f %s %s\n", $1, $2, $3, $4, rat, mark, $5; next } ---------------------------------------------------- compare-freqs \ .bio-j-ecc-tt-1-1.frq \ .bio-j-ecc-nl-1-1.frq \ | compute-count-ratio-new \ -v nmin=5 -v mw=8 -v mc=40 \ | sort +0.0 -0.2r +4 -5nr \ > .bio-j-ecc-tt-nl-1-1-new.cmp cat .bio-j-ecc-tt-nl-1-1-new.cmp \ | print-pattern-classes \ -v rowchars='qHPwco8rekm' \ -v colchars='mwkco8eHPqr' m w k c o 8 e H P q r -- -- -- -- -- -- -- -- -- -- -- q | . . . ?? -- . . . . . . H | ?? . . -- -- ?? ?? . . . ?? P | . . . -? -? ?? . . . . . w | . . . ?? ?? . . . . . . c | ?? . . -- -- -- -- -- ?? +? -? o | -- ?? -? :: :: ++ :: :: ++ :: :: 8 | . . . -- -- ?? ?? ?? +? ++ +? r | . . . :? :? ++ ?? ++ ++ ++ ++ e | . . . :? :: :: ++ :? ++ ++ ++ k | . . . +? +? +? +? . +? ++ ++ m | . . . -- :? :? +? ?? +? ++ ++ Non-breaks: [qHPw]:. .:[mwk] [c]:[co8eHPr] [8]:[co] [m]:[c] "Word" breaks: [8rk]:[8] [8erkm]:[eHPqr] [o]:[8P] [k]:[co] Possible "Syllabe" breaks: all else. Recomputing with mw=5 instead of 8: compare-freqs \ .bio-j-ecc-tt-1-1.frq \ .bio-j-ecc-nl-1-1.frq \ | compute-count-ratio-new \ -v nmin=5 -v mw=5 -v mc=40 \ | sort +0.0 -0.2r +4 -5nr \ > .bio-j-ecc-tt-nl-1-1-new.cmp cat .bio-j-ecc-tt-nl-1-1-new.cmp \ | print-pattern-classes \ -v rowchars='qHPwco8rekm' \ -v colchars='mwkco8eHPqr' m w k c o 8 e H P q r -- -- -- -- -- -- -- -- -- -- -- q | . . . ?? -- . . . . . . H | ?? . . -- -- ?? ?? . . . ?? P | . . . -? -? ?? . . . . . w | . . . ?? ?? . . . . . . c | ?? . . -- -- -- -- -- ?? +? -? o | -- ?? -? :: :: :: :: :: ++ :: :: 8 | . . . -- -- ?? ?? ?? +? :? +? e | . . . :? :: :: :? :? ++ ++ ++ r | . . . :? :? ++ ?? ++ ++ ++ ++ k | . . . +? +? +? +? . +? ++ ++ m | . . . -- :? :? +? ?? +? ++ ++ Non-breaks: [qHPw]:. .:[mwk] [c]:[Pr] [8]:[co] [m]:[c] "Word" breaks: [8erkm]:[eHPqr] [8]:[8] [rkm]:[o8] [k]:[c] Possible "Syllabe" breaks: all else (should check digraphs). Overall tetragram frequencies: cat bio-j-ecc-gut.wds \ | tr -d ' \012' \ | enum-ngraphs -v n=4 \ | egrep -v '\*' \ | sed \ -e 's/^\(..\)\(..\)$/\1:\2/g' \ > .bio-j-ecc-gut-tt-2-2.grm cat .bio-j-ecc-gut-tt-2-2.grm \ | egrep -v '[qHPw]:.|.:[mwk]|[c]:[co8eHPr]|[8]:[co]|[m]:[c]' \ | egrep -v '[8rk]:[8]|[8erkm]:[eHPqr]|[o]:[8P]|[k]:[co]' \ | sort | uniq -c | expand \ | compute-freqs \ > .bio-j-ecc-gut-tt-2-2.frq Tetragram frequencies around line breaks, ignoring spaces: cat bio-j-ecc-gut.wds \ | sed -e 's/^\(..\).*\(..\)$/\1\2/g' \ | tr -s '\012' ':' \ | enum-ngraphs -v n=5 \ | egrep -v '\*' \ | egrep '^..:..$' \ > .bio-j-ecc-gut-nl-2-2.grm cat .bio-j-ecc-gut-nl-2-2.grm \ | egrep -v '[qHPw]:.|.:[mwk]|[c]:[co8eHPr]|[8]:[co]|[m]:[c]' \ | egrep -v '[8rk]:[8]|[8erkm]:[eHPqr]|[o]:[8P]|[k]:[co]' \ | sort | uniq -c | expand \ | compute-freqs \ > .bio-j-ecc-gut-nl-2-2.frq Comparisons: compare-freqs \ .bio-j-ecc-gut-tt-2-2.frq \ .bio-j-ecc-gut-nl-2-2.frq \ | compute-count-ratio-new \ -v nmin=5 -v mw=8 -v mc=40 \ | sort +0.0 -0.2r +4 -5nr \ > .bio-j-ecc-gut-tt-nl-2-2-new.cmp cat .bio-j-ecc-gut-tt-nl-2-2-new.cmp \ | print-pattern-classes oc cc 8o 8c oH oP oe or om o8 oq oo ok ow qo qc ro rq Ho Hc eo ec rc e8 eq er ee eH eP r8 rH rP re rr ce cH cP cm co He H8 Hm 8P 8e 8r -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- oo | . ?? . . ?? . ?? . . . . . . . . . ?? . -? -- ?? -? ?? ?? ?? ?? ?? -? ?? . . ?? . . . ?? . . . . ?? . . . . qo | . ?? . . ?? . ?? ?? . . . . . . ?? . ?? ?? -- -- ?? -? ?? ?? ?? ?? ?? ?? ?? . . . . . . ?? . . . ?? . . . . . ko | . ?? . . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . wo | . . . . . . . . . . . . . . . . ?? . ?? ?? . . . . . . . ?? . . . . . . . . . . . . . . . . . Ho | +? ++ . . :? ?? ?? +? . . . . . . ++ +? :? ?? ?? +? ?? ?? -? ?? . . . ?? ?? ?? . . . . . . . . . . . . . . . Po | . ?? . . . . . . . . . . . . . . ++ . ?? ?? ?? ?? ?? . . . . ?? ?? . ?? . . . . ?? . . . . . . . . . eo | +? ++ . . +? . +? . ?? . . . . . ++ . ++ ?? :? -? ?? -? ?? ?? ?? ?? ?? ?? . ?? . ?? . ?? . . . . . . . . . . . mo | . ?? . . . . . . . . . . . . +? . +? +? -? -? ?? ?? ?? ?? ?? ?? ?? ?? ?? . . . . . . ?? . . . . . ?? . . . ro | +? ?? . . +? . +? . ?? +? . . . . +? . :? ?? ?? :? :? -? ?? ?? ?? ?? ?? -? ?? ?? ?? ?? ?? ?? . ?? . . . ?? . . . . . 8o | ++ :? . . :: ?? :? ?? ?? +? . ?? . . :: ?? ++ ?? ++ ++ ?? :? :? ?? ?? ?? . ?? ?? ?? ?? ?? . ?? ?? ?? +? . . . . . . . . co | +? :? . . :? ?? :? ?? . ?? . ?? ?? . :: ?? :? ?? :? :? -? :? :? ?? ?? ?? ?? -? ?? ?? ?? ?? . ?? . ?? ?? . . . . . . . . oe | ++ -- :? ++ :? ?? :? -? ?? ?? ?? ?? ?? . . . . . . . . . . . . . . . . . . . . . ?? ?? ?? . . . . . . . . om | ++ . :? :? :? ?? -? ?? . ?? ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ?? ?? or | ++ :? . . -? ?? :? -? -? ?? ?? ?? ?? ?? . . . . . . . . . . . . . . . . . . . . -? ?? ?? ?? . . . . . . . ce | +? :? :? :? ?? ?? :? ?? ?? ?? ?? ?? ?? . . . . . . . . . . . . . . . . . . . . . ?? ?? . . . . . . ?? . . Hc | . . . . . . . . . . . . . . +? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . cr | . ?? . . ?? . ?? ?? ?? . ?? ?? . . . . . . . . . . . . . . . . . . . . . . ?? . . . ?? . . . . . . er | . ?? . . . . ?? ?? ?? . . +? . . . . . . . . . . . . . . . . . . . . . . ?? . . . . . . . . . . kr | . . . . . . ?? ?? ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . mr | . . . . . . ?? ?? ?? . . . . . . . . . . . . . . . . . . . . . . . . . ?? . . . . . . . . . . rr | . ?? . . . . ?? ?? ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 8e | . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . He | . ?? . . . . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . cc | . . . . . . . . . . . . . . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ee | . ?? . . . . ?? ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ke | . . . . . . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . me | . ?? . . . . ?? . . ?? . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . re | . ?? . . . . . . . . . . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Hm | . . . . . . . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . cm | . . . . . . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 8r | . . . . . . ?? . . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Hr | . . . . . . . . ?? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- oc cc 8o 8c oH oP oe or om o8 oq oo ok ow qo qc ro rq Ho Hc eo ec rc e8 eq er ee eH eP r8 rH rP re rr ce cH cP cm co He H8 Hm 8P 8e 8r Note that :oH resembles :cH, could it be a mistreading? From this table, the only reasonably certain entries are "Word" boundary: eo:cc eo:ro eo:qo Ho:cc Ho:qo Po:ro 8o:oc 8o:ro 8o:Ho 80:Hc oe:oc oe:8c om:oc or:oc Non-boundary: oo:Hc qo:Ho qo:Hc oe:cc "Syllabe" boundary: 8o:qo 8o:oH co:qo We could extend these to "don't care" cases as follows: "Word" boundary: [HPerm8c]o:o[crm8] [HPemr]o:(cc|qo|qc) [emr]o:o[HPeqokw] [Pem8o]o:r[oq] 8o:H[oc] Ho:Hc (oe|om|or|ce):oc oe:8c Hc:qo "Syllabe" boundary: [HP8c]o:o[HPeqokw] [8c]o:(cc|qo|qc|ec|rc) Ho:ro eo:Ho ro:(ro|Hc|eo) co:(r[oq]|H[oc]) (om|or|ce):(cc|8o|8c) oe:8o o[em]:oH (oe|or|ce):oe Non-break: ([cekmr8H]r|oo|qo|ko|wo):.. ..:(e[8qreHP]|r[8HPer]|c[eHPmo]|8[Per]) ([HPem]o|oe|or|om|ce):(eo|ec|rc|8o|8c) [r8c]o:8[oc] (oe|om|or|ce):(o[Pr8mqokw]|q[oc]|r[oq]|H[oc]) (ro|Ho):(rq|Ho) (mo|Po):(Ho|Hc) ro:(ec|rc) co:eo eo:Hc om:oe or:oH ce:oH oe:cc cat bio-j-ecc-gut.wds \ | sed -e 's/\(.\)/\1 /g' -e 's/ $//g' \ | split-ecc-by-nl-patterns \ | split-ecc-by-nl-patterns \ | tr -d ' \-' | tr '+:' ' \-' \ > .bio-j-ecc-gut-split.ecc Here is a sample of the result: 8ocHcoe Hok ooHcco-eccco-Hce-8o-ccco-oHccco-qoHcc8o Pccc8o-qoHcc8o-oHomccc8o-qoHor-ccoe-oeccc8o-qoHo Pccc8o Hcc8o-qoHc8o-qoHc8o-qoHc8o-qoHc8o-qoHomoeccc8o rom qoHom qoe Hccoeo romccc8o r-o-eor-ccc8o-oHcc8o-qoHo Pccc8o-r-cccPcco-eccc8o ro 8ce-ccce-cco-Hoeccc8o-qoHok roecccc8o-qoeccc8o-qoe-o-Homccor ro-r-o-eo qoHccc8o-qoeccco-qoHo cccocHcco-qoHomor qoHomoe Hcco-qoe Ho-ro-romccccHcoeo r-oe 8omoecccoe-8omoe qoeo 8o ro 8o Hccc8o Pccc8o-qoHcco-r-o-e-oe-8owccccHco-qoe ecccc8o-qoHcc8oe-oeccc8o qo 8omccccHo qoHco-qoHomcccHo qoHce-8omccc8o-oHce-oeccc8o-oHo-r-o-eok roe Hc8o-oHce-8o roHo-oHo-roHo-r-oe Homoe Hc8o qoHc8o 8o-ccccHo qoHc8o-qoHcc8o-qoHccc8oe-oe qoHcc8o-qoHcc8o-qoHc8o-qoHc8o-qoHcc8oe-8o occc8o-qoHcc8o-qoHcc8o-oe Hcc8o-oHco-Hoe-8o 8ccc8o-qoHc8o-qoHcc8o-qoHcco-qoHcc8o 8or occc8o-cccHo-r-oe-8o-qoHomccHo-roHo-r-oe-8o Ditto, without "-"s: 8ocHcoe Hok ooHccoecccoHce8occcooHcccoqoHcc8o Pccc8oqoHcc8ooHomccc8oqoHorccoeoeccc8oqoHo Pccc8o Hcc8oqoHc8oqoHc8oqoHc8oqoHc8oqoHomoeccc8o rom qoHom qoe Hccoeo romccc8o roeorccc8ooHcc8oqoHo Pccc8orcccPccoeccc8o ro 8ceccceccoHoeccc8oqoHok roecccc8oqoeccc8oqoeoHomccor roroeo qoHccc8oqoecccoqoHo cccocHccoqoHomor qoHomoe Hccoqoe HororomccccHcoeo roe 8omoecccoe8omoe qoeo 8o ro 8o Hccc8o Pccc8oqoHccoroeoe8owccccHcoqoe ecccc8oqoHcc8oeoeccc8o qo 8omccccHo qoHcoqoHomcccHo qoHce8omccc8ooHceoeccc8ooHoroeok roe Hc8ooHce8o roHooHoroHoroe Homoe Hc8o qoHc8o 8occccHo qoHc8oqoHcc8oqoHccc8oeoe qoHcc8oqoHcc8oqoHc8oqoHc8oqoHcc8oe8o occc8oqoHcc8oqoHcc8ooe Hcc8ooHcoHoe8o 8ccc8oqoHc8oqoHcc8oqoHccoqoHcc8o 8or occc8occcHoroe8oqoHomccHoroHoroe8o I have split Landini's file into one chunk per page csplit \ --prefix 'chunk-' \ --suffix '%03d.evt' \ - '^# *$' '{*}' and then futher edited it manually, splitting each page into homogeneous "textual units" (all normal text, all labels, etc.) The files are L16/fNNN and L16/fNNN.L, where fNNN is the panel number (as in f85r1) and L is the location code within that panel. Files without location code contain general comments about the panel. See L16/README for a detailed description of the files and my editings.