#! /bin/sed -f # Last edited on 1998-07-16 02:46:20 by stolfi # # Factors pinyin words with numeric tones into # its constituents C{V:T}F where # # C is a consonant (single or digraph) or '@' for none # V is a vowel group # T is a numric tone or '0' for neutral # F is a final 'r' 'n' 'ng' or '@' for none. # # Ignore comments: /^[#]/b # # Provide a default "e" vowel for isolated "r": s/^r$/er/ s/^r\([^a-z]\)/er\1/ s/\([^a-zü]\)r$/\1er/g s/\([^a-zü]\)r\([^a-zü]\)/\1er\2/g # # Mark off the vowel group: s/\([aeiouüyw][aeiouü]*\)/{\1}/g # # Provide '@' for empty consonant: s/^{/@{/ s/\([^@a-z]\){/\1@{/g # # Provide '@' for empty final: s/}$/}@/ s/}\([^@a-z]\)/}@\1/g # # Unravel cryptic "ü"s s/\([jqx]\){u/\1{ü/g s/@{yu/@{ü/g # # Normalize "y" "w" s/@{yi/@{i/g s/@{y/@{i/g s/@{wu/@{u/g s/@{w/@{u/g # # Unravel contracted "iu" "ui" "un" s/\([^@]\){iu}/\1{iou}/g s/\([^@]\){ui}/\1{uei}/g s/\([^@]\){u}n/\1{ue}n/g # # Move the tone closer to the vowel group: s/}\([nrg@]*\)\([0-4]\)/:\2}\1/g # # Provide a default zero tone: s/\([^0-4]\)}/\1:0}/g #