#! /usr/bin/gawk -f # Extracts significant characters from a text. # Last edited on 1998-07-12 08:26:49 by stolfi BEGIN { usage = ( \ "extract-signif-chars \\\n" \ " [ -v errors=CHARS ] \\\n" \ " [ -v ignore=CHARS ] \\\n" \ " [ -v blanks=CHARS ] \\\n" \ " [ -v normal=CHARS ] \\\n" \ " < INFILE > OUTFILE" \ ); # # Extracts significant characters from INFILE. Outputs a sequence # of fixed-format records of the form CLASS STR, # with no separation between the fields; where # STR is a substring of the input, and CLASS is its class, # according to this table: # # CLASS = 0: STR contains non-significant chars. # CLASS = 1: STR is a word separator. # CLASS = 2: STR is a paragraph separator. # CLASS = 3: STR contains a single significant char. # # The string STR has newlines turned into CRs ("\015"). # # Non-printable, non-blank ISO Latin-1 characters are always errors. # Letters (plain and accented) and digits are significant by # default. The other printable ISO Latin-1 characters, and # LF(#10), FF(#12), SP (#32), and NBSP (#160), are # word separators by default. # # These defaults can be overriden by the following option # variables (set with "-v"): # # "normal" (string) significant characters. # "errors" (string) invalid input characters. # "ignore" (string) characters to be simply ignored. # "blanks" (string) word separator characters. # # A string STR of "ignore"s and "blanks" containing at least two # newlines (provided they are not "ignore"d and not in "#"-comments) # will be a class 2 string. In any case an empty class 2 string will be # output before and after the whole text. # # Any other string STR of consecutive "ignore" and "blank" # characters with at least one "blank" is interpreted as a word # break (class 1). abort = -1; iso_blanks = "\012\014\040\240"; check_options(); init_buf(); } # We maintain a string buffer "buf" with all non-significant # characters seen since the last non-significant char. The variable # "numLines" counts newline characters in "buf", "numBlanks" counts # "blanks" characters. These are cooked at begin-text and end-text # to force a paragraph break output. function init_buf() { buf = ""; numBlanks = 0; numLines = 2; } function flush_buf( i,n,h,cl) { if (numLines > 1) { cl = 2; } else if (numBlanks > 0) { cl = 1; } else { cl = 0; } if ((cl != 0) || (buf != "")) { printf "%d%s\n", cl, encode(buf); } buf = ""; paraBrk = 0; numBlanks = 0; numLines = 0; } function encode(str) { gsub(/\n/, "\015", str); return str; } # processing an input character: function process_char(c, cl) { cl = class[c]; if (cl == -1) { error(("line " NR ": invalid character \"" c "\"")); } else if (cl == 0) { buf = (buf c); } else if (cl == 1) { buf = (buf c); numBlanks++; if (c == "\n") { numLines++; } } else { flush_buf(); printf "3%c\n", c; } } # Record processing: /^ *[#]/{ if (abort >= 0) { exit(abort); } buf = (buf $0 "\n"); next; } // { if (abort >= 0) { exit(abort); } if (match($0, /^]*> */)) { loc = substr($0, 1, RLENGTH); lin = substr($0, RLENGTH+1); } else { loc = ""; lin = $0; } buf = (buf loc); n = length(lin); for (i=1; i<=n; i++) { process_char(substr(lin,i,1)); } process_char("\n"); next; } # Flush any buffered input (forcing a paragraph break): END { if (abort >= 0) { exit(abort); } numLines = 2; flush_buf(); } function check_options( i,c,mk,ucs,lcs,uc,lc) { # Analyzes/defaults the option variables listed by "tup_parse_options()", # namely # # "normal" "ignore" "blanks" "breaks" # "filler" "breakLength" "lowercase" # # Defines the global variables # # "class" # # "mk[c]" is number of explicit definitions for character "c": split("", mk); # "class[c]" is the input class of character "c". # -1 - character is an error. # 0 - character is to be ignored. # 1 - character is a blank. # 3 - character is significant. split("", class); # --- default character classes -------------------------------- str = ( \ "ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ" \ "abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ" \ "ß" sprintf("%c", 255) \ "0123456789" \ ); for (i=1;i<=length(str); i++) { c = substr(str,i,1); class[c] = 3; } str = ( \ iso_blanks \ "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" \ "¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿×÷" \ ); for (i=1;i<=length(str);i++) { c = substr(str,i,1); class[c] = 1; } # Note: sets "mk[c]" to prevent redefinition. for(i=0;i<160;i++) { if ((i < 10) || (i == 11) || ((i > 12) && (i < 32)) || ((i > 126) && (i < 160))) { c = sprintf("%c", i); class[c] = -1; mk[c] += 1; } } # --- "normal" -------------------------------------------------- for (i=1;i<=length(normal); i++) { c = substr(normal,i,1); if ((class[c] == "\033") || (index(iso_blanks,c) != 0)) { error(("normal characters must be printable and non-blank")); } class[c] = 3; mk[c] += 1; } # --- process "errors" option ---------------------------------------- for (i=1;i<=length(errors); i++) { c = substr(errors,i,1); class[c] = -1; mk[c] += 1; } # --- process "ignore" option ---------------------------------------- for (i=1;i<=length(ignore); i++) { c = substr(ignore,i,1); class[c] = 0; mk[c] += 1; } # --- process "blanks" option --------------------------------------- for (i=1;i<=length(blanks); i++) { c = substr(blanks,i,1); class[c] = 1; mk[c] += 1; } # --- consistency checking ---------------------------------------- # Check if all characters have been covered exactly once: for (i=0;i<256;i++) { c = sprintf("%c", i); if (! (c in class)) { error(("character \"\\" sprintf("%03o", i) "\" not defined")); } else if(mk[c] > 1) { error(("character \"\\" sprintf("%03o", i) "\" multiply defined")); } } } function error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; }