#! /usr/bin/gawk # Last edited on 2026-01-12 17:16:12 by stolfi # Common functions for scripts of Notes/077. # To be included in other gawk scripts. function extract_words(raw_lin) { # Removes the locus ID from {raw_lin}, all leading and trailing # punctuation chars, and replaces each internal string of one or more # punctuation chars with ' '. lin = raw_lin; gsub(/^[<][0-9A-Za-z.;]+[>][ ]*/, "", lin); lin = tolower(lin); gsub(/[=.,' ()]+/, " ", lin); gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ][ ]+/, " ", lin); return lin; }