#! /usr/bin/python3 # Last edited on 2026-02-22 16:14:53 by stolfi from math import fabs, exp, log, inf, isfinite, floor import os, sys, re from sys import stdin as inp, stdout as out, stderr as err import argparser PROG_NAME = "compute_freqs_from_counts" PROG_COPYRIGHT = "Copyright © 2026 by the State University of Campinas" PROG_HELP = \ " " + PROG_NAME + "\\" + \ " -encoding { utf | bytes } \\" + \ " [ -total STRING ] \\" + \ argparser.help_info_HELP + "\\" \ " < INFILE.wct > OUTFILE.wfr" PROG_INFO = \ "SYNOPSIS\n" + \ PROG_HELP + "\n" + \ "\n" + \ "DESCRIPTION" + \ """ Reads a file of {COUNT} {ITEM} pairs, as produced by "uniq -c" Outputs a similar file with {COUNT} {FREQ} {ITEM} lines, where {FREQ} is the fraction of {COUNT} relative to the total of all {COUNT}s. Any #-comments and blank lines are passed through.\n""" + \ "\n" + \ "OPTIONS" + \ """ -encoding {ENC} This mandatory argument specifies the input and output file encoding, that affects the {ITEM}s: where {ENC="utf"} says Unicode in UTF-8, and {ENC="bytes"} for single-byte-per char (e.g. ISO-Latin-1). -total {TOTSTR} If this optional keyword is given,, also outputs an extra line with {ITEM=TOTSTR}, {FREQ = 1.0}, and {COUNT} = total item count.\n""" + \ "\n" + \ "DOCUMENTATION OPTIONS\n" + \ argparser.help_info_INFO + "\n" + \ "\n" + \ argparser.help_info_INFO + "\n" + \ "SEE ALSO\n" + \ " Look deep into your soul; what do you see?\n" + \ "\n" + \ "AUTHOR\n" + \ " Created 2026-02-22 by Jorge Stolfi, IC-UNICAMP.\n" + \ "\n" + \ "MODIFICATION HISTORY\n" + \ " All entries by the author above unless indicated otherwise.\n" + \ " 2025-07-30 Created.\n" + \ "\n" + \ "WARRANTY\n" + \ " " + argparser.help_info_NO_WARRANTY + "\n" + \ "\n" \ "RIGHTS\n" + \ " " + PROG_COPYRIGHT + ".\n" + \ "\n" + \ " " + argparser.help_info_STANDARD_RIGHTS def main(): o = parse_options() if o['encoding'] == "utf": inp.reconfigure(encoding='utf-8') out.reconfigure(encoding='utf-8') elif o['encoding'] == "bytes": inp.reconfigure(encoding='iso-8859-1') out.reconfigure(encoding='iso-8859-1') # The input data is stored in the list {data}. Each element is a pair # {(COUNT,ITEM)} except that {COUNT} is {None} for a comment or blank # line and {ITEM} is then the line. data, max_count, tot_count, tot_item, prec_count = read_data(inp) ct_fmt = choose_count_format((tot_count if o['total'] != None else max_count), prec_count) fr_fmt = "%10.8f" den = tot_count if tot_count > 0 else 1 for ct, it in data: if (ct == None): out.write(it) wr.write("\n") else: fr = ct/den output_count_freq_line(out, ct, ct_fmt, fr, fr_fmt, it) if o['total'] != None: output_count_freq_line(out, tot_count, ct_fmt, 1.0, fr_fmt, o['total']) out.flush() return # ---------------------------------------------------------------------- def choose_count_format(max_count, prec_count): if prec_count == 0: mct = "%d" % max_count tot_digits = len(mct) fmt = f"%{tot_digits}d" else: mct = f"%.{prec_count}" % max_count tot_digits = len(mct) fmt = f"%{tot_digits}.{prec_count}f" return fmt # ---------------------------------------------------------------------- def output_count_freq_line(wr, ct, ct_fmt, fr, fr_fmt, item): wr.write(ct_fmt % ct); wr.write(" ") wr.write(fr_fmt % fr); wr.write(" ") wr.write(item) wr.write("\n") return # ---------------------------------------------------------------------- def read_data(rd): data = [] nread = 0 tot_item = 0 tot_count = 0 max_count = 0 prec_count = 0 while True: line = rd.readline() if line == "": break nread += 1 line = line.rstrip() m = re.match(r"^[ ]*([#].*|$)", line) if m is not None: data.append((None, line)) continue m = re.fullmatch(r"^[ ]*([0-9]+|[0-9]+[.][0-9]*|[.][0-9]+)[ ]+([^ ]+)", line) if m is not None: ct_str = m.group(1) item = m.group(2) if re.search(r"[.]", ct_str): # Nominally fractional: ct_str = re.sub(r"[0]+$", "", ct_str) # Remove trailing zero decimals. ct_str = re.sub(r"[.]$", "", ct_str) # Remove '.' with no decimals. m = re.search(r"[.]", ct_str) if m is not None: # Defintely fractional: fr_digs = len(ct_str) - m.start(0) - 1 assert fr_digs > 0 fraction_digits = max(fraction_digits, fr_digs) count = float(ct_str) if not isfinite(count): prog_error(f"invalid count {ct_str = !r}") else: count = int(ct_str) # data.append((count,item)) tot_item += 1 tot_count += count max_count = max(max_count, count) continue file_line_error("stdin",nread,"invalid line format",line) return data, max_count, tot_count, tot_item, prec_count # ---------------------------------------------------------------------- def parse_options(): pp = argparser.ArgParser(sys.argv, sys.stderr, PROG_HELP, PROG_INFO) # Defaults: o = dict() o['encoding'] = None o['total'] = None pp.get_keyword("-encoding"); o["total"] = pp.get_next() if pp.keyword_present("-total"): o["total"] = pp.get_next() pp.skip_parsed() pp.finish() return o # ---------------------------------------------------------------------- main()