#! /usr/bin/gawk -f # Last edited on 2022-11-17 13:05:18 by stolfi # Reads from {stdin} one or more files produced by {}. } # Splits each entry into a separate file "{odir}/{key}.bib" where # the {key} is built from the author names and year. # Must be executed with "-f ~/lib/read_table.gawk". # Expects a file called "key-map.tbl" that maps synthetic # bib keys to the keys used in {stolfi,others}.bib. # See comments in {read_table} for the file format. BEGIN { abort = -1; if (odir == "") { arg_error("must define {odir}"); } ifile = ""; # Previous input file name. alf = "abcdefghijklmnopqrstuvxyz"; # Used to convert numbers to "aa", "ab", etc. split("", kc); # The number of times bare key {k} occurred is {kc[k]}. # Read the key mapping table {key_map}: tbl_fname = "key-map.tbl" printf "reading %s ...\n", tbl_fname > "/dev/stderr" split("", key_map); read_table(tbl_fname, 0, key_map, 1); n_mapped = 0; # Counts keys that were actually mapped. # Initialize the entry data: clear_entry(); } (abort >= 0) { exit(abort); } (FILENAME != ifile) { # Report input file name change: printf "== %s ==\n", FILENAME > "/dev/stderr" ifile = FILENAME; } # General cleanup: //{ # Remove boldface markup: gsub(/<[\/]?b>/, "", $0); # Repace HTML #-codes: gsub(/[&]lt;/, "<", $0); gsub(/[&]gt;/, ">", $0); gsub(/[&]qt;/, "'", $0); gsub(/[&][#]39;/, "'", $0); gsub(/[&]amp;/, "&", $0); # Remove final
,

, : gsub(/<[\/]p> *$/, "", $0); gsub(/ *$/, "", $0); } /^ *
  • / { if (ti != "") { data_error("missing
  • at end of entry"); } if (! match($0, /^ *
  • *

    * *(.*)<[\/]a> *$/, fld)) { data_error("bad

  • line format"); } ur = cleanup_url(fld[1]); ti = cleanup_title(fld[2]); next; } /^ *<[\/]li>/ { if (ti == "") { data_error("spurious
  • line"); } dump_entry(); clear_entry(); next; } (ti != "") { if (au == "") { # Assume it is the author line: if (match($0, /^ * *(.*)[ ][-][ ] *(.*)[,]? *([12][0129][0-9][0-9]) *[ ][-][ ] *(.*) *<[\/]i> *$/, fld)) { # Normal Google Scholar hit: au = cleanup_authors(fld[1]); jn = cleanup_journal(fld[2]); yr = cleanup_year(fld[3]); sr = cleanup_source(fld[4]); } else if (match($0, /^ * *(.*)[ ][-][ ] *(.*) *<[\/]i> *$/, fld)) { # Incomplete hit (no journal or year): au = cleanup_authors(fld[1]); jn = ""; yr = "2022"; # Just to have a valid year. sr = cleanup_source(fld[2]); } else { data_error("bad author line format"); } } else { # Assume quote line: qt = append_quote(qt, $0); } if (match($0, /<[\/]li>/)) { data_error("mark not on separate line"); } } END { if (abort >= 0) { exit(abort); } if (ti != "") { data_error("missing at end of entry"); } printf "%d keys were remapped\n", n_mapped > "/dev/stderr" } function clear_entry() { ti = ""; # Title. au = ""; # Author list with "and" as in Bibtex. jn = ""; # Journal name, book title, etc. yr = ""; # Year. sr = ""; # Source of Google Scholar data. ur = ""; # URL as per Google Scholar. qt = ""; # Quotes from paper. } function cleanup_url(x) { # gsub(/[?]casa_token=[^&]*[&]/, "?", x); # gsub(/[?]casa_token=[^&]*$/, "", x); gsub(/[&]amp;/, "&", x); return x; } function cleanup_title(x) { gsub(/^[ ]+/, "", x); gsub(/[ ]+$/, "", x); # https://www.morganclaypool.com/doi/pdf/10.2200/S00983ED1V01Y202001MAS032 gsub(/2[.]5.*INTERVAL RECIPROCAL/, "Affine Arithmetic Based Solution of Uncertain Static and Dynamic Problems", x); # http://repositorio.ufjf.br/jspui/bitstream/ufjf/2339/1/biancamariacostaaraujo.pdf gsub(/MESTRADO EM ENGENHARIA EL.*TRICA/, "Aritméticas Intervalares Aplicadas à Solução do Problema de Fluxo de Potência via Equações de Injeção de Corrente", x); return x; } function cleanup_authors(x) { gsub(/^[ ]+/, "", x); gsub(/[ ]+$/, "", x); # https://www.morganclaypool.com/doi/pdf/10.2200/S00983ED1V01Y202001MAS032 gsub(/^D.* .*x.* Affine.*$/, "S Chakraverty, S Rout", x); # http://repositorio.ufjf.br/jspui/bitstream/ufjf/2339/1/biancamariacostaaraujo.pdf gsub(/^AIA.*CORRENTE.*$/, "Bianca Maria Costa Araújo", x); return x; } function cleanup_journal(x) { gsub(/^[ ]+/, "", x); gsub(/[ ]+$/, "", x); return x; } function cleanup_year(x) { if (x !~ /^(1[89][0-9][0-9]|20([01][0-9]|2[012]))$/) { data_error("invalid year \"" x "\""); } return x; } function cleanup_source(x) { gsub(/^[ ]+/, "", x); gsub(/[ ]+$/, "", x); return x; } function append_quote(x,y, z) { gsub(/^[ ]+/, "", y) gsub(/[ ]+$/, "", y); gsub(/[ ][ ]+/, " ", y); if (y == "") { return x; } if (x != "") { x = (x " "); } z = (x y); return z; } function dump_entry( ydir,fname,txau,key) { txau = texify_authors(au); key = make_key(txau, yr); ydir = (odir "/" yr); system(("mkdir -p " ydir)); fname = (ydir "/" key ".bib"); # printf " %s\n", key > "/dev/stderr"; printf "@entry{%s,\n", key > fname; printf " author = {%s},\n", txau >> fname; printf " title = {%s},\n", texify_text(ti) >> fname; printf " journal = {%s},\n", texify_text(jn) >> fname; printf " volume = {},\n" >> fname; printf " number = {},\n" >> fname; printf " pages = {},\n" >> fname; printf " year = %s,\n", yr >> fname; printf " month = ,\n" >> fname; printf " doi = {},\n" >> fname; printf " comment = {},\n" >> fname; printf " abstract = {},\n" >> fname; printf " url = {{\\url{%s}}},\n", ur >> fname; printf " quotes = {%s}\n", texify_text(qt) >> fname; printf "}\n" >> fname; close(fname); } function texify_text(x) { x = texify_unicode(x); gsub(/à/, "{\\`a}", x); gsub(/á/, "{\\'a}", x); gsub(/ä/, "{\\\"a}", x); gsub(/ã/, "{\\~a}", x); gsub(/â/, "{\\^a}", x); gsub(/é/, "{\\'e}", x); gsub(/è/, "{\\`e}", x); gsub(/ë/, "{\\\"e}", x); gsub(/ê/, "{\\^e}", x); gsub(/í/, "{\\'\\i}", x); gsub(/ì/, "{\\`\\i}", x); gsub(/î/, "{\\^\\i}", x); gsub(/ï/, "{\\\"\\i}", x); gsub(/ó/, "{\\'o}", x); gsub(/ò/, "{\\`o}", x); gsub(/ö/, "{\\\"o}", x); gsub(/õ/, "{\\~o}", x); gsub(/ô/, "{\\^o}", x); gsub(/ø/, "{\\o}", x); gsub(/ú/, "{\\'u}", x); gsub(/ù/, "{\\`u}", x); gsub(/ü/, "{\\\"u}", x); gsub(/ý/, "{\\'y}", x); gsub(/ÿ/, "{\\\"y}", x); gsub(/ç/, "{\\c{c}}", x); gsub(/ñ/, "{\\~n}", x); gsub(/æ/, "{\\ae}", x); gsub(/ß/, "{\\ss}", x); gsub(/Á/, "{\\'A}", x); gsub(/À/, "{\\`A}", x); gsub(/Ä/, "{\\\"A}", x); gsub(/Ã/, "{\\~A}", x); gsub(/Â/, "{\\^A}", x); gsub(/É/, "{\\'E}", x); gsub(/È/, "{\\`E}", x); gsub(/Ë/, "{\\\"E}", x); gsub(/Ê/, "{\\^E}", x); gsub(/Í/, "{\\'\\I}", x); gsub(/Ì/, "{\\`\\I}", x); gsub(/Î/, "{\\^\\I}", x); gsub(/Ï/, "{\\\"\\I}", x); gsub(/Ó/, "{\\'O}", x); gsub(/Ò/, "{\\`O}", x); gsub(/Ö/, "{\\\"O}", x); gsub(/Õ/, "{\\~O}", x); gsub(/Ô/, "{\\^O}", x); gsub(/Ø/, "{\\O}", x); gsub(/Ú/, "{\\'U}", x); gsub(/Ù/, "{\\`U}", x); gsub(/Ü/, "{\\\"U}", x); gsub(/Û/, "{\\^U}", x); gsub(/Ý/, "{\\'Y}", x); gsub(/Ç/, "{\\C{C}}", x); gsub(/Ñ/, "{\\~N}", x); gsub(/Æ/, "{\\AE}", x); gsub(/·/, "{\\cdot}", x); gsub(/×/, "{\\times}", x); gsub(/±/, "{\\pm}", x); gsub(/[\\]uc\{0163\}/, "{\\k{t}}", x); gsub(/[\\]uc\{0160\}/, "{\\v{S}}", x); gsub(/[\240]/, " ", x); x = gensub(/\([^\\]\)[&]/, "\\1{\\\\&}", "g", x); # Assumes it is in text mode. gsub(/[%]/, "{\\%}", x); # Protects "%" from being TeX comment. if (x !~ /^[\040-\176]*$/) { data_error("non-ascii character in TEX string \"" x "\""); } return x; } function texify_unicode(x, n,k,i,ck,ci,m,h,u,y) { # Map CJK unicode chars to "\\cjk{HHHH}" where {HHHH}" is a 40digit hex codepoint. n = length(x); k = 1; y = "" while (k <= n) { ck = substr(x,k,1); # Determine the number of bytes in the next utf-8 encoded char: if ((ck >= "\300") && (ck <= "\337")) { m = 2; } else if ((ck >= "\340") && (ck <= "\357")) { m = 3; } else if ((ck >= "\360") && (ck <= "\367")) { m = 4; } else if ((ck >= "\370") && (ck <= "\373")) { m = 5; } else if ((ck >= "\374") && (ck <= "\375")) { m = 6; } else { m = 1; } if (length(x) < m) { data_error("truncated utf-8 character"); } if (m == 1) { # One-byte ascii character: y = (y ck); } else { # Next {m-1} bytes must start with bits "01" that is octal "\200" to "\277" for (i = 1; i < m; i++) { ci = substr(x,k+i,1); if ((ci < "\200") || (ci > "\277")) { data_error("invalid utf-8 continuation char"); } } ci = substr(x,k+1,1); if ((m == 2) && (ck <= "\303")) { # If it is in the range +0080 to +00ff, assume latin-1 and just copy: y = (y ck ci); } else { # Beyond unicode +00ff. # Convert the utf-8 encoding to hex unicode point: # printf " x = [[%s]] k = %d x[k:m] = %s\n", x, k, substr(x,k,m) > "/dev/stderr" h = hexcode(substr(x,k,m)); y = (y "\\uc{" h "}"); } } # Skip the utf-8 encoded character: k = k + m; } return y; } function hexcode(c, m,v,h,cmd) { # Returns the hex unicode point (4 or more dhex digits) of the multi-byte string {c} # that is supposed to be the unicode of a CJK character in utf-8 encoding. # This is a big crock. m = length(c) # printf " c = %s\n", c > "/dev/stderr" # Write to disk: system("rm -f .tmpchar") printf "%s", c > ".tmpchar" close(".tmpchar") # getline v < ".tmpchar"; close(".tmpchar"); printf " v = %s\n", v > "/dev/stderr" # Convert to 4-byte unicode chars and and then display the hexadecimal: cmd = "cat .tmpchar | recode u8..UCS-4 | od --endian=big -t x4 > .tmphex"; # printf " cmd = [[%s]]\n", cmd > "/dev/stderr"; system(cmd); getline h < ".tmphex" close(".tmphex") # printf " h = [[%s]]\n", h > "/dev/stderr" # Remove file position offset: gsub(/^ *[0-9][0-9][0-9][0-9]+ +/, "", h); # Remove encoding indicator, if any: gsub(/^fe ff +/, "", h); # Remove leading "00" or "0000" (but not "000000"): gsub(/^00/, "", h); gsub(/^00/, "", h); # Return the hex code: # printf " c = %s\n", c > "/dev/stderr" # Write to disk: system("rm -f .tmpchar") printf "%s", c > ".tmpchar" close(".tmpchar") # getline v < ".tmpchar"; close(".tmpchar"); printf " v = %s\n", v > "/dev/stderr" # Convert to 4-byte unicode chars and and then display the hexadecimal: cmd = "cat .tmpchar | recode u8..UCS-4 | od --endian=big -t x4 > .tmphex"; # printf " cmd = [[%s]]\n", cmd > "/dev/stderr"; system(cmd); getline h < ".tmphex" close(".tmphex") # printf " h = [[%s]]\n", h > "/dev/stderr" # Remove file position offset: gsub(/^ *[0-9][0-9][0-9][0-9]+ +/, "", h); # Remove encoding indicator, if any: gsub(/^fe ff +/, "", h); # Remove leading "00" or "0000" (but not "000000"): gsub(/^00/, "", h); gsub(/^00/, "", h); # Return the hex code: # printf " h = [[%s]]\n", h > "/dev/stderr" return h; } function texify_authors(au) { #printf "in au = [[%s]]\n", au > "/dev/stderr" au = texify_text(au); # printf "tx au = [[%s]]\n", au > "/dev/stderr" # CJK comma: gsub(/[\\]uc\{ff0c\}/, ", ", au); # Special authors: # https://cir.nii.ac.jp/crid/1570009752446900992 gsub(/[\\]uc\{5965\}[\\]uc\{7530\}[\\]uc\{5c1a\}[\\]uc\{4f38\}/, "H Okuda", au); gsub(/[\\]uc\{5b89\}[\\]uc\{7559\}[\\]uc\{8aa0\}[\\]uc\{543e\}/, "S Yasutome", au); gsub(/[\\]uc\{90fd\}[\\]uc\{5009\}[\\]uc\{4fe1\}[\\]uc\{6a39\}/, "N Tokura", au); # https://cir.nii.ac.jp/crid/1571417127447012352 = https://cir.nii.ac.jp/crid/1571417127447012352 # https://ci.nii.ac.jp/naid/110003292159/ = https://cir.nii.ac.jp/crid/1571698602293259008 gsub(/[\\]uc\{5d0e\}[\\]uc\{5c71\}[\\]uc\{8cb4\}[\\]uc\{884c\}/, "T Sakiyama", au); gsub(/[\\]uc\{67cf\}[\\]uc\{6728\}[\\]uc\{96c5\}[\\]uc\{82f1\}/, "M Kashiwagi", au); # https://ci.nii.ac.jp/naid/110003292221/ = https://cir.nii.ac.jp/crid/1520009410272491776 gsub(/[\\]uc\{795e\}[\\]uc\{6ca2\}[\\]uc\{96c4\}[\\]uc\{667a\}/, "Y Kanazawa", au); gsub(/[\\]uc\{5927\}[\\]uc\{77f3\}[\\]uc\{9032\}[\\]uc\{4e00\}/, "S Oishi", au); # https://www.ieice.org/ken/paper/20080131raBF/ gsub(/[\\]uc\{5185\}[\\]uc\{6751\}[\\]uc\{5275\}/, "H Uchimura", au); gsub(/[\\]uc\{67cf\}[\\]uc\{6728\}[\\]uc\{5553\}[\\]uc\{4e00\}[\\]uc\{90ce\}/, "K Kashiwagi", au); # https://ipsj.ixsq.nii.ac.jp/ej/?item_id=152845 gsub(/[\\]uc\{5bae\}[\\]uc\{5cf6\}[\\]uc\{4fe1\}[\\]uc\{4e5f\}/, "S Miyajima", au); gsub(/[\\]uc\{5bae\}[\\]uc\{7530\}[\\]uc\{5b5d\}[\\]uc\{5bcc\}/, "T Miyata", au); # https://ci.nii.ac.jp/naid/110003291360/ = https://cir.nii.ac.jp/crid/1520853832584743680 # https://www.ieice.org/ken/paper/200909242aoU/ gsub(/[\\]uc\{65b0\}[\\]uc\{5bae\}[\\]uc\{5f18\}[\\]uc\{654f\}/, "H Shingu", au); gsub(/[\\]uc\{795e\}[\\]uc\{6fa4\}[\\]uc\{96c4\}[\\]uc\{667a\}/, "Y Kanazawa", au); # https://ir.nctu.edu.tw/handle/11536/38189 gsub(/[\\]uc\{9673\}[\\]uc\{5f65\}[\\]uc\{5b87\}/, "YY Chen", au); gsub(/[\\]uc\{5468\}[\\]uc\{666f\}[\\]uc\{63da\}/, "JY Jou", au); # https://chuo-u.repo.nii.ac.jp/?item_id=4004 gsub(/[\\]uc\{4e09\}[\\]uc\{5cf6\}[\\]uc\{548c\}[\\]uc\{535a\}/, "K Mishima", au); # https://library.naist.jp/library/Gakunai/IEICE/general/2012/Settings/pdf/a_02_013.pdf gsub(/[\\]uc\{4e95\}[\\]uc\{539f\}[\\]uc\{6d69\}[\\]uc\{4ecb\}/, "K Ihara", au); # https://www.airitilibrary.com/Publication/alDetailedMesh?docid=U0017-2907201910202500 gsub(/[\\]uc\{66fe\}[\\]uc\{9865\}/, "Z Hao", au); # http://jssst.or.jp/files/user/taikai/2016/GENERAL/general5-2.pdf gsub(/[\\]uc\{677e\}[\\]uc\{672c\}[\\]uc\{7fd4\}[\\]uc\{592a\}/, "S Matsumoto", au); gsub(/[\\]uc\{4e0a\}[\\]uc\{7530\}[\\]uc\{548c\}[\\]uc\{7d00\}/, "K Ueda", au); # https://oatd.org/oatd/record?record=handle%5C%3A2065%5C%2F637 gsub(/[\\]uc\{6c34\}[\\]uc\{53e3\}[\\]uc\{7fa9\}[\\]uc\{96c4\}/, "Y Minakuchi", au); # https://www.ieice.org/ken/paper/20140306MBla/ # https://journal.bupt.edu.cn/CN/Y2015/V38/I2/69 gsub(/[\\]uc\{8c22\}[\\]uc\{6c38\}[\\]uc\{5f3a\}/, "YQ Xie", au); gsub(/[\\]uc\{9648\}[\\]uc\{5efa\}[\\]uc\{519b\}/, "JJ Chen", au); gsub(/[\\]uc\{66f9\}[\\]uc\{9e3f\}[\\]uc\{94a7\}/, "HJ Cao", au); # https://elib.spbstu.ru/dl/3/2020/vr/vr20-3728.pdf/info gsub(/[\\]uc\{0414\}[\\]uc\{0413\} [\\]uc\{0414\}[\\]uc\{0435\}[\\]uc\{043c\}[\\]uc\{0447\}[\\]uc\{0435\}[\\]uc\{043d\}[\\]uc\{043a\}[\\]uc\{043e\}/, "DD Georgievich", au); # http://xddl.ncepujournal.com/cn/article/doi/10.19725/j.cnki.1007-2322.2019.1072 gsub(/[\\]uc\{5173\}[\\]uc\{4f73\}[\\]uc\{6b23\}/, "JX Guan", au); gsub(/[\\]uc\{8fb9\}[\\]uc\{7ade\}/, "J Bian", au); gsub(/[\\]uc\{674e\}[\\]uc\{56fd\}[\\]uc\{5e86\}/, "GQ Li", au); gsub(/[\\]uc\{738b\}[\\]uc\{9e64\}/, "H Wang", au); # http://dgjsxb.ces-transaction.com/CN/article/downloadArticleFile.do?attachType=PDF&id=6382 gsub(/[\\]uc\{675c\}[\\]uc\{840d\}[\\]uc\{9759\}/, "PJ Du", au); gsub(/[\\]uc\{6768\}[\\]uc\{660e\}/, "M Yang", au); gsub(/[\\]uc\{66f9\}[\\]uc\{826f\}[\\]uc\{6676\}/, "LJ Cao", au); gsub(/[\\]uc\{7fdf\}[\\]uc\{9e64\}[\\]uc\{5cf0\}/, "HF Zhai", au); gsub(/[\\]uc\{6768\}[\\]uc\{4f73\}[\\]uc\{5cfb\}/, "JJ Yang", au); # http://verifiedby.me/kv/affine/affine.pdf # https://ieeexplore.ieee.org/abstract/document/6673287/ gsub(/[\\]uc\{5bae\}[\\]uc\{524d\}[\\]uc\{77e5\}[\\]uc\{9686\}/, "T Miyamae", au); gsub(/, *$/, "", au); gsub(/, */, " and ", au); # printf "cn au = [[%s]]\n", au > "/dev/stderr" return au; } function make_key(txau,yr, xa,na,af,i,ky,gn,sn,zi,nd,xd) { # Makes a Bibtex key from texified author names {txau} and year {yr}. # Appends seq tag "aa", "ab", "ac" ... for disamb. # Then maps the key through the global {key_map} table, if # it is defined there. # Also increments the global counter {n_mapped}. # Convert iso-latin-1 special chars to TeX: xa = txau; # printf "in xa = [[%s]]\n", xa > "/dev/stderr" # Replace special letter macros by ascii letters: gsub(/[\\]o\b/, "oe", xa); gsub(/[\\]ae\b/, "ae", xa); gsub(/[\\]i\b/, "i", xa); gsub(/[\\]["]u/, "ue", xa); gsub(/[\\]["]a/, "ae", xa); gsub(/[\\]["]o/, "oe", xa); gsub(/[\\]O\b/, "OE", xa); gsub(/[\\]AE\b/, "AE", xa); gsub(/[\\]I\b/, "I", xa); gsub(/[\\]["]U/, "UE", xa); gsub(/[\\]["]A/, "AE", xa); gsub(/[\\]["]O/, "OE", xa); # Remove all accent macros: gsub(/[\\][a-zA-Z]+/, "", xa); gsub(/[\\]./, "", xa); gsub(/[{}]/, "", xa); # printf "rm xa = [[%s]]\n", xa > "/dev/stderr" # Remove hyphens, periods, apostrophes: gsub(/[-.']/, "", xa); # Split authors: na = split(xa, af, / and /) # Assemble bare key {ky}: ky = "" for (i = 1; i <= na; i++) { gn = af[i]; gsub(/[ ].*$/, "", gn); # Initials. sn = af[i]; gsub(/^[-A-Z]*[ ]/, "", sn); # Last name. gsub(/^ *[dD][ieaos]+ +/, "", sn); # Remove preposition from surname. gsub(/^ *[Vv][oa][n] +/, "", sn); # Remove german/dutch preposition from surname. gsub(/^ *[Ll][ea]+ +/, "", sn); # Remove detached article from surname. gsub(/[ ]/, "", sn); # Join multiword surnames. # Append initials to name and pad with "x" to ensure 3 letters: zi = tolower(sn gn "xxx"); # Take first 3 letters: zi = substr(zi, 1, 3); if (length(zi) != 3) { prog_error("could not get 3 letters \"" af[i] "\" --> \"" zi "\""); } ky = (ky zi "-") } ky = (ky substr(yr, 3,2)); # Append year 00-99. # Check for duplicates and add disamb suffix "aa", "ab", etc: if (ky in kc) { nd = kc[ky]; } else { nd = 0; } kc[ky] = nd+1; xd = (substr(alf, 1+int(nd/26), 1) substr(alf, 1+(nd%26), 1)) ky = (ky "-" xd); printf " %s", ky > "/dev/stderr"; if (ky in key_map) { # Map through the key table: ky = key_map[ky] printf " --> %s\n", ky > "/dev/stderr"; n_mapped++; } else { printf " seems new\n" > "/dev/stderr"; } return ky } function data_error(msg) { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit(abort); } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); } function prog_error(msg) { printf "** PROG ERROR: %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); }