#! /usr/bin/gawk -f
# Last edited on 2003-07-04 02:59:46 by stolfi
BEGIN {
usage = ( "cat nec-hits.html | extract-papers-from-nec-hits > hits-raw.bib" );
#
# Reads an HTML page returned by NEC Citeseer in response to a
# [Citations search] query. Extracts the papers in a format vaguely reminiscent
# of Bibtex.
#
# The typical entry looks like this
#
#
Context
# Doc
# 219.7 209 (6): L. Guibas and J. Stolfi.
# Primitives for the manipulation of general subdivisions and the
# computation of Voronoi diagrams. ACM Trans. on Graphics,
# 4(2):74--123, April 1985.
#
# except that it is all in a single line. The string
#
# Doc
#
# may be replaced by a link like
#
# Doc
#
# if the paper has an entry in the database. In this case there is a
# newline after the .
#
split("", fld);
printf "%% Created by extract-papers-from-nec-hits on files:\n";
for (i = 1; i < ARGC; i++) { printf "%% %s\n", ARGV[i]; }
printf "\n";
}
/>Context<.*>Doc {
lin = cleanup_html_crud($0);
# Join lines if Doc has a link:
if (match(lin, />Doc<[\/]a>/))
{ getline; lin = ( lin " " cleanup_html_crud($0) ); }
# The entry should now look like this:
#
# Context
# Doc 219.7 209 (6): L. Guibas and J.
# Stolfi. Primitives for the manipulation of general
# subdivisions and the computation of Voronoi diagrams. ACM
# Trans. on Graphics, 4(2):74--123, April 1985.
# Unparsed stuff in line:
misc = "???";
# Extract the URL of the citations-in-context for this paper:
ctxurl = "???"
if (match(lin, /^[ ]*Context<[\/]a>/,fld))
{ ctxurl = normalize_spaces(fld[1]);
lin = substr(lin,RSTART+RLENGTH);
}
else
{ data_error(("cannot find Context link")); }
# Remove "Doc" link (or placeholder):
docurl = "???";
if (match(lin, /^[ ]*Doc<[\/]a>/, fld))
{ docurl = normalize_spaces(fld[1]);
lin = substr(lin,RSTART+RLENGTH);
}
else if (match(lin, /^[ ]*]*>Doc<[\/]span>/))
{ lin = substr(lin,RSTART+RLENGTH); }
else
{ data_error(("cannot find Doc link")); }
# Extract the number of citations:
ncites = "???"
if (match(lin, /^[ ]*[ ]*([.0-9 ]*[ ]+[.0-9 ]*[(][.0-9 ]*[)])[ ]*[:][ ]*<[\/]i>/, fld))
{ ncites = normalize_spaces(fld[1]);
lin = substr(lin,RSTART+RLENGTH);
}
else
{ data_error(("cannot find citation count")); }
# Extract the authors' names:
auths = "???"
if (match(lin, /^ *([^<>]*)/, fld))
{ auths = fld[1];
# Remove final punctuation (unless abbrev dot):
auths = gensub(/([a-zA-Z][a-zA-Z])[ .]+$/, "\\1", "s", auths);
auths = normalize_spaces(auths);
lin = substr(lin,RSTART+RLENGTH-3);
}
else
{ data_warning(("cannot find authors"));
misc = lin; lin = "";
}
# Extract the paper title:
title = "???"
if (match(lin, /^[ ]*([^<>]*)<[\/]i>([. ]*)/, fld))
{ title = normalize_spaces((fld[1] fls[2]));
lin = substr(lin,RSTART+RLENGTH);
}
else
{ data_warning(("cannot find title"));
misc = lin; lin = "";
}
# Extract the bibliography data:
where = "???"
if (match(lin, /^[ ]*([^<>]*)$/, fld))
{ where = normalize_spaces(fld[1]);
lin = substr(lin,RSTART+RLENGTH);
}
else
{ data_error(("cannot find location"));
misc = lin; lin = "";
}
# Extract a "NEC ID number" from the context link:
if (ctxurl == "???")
{ key = "??"; }
else if (match(ctxurl, /[\/]context[\/]([0-9\/]+)$/, fld))
{ key = fld[1]; gsub(/[\/]/, "-", key); }
else
{ data_warning(("weird ctxurl")); }
# Output entry:
printf "@necitem{??\n";
printf " neckey = {%s}\n", key;
printf " necauthor = {%s}\n", auths;
printf " nectitle = {%s}\n", title;
if (where != "???") { printf " necwhere = {%s}\n", where; }
if (misc != "???") { printf " necmisc = {%s}\n", misc; }
printf " citations = {NEC: %s}\n", ncites;
if (docurl != "???") { printf " docurl = {{\\url{%s}}}\n", docurl; }
printf " ctxurl = {{\\url{%s}}}\n", ctxurl;
printf "}\n"
printf "\n"
next;
}
// { next; }
function cleanup_html_crud(lin)
{
# Remove funny spaces and line breaks:
gsub(/[&]nbsp[;]/, " ", lin);
gsub(/
/, " ", lin);
# Remove boldface marks (seem to be superfluous for parsing):
gsub(/<[\/]*b>/, "", lin);
return lin;
}
function normalize_spaces(str)
{
gsub(/^[ ,;:.]+/, "", str);
gsub(/[ ,;:]+[.]/, ".", str);
gsub(/[ ,;]+[:]/, ":", str);
gsub(/[ ,]+[;]/, ";", str);
gsub(/[ ]+[,]/, ",", str);
gsub(/[ ]+/, " ", str);
gsub(/[ ,;:]+$/, "", str);
return str;
}
function data_warning(msg)
{
printf "%s:%d: ++ Warning: %s\n", FILENAME, FNR, msg > "/dev/stderr";
printf " lin = «%s»\n", lin > "/dev/stderr";
printf " $0 = «%s»\n", $0 > "/dev/stderr";
}
function data_error(msg)
{
printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
printf " lin = «%s»\n", lin > "/dev/stderr";
printf " $0 = «%s»\n", $0 > "/dev/stderr";
abort = -1;
exit abort;
}