commit 3c005729906d4fbacc1c3fe7208f120d75ccc547
parent edfc196e2ead5a08fa9418750ad7cd7230940b1b
Author: Dan Stillman <dstillman@zotero.org>
Date: Sun, 11 Jan 2009 02:17:36 +0000
Updated version from dev list
Diffstat:
1 file changed, 151 insertions(+), 50 deletions(-)
diff --git a/translators/Nagoya University OPAC.js b/translators/Nagoya University OPAC.js
@@ -8,7 +8,7 @@
"maxVersion":"",
"priority":100,
"inRepository":true,
- "lastUpdated":"2008-07-10 06:15:00"
+ "lastUpdated":"2009-01-11 02:17:07"
}
function detectWeb(doc, url) {
@@ -20,31 +20,96 @@ function detectWeb(doc, url) {
}
}
-// initially posted to zotero-dev as an attachment -- sorry for the extra list traffic that caused
+/*
+ * Set the texts used to find raw citation elements
+ */
+function setSpec() {
+ var spec = new Array();
+ spec['title'] = ['題および','title and statement'];
+ spec['year'] = ['出版・頒布','publication,distribution'];
+ spec['isbn'] = ['国際標準図書','international standard book'];
+ spec['authors'] = ['著者標目','author link'];
+ spec['series'] = ['書誌構造','parent bibliography'];
+ return spec;
+}
+
+/*
+ * Extract raw string sets from the page. This is the only function that uses
+ * xpath. The string sets retrieved for each label registered by setSpec is
+ * stored as a list, to cope with the possibility of multiple instances of the
+ * same label with different data.
+ */
+function getData(doc, spec) {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+ var data = new Object();
+ for (key in spec) {
+ var check = doc.evaluate("//td[contains(text(),'"+spec[key][0]+"') or contains(text(),'"+spec[key][1]+"')]/following-sibling::td", doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var c = check.iterateNext();
+ while (c) {
+ if (!data[key] ) {
+ data[key] = new Array();
+ }
+ data[key].push(Zotero.Utilities.cleanString(c.textContent));
+ c = check.iterateNext();
+ }
+ }
+ return data;
+}
+/*
+ * Chop a semicolon-delimited string of authors out of a raw title string,
+ * check it for Japanese characters, and save the raw string for each author
+ * to an array. If no Japanese authors were found, save directly to the item
+ * object.
+ */
parseRomanAuthors = function (item,data) {
- var result = false;
- var datastring = data['title'][0].replace(/.*\//, "")
- if ( datastring.match(/.*[^- 0-9()\[\];:.a-zA-Z].*/) ) {
- return result;
+ var datastring = data['title'][0];
+ // don't bother if there is no author info
+ if ( ! datastring.match(/.*\/.*/) ) {
+ return true;
}
- var authors = datastring.split(";");
+ // cut off the title
+ datastring = datastring.replace(/.*\//, "");
+ // raise flag if there are japanese characters
+ var japanese_check = datastring.match(/.*[^- &0-9()\[\];:,.a-zA-Z].*/);
+ // replace comma with semicolon in certain cases, to prepare for split
+ datastring = datastring.replace(/,(\s+[a-zA-Z]{3,})/, ";$1");
+ datastring = datastring.replace(/,(\s+[a-zA-Z]{1}[^a-zA-Z])/, ";$1");
+ datastring = datastring.replace(/(\s+and\s+)/, "; ");
+ datastring = datastring.replace(/(\s+&\s+)/, "; ");
+ // split the authors
+ var authors = datastring.replace(/\|.*/, "").split(";");
+ // this is parsing the authors for a single work. if there is a special byline, we
+ // assume that it applies to all subsequent entries until overridden.
+ var authortype = 'author';
for (i in authors) {
- authortype = authors[i].replace(/^([ a-z]*).*/, "$1");
- if ( authortype.match(/.*edit.*/) ) {
+ item.authorstrings.push(authors[i]);
+ var authortypehint = authors[i].replace(/^([ ,.:a-z]*).*/, "$1");
+ if ( authortypehint.match(/.*(edit|organiz).*/) ) {
authortype = "editor";
- } else if ( authortype.match(/.*trans.*/) ) {
+ } else if ( authortypehint.match(/.*trans.*/) ) {
authortype = "translator";
- } else {
- authortype = "author";
}
author = authors[i].replace(/^[ a-z]*/, "").replace( /\.\.\..*/, "" );
- item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype));
- result = true;
+ // need to test for length because the replacement of commas with semicolons
+ // can cause a short split at the end of a byline that originally ended in a comma
+ if ( ! japanese_check && author.length ) {
+ item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype));
+ }
}
- return result;
+ return japanese_check;
}
+/*
+ * For each author link, attempt to find a hint that the person
+ * is an editor or translator, first in the link text itself, then in
+ * the list of raw author strings captured by parseRomanAuthors.
+ * Clean out cruft, reverse the order of each name, and save
+ * directly to the item object.
+ */
parseJapaneseAuthors = function ( item, data ) {
var authortype = author;
var authors = data['authors'];
@@ -56,58 +121,94 @@ parseJapaneseAuthors = function ( item, data ) {
} else {
authortype = 'author';
}
- var author = authors[i].replace(/[*]/g,"").replace(/[0-9<(|].*/, "").replace(/(.*?),(.*)/, "$2 $1");
+ var author = authors[i].replace(/[*]/g,"").replace(/[0-9<()|].*/, "").replace(/(.*?),(.*)/, "$2 $1");
+ // If we claim to be an author, double-check in the English entries for a translator hint.
+ // This is an enormous pain, but the original records are a mess, with different conventions
+ // for Japanese and foreign records, sometimes mixed up in the same entry. What are you
+ // going to do.
+ for ( x in item.authorstrings ) {
+ var authorstring = item.authorstrings[x];
+ Zotero.debug(authorstring);
+ var name = author.split(" ");
+ name.reverse();
+ if ( authorstring.indexOf( name[0] ) > -1 && authorstring.match(/.*(訳|譯|譯註)$/) ) {
+ authortype = 'translator';
+ break;
+ } else if ( authorstring.indexOf( name[0] ) > -1 && authorstring.match(/.*(編|編著)$/) ) {
+ authortype = 'editor';
+ break;
+ }
+ }
+ delete item.authorstrings;
item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype));
}
}
+/*
+ * Split extracted title field. This always starts as a single list item,
+ * but can contain entries for several works, as in an omnibus volume of
+ * translated works, for example. Such records separate the elements of
+ * the omnibus with periods that have no trailing space, so we use that as
+ * the split point. We discard the phonetic information appended to the end
+ * of the string in Japanese records.
+ */
+function splitTitle(data) {
+ // split in data array
+ var titlestring = data['title'][0].replace(/\|.*/, "");
+ data['title'] = titlestring.split(" . ");
+}
+
+/*
+ * The scrape function brings the various parsing functions together
+ */
function scrape(doc,url) {
var item = new Zotero.Item("book");
- var spec = new Array();
- spec['title'] = ['題および','title and statement'];
- spec['year'] = ['出版・頒布','publication,distribution'];
- spec['isbn'] = ['国際標準図書','international standard book'];
- spec['authors'] = ['著者標目','author link'];
- spec['series'] = ['書誌構造','parent bibliography'];
- var data = {};
- for (key in spec) {
- var check = doc.evaluate("//td[contains(text(),'"+spec[key][0]+"') or contains(text(),'"+spec[key][1]+"')]/following-sibling::td", doc, null, XPathResult.ANY_TYPE, null);
- var c = check.iterateNext();
- if (!data[key] && c) {
- data[key] = [];
- }
- while (c) {
- data[key].push(Zotero.Utilities.cleanString(c.textContent));
- c = check.iterateNext();
- }
- }
+ item.authorstrings = new Array();
+ var spec = setSpec();
+ var data = getData(doc, spec);
+ splitTitle(data);
if (data['title']) {
- item.title = data['title'][0].replace(/\/.*/, "");
- // if authors are in roman letters, use them
- has_author = parseRomanAuthors( item, data );
- // otherwise, use author links
- if (!has_author) {
+ var titles = new Array();
+ for (i in data['title']) {
+ titles.push( data['title'][i].replace(/\s*\/.*/, "") );
+ }
+ item.title = titles.join(", ");
+ jse_authors = parseRomanAuthors( item, data );
+ if ( jse_authors ) {
parseJapaneseAuthors( item, data );
}
}
+
if (data['year']) {
- item.date = data['year'][0].replace(/.*?([0-9][.0-9][0-9]+).*/, "$1");
- item.place = data['year'][0].replace(/:.*/, "");
- item.publisher = data['year'][0].replace(/.*:(.*),.*/, "$1");
+ // sometimes there are multiple "date" fields, some of which are filled
+ // with other random information
+ for (i in data['year']) {
+ var year = data['year'][i];
+ if ( year.match(/.*[0-9]{3}.*/) ) {
+ item.date = year.replace(/.*?([0-9][.0-9][0-9]+).*/, "$1");
+ item.place = year.replace(/:.*/, "").replace(/[\[\]]/g, "");
+ item.publisher = year.replace(/.*:(.*),.*/, "$1");
+ break;
+ }
+ }
}
- // apparently the series field does not exist in this capture type
- //if (data['series']) {
- // Zotero.debug('series: '+data['series'][0]);
- // item.series = data['series'][0].replace(/<.*/, "");
- //}
+ if (data['series']) {
+ item.series = data['series'][0].replace(/<.*/, "");
+ }
+
+ if (data['isbn']) {
+ item.ISBN = data['isbn'][0].replace(/[^0-9]*([0-9]+).*/, "$1");
+ }
item.complete();
}
+
function doWeb(doc, url) {
- var articles = [url];
- Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
+ articles = [url];
+ Zotero.Utilities.processDocuments(articles, scrape, function() {
+ Zotero.done();
+ });
Zotero.wait();
}
-