www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 3c005729906d4fbacc1c3fe7208f120d75ccc547
parent edfc196e2ead5a08fa9418750ad7cd7230940b1b
Author: Dan Stillman <dstillman@zotero.org>
Date:   Sun, 11 Jan 2009 02:17:36 +0000

Updated version from dev list


Diffstat:
Mtranslators/Nagoya University OPAC.js | 201+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 151 insertions(+), 50 deletions(-)

diff --git a/translators/Nagoya University OPAC.js b/translators/Nagoya University OPAC.js @@ -8,7 +8,7 @@ "maxVersion":"", "priority":100, "inRepository":true, - "lastUpdated":"2008-07-10 06:15:00" + "lastUpdated":"2009-01-11 02:17:07" } function detectWeb(doc, url) { @@ -20,31 +20,96 @@ function detectWeb(doc, url) { } } -// initially posted to zotero-dev as an attachment -- sorry for the extra list traffic that caused +/* + * Set the texts used to find raw citation elements + */ +function setSpec() { + var spec = new Array(); + spec['title'] = ['題および','title and statement']; + spec['year'] = ['出版・頒布','publication,distribution']; + spec['isbn'] = ['国際標準図書','international standard book']; + spec['authors'] = ['著者標目','author link']; + spec['series'] = ['書誌構造','parent bibliography']; + return spec; +} + +/* + * Extract raw string sets from the page. This is the only function that uses + * xpath. The string sets retrieved for each label registered by setSpec is + * stored as a list, to cope with the possibility of multiple instances of the + * same label with different data. + */ +function getData(doc, spec) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + var data = new Object(); + for (key in spec) { + var check = doc.evaluate("//td[contains(text(),'"+spec[key][0]+"') or contains(text(),'"+spec[key][1]+"')]/following-sibling::td", doc, nsResolver, XPathResult.ANY_TYPE, null); + var c = check.iterateNext(); + while (c) { + if (!data[key] ) { + data[key] = new Array(); + } + data[key].push(Zotero.Utilities.cleanString(c.textContent)); + c = check.iterateNext(); + } + } + return data; +} +/* + * Chop a semicolon-delimited string of authors out of a raw title string, + * check it for Japanese characters, and save the raw string for each author + * to an array. If no Japanese authors were found, save directly to the item + * object. + */ parseRomanAuthors = function (item,data) { - var result = false; - var datastring = data['title'][0].replace(/.*\//, "") - if ( datastring.match(/.*[^- 0-9()\[\];:.a-zA-Z].*/) ) { - return result; + var datastring = data['title'][0]; + // don't bother if there is no author info + if ( ! datastring.match(/.*\/.*/) ) { + return true; } - var authors = datastring.split(";"); + // cut off the title + datastring = datastring.replace(/.*\//, ""); + // raise flag if there are japanese characters + var japanese_check = datastring.match(/.*[^- &0-9()\[\];:,.a-zA-Z].*/); + // replace comma with semicolon in certain cases, to prepare for split + datastring = datastring.replace(/,(\s+[a-zA-Z]{3,})/, ";$1"); + datastring = datastring.replace(/,(\s+[a-zA-Z]{1}[^a-zA-Z])/, ";$1"); + datastring = datastring.replace(/(\s+and\s+)/, "; "); + datastring = datastring.replace(/(\s+&\s+)/, "; "); + // split the authors + var authors = datastring.replace(/\|.*/, "").split(";"); + // this is parsing the authors for a single work. if there is a special byline, we + // assume that it applies to all subsequent entries until overridden. + var authortype = 'author'; for (i in authors) { - authortype = authors[i].replace(/^([ a-z]*).*/, "$1"); - if ( authortype.match(/.*edit.*/) ) { + item.authorstrings.push(authors[i]); + var authortypehint = authors[i].replace(/^([ ,.:a-z]*).*/, "$1"); + if ( authortypehint.match(/.*(edit|organiz).*/) ) { authortype = "editor"; - } else if ( authortype.match(/.*trans.*/) ) { + } else if ( authortypehint.match(/.*trans.*/) ) { authortype = "translator"; - } else { - authortype = "author"; } author = authors[i].replace(/^[ a-z]*/, "").replace( /\.\.\..*/, "" ); - item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype)); - result = true; + // need to test for length because the replacement of commas with semicolons + // can cause a short split at the end of a byline that originally ended in a comma + if ( ! japanese_check && author.length ) { + item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype)); + } } - return result; + return japanese_check; } +/* + * For each author link, attempt to find a hint that the person + * is an editor or translator, first in the link text itself, then in + * the list of raw author strings captured by parseRomanAuthors. + * Clean out cruft, reverse the order of each name, and save + * directly to the item object. + */ parseJapaneseAuthors = function ( item, data ) { var authortype = author; var authors = data['authors']; @@ -56,58 +121,94 @@ parseJapaneseAuthors = function ( item, data ) { } else { authortype = 'author'; } - var author = authors[i].replace(/[*]/g,"").replace(/[0-9<(|].*/, "").replace(/(.*?),(.*)/, "$2 $1"); + var author = authors[i].replace(/[*]/g,"").replace(/[0-9<()|].*/, "").replace(/(.*?),(.*)/, "$2 $1"); + // If we claim to be an author, double-check in the English entries for a translator hint. + // This is an enormous pain, but the original records are a mess, with different conventions + // for Japanese and foreign records, sometimes mixed up in the same entry. What are you + // going to do. + for ( x in item.authorstrings ) { + var authorstring = item.authorstrings[x]; + Zotero.debug(authorstring); + var name = author.split(" "); + name.reverse(); + if ( authorstring.indexOf( name[0] ) > -1 && authorstring.match(/.*(訳|譯|譯註)$/) ) { + authortype = 'translator'; + break; + } else if ( authorstring.indexOf( name[0] ) > -1 && authorstring.match(/.*(編|編著)$/) ) { + authortype = 'editor'; + break; + } + } + delete item.authorstrings; item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype)); } } +/* + * Split extracted title field. This always starts as a single list item, + * but can contain entries for several works, as in an omnibus volume of + * translated works, for example. Such records separate the elements of + * the omnibus with periods that have no trailing space, so we use that as + * the split point. We discard the phonetic information appended to the end + * of the string in Japanese records. + */ +function splitTitle(data) { + // split in data array + var titlestring = data['title'][0].replace(/\|.*/, ""); + data['title'] = titlestring.split(" . "); +} + +/* + * The scrape function brings the various parsing functions together + */ function scrape(doc,url) { var item = new Zotero.Item("book"); - var spec = new Array(); - spec['title'] = ['題および','title and statement']; - spec['year'] = ['出版・頒布','publication,distribution']; - spec['isbn'] = ['国際標準図書','international standard book']; - spec['authors'] = ['著者標目','author link']; - spec['series'] = ['書誌構造','parent bibliography']; - var data = {}; - for (key in spec) { - var check = doc.evaluate("//td[contains(text(),'"+spec[key][0]+"') or contains(text(),'"+spec[key][1]+"')]/following-sibling::td", doc, null, XPathResult.ANY_TYPE, null); - var c = check.iterateNext(); - if (!data[key] && c) { - data[key] = []; - } - while (c) { - data[key].push(Zotero.Utilities.cleanString(c.textContent)); - c = check.iterateNext(); - } - } + item.authorstrings = new Array(); + var spec = setSpec(); + var data = getData(doc, spec); + splitTitle(data); if (data['title']) { - item.title = data['title'][0].replace(/\/.*/, ""); - // if authors are in roman letters, use them - has_author = parseRomanAuthors( item, data ); - // otherwise, use author links - if (!has_author) { + var titles = new Array(); + for (i in data['title']) { + titles.push( data['title'][i].replace(/\s*\/.*/, "") ); + } + item.title = titles.join(", "); + jse_authors = parseRomanAuthors( item, data ); + if ( jse_authors ) { parseJapaneseAuthors( item, data ); } } + if (data['year']) { - item.date = data['year'][0].replace(/.*?([0-9][.0-9][0-9]+).*/, "$1"); - item.place = data['year'][0].replace(/:.*/, ""); - item.publisher = data['year'][0].replace(/.*:(.*),.*/, "$1"); + // sometimes there are multiple "date" fields, some of which are filled + // with other random information + for (i in data['year']) { + var year = data['year'][i]; + if ( year.match(/.*[0-9]{3}.*/) ) { + item.date = year.replace(/.*?([0-9][.0-9][0-9]+).*/, "$1"); + item.place = year.replace(/:.*/, "").replace(/[\[\]]/g, ""); + item.publisher = year.replace(/.*:(.*),.*/, "$1"); + break; + } + } } - // apparently the series field does not exist in this capture type - //if (data['series']) { - // Zotero.debug('series: '+data['series'][0]); - // item.series = data['series'][0].replace(/<.*/, ""); - //} + if (data['series']) { + item.series = data['series'][0].replace(/<.*/, ""); + } + + if (data['isbn']) { + item.ISBN = data['isbn'][0].replace(/[^0-9]*([0-9]+).*/, "$1"); + } item.complete(); } + function doWeb(doc, url) { - var articles = [url]; - Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); + articles = [url]; + Zotero.Utilities.processDocuments(articles, scrape, function() { + Zotero.done(); + }); Zotero.wait(); } -