www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 3e30f21ede623d1a100d5fe67139cad84807dadc
parent 5951768ada405cbf8cd9bd09dd4aeeac2aa509c7
Author: Simon Kornblith <simon@simonster.com>
Date:   Tue, 19 Apr 2011 20:00:26 +0000

Rewritten unAPI translator


Diffstat:
Mtranslators/unAPI.js | 462++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
1 file changed, 268 insertions(+), 194 deletions(-)

diff --git a/translators/unAPI.js b/translators/unAPI.js @@ -4,257 +4,331 @@ "label":"unAPI", "creator":"Simon Kornblith", "target":null, - "minVersion":"1.0.0b4.r1", + "minVersion":"2.1", "maxVersion":"", "priority":200, "inRepository":true, "detectXPath":"//link[@rel='unapi-server']", - "lastUpdated":"2010-09-23 04:19:20" + "lastUpdated":"2011-04-19 19:40:07" } -var RECOGNIZABLE_FORMATS = ["mods", "marc", "endnote", "ris", "bibtex", "rdf"]; +var RECOGNIZABLE_FORMATS = ["rdf_zotero", "rdf_bibliontology", "mods", "marc", "unimarc", "ris", + "refer", "bibtex", "rdf_dc"]; var FORMAT_GUIDS = { + "rdf_zotero":"5e3ad958-ac79-463d-812b-a86a9235c28f", + "rdf_bibliontology":"14763d25-8ba0-45df-8f52-b8d1108e7ac9", "mods":"0e2235e7-babf-413c-9acf-f27cce5f059c", "marc":"a6ee60df-1ddc-4aae-bb25-45e0537be973", - "endnote":"881f60f2-0802-411a-9228-ce5f47b64c7d", + "unimarc":"a6ee60df-1ddc-4aae-bb25-45e0537be973", "ris":"32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7", + "refer":"881f60f2-0802-411a-9228-ce5f47b64c7d", "bibtex":"9cb70025-a888-4a29-a210-93ec52da40d4", - "rdf":"5e3ad958-ac79-463d-812b-a86a9235c28f" + "rdf_dc":"5e3ad958-ac79-463d-812b-a86a9235c28f" }; -var unAPIResolver, unsearchedIds, foundIds, foundItems, foundFormat, foundFormatName, domain; +var unAPIResolver = false; +var defaultFormat, unAPIIDs; -function detectWeb(doc, url) { - // initialize variables - unsearchedIds = []; - foundIds = []; - foundItems = []; - foundFormat = []; - foundFormatName = []; - - // Set the domain we're scraping - domain = doc.location.href.match(/https?:\/\/([^/]+)/); - - // This and the x: prefix in the XPath are to work around an issue with pages - // served as application/xhtml+xml - // - // https://developer.mozilla.org/en/Introduction_to_using_XPath_in_JavaScript#Implementing_a_default_namespace_for_XML_documents - function nsResolver() { - return 'http://www.w3.org/1999/xhtml'; - } +/** + * A class to describe an unAPI format description + * @property isSupported {Boolean} Whether Zotero supports a format contained in this description + * @property name {String} The unAPI format name, used to retrieve item descriptions + * @property translatorID {String} The ID of the translator used to read this format + * + * @constructor + * @param {String} aXML unAPI format description XML + */ +UnAPIFormat = function(aXML) { + var parser = new DOMParser(); + var doc = parser.parseFromString(aXML.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, ""), "text/xml"); - // look for a resolver - unAPIResolver = doc.evaluate('//x:link[@rel="unapi-server"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if(!unAPIResolver) return false; - unAPIResolver = unAPIResolver.getAttribute("href"); + var foundFormat = new Object(); - // look for abbrs - var abbrs = doc.getElementsByTagName("abbr"); - for each(var abbr in abbrs) { - if(abbr.getAttribute && abbr.getAttribute("class") && - abbr.getAttribute("class").split(" ").indexOf("unapi-id") != -1 && abbr.getAttribute("title")) { - // found an abbr - unsearchedIds.push(escape(abbr.getAttribute("title"))); + // Loop through to determine format name + var nodes = doc.documentElement.getElementsByTagName("format"); + var nNodes = nodes.length; + var node, name, lowerName, format; + for(var i=0; i<nNodes; i++) { + node = nodes[i]; + name = node.getAttribute("name"); + lowerName = name.toLowerCase(); + format = false; + + // Look for formats we can recognize + if(["rdf_zotero", "rdf_bibliontology", "bibtex", "endnote", "rdf_dc"].indexOf(lowerName) != -1) { + format = lowerName; + } else if(lowerName == "rdf_bibliontology") { + format = "rdf_bibliontology"; + } else if(lowerName === "mods" + || node.getAttribute("namespace_uri") === "http://www.loc.gov/mods/v3" + || node.getAttribute("docs") === "http://www.loc.gov/standards/mods/" + || node.getAttribute("type") === "application/mods+xml") { + format = "mods"; + } else if(lowerName.match(/^marc\b/) + || node.getAttribute("type") === "application/marc") { + format = "marc"; + } else if(lowerName.match(/^unimarc\b/) + || node.getAttribute("type") === "application/unimarc") { + format = "unimarc"; + } else if(node.getAttribute("docs") == "http://www.refman.com/support/risformat_intro.asp" + || lowerName.match(/^ris\b/)) { + format = "ris"; } + + if(format) foundFormat[format] = name; } - if(!unsearchedIds.length) return false; - // now we need to see if the server actually gives us bibliographic metadata. - Zotero.wait(); + // Loop through again to determine optimal supported format + for(var i=0; i<RECOGNIZABLE_FORMATS.length; i++) { + if(foundFormat[RECOGNIZABLE_FORMATS[i]]) { + this.isSupported = true; + this.name = foundFormat[RECOGNIZABLE_FORMATS[i]]; + this.translatorID = FORMAT_GUIDS[RECOGNIZABLE_FORMATS[i]]; + return; + } + } - if(unsearchedIds.length == 1) { - // if there's only one abbr tag, we should go ahead and retrieve types for it - getItemType(); - } else { - // if there's more than one, we should first see if the resolver gives metadata for all of them - Zotero.Utilities.HTTP.doGet(unAPIResolver, function(text) { - var format = checkFormats(text); - if(format) { - // move unsearchedIds to foundIds - foundIds = unsearchedIds; - unsearchedIds = []; - // save format and formatName - foundFormat = format[0]; - foundFormatName = format[1]; - - Zotero.done("multiple"); + this.isSupported = false; +} + +/** + * A class encapsulating an UnAPI ID + * @property format {UnAPIFormat} Information regarding the format + * @property items {Zotero.Item[]} Items corresponding to this ID + * + * @constructor + * @param {String} id The ID contained in an abbr tag + */ +UnAPIID = function(id) { + this.id = id; + unAPIIDs[id] = this; +} + +UnAPIID.prototype = { + /** + * Gets the item type for this item + * @param {Function} callback Callback to be passed itemType when it is known + */ + "getItemType":function(callback) { + var me = this; + this.getItems(function(items) { + if(items.length === 0) { + callback(false); + } else if(items.length === 1) { + callback(items[0].itemType); } else { - getItemType(); + callback("multiple"); } }); - } -} - -function getItemType() { - // if there are no items left to search, use the only item's type (if there is one) or give up - if(!unsearchedIds.length) { - if(foundIds.length) { - getOnlyItem(); - } else { - Zotero.done(false); - } - return; - } + }, - var id = unsearchedIds.shift(); - Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) { - var format = checkFormats(text); - if(format) { - // save data - foundIds.push(id); - foundFormat.push(format[0]); - foundFormatName.push(format[1]); - - if(foundIds.length == 2) { - // this is our second; use multiple - Zotero.done("multiple"); + /** + * Gets items associated with this ID + * @param {Function} callback Callback to be passed items when they have been retrieved + */ + "getItems":function(callback) { + if(this.items) { + callback(me.items); + return; + } + + var me = this; + this.items = []; + this.isSupported(function(isSupported) { + if(!isSupported) { + callback([]); return; } + + Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+me.id+"&format="+me.format.name, function(text) { + var translator = Zotero.loadTranslator("import"); + translator.setTranslator(me.format.translatorID); + translator.setString(text); + translator.setHandler("itemDone", function(obj, item) { + // add item to array + me.items.push(item); + }); + translator.setHandler("done", function(obj) { + // run callback on item array + callback(me.items); + }); + translator.translate(); + }); + }); + }, + + /** + * Determines whether Zotero can handle this ID + * @param {Function} callback Callback to be passed isSupported when it is known + */ + "isSupported":function(callback) { + if(this.hasOwnProperty("format")) { + callback(me.format.isSupported); + return; } - // keep going - getItemType(); - }); -} - -function checkFormats(text) { - text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, ""); - var xml = new XML(text); - - var foundFormat = new Object(); - - // this is such an ugly, disgusting hack, and I hate how Mozilla decided to neuter an ECMA standard - for each(var format in xml.format) { - var name = format.@name.toString(); - var lowerName = name.toLowerCase(); + var me = this; - if(format.@namespace_uri == "http://www.loc.gov/mods/v3" || lowerName == "mods" || format.@docs == "http://www.loc.gov/standards/mods/") { - if(!foundFormat["mods"] || lowerName.indexOf("full") != -1) { - foundFormat["mods"] = escape(name); - } - } else if(lowerName.match(/^marc\b/)) { - if(!foundFormat["marc"] || lowerName.indexOf("utf8") != -1) { - foundFormat["marc"] = escape(name); - } - } else if(lowerName == "rdf_dc") { - foundFormat["rdf"] = escape(name); - } else if(format.@docs.text() == "http://www.refman.com/support/risformat_intro.asp" || lowerName.match(/^ris\b/)) { - if(!foundFormat["ris"] || lowerName.indexOf("utf8") != -1) { - foundFormat["ris"] = escape(name); + getDefaultFormat(function() { + // first try default format, since this won't require >1 HTTP request + if(defaultFormat.isSupported) { + me.format = defaultFormat; + callback(true); + } else { + // if no supported default format, try format for this item + Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+me.id, function(text) { + me.format = UnAPIFormat(text); + callback(!!me.format.isSupported); + }); } - } else if(lowerName == "bibtex") { - foundFormat["bibtex"] = escape(name); - } else if(lowerName == "endnote") { - foundFormat["endnote"] = escape(name); - } + }); + } +} + +/** + * This and the x: prefix in the XPath are to work around an issue with pages + * served as application/xhtml+xml + * + * https://developer.mozilla.org/en/Introduction_to_using_XPath_in_JavaScript#Implementing_a_default_namespace_for_XML_documents + */ +function nsResolver() { + return 'http://www.w3.org/1999/xhtml'; +} + +/** + * Extracts UnAPIIDs from a document + * @param {document} A document object from which to extract unAPIIds + * @return {UnAPIID[]} The unAPI ID objects extracted from the document + */ +function getUnAPIIDs(doc) { + // look for a resolver + var newUnAPIResolver = doc.evaluate('//x:link[@rel="unapi-server"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(!newUnAPIResolver) return []; + newUnAPIResolver = newUnAPIResolver.getAttribute("href"); + if(unAPIResolver !== newUnAPIResolver) { + // if unAPI resolver has changed, clear + defaultFormat = false; + unAPIResolver = newUnAPIResolver; + unAPIIDs = []; } - // loop through again, this time respecting preferences - for each(var format in RECOGNIZABLE_FORMATS) { - if(foundFormat[format]) return [format, foundFormat[format]]; + // look for abbrs + var abbrs = doc.evaluate('//x:abbr[contains(@class, " unapi-id") or contains(@class, "unapi-id ") or @class="unapi-id"][@title]', + doc, nsResolver, XPathResult.ANY_TYPE, null); + var abbr; + var ids = []; + while(abbr = abbrs.iterateNext()) { + var id = abbr.getAttribute("title"); + ids.push(unAPIIDs[id] ? unAPIIDs[id] : new UnAPIID(id)); } - return false; + return ids; } -function getOnlyItem() { - // retrieve the only item - retrieveItem(foundIds[0], foundFormat[0], foundFormatName[0], function(obj, item) { - foundItems.push(item); - Zotero.done(item.itemType); - }); -} -function retrieveItem(id, format, formatName, callback) { - // retrieve URL - Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id+"&format="+formatName, function(text) { - var translator = Zotero.loadTranslator("import"); - translator.setTranslator(FORMAT_GUIDS[format]); - translator.setString(text); - translator.setHandler("itemDone", callback); - translator.translate(); +/** + * Retrieves the list of formats available for all items accessible through this unAPI resolver + * @param {Function} callback A callback to be passed the format when it is available + */ +function getDefaultFormat(callback) { + if(defaultFormat) { + callback(defaultFormat); + } else { + Zotero.Utilities.HTTP.doGet(unAPIResolver, function(text) { + // determine format of this item + defaultFormat = new UnAPIFormat(text); + callback(defaultFormat); + }); + } +} +/** + * Determines itemType for detection + */ +function determineDetectItemType(ids, supportedId) { + var id = ids.shift(); + id.isSupported(function(isSupported) { + if(isSupported && supportedId !== undefined) { + // If there are multiple items with valid itemTypes, use "multiple" + Zotero.done("multiple"); + } else if(ids.length) { + // If IDs remain to be handled, handle the next one + determineDetectItemType(ids, (isSupported ? id : supportedId)); + } else { + // If all IDs have been handled, get foundItemType for only supported ID + supportedId.getItemType(Zotero.done); + } }); } /** - * Get formats and names for all usable ids; when done, get all items + * Get all items + * @param {UnAPIID[]} ids List of UnAPI IDs + * @param {Function} callback Function to pass item array to when items have been retrieved + * @param {Zotero.Item[]} items Item array; used for recursive calls **/ -function getAllIds() { - if(!unsearchedIds.length) { - // once all ids have been gotten, get all items - getAllItems(); - return; - } - - var id = unsearchedIds.shift(); - Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) { - var format = checkFormats(text); - if(format) { - // save data - foundIds.push(id); - foundFormat.push(format[0]); - foundFormatName.push(format[1]); - } +function getAllItems(ids, callback, items) { + var id = ids.shift(); + id.getItems(function(retrievedItems) { + var collectedItems = (items ? items.concat(retrievedItems) : retrievedItems); - // keep going - getAllIds(); + if(ids.length) { + getAllItems(ids, callback, collectedItems); + } else { + callback(collectedItems); + } }); } -/** - * Get all items; when done, show selectItems or scrape - **/ -function getAllItems() { - if(foundItems.length == foundIds.length) { - if(foundItems.length == 1) { - // Set the item Repository to the domain - foundItems[0].repository = domain[1]; - // if only one item, send complete() - foundItems[0].complete(); - } else if(foundItems.length > 0) { - // if multiple items, show selectItems +function detectWeb(doc, url) { + // get unAPI IDs + var ids = getUnAPIIDs(doc); + if(!ids.length) return false; + + // now we need to see if the server actually gives us bibliographic metadata, and determine the + // type + Zotero.wait(); + + if(!ids.length === 1) { + // Only one item, so we will just get its item type + ids[0].getItemType(Zotero.done); + } else { + // Several items. We will need to call determineDetectItemType + determineDetectItemType(ids); + } +} + +function doWeb(doc, url) { + var ids = getUnAPIIDs(doc); + + Zotero.wait(); + + getAllItems(ids, function(items) { + // get the domain we're scraping, so we can use it for libraryCatalog + domain = doc.location.href.match(/https?:\/\/([^/]+)/); + + if(items.length == 1) { + // If only one item, just complete it + items[0].libraryCatalog = domain[1]; + items[0].complete(); + } else if(items.length > 0) { + // If multiple items, extract their titles var itemTitles = []; - for(var i in foundItems) { - itemTitles[i] = foundItems[i].title; + for(var i in items) { + itemTitles[i] = items[i].title; } + // Show item selection dialog var chosenItems = Zotero.selectItems(itemTitles); if(!chosenItems) Zotero.done(true); + // Complete items for(var i in chosenItems) { - // Set the item Repository to the domain - foundItems[i].repository = domain[1]; - foundItems[i].complete(); + items[i].libraryCatalog = domain[1]; + items[i].complete(); } } - // reset items - foundItems = []; - Zotero.done(); return; - } - - var id = foundIds[foundItems.length]; - // foundFormat can be either a string or an array - if(typeof(foundFormat) == "string") { - var format = foundFormat; - var formatName = foundFormatName; - } else { - var format = foundFormat[foundItems.length]; - var formatName = foundFormatName[foundItems.length]; - } - - // get item - retrieveItem(id, format, formatName, function(obj, item) { - foundItems.push(item); - getAllItems(); }); } - -function doWeb() { - Zotero.wait(); - - // retrieve data for all ids - getAllIds(); -}