commit 3e30f21ede623d1a100d5fe67139cad84807dadc
parent 5951768ada405cbf8cd9bd09dd4aeeac2aa509c7
Author: Simon Kornblith <simon@simonster.com>
Date: Tue, 19 Apr 2011 20:00:26 +0000
Rewritten unAPI translator
Diffstat:
| M | translators/unAPI.js | | | 462 | ++++++++++++++++++++++++++++++++++++++++++++++--------------------------------- |
1 file changed, 268 insertions(+), 194 deletions(-)
diff --git a/translators/unAPI.js b/translators/unAPI.js
@@ -4,257 +4,331 @@
"label":"unAPI",
"creator":"Simon Kornblith",
"target":null,
- "minVersion":"1.0.0b4.r1",
+ "minVersion":"2.1",
"maxVersion":"",
"priority":200,
"inRepository":true,
"detectXPath":"//link[@rel='unapi-server']",
- "lastUpdated":"2010-09-23 04:19:20"
+ "lastUpdated":"2011-04-19 19:40:07"
}
-var RECOGNIZABLE_FORMATS = ["mods", "marc", "endnote", "ris", "bibtex", "rdf"];
+var RECOGNIZABLE_FORMATS = ["rdf_zotero", "rdf_bibliontology", "mods", "marc", "unimarc", "ris",
+ "refer", "bibtex", "rdf_dc"];
var FORMAT_GUIDS = {
+ "rdf_zotero":"5e3ad958-ac79-463d-812b-a86a9235c28f",
+ "rdf_bibliontology":"14763d25-8ba0-45df-8f52-b8d1108e7ac9",
"mods":"0e2235e7-babf-413c-9acf-f27cce5f059c",
"marc":"a6ee60df-1ddc-4aae-bb25-45e0537be973",
- "endnote":"881f60f2-0802-411a-9228-ce5f47b64c7d",
+ "unimarc":"a6ee60df-1ddc-4aae-bb25-45e0537be973",
"ris":"32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7",
+ "refer":"881f60f2-0802-411a-9228-ce5f47b64c7d",
"bibtex":"9cb70025-a888-4a29-a210-93ec52da40d4",
- "rdf":"5e3ad958-ac79-463d-812b-a86a9235c28f"
+ "rdf_dc":"5e3ad958-ac79-463d-812b-a86a9235c28f"
};
-var unAPIResolver, unsearchedIds, foundIds, foundItems, foundFormat, foundFormatName, domain;
+var unAPIResolver = false;
+var defaultFormat, unAPIIDs;
-function detectWeb(doc, url) {
- // initialize variables
- unsearchedIds = [];
- foundIds = [];
- foundItems = [];
- foundFormat = [];
- foundFormatName = [];
-
- // Set the domain we're scraping
- domain = doc.location.href.match(/https?:\/\/([^/]+)/);
-
- // This and the x: prefix in the XPath are to work around an issue with pages
- // served as application/xhtml+xml
- //
- // https://developer.mozilla.org/en/Introduction_to_using_XPath_in_JavaScript#Implementing_a_default_namespace_for_XML_documents
- function nsResolver() {
- return 'http://www.w3.org/1999/xhtml';
- }
+/**
+ * A class to describe an unAPI format description
+ * @property isSupported {Boolean} Whether Zotero supports a format contained in this description
+ * @property name {String} The unAPI format name, used to retrieve item descriptions
+ * @property translatorID {String} The ID of the translator used to read this format
+ *
+ * @constructor
+ * @param {String} aXML unAPI format description XML
+ */
+UnAPIFormat = function(aXML) {
+ var parser = new DOMParser();
+ var doc = parser.parseFromString(aXML.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, ""), "text/xml");
- // look for a resolver
- unAPIResolver = doc.evaluate('//x:link[@rel="unapi-server"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
- if(!unAPIResolver) return false;
- unAPIResolver = unAPIResolver.getAttribute("href");
+ var foundFormat = new Object();
- // look for abbrs
- var abbrs = doc.getElementsByTagName("abbr");
- for each(var abbr in abbrs) {
- if(abbr.getAttribute && abbr.getAttribute("class") &&
- abbr.getAttribute("class").split(" ").indexOf("unapi-id") != -1 && abbr.getAttribute("title")) {
- // found an abbr
- unsearchedIds.push(escape(abbr.getAttribute("title")));
+ // Loop through to determine format name
+ var nodes = doc.documentElement.getElementsByTagName("format");
+ var nNodes = nodes.length;
+ var node, name, lowerName, format;
+ for(var i=0; i<nNodes; i++) {
+ node = nodes[i];
+ name = node.getAttribute("name");
+ lowerName = name.toLowerCase();
+ format = false;
+
+ // Look for formats we can recognize
+ if(["rdf_zotero", "rdf_bibliontology", "bibtex", "endnote", "rdf_dc"].indexOf(lowerName) != -1) {
+ format = lowerName;
+ } else if(lowerName == "rdf_bibliontology") {
+ format = "rdf_bibliontology";
+ } else if(lowerName === "mods"
+ || node.getAttribute("namespace_uri") === "http://www.loc.gov/mods/v3"
+ || node.getAttribute("docs") === "http://www.loc.gov/standards/mods/"
+ || node.getAttribute("type") === "application/mods+xml") {
+ format = "mods";
+ } else if(lowerName.match(/^marc\b/)
+ || node.getAttribute("type") === "application/marc") {
+ format = "marc";
+ } else if(lowerName.match(/^unimarc\b/)
+ || node.getAttribute("type") === "application/unimarc") {
+ format = "unimarc";
+ } else if(node.getAttribute("docs") == "http://www.refman.com/support/risformat_intro.asp"
+ || lowerName.match(/^ris\b/)) {
+ format = "ris";
}
+
+ if(format) foundFormat[format] = name;
}
- if(!unsearchedIds.length) return false;
- // now we need to see if the server actually gives us bibliographic metadata.
- Zotero.wait();
+ // Loop through again to determine optimal supported format
+ for(var i=0; i<RECOGNIZABLE_FORMATS.length; i++) {
+ if(foundFormat[RECOGNIZABLE_FORMATS[i]]) {
+ this.isSupported = true;
+ this.name = foundFormat[RECOGNIZABLE_FORMATS[i]];
+ this.translatorID = FORMAT_GUIDS[RECOGNIZABLE_FORMATS[i]];
+ return;
+ }
+ }
- if(unsearchedIds.length == 1) {
- // if there's only one abbr tag, we should go ahead and retrieve types for it
- getItemType();
- } else {
- // if there's more than one, we should first see if the resolver gives metadata for all of them
- Zotero.Utilities.HTTP.doGet(unAPIResolver, function(text) {
- var format = checkFormats(text);
- if(format) {
- // move unsearchedIds to foundIds
- foundIds = unsearchedIds;
- unsearchedIds = [];
- // save format and formatName
- foundFormat = format[0];
- foundFormatName = format[1];
-
- Zotero.done("multiple");
+ this.isSupported = false;
+}
+
+/**
+ * A class encapsulating an UnAPI ID
+ * @property format {UnAPIFormat} Information regarding the format
+ * @property items {Zotero.Item[]} Items corresponding to this ID
+ *
+ * @constructor
+ * @param {String} id The ID contained in an abbr tag
+ */
+UnAPIID = function(id) {
+ this.id = id;
+ unAPIIDs[id] = this;
+}
+
+UnAPIID.prototype = {
+ /**
+ * Gets the item type for this item
+ * @param {Function} callback Callback to be passed itemType when it is known
+ */
+ "getItemType":function(callback) {
+ var me = this;
+ this.getItems(function(items) {
+ if(items.length === 0) {
+ callback(false);
+ } else if(items.length === 1) {
+ callback(items[0].itemType);
} else {
- getItemType();
+ callback("multiple");
}
});
- }
-}
-
-function getItemType() {
- // if there are no items left to search, use the only item's type (if there is one) or give up
- if(!unsearchedIds.length) {
- if(foundIds.length) {
- getOnlyItem();
- } else {
- Zotero.done(false);
- }
- return;
- }
+ },
- var id = unsearchedIds.shift();
- Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) {
- var format = checkFormats(text);
- if(format) {
- // save data
- foundIds.push(id);
- foundFormat.push(format[0]);
- foundFormatName.push(format[1]);
-
- if(foundIds.length == 2) {
- // this is our second; use multiple
- Zotero.done("multiple");
+ /**
+ * Gets items associated with this ID
+ * @param {Function} callback Callback to be passed items when they have been retrieved
+ */
+ "getItems":function(callback) {
+ if(this.items) {
+ callback(me.items);
+ return;
+ }
+
+ var me = this;
+ this.items = [];
+ this.isSupported(function(isSupported) {
+ if(!isSupported) {
+ callback([]);
return;
}
+
+ Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+me.id+"&format="+me.format.name, function(text) {
+ var translator = Zotero.loadTranslator("import");
+ translator.setTranslator(me.format.translatorID);
+ translator.setString(text);
+ translator.setHandler("itemDone", function(obj, item) {
+ // add item to array
+ me.items.push(item);
+ });
+ translator.setHandler("done", function(obj) {
+ // run callback on item array
+ callback(me.items);
+ });
+ translator.translate();
+ });
+ });
+ },
+
+ /**
+ * Determines whether Zotero can handle this ID
+ * @param {Function} callback Callback to be passed isSupported when it is known
+ */
+ "isSupported":function(callback) {
+ if(this.hasOwnProperty("format")) {
+ callback(me.format.isSupported);
+ return;
}
- // keep going
- getItemType();
- });
-}
-
-function checkFormats(text) {
- text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, "");
- var xml = new XML(text);
-
- var foundFormat = new Object();
-
- // this is such an ugly, disgusting hack, and I hate how Mozilla decided to neuter an ECMA standard
- for each(var format in xml.format) {
- var name = format.@name.toString();
- var lowerName = name.toLowerCase();
+ var me = this;
- if(format.@namespace_uri == "http://www.loc.gov/mods/v3" || lowerName == "mods" || format.@docs == "http://www.loc.gov/standards/mods/") {
- if(!foundFormat["mods"] || lowerName.indexOf("full") != -1) {
- foundFormat["mods"] = escape(name);
- }
- } else if(lowerName.match(/^marc\b/)) {
- if(!foundFormat["marc"] || lowerName.indexOf("utf8") != -1) {
- foundFormat["marc"] = escape(name);
- }
- } else if(lowerName == "rdf_dc") {
- foundFormat["rdf"] = escape(name);
- } else if(format.@docs.text() == "http://www.refman.com/support/risformat_intro.asp" || lowerName.match(/^ris\b/)) {
- if(!foundFormat["ris"] || lowerName.indexOf("utf8") != -1) {
- foundFormat["ris"] = escape(name);
+ getDefaultFormat(function() {
+ // first try default format, since this won't require >1 HTTP request
+ if(defaultFormat.isSupported) {
+ me.format = defaultFormat;
+ callback(true);
+ } else {
+ // if no supported default format, try format for this item
+ Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+me.id, function(text) {
+ me.format = UnAPIFormat(text);
+ callback(!!me.format.isSupported);
+ });
}
- } else if(lowerName == "bibtex") {
- foundFormat["bibtex"] = escape(name);
- } else if(lowerName == "endnote") {
- foundFormat["endnote"] = escape(name);
- }
+ });
+ }
+}
+
+/**
+ * This and the x: prefix in the XPath are to work around an issue with pages
+ * served as application/xhtml+xml
+ *
+ * https://developer.mozilla.org/en/Introduction_to_using_XPath_in_JavaScript#Implementing_a_default_namespace_for_XML_documents
+ */
+function nsResolver() {
+ return 'http://www.w3.org/1999/xhtml';
+}
+
+/**
+ * Extracts UnAPIIDs from a document
+ * @param {document} A document object from which to extract unAPIIds
+ * @return {UnAPIID[]} The unAPI ID objects extracted from the document
+ */
+function getUnAPIIDs(doc) {
+ // look for a resolver
+ var newUnAPIResolver = doc.evaluate('//x:link[@rel="unapi-server"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if(!newUnAPIResolver) return [];
+ newUnAPIResolver = newUnAPIResolver.getAttribute("href");
+ if(unAPIResolver !== newUnAPIResolver) {
+ // if unAPI resolver has changed, clear
+ defaultFormat = false;
+ unAPIResolver = newUnAPIResolver;
+ unAPIIDs = [];
}
- // loop through again, this time respecting preferences
- for each(var format in RECOGNIZABLE_FORMATS) {
- if(foundFormat[format]) return [format, foundFormat[format]];
+ // look for abbrs
+ var abbrs = doc.evaluate('//x:abbr[contains(@class, " unapi-id") or contains(@class, "unapi-id ") or @class="unapi-id"][@title]',
+ doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var abbr;
+ var ids = [];
+ while(abbr = abbrs.iterateNext()) {
+ var id = abbr.getAttribute("title");
+ ids.push(unAPIIDs[id] ? unAPIIDs[id] : new UnAPIID(id));
}
- return false;
+ return ids;
}
-function getOnlyItem() {
- // retrieve the only item
- retrieveItem(foundIds[0], foundFormat[0], foundFormatName[0], function(obj, item) {
- foundItems.push(item);
- Zotero.done(item.itemType);
- });
-}
-function retrieveItem(id, format, formatName, callback) {
- // retrieve URL
- Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id+"&format="+formatName, function(text) {
- var translator = Zotero.loadTranslator("import");
- translator.setTranslator(FORMAT_GUIDS[format]);
- translator.setString(text);
- translator.setHandler("itemDone", callback);
- translator.translate();
+/**
+ * Retrieves the list of formats available for all items accessible through this unAPI resolver
+ * @param {Function} callback A callback to be passed the format when it is available
+ */
+function getDefaultFormat(callback) {
+ if(defaultFormat) {
+ callback(defaultFormat);
+ } else {
+ Zotero.Utilities.HTTP.doGet(unAPIResolver, function(text) {
+ // determine format of this item
+ defaultFormat = new UnAPIFormat(text);
+ callback(defaultFormat);
+ });
+ }
+}
+/**
+ * Determines itemType for detection
+ */
+function determineDetectItemType(ids, supportedId) {
+ var id = ids.shift();
+ id.isSupported(function(isSupported) {
+ if(isSupported && supportedId !== undefined) {
+ // If there are multiple items with valid itemTypes, use "multiple"
+ Zotero.done("multiple");
+ } else if(ids.length) {
+ // If IDs remain to be handled, handle the next one
+ determineDetectItemType(ids, (isSupported ? id : supportedId));
+ } else {
+ // If all IDs have been handled, get foundItemType for only supported ID
+ supportedId.getItemType(Zotero.done);
+ }
});
}
/**
- * Get formats and names for all usable ids; when done, get all items
+ * Get all items
+ * @param {UnAPIID[]} ids List of UnAPI IDs
+ * @param {Function} callback Function to pass item array to when items have been retrieved
+ * @param {Zotero.Item[]} items Item array; used for recursive calls
**/
-function getAllIds() {
- if(!unsearchedIds.length) {
- // once all ids have been gotten, get all items
- getAllItems();
- return;
- }
-
- var id = unsearchedIds.shift();
- Zotero.Utilities.HTTP.doGet(unAPIResolver+"?id="+id, function(text) {
- var format = checkFormats(text);
- if(format) {
- // save data
- foundIds.push(id);
- foundFormat.push(format[0]);
- foundFormatName.push(format[1]);
- }
+function getAllItems(ids, callback, items) {
+ var id = ids.shift();
+ id.getItems(function(retrievedItems) {
+ var collectedItems = (items ? items.concat(retrievedItems) : retrievedItems);
- // keep going
- getAllIds();
+ if(ids.length) {
+ getAllItems(ids, callback, collectedItems);
+ } else {
+ callback(collectedItems);
+ }
});
}
-/**
- * Get all items; when done, show selectItems or scrape
- **/
-function getAllItems() {
- if(foundItems.length == foundIds.length) {
- if(foundItems.length == 1) {
- // Set the item Repository to the domain
- foundItems[0].repository = domain[1];
- // if only one item, send complete()
- foundItems[0].complete();
- } else if(foundItems.length > 0) {
- // if multiple items, show selectItems
+function detectWeb(doc, url) {
+ // get unAPI IDs
+ var ids = getUnAPIIDs(doc);
+ if(!ids.length) return false;
+
+ // now we need to see if the server actually gives us bibliographic metadata, and determine the
+ // type
+ Zotero.wait();
+
+ if(!ids.length === 1) {
+ // Only one item, so we will just get its item type
+ ids[0].getItemType(Zotero.done);
+ } else {
+ // Several items. We will need to call determineDetectItemType
+ determineDetectItemType(ids);
+ }
+}
+
+function doWeb(doc, url) {
+ var ids = getUnAPIIDs(doc);
+
+ Zotero.wait();
+
+ getAllItems(ids, function(items) {
+ // get the domain we're scraping, so we can use it for libraryCatalog
+ domain = doc.location.href.match(/https?:\/\/([^/]+)/);
+
+ if(items.length == 1) {
+ // If only one item, just complete it
+ items[0].libraryCatalog = domain[1];
+ items[0].complete();
+ } else if(items.length > 0) {
+ // If multiple items, extract their titles
var itemTitles = [];
- for(var i in foundItems) {
- itemTitles[i] = foundItems[i].title;
+ for(var i in items) {
+ itemTitles[i] = items[i].title;
}
+ // Show item selection dialog
var chosenItems = Zotero.selectItems(itemTitles);
if(!chosenItems) Zotero.done(true);
+ // Complete items
for(var i in chosenItems) {
- // Set the item Repository to the domain
- foundItems[i].repository = domain[1];
- foundItems[i].complete();
+ items[i].libraryCatalog = domain[1];
+ items[i].complete();
}
}
- // reset items
- foundItems = [];
-
Zotero.done();
return;
- }
-
- var id = foundIds[foundItems.length];
- // foundFormat can be either a string or an array
- if(typeof(foundFormat) == "string") {
- var format = foundFormat;
- var formatName = foundFormatName;
- } else {
- var format = foundFormat[foundItems.length];
- var formatName = foundFormatName[foundItems.length];
- }
-
- // get item
- retrieveItem(id, format, formatName, function(obj, item) {
- foundItems.push(item);
- getAllItems();
});
}
-
-function doWeb() {
- Zotero.wait();
-
- // retrieve data for all ids
- getAllIds();
-}