commit 51108446e3517a7a63e80e4f5c2e2c244333fef6
parent 009a4ad5201eb25368a5f6452f73b7bd6206eaed
Author: Simon Kornblith <simon@simonster.com>
Date: Tue, 15 Aug 2006 19:46:42 +0000
closes #187, make berkeley's library work
closes #186, stop translators from hanging
when a document loads inside a frameset, we now check whether we can scrape each individual frame.
all functions involving tabs have been vastly simplified, because in the process of figuring this out, i discovered Firefox 2's new tab events.
if a translator throws an exception inside loadDocument(), doGet(), doPost(), or processDocuments(), a translate error message will appear, and the translator will not hang
Diffstat:
4 files changed, 253 insertions(+), 161 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -23,7 +23,6 @@ var Scholar_Ingester_Interface = function() {}
* loading
*/
Scholar_Ingester_Interface.init = function() {
- Scholar_Ingester_Interface.browsers = new Array();
Scholar_Ingester_Interface.browserData = new Object();
Scholar_Ingester_Interface._scrapePopupShowing = false;
Scholar.Ingester.ProxyMonitor.init();
@@ -42,8 +41,10 @@ Scholar_Ingester_Interface.chromeLoad = function() {
Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
// this gives us onLocationChange, for updating when tabs are switched/created
- Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener,
- Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
+ Scholar_Ingester_Interface.tabBrowser.addEventListener("TabClose",
+ Scholar_Ingester_Interface.tabClose, false);
+ Scholar_Ingester_Interface.tabBrowser.addEventListener("TabSelect",
+ Scholar_Ingester_Interface.tabSelect, false);
// this is for pageshow, for updating the status of the book icon
Scholar_Ingester_Interface.appContent.addEventListener("pageshow",
Scholar_Ingester_Interface.contentLoad, true);
@@ -53,8 +54,7 @@ Scholar_Ingester_Interface.chromeLoad = function() {
* When chrome unloads, delete our document objects and remove our listeners
*/
Scholar_Ingester_Interface.chromeUnload = function() {
- delete Scholar_Ingester_Interface.browserData, Scholar_Ingester_Interface.browsers;
- this.tabBrowser.removeProgressListener(this);
+ delete Scholar_Ingester_Interface.browserData;
}
/*
@@ -77,7 +77,7 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
}
var translate = new Scholar.Translate("web");
- translate.setBrowser(browser);
+ translate.setDocument(data.document);
// use first translator available
translate.setTranslator(data.translators[0]);
translate.setHandler("select", Scholar_Ingester_Interface._selectItems);
@@ -90,86 +90,69 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
/*
* An event handler called when a new document is loaded. Creates a new document
* object, and updates the status of the capture icon
-
*/
Scholar_Ingester_Interface.contentLoad = function(event) {
- if (event.originalTarget instanceof HTMLDocument) {
- // Stolen off the Mozilla extension developer's website, a routine to
- // determine the root document loaded from a frameset
- if (event.originalTarget.defaultView.frameElement) {
- var doc = event.originalTarget;
- while (doc.defaultView.frameElement) {
- doc=doc.defaultView.frameElement.ownerDocument;
- }
- // Frame within a tab was loaded. doc is the root document of the frameset
- } else {
- var doc = event.originalTarget;
- // Page was loaded. doc is the document that loaded.
+ if(event.originalTarget instanceof HTMLDocument) {
+ var doc = event.originalTarget;
+ var rootDoc = doc;
+
+ // get the appropriate root document to check which browser we're on
+ Scholar.debug("getting root document");
+ while(rootDoc.defaultView.frameElement) {
+ rootDoc = rootDoc.defaultView.frameElement.ownerDocument;
}
// Figure out what browser this contentDocument is associated with
var browser;
+ Scholar.debug("getting browser");
for(var i=0; i<Scholar_Ingester_Interface.tabBrowser.browsers.length; i++) {
- if(doc == Scholar_Ingester_Interface.tabBrowser.browsers[i].contentDocument) {
+ if(rootDoc == Scholar_Ingester_Interface.tabBrowser.browsers[i].contentDocument) {
browser = Scholar_Ingester_Interface.tabBrowser.browsers[i];
break;
}
}
if(!browser) {
- Scholar.debug("Could not find browser!");
return;
}
+ Scholar.debug("getting data");
// get data object
var data = Scholar_Ingester_Interface._getData(browser);
+
+ // if there's already a scrapable page in the browser window, and it's
+ // still there, return
+ if(data.translators && data.translators.length && data.document.location) {
+ return;
+ }
+
+ Scholar.debug("translating");
// get translators
var translate = new Scholar.Translate("web");
- translate.setBrowser(browser);
+ translate.setDocument(doc);
data.translators = translate.getTranslators();
// update status
Scholar_Ingester_Interface._updateStatus(data);
+ // add document
+ if(data.translators && data.translators.length) {
+ data.document = doc;
+ }
}
}
/*
- * Dummy event handlers for all the events we don't care about
+ * called when a tab is closed
*/
-Scholar_Ingester_Interface.Listener = function() {}
-Scholar_Ingester_Interface.Listener.onStatusChange = function() {}
-Scholar_Ingester_Interface.Listener.onSecurityChange = function() {}
-Scholar_Ingester_Interface.Listener.onProgressChange = function() {}
-Scholar_Ingester_Interface.Listener.onStateChange = function() {}
+Scholar_Ingester_Interface.tabClose = function(event) {
+ // To execute if document object does not exist
+ Scholar_Ingester_Interface._deleteData(event.target.linkedBrowser);
+}
/*
- * onLocationChange is called when tabs are switched. Use it to retrieve the
- * appropriate status indicator for the current tab, and to free useless objects
+ * called when a tab is switched
*/
-Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
- var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
-
- // Remove document object of any browser that no longer exists
- for (var i = 0; i < Scholar_Ingester_Interface.browsers.length; i++) {
- var browser = Scholar_Ingester_Interface.browsers[i];
- var exists = false;
-
- for (var j = 0; j < browsers.length; j++) {
- if (browser == browsers[j]) {
- exists = true;
- break;
- }
- }
-
- if (!exists) {
- Scholar_Ingester_Interface.browsers.splice(i,1);
-
- // To execute if document object does not exist
- Scholar_Ingester_Interface._deleteDocument(browser);
- }
- }
-
+Scholar_Ingester_Interface.tabSelect = function(event) {
var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
Scholar_Ingester_Interface._updateStatus(data);
-
// Make sure scrape progress is gone
Scholar_Ingester_Interface.Progress.kill();
}
diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js
@@ -29,8 +29,8 @@
* PUBLIC PROPERTIES:
*
* type - the text type of translator (set by constructor, should be read only)
- * browser - the browser object to be used for web scraping (read-only; set
- * with setBrowser)
+ * document - the document object to be used for web scraping (read-only; set
+ * with setDocument)
* translator - the translator currently in use (read-only; set with
* setTranslator)
* location - the location of the target (read-only; set with setLocation)
@@ -115,9 +115,9 @@ Scholar.Translate = function(type, saveItem) {
/*
* sets the browser to be used for web translation; also sets the location
*/
-Scholar.Translate.prototype.setBrowser = function(browser) {
- this.browser = browser;
- this.setLocation(browser.contentDocument.location.href);
+Scholar.Translate.prototype.setDocument = function(doc) {
+ this.document = doc;
+ this.setLocation(doc.location.href);
}
/*
@@ -428,7 +428,7 @@ Scholar.Translate.prototype._generateSandbox = function() {
var sandboxURL = "";
if(this.type == "web") {
// use real URL, not proxied version, to create sandbox
- sandboxURL = this.browser.contentDocument.location.href;
+ sandboxURL = this.document.location.href;
} else {
// generate sandbox for search by extracting domain from translator
// target, if one exists
@@ -446,8 +446,8 @@ Scholar.Translate.prototype._generateSandbox = function() {
this._sandbox.Scholar = new Object();
// add ingester utilities
- this._sandbox.Scholar.Utilities = new Scholar.Utilities.Ingester(this.locationIsProxied);
- this._sandbox.Scholar.Utilities.HTTP = new Scholar.Utilities.Ingester.HTTP(this.locationIsProxied);
+ this._sandbox.Scholar.Utilities = new Scholar.Utilities.Ingester(this);
+ this._sandbox.Scholar.Utilities.HTTP = new Scholar.Utilities.Ingester.HTTP(this);
// set up selectItems handler
this._sandbox.Scholar.selectItems = function(options) { return me._selectItems(options) };
@@ -584,7 +584,7 @@ Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtension
try {
if(this.type == "web") {
- returnValue = this._sandbox.detectWeb(this.browser.contentDocument, this.location);
+ returnValue = this._sandbox.detectWeb(this.document, this.location);
} else if(this.type == "search") {
returnValue = this._sandbox.detectSearch(this.search);
} else if(this.type == "import") {
@@ -954,7 +954,7 @@ Scholar.Translate.prototype._runHandler = function(type, argument) {
*/
Scholar.Translate.prototype._web = function() {
try {
- this._sandbox.doWeb(this.browser.contentDocument, this.location);
+ this._sandbox.doWeb(this.document, this.location);
} catch(e) {
Scholar.debug(e+' in executing code for '+this.translator[0].label);
return false;
diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
@@ -164,8 +164,8 @@ Scholar.Utilities.prototype.itemTypeExists = function(type) {
// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
// classes relating to data extraction specifically from HTML documents.
-Scholar.Utilities.Ingester = function(proxiedURL) {
- this.proxiedURL = proxiedURL;
+Scholar.Utilities.Ingester = function(translate, proxiedURL) {
+ this.translate = translate;
}
Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
@@ -252,43 +252,62 @@ Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) {
// Ingester adapters for Scholar.Utilities.HTTP to handle proxies
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) {
- if(this.proxiedURL) {
- url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
- }
- Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
+ this.processDocuments([ url ], succeeded, null, failed);
}
Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
- if(this.proxiedURL) {
+ if(this.translate.locationIsProxied) {
for(i in urls) {
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
}
}
+
+ // unless the translator has proposed some way to handle an error, handle it
+ // by throwing a "scraping error" message
+ if(!exception) {
+ var translate = this.translate;
+ exception = function(e) {
+ Scholar.debug("an error occurred in code called by processDocuments: "+e);
+ translate._translationComplete(false);
+ }
+ }
+
Scholar.Utilities.HTTP.processDocuments(null, urls, processor, done, exception);
}
-Scholar.Utilities.Ingester.HTTP = function(proxiedURL) {
- this.proxiedURL = proxiedURL;
+Scholar.Utilities.Ingester.HTTP = function(translate) {
+ this.translate = translate;
}
Scholar.Utilities.Ingester.HTTP.prototype.doGet = function(url, onDone) {
- if(this.proxiedURL) {
+ if(this.translate.locationIsProxied) {
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
}
- Scholar.Utilities.HTTP.doGet(url, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
+
+ var translate = this.translate;
+ Scholar.Utilities.HTTP.doGet(url, function(xmlhttp) {
+ try {
+ onDone(xmlhttp.responseText, xmlhttp);
+ } catch(e) {
+ Scholar.debug("an error occurred in code called by doGet: "+e);
+ translate._translationComplete(false);
+ }
+ })
}
Scholar.Utilities.Ingester.HTTP.prototype.doPost = function(url, body, onDone) {
- if(this.proxiedURL) {
+ if(this.translate.locationIsProxied) {
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
}
- Scholar.Utilities.HTTP.doPost(url, body, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
-}
-
-Scholar.Utilities.Ingester.HTTP.prototype.doOptions = function(url, onDone) {
- if(this.proxiedURL) {
- url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
- }
- Scholar.Utilities.HTTP.doOptions(url, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
+
+ var translate = this.translate;
+ Scholar.Utilities.HTTP.doPost(url, body, function(xmlhttp) {
+ try {
+ onDone(xmlhttp.responseText, xmlhttp);
+ } catch(e) {
+ Scholar.debug("an error occurred in code called by doPost: "+e);
+ translate._translationComplete(false);
+ }
+ })
}
// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
@@ -310,7 +329,7 @@ Scholar.Utilities.HTTP = new function() {
* doGet can be called as:
* Scholar.Utilities.HTTP.doGet(url, onDone)
**/
- function doGet(url, onDone) {
+ function doGet(url, onDone, onError) {
Scholar.debug("HTTP GET "+url);
if (this.browserIsOffline()){
return false;
@@ -429,17 +448,14 @@ Scholar.Utilities.HTTP = new function() {
// Download complete
case 4:
- try {
- if (onDone){
- onDone(xmlhttp);
- }
- }
- catch (e){
- Scholar.debug(e, 2);
+ if(onDone){
+ onDone(xmlhttp);
}
break;
}
}
+
+
}
// Downloads and processes documents with processor()
@@ -455,63 +471,71 @@ Scholar.Utilities.HTTP = new function() {
Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, done, exception, saveBrowser) {
var hiddenBrowser = Scholar.Browser.createHiddenBrowser();
var prevUrl, url;
+
+ if (urls.length == 0) {
+ if(firstDoc) {
+ processor(firstDoc, done);
+ } else {
+ done();
+ }
+ return;
+ }
+ var urlIndex = -1;
- try {
- if (urls.length == 0) {
- if(firstDoc) {
- processor(firstDoc, done);
- } else {
- done();
- }
- return;
+ var removeListeners = function() {
+ hiddenBrowser.removeEventListener("load", onLoad, true);
+ if(!saveBrowser) {
+ Scholar.Browser.deleteHiddenBrowser(hiddenBrowser);
}
-
- var urlIndex = -1;
- var doLoad = function() {
- urlIndex++;
- if (urlIndex < urls.length) {
- url = urls[urlIndex];
- try {
- Scholar.debug("loading "+url);
- hiddenBrowser.loadURI(url);
- } catch (e) {
- Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2);
+ }
+ var doLoad = function() {
+ urlIndex++;
+ if (urlIndex < urls.length) {
+ url = urls[urlIndex];
+ try {
+ Scholar.debug("loading "+url);
+ hiddenBrowser.loadURI(url);
+ } catch (e) {
+ removeListeners();
+ if(exception) {
exception(e);
+ return;
+ } else {
+ throw(e);
}
- } else {
- hiddenBrowser.removeEventListener("load", onLoad, true);
- if(!saveBrowser) {
- Scholar.Browser.deleteHiddenBrowser(hiddenBrowser);
- }
- done();
}
- };
- var onLoad = function() {
- Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
- if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
- prevUrl = hiddenBrowser.contentDocument.location.href;
- try {
- processor(hiddenBrowser.contentDocument);
- } catch (e) {
- Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
+ } else {
+ removeListeners();
+ done();
+ }
+ };
+ var onLoad = function() {
+ Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
+ if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
+ prevUrl = hiddenBrowser.contentDocument.location.href;
+ try {
+ processor(hiddenBrowser.contentDocument);
+ } catch (e) {
+ removeListeners();
+ if(exception) {
exception(e);
+ return;
+ } else {
+ throw(e);
}
- doLoad();
- }
- };
- var init = function() {
- hiddenBrowser.addEventListener("load", onLoad, true);
-
- if (firstDoc) {
- processor(firstDoc, doLoad);
- } else {
- doLoad();
}
+ doLoad();
}
+ };
+ var init = function() {
+ hiddenBrowser.addEventListener("load", onLoad, true);
- init();
- } catch (e) {
- Scholar.debug("processDocuments: " + e);
- exception(e);
+ if (firstDoc) {
+ processor(firstDoc, doLoad);
+ } else {
+ doLoad();
+ }
}
+
+ init();
}
\ No newline at end of file
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,7 +1,7 @@
--- 48
+-- 49
-- Set the following timestamp to the most recent scraper update date
-REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-11 11:18:00'));
+REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00'));
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-08-11 11:18:00', 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)',
'function detectWeb(doc, url) {
@@ -112,7 +112,7 @@ function doWeb(doc, url) {
}
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
- function() { Scholar.done(); }, function() {});
+ function() { Scholar.done(); }, null);
Scholar.wait();
} else {
@@ -646,7 +646,7 @@ function doWeb(doc, url) {
}
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
- function() { Scholar.done(); }, function() {});
+ function() { Scholar.done(); }, null);
Scholar.wait();
} else {
@@ -763,7 +763,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
newItem.complete();
Scholar.done();
- }, function() {});
+ }, null);
} else { // Search results page
// Require link to match this
var tagRegexp = new RegExp();
@@ -952,7 +952,7 @@ function doWeb(doc, url) {
}
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
- function() { Scholar.done() }, function() {});
+ function() { Scholar.done() }, null);
Scholar.wait();
}
@@ -1127,7 +1127,7 @@ function doWeb(doc, url) {
}
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
- function() { Scholar.done(); }, function() {});
+ function() { Scholar.done(); }, null);
Scholar.wait();
} else {
@@ -1136,7 +1136,7 @@ function doWeb(doc, url) {
if(m && (m[1] == "1" || m[1] == "2")) {
scrape(doc);
} else if(m) {
- Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, function() {});
+ Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, null);
Scholar.wait();
}
}
@@ -1366,7 +1366,7 @@ function doWeb(doc, url) {
}
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
- function() { Scholar.done(); }, function() {});
+ function() { Scholar.done(); }, null);
Scholar.wait();
}
@@ -1457,7 +1457,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
newItem.source = uri;
record.translate(newItem);
newItem.complete();
- }, function() { Scholar.done(); }, function() {});
+ }, function() { Scholar.done(); }, null);
Scholar.wait();
}');
@@ -1544,7 +1544,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
newItem.source = uri;
record.translate(newItem);
newItem.complete();
- }, function() { Scholar.done() }, function() {});
+ }, function() { Scholar.done() }, null);
Scholar.wait();
}');
@@ -1647,7 +1647,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
newItem.source = uri;
record.translate(newItem);
newItem.complete();
- }, function(){ Scholar.done(); }, function() {});
+ }, function(){ Scholar.done(); }, null);
Scholar.wait();
}');
@@ -1721,8 +1721,7 @@ REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006
Scholar.wait();
}');
-
-REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
+REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|GeacFETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
'function detectWeb(doc, url) {
if(doc.location.href.indexOf("/GeacQUERY") > 0) {
return "multiple";
@@ -1804,7 +1803,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006
newItem.source = uri;
record.translate(newItem);
newItem.complete();
- }, function() { Scholar.done(); }, function() {});
+ }, function() { Scholar.done(); }, null);
Scholar.wait();
}');
@@ -2037,7 +2036,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
newItem.source = uri;
record.translate(newItem);
newItem.complete();
- }, function() {Scholar.done(); }, function() {});
+ }, function() {Scholar.done(); }, null);
Scholar.wait();
}');
@@ -2568,7 +2567,79 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
}
}
newItem.complete();
- }, function() { Scholar.done(); }, function() {});
+ }, function() { Scholar.done(); }, null);
+
+ Scholar.wait();
+}');
+
+REPLACE INTO "translators" VALUES ('9c335444-a562-4f88-b291-607e8f46a9bb', '2006-08-15 15:42:00', 4, 'Berkeley Library', 'Simon Kornblith', '^http://[^/]*berkeley.edu[^/]*/WebZ/(?:html/results.html|FETCH)\?.*sessionid=',
+'function detectWeb(doc, url) {
+ var resultsRegexp = /\/WebZ\/html\/results.html/i
+ if(resultsRegexp.test(url)) {
+ return "multiple";
+ } else {
+ return "book";
+ }
+}',
+'function reformURL(url) {
+ return url.replace(/fmtclass=[^&]*/, "")+":fmtclass=marc";
+}
+
+function doWeb(doc, url) {
+ var resultsRegexp = /\/WebZ\/html\/results.html/i
+
+ if(resultsRegexp.test(url)) {
+ var items = Scholar.Utilities.getItemArray(doc, doc, "/WebZ/FETCH", "^[0-9]*$");
+ items = Scholar.selectItems(items);
+
+ if(!items) {
+ return true;
+ }
+
+ var urls = new Array();
+ for(var i in items) {
+ urls.push(reformURL(i));
+ }
+ } else {
+ var urls = [reformURL(url)];
+ }
+
+ var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
+
+ Scholar.Utilities.processDocuments(urls, function(newDoc) {
+ Scholar.Utilities.debug(newDoc.getElementsByTagName("body")[0].innerHTML);
+ var uri = newDoc.location.href;
+
+ var namespace = newDoc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var elmts = newDoc.evaluate(''//table/tbody/tr[@valign="top"]'',
+ newDoc, nsResolver, XPathResult.ANY_TYPE, null);
+
+ var record = new marc.MARC_Record();
+ while(elmt = elmts.iterateNext()) {
+ var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
+ var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
+ var ind1 = value[4];
+ var ind2 = value[6];
+ value = Scholar.Utilities.cleanString(value.substr(6)).
+ replace(/\$([a-z0-9]) /g, record.subfield_delimiter+"$1");
+ if(value[0] != record.subfield_delimiter) {
+ value = record.subfield_delimiter+"a"+value;
+ }
+
+ if(field != 0) {
+ record.add_field(field, ind1, ind2, value);
+ }
+ }
+
+ var newItem = new Scholar.Item();
+ newItem.source = uri;
+ record.translate(newItem);
+ newItem.complete();
+ }, function() { Scholar.done(); }, null);
Scholar.wait();
}');
@@ -2644,9 +2715,7 @@ function doSearch(item) {
Scholar.done(false);
});
}
- }, function() {
- error();
- });
+ }, null);
Scholar.wait();
}');
@@ -4604,7 +4673,16 @@ MARC_Record.prototype.get_field_subfields = function(tag) { // returns a two-dim
}
MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
- if (tag.length != 3) { return false; }
+ /*if(tag.length != 3) {
+ return false;
+ }*/
+
+ if (tag.length < 3) {
+ tag = Scholar.Utilities.lpad(tag.toString(),"0",3);
+ } else if(tag.length > 3) {
+ return false;
+ }
+
var F = new this.MARC_field(this,tag,ind1,ind2,value);
// adds pointer to list of fields
this.variable_fields[this.variable_fields.length] = F;
@@ -4666,9 +4744,11 @@ MARC_Record.prototype._clean = function(value) {
}
MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) {
+
if(!part) {
part = ''a'';
}
+
var field = this.get_field_subfields(fieldNo);
Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part);
if(field) {
@@ -4685,6 +4765,7 @@ MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldNam
}
}
if(value) {
+ this._gotField = true;
value = this._clean(value);
if(execMe) {
@@ -4807,6 +4888,10 @@ MARC_Record.prototype.translate = function(item) {
// Set type
item.itemType = "book";
+
+ if(!this._gotField) {
+ throw("tried to create a marc record with no fields!");
+ }
}
MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides