commit 93652a137cf25526c2ed5480f9adfda409128897
parent c42991a5bf204def797fd19d68046b3aebcf6a87
Author: Simon Kornblith <simon@simonster.com>
Date: Fri, 2 Jun 2006 23:53:42 +0000
Fix issues with asynchronous scraping and XMLHttpRequest
Diffstat:
3 files changed, 106 insertions(+), 62 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -211,31 +211,36 @@ Scholar.Ingester.Interface._deleteDocument = function(browser) {
* Callback to be executed when scraping is complete
*/
Scholar.Ingester.Interface._finishScraping = function(documentObject) {
- Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
-
- var fields = Scholar.ItemFields.getItemTypeFields(documentObject.item.getField("itemTypeID"));
+ if(documentObject.item) {
+ Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
- var titleLabel = Scholar.getString("itemFields.title") + ":"
- Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, this.item.getField("title"));
- var creators = documentObject.item.numCreators();
- if(creators) {
- for(var i=0; i<creators; i++) {
- var creator = documentObject.item.getCreator(i);
- var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
- var data = creator.firstName + ' ' + creator.lastName;
- Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
- }
- }
-
- for(i in fields) {
- var data = documentObject.item.getField(fields[i]);
- if(data) {
- var name = Scholar.ItemFields.getName(fields[i]);
- if(name != "source") {
- var label = Scholar.getString("itemFields."+ name) + ":";
+ var fields = Scholar.ItemFields.getItemTypeFields(documentObject.item.getField("itemTypeID"));
+
+ var titleLabel = Scholar.getString("itemFields.title") + ":"
+ Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, this.item.getField("title"));
+ var creators = documentObject.item.numCreators();
+ if(creators) {
+ for(var i=0; i<creators; i++) {
+ var creator = documentObject.item.getCreator(i);
+ var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
+ var data = creator.firstName + ' ' + creator.lastName;
Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
}
}
+
+ for(i in fields) {
+ var data = documentObject.item.getField(fields[i]);
+ if(data) {
+ var name = Scholar.ItemFields.getName(fields[i]);
+ if(name != "source") {
+ var label = Scholar.getString("itemFields."+ name) + ":";
+ Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
+ }
+ }
+ }
+ } else {
+ Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
+ Scholar.Ingester.Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
}
setTimeout(function() { Scholar.Ingester.Interface.scrapeProgress.fade() }, 2000);
@@ -311,6 +316,19 @@ Scholar.Ingester.Interface.Progress.prototype.addResult = function(label, data)
this.table.appendChild(tr);
}
+Scholar.Ingester.Interface.Progress.prototype.addDescription = function(description) {
+ var descriptionNode = this.document.createTextNode(description);
+ var tr = this.document.createElement("tr");
+ var descriptionTd = this.document.createElement("td");
+ descriptionTd.style.fontSize = '10px';
+ descriptionTd.style.colspan = '2';
+
+ descriptionTd.appendChild(descriptionNode);
+ tr.appendChild(descriptionTd);
+ this.table.appendChild(tr);
+}
+
+
Scholar.Ingester.Interface.Progress.prototype.fade = function() {
// Icky, icky hack to keep objects
var me = this;
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -200,46 +200,55 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
// essential components for Scholar and would take a great deal of effort to
// implement. We can, however, always implement them later.
-// It looks like these are simple front-ends for XMLHttpRequest. They're a
-// component of the Piggy Bank API, so they're implemented here.
-Scholar.Ingester.Utilities.HTTPUtilities = function() {}
+// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
+// accessed outside the sandbox, and even if it could, it wouldn't let scripts
+// access across domains, so everything's replicated here.
+Scholar.Ingester.HTTPUtilities = function(contentWindow) {
+ this.window = contentWindow;
+}
-Scholar.Ingester.Utilities.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
- var xmlhttp = new XMLHttpRequest();
-
- xmlhttp.open('GET', url, true);
- xmlhttp.overrideMimeType("text/xml");
- xmlhttp.onreadystatechange = function() {
- Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
- };
- xmlhttp.send(null);
+Scholar.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
+ var xmlhttp = new this.window.XMLHttpRequest();
+
+ xmlhttp.open('GET', url, true);
+ xmlhttp.overrideMimeType("text/xml");
+
+ var me = this;
+ xmlhttp.onreadystatechange = function() {
+ me.stateChange(xmlhttp, onStatus, onDone);
+ };
+ xmlhttp.send(null);
}
-Scholar.Ingester.Utilities.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
- var xmlhttp = new XMLHttpRequest();
-
- xmlhttp.open('POST', url, true);
- xmlhttp.overrideMimeType("text/xml");
- xmlhttp.onreadystatechange = function() {
- Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
- };
- xmlhttp.send(body);
+Scholar.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
+ var xmlhttp = new this.window.XMLHttpRequest();
+
+ xmlhttp.open('POST', url, true);
+ xmlhttp.overrideMimeType("text/xml");
+
+ var me = this;
+ xmlhttp.onreadystatechange = function() {
+ me.stateChange(xmlhttp, onStatus, onDone);
+ };
+ xmlhttp.send(body);
}
-Scholar.Ingester.Utilities.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
- var xmlhttp = new XMLHttpRequest();
-
- xmlhttp.open('OPTIONS', url, true);
- xmlhttp.overrideMimeType("text/xml");
- xmlhttp.onreadystatechange = function() {
- Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
- };
- xmlhttp.send(body);
+Scholar.Ingester.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
+ var xmlhttp = new this.window.XMLHttpRequest();
+
+ xmlhttp.open('OPTIONS', url, true);
+ xmlhttp.overrideMimeType("text/xml");
+
+ var me = this;
+ xmlhttp.onreadystatechange = function() {
+ me.stateChange(xmlhttp, onStatus, onDone);
+ };
+ xmlhttp.send(body);
}
// Possible point of failure; for some reason, this used to be a separate
// class, so make sure it works
-Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
+Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
switch (xmlhttp.readyState) {
// Request not yet made
@@ -307,6 +316,8 @@ Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhtt
*/
Scholar.Ingester.Document = function(browserWindow){
this.browser = browserWindow;
+ this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
+ .getService(Ci.nsIAppShellService);
this.scraper = null
this.model = new Scholar.Ingester.Model();
this._generateSandbox();
@@ -379,10 +390,11 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
} catch(e) {
throw e+' in scraperJavaScript for '+this.scraper.label;
+ this._scrapePageComplete();
}
// If synchronous, call _scrapePageComplete();
- if(!scraperSandbox._waitForCompletion) {
+ if(!this._waitForCompletion) {
this._scrapePageComplete();
}
}
@@ -411,7 +423,7 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
* function before returning
*/
-/*`
+/*
* Called when scraping (synchronous or asynchronous) is complete
*/
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
@@ -420,17 +432,23 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
this._scrapeCallback(this);
}
}
-
+
+/*
+ * Generates a sandbox for scraping/scraper detection
+ */
Scholar.Ingester.Document.prototype._generateSandbox = function() {
this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
this.sandbox.browser = this.browser;
this.sandbox.doc = this.sandbox.browser.contentDocument;
this.sandbox.utilities = new Scholar.Ingester.Utilities;
+ this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow);
+ this.sandbox.window = this.window;
this.sandbox.model = this.model;
this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
- this.sandbox.wait = function(){ this._waitForCompletion = true; };
- this.sandbox.done = function(){ this._scrapePageComplete(); };
+ var me = this;
+ this.sandbox.wait = function(){ me._waitForCompletion = true; };
+ this.sandbox.done = function(){ me._scrapePageComplete(); };
}
/*
@@ -453,9 +471,15 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher']);
}
if(this.model.data[uri][prefixDC + 'year']) {
- data.date = this.model.data[uri][prefixDC + 'year'].substring(
- this.model.data[uri][prefixDC + 'year'].lastIndexOf(" ")+1,
- this.model.data[uri][prefixDC + 'year'].length);
+ if(this.model.data[uri][prefixDC + 'year'].length == 4) {
+ newItem.setField("year", this.model.data[uri][prefixDC + 'year']);
+ } else {
+ try {
+ newItem.setField(this.model.data[uri][prefixDC + 'year'].substring(
+ this.model.data[uri][prefixDC + 'year'].lastIndexOf(" ")+1,
+ this.model.data[uri][prefixDC + 'year'].length));
+ } catch(e) {}
+ }
}
if(this.model.data[uri][prefixDC + 'edition']) {
newItem.setField("edition", this.model.data[uri][prefixDC + 'edition']);
diff --git a/chrome/chromeFiles/locale/en-US/scholar/scholar.properties b/chrome/chromeFiles/locale/en-US/scholar/scholar.properties
@@ -24,4 +24,6 @@ creatorTypes.contributor = Contributor
creatorTypes.editor = Editor
ingester.scraping = Scraping Page...
-ingester.scrapeComplete = Scraping Complete
-\ No newline at end of file
+ingester.scrapeComplete = Scraping Complete
+ingester.scrapeError = Could Not Scrape
+ingester.scrapeErrorDescription = An error occurred while scraping this page. Please try again. If this error persists, contact the scraper author.
+\ No newline at end of file