commit 639a006efb027446995259c361a6c4ac3a522be7
parent 864fa537b56c92aba9484cb95e203385f0b4bb12
Author: Simon Kornblith <simon@simonster.com>
Date: Fri, 2 Jun 2006 03:19:12 +0000
XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?)
Diffstat:
5 files changed, 533 insertions(+), 482 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -2,19 +2,23 @@
// Utilities based on code taken from Greasemonkey
// This code is licensed according to the GPL
-// Prepare the browser and collector instrumentation caches --------------------
Scholar.Ingester.Interface = function() {}
+/*
+ * Initialize some variables and prepare event listeners for when chrome is done
+ * loading
+ */
Scholar.Ingester.Interface.init = function() {
Scholar.Ingester.Interface.browsers = new Array();
+ Scholar.Ingester.Interface.browserDocuments = new Object();
window.addEventListener("load", Scholar.Ingester.Interface.chromeLoad, false);
window.addEventListener("unload", Scholar.Ingester.Interface.chromeUnload, false);
-
- Scholar.Ingester.Interface.browsers = new Array();
- Scholar.Ingester.Interface.browserDocuments = new Object();
}
+/*
+ * When chrome loads, register our event handlers with the appropriate interfaces
+ */
Scholar.Ingester.Interface.chromeLoad = function() {
Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
@@ -28,10 +32,23 @@ Scholar.Ingester.Interface.chromeLoad = function() {
Scholar.Ingester.Interface.contentLoad, true);
}
+/*
+ * When chrome unloads, delete our document objects and remove our listeners
+ */
Scholar.Ingester.Interface.chromeUnload = function() {
- this.tabBrowser.removeProgressListener(this);
+ delete Scholar.Ingester.Interface.browserDocuments;
+ this.tabBrowser.removeProgressListener(this);
}
+
+/*
+ * Gets a document object given a browser window object
+ *
+ * NOTE: Browser objects are associated with document objects via keys generated
+ * from the time the browser object is opened. I'm not sure if this is the
+ * appropriate mechanism for handling this, but it's what PiggyBank used and it
+ * appears to work.
+ */
Scholar.Ingester.Interface.getDocument = function(browser) {
try {
var key = browser.getAttribute("scholar-key");
@@ -42,6 +59,10 @@ Scholar.Ingester.Interface.getDocument = function(browser) {
return false;
}
+/*
+ * Creates a new document object for a browser window object, attempts to
+ * retrieve appropriate scraper
+ */
Scholar.Ingester.Interface.setDocument = function(browser) {
try {
var key = browser.getAttribute("scholar-key");
@@ -55,6 +76,9 @@ Scholar.Ingester.Interface.setDocument = function(browser) {
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
}
+/*
+ * Deletes the document object associated with a given browser window object
+ */
Scholar.Ingester.Interface.deleteDocument = function(browser) {
try {
var key = browser.getAttribute("scholar-key");
@@ -66,6 +90,9 @@ Scholar.Ingester.Interface.deleteDocument = function(browser) {
return false;
}
+/*
+ * Scrapes a page (called when the capture icon is clicked)
+ */
Scholar.Ingester.Interface.scrapeThisPage = function() {
var document = Scholar.Ingester.Interface.getDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
if(document.scraper) {
@@ -73,6 +100,10 @@ Scholar.Ingester.Interface.scrapeThisPage = function() {
}
}
+/*
+ * Updates the status of the capture icon to reflect the scrapability or lack
+ * thereof of the current page
+ */
Scholar.Ingester.Interface.updateStatus = function(browser) {
var document = Scholar.Ingester.Interface.getDocument(browser);
if(document && document.scraper) {
@@ -82,16 +113,33 @@ Scholar.Ingester.Interface.updateStatus = function(browser) {
}
}
+/*
+ * An event handler called when a new document is loaded. Creates a new document
+ * object, and updates the status of the capture icon
+ *
+ * FIXME: This approach, again borrowed from PiggyBank, does not work properly
+ * when the newly loaded page is not the currently selected page. For example,
+ * if a tab is loaded behind the currently selected page, the ingester will not
+ * create a new object for it.
+ */
Scholar.Ingester.Interface.contentLoad = function() {
Scholar.Ingester.Interface.setDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
Scholar.Ingester.Interface.updateStatus(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
}
+/*
+ * Dummy event handlers for all the events we don't care about
+ */
Scholar.Ingester.Interface.Listener = function() {}
Scholar.Ingester.Interface.Listener.onStatusChange = function() {}
Scholar.Ingester.Interface.Listener.onSecurityChange = function() {}
Scholar.Ingester.Interface.Listener.onProgressChange = function() {}
Scholar.Ingester.Interface.Listener.onStateChange = function() {}
+
+/*
+ * onLocationChange is called when tabs are switched. Use it to retrieve the
+ * appropriate status indicator for the current tab, and to free useless objects
+ */
Scholar.Ingester.Interface.Listener.onLocationChange = function() {
var browsers = Scholar.Ingester.Interface.tabBrowser.browsers;
@@ -114,7 +162,7 @@ Scholar.Ingester.Interface.Listener.onLocationChange = function() {
Scholar.Ingester.Interface.deleteDocument(browser);
}
}
-
+
/*// Add a collector to any new browser
for (var i = 0; i < browsers.length; i++) {
var browser = browsers[i];
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.xul b/chrome/chromeFiles/content/scholar/ingester/browser.xul
@@ -6,8 +6,7 @@
<overlay id="scholar-ingester-overlay" xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul">
<script src="../include.js"/>
-
- <script src="ingester.js"/>
+
<script src="browser.js"/>
<script type="application/x-javascript">
diff --git a/chrome/chromeFiles/content/scholar/ingester/ingester.js b/chrome/chromeFiles/content/scholar/ingester/ingester.js
@@ -1,473 +0,0 @@
-// Firefox Scholar Ingester
-// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
-// This code is licensed according to the GPL
-
-Scholar.Ingester = new function() {}
-
-/////////////////////////////////////////////////////////////////
-//
-// Scholar.Ingester.Model
-//
-/////////////////////////////////////////////////////////////////
-
-// Scholar.Ingester.Model, an object representing an RDF data model with
-// methods to add to that model. In Piggy Bank, this was implemented in Java,
-// but seeing as we don't really want an enormous web server running with FS,
-// but we don't actually need that, so it's much simpler.
-//
-// The Java version of this class can be viewed at
-// http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java
-Scholar.Ingester.Model = function() {
- this.data = new Object();
-}
-
-// Piggy Bank provides a fourth argument, one that determines if the third
-// argument is a literal or an RDF URI. Since our ontologies are
-// sufficiently restricted, we have no chance of confusing a literal and an
-// RDF URI and thus this is unnecessary.
-Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) {
- if(!this.data[uri]) this.data[uri] = new Object();
- this.data[uri][rdfUri] = literal;
- Scholar.debug(rdfUri+" for "+uri+" is "+literal);
-}
-
-// Additional functions added for compatibility purposes only
-// No idea if any scraper actually uses these, but just in case, they're
-// implemented so as not to throw an exception
-Scholar.Ingester.Model.prototype.addTag = function() {}
-Scholar.Ingester.Model.prototype.getRepository = function() {}
-Scholar.Ingester.Model.prototype.detachRepository = function() {}
-
-/////////////////////////////////////////////////////////////////
-//
-// Scholar.Ingester.Utilities
-//
-/////////////////////////////////////////////////////////////////
-// Scholar.Ingester.Utilities class, a set of methods to assist in data
-// extraction. Most code here was stolen directly from the Piggy Bank project.
-Scholar.Ingester.Utilities = function() {}
-
-// Adapter for Piggy Bank function to print debug messages; log level is
-// fixed at 4 (could change this)
-Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) {
- Scholar.debug(msg, 4);
-}
-
-// Appears to trim a string, chopping of newlines/spacing
-Scholar.Ingester.Utilities.prototype.trimString = function(s) {
- var i = 0;
- var spaceChars = " \n\r\t" + String.fromCharCode(160) /* */;
- while (i < s.length) {
- var c = s.charAt(i);
- if (spaceChars.indexOf(c) < 0) {
- break;
- }
- i++;
- }
-
- s = s.substring(i);
-
- i = s.length;
- while (i > 0) {
- var c = s.charAt(i - 1);
- if (spaceChars.indexOf(c) < 0) {
- break;
- }
- i--;
- }
-
- return s.substring(0, i);
-}
-
-// Takes an XPath query and returns the results
-Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {
- var elmts = [];
-
- var iterator = doc.evaluate(xpath, parentNode, nsResolver, XPathResult.ANY_TYPE,null);
- var elmt = iterator.iterateNext();
- var i = 0;
- while (elmt) {
- elmts[i++] = elmt;
- elmt = iterator.iterateNext();
- }
- return elmts;
-}
-
-// Loads a single document for a scraper, running succeeded() on success or
-// failed() on failure
-Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
- this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
-}
-
-// Downloads and processes documents with processor()
-// browser - a browser object
-// firstDoc - the first document to process with the processor (if null,
-// first document is processed without processor)
-// urls - an array of URLs to load
-// processor - a function to execute to process each document
-// done - a function to execute when all document processing is complete
-// exception - a function to execute if an exception occurs (exceptions are
-// also logged in the Firefox Scholar log)
-Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
- try {
- if (urls.length == 0) {
- if (firstDoc) {
- processor(firstDoc, done);
- } else {
- done();
- }
- return;
- }
-
- var urlIndex = -1;
- var doLoad = function() {
- urlIndex++;
- if (urlIndex < urls.length) {
- try {
- var url = urls[urlIndex];
- var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
- b.loadURI(url);
- } catch (e) {
- exception(e);
- Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
- }
- } else {
- window.setTimeout(done, 10);
- }
- };
- var onLoad = function() {
- try {
- var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;
- processor(b.contentDocument, doLoad);
- } catch (e) {
- exception(e);
- Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
- }
- };
- var init = function() {
- var listener;
- listener.onStateChange = function(webProgress, request, stateFlags, status) {
- if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&
- request.name == urls[urlIndex]) {
- try {
- Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);
- } catch (e) {
- exception(e);
- Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);
- }
- }
- };
-
- var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
- tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);
-
- if (firstDoc) {
- processor(firstDoc, doLoad);
- } else {
- doLoad();
- }
- }
-
- w.addEventListener("load", init, false);
- } catch (e) {
- exception(e);
- PB_Debug.print("processDocuments: " + e);
- }
-}
-
-// Appears to look for links in a document containing a certain substring
-Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) {
- var urls = [];
- var addedURLs = [];
-
- var aElements = doc.evaluate("//a", doc, null, XPathResult.ANY_TYPE,null);
- var aElement = aElements.iterateNext();
- while (aElement) {
- var href = aElement.href;
- if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {
- urls.unshift(href);
- addedURLs[href] = true;
- }
- aElement = aElements.iterateNext();
- }
- return urls;
-}
-
-// For now, we're going to skip the getLLsFromAddresses function (which gets
-// latitude and longitude pairs from a series of addresses, but requires the
-// big mess of Java code that is the Piggy Bank server) and the geoHelper
-// tools (which rely on getLLsFromAddresses) since these are probably not
-// essential components for Scholar and would take a great deal of effort to
-// implement. We can, however, always implement them later.
-
-// It looks like these are simple front-ends for XMLHttpRequest. They're a
-// component of the Piggy Bank API, so they're implemented here.
-Scholar.Ingester.Utilities.HTTPUtilities = function() {}
-
-Scholar.Ingester.Utilities.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
- var xmlhttp = new XMLHttpRequest();
-
- xmlhttp.open('GET', url, true);
- xmlhttp.overrideMimeType("text/xml");
- xmlhttp.onreadystatechange = function() {
- Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
- };
- xmlhttp.send(null);
-}
-
-Scholar.Ingester.Utilities.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
- var xmlhttp = new XMLHttpRequest();
-
- xmlhttp.open('POST', url, true);
- xmlhttp.overrideMimeType("text/xml");
- xmlhttp.onreadystatechange = function() {
- Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
- };
- xmlhttp.send(body);
-}
-
-Scholar.Ingester.Utilities.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
- var xmlhttp = new XMLHttpRequest();
-
- xmlhttp.open('OPTIONS', url, true);
- xmlhttp.overrideMimeType("text/xml");
- xmlhttp.onreadystatechange = function() {
- Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
- };
- xmlhttp.send(body);
-}
-
-// Possible point of failure; for some reason, this used to be a separate
-// class, so make sure it works
-Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
- switch (xmlhttp.readyState) {
-
- // Request not yet made
- case 1:
- break;
-
- // Contact established with server but nothing downloaded yet
- case 2:
- try {
- // Check for HTTP status 200
- if (xmlhttp.status != 200) {
- if (onStatus) {
- onStatus(
- xmlhttp.status,
- xmlhttp.statusText,
- xmlhttp
- );
- xmlhttp.abort();
- }
- }
- } catch (e) {
- Scholar.debug(e, 2);
- }
- break;
-
- // Called multiple while downloading in progress
- case 3:
- break;
-
- // Download complete
- case 4:
- try {
- if (onDone) {
- onDone(xmlhttp.responseText, xmlhttp);
- }
- } catch (e) {
- Scholar.debug(e, 2);
- }
- break;
- }
-}
-//////////////////////////////////////////////////////////////////////////////
-//
-// Scholar.Ingester.Document
-//
-//////////////////////////////////////////////////////////////////////////////
-
-/* Public properties:
- * browser - browser window object of document
- * model - data model for semantic scrapers
- * scraper - best scraper to use to scrape page
- *
- * Private properties:
- * _sandbox - sandbox for code execution
- * _progressDialog - dialog showing scrape progress
- */
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// Public Scholar.Ingester.Document methods
-//
-//////////////////////////////////////////////////////////////////////////////
-
-/*
- * Constructor for Document object
- */
-Scholar.Ingester.Document = function(browserWindow){
- this.browser = browserWindow;
- this.scraper = null
- this.model = new Scholar.Ingester.Model();
- this._generateSandbox();
-}
-
-/*
- * Retrieves the best scraper to scrape a given page
- */
-Scholar.Ingester.Document.prototype.retrieveScraper = function() {
- Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href);
- var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC';
- var scrapers = Scholar.DB.query(sql);
- for(var i=0; i<scrapers.length; i++) {
- var currentScraper = scrapers[i];
- if(this.canScrape(currentScraper)) {
- this.scraper = currentScraper;
- Scholar.debug("Found scraper "+this.scraper.label);
- return true;
- }
- }
- return false;
-}
-
-/*
- * Check to see if _scraper_ can scrape this document
- */
-Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
- var canScrape = false;
-
- // Test with regular expression
- // If this is slow, we could preload all scrapers and compile regular
- // expressions, so each check will be faster
- if(currentScraper.urlPattern) {
- var regularExpression = new RegExp(currentScraper.urlPattern, "i");
- if(regularExpression.test(this.browser.contentDocument.location.href)) {
- canScrape = true;
- }
- }
-
- // Test with JavaScript if available and didn't have a regular expression or
- // passed regular expression test
- if((!currentScraper.urlPattern || canScrape)
- && currentScraper.scraperDetectCode) {
- var scraperSandbox = this.sandbox;
- try {
- canScrape = this.evalInSandbox("(function(){\n" +
- currentScraper.scraperDetectCode +
- "\n})()", scraperSandbox);
- } catch(e) {
- throw e+' in scraper '+currentScraper.label;
- }
- }
- return canScrape;
-}
-
-/*
- * Populate model with semantic data regarding this page using _scraper_
- */
-Scholar.Ingester.Document.prototype.scrapePage = function() {
- Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
-
- var scraperSandbox = this.sandbox;
-
- this._progressDialog = openDialog("chrome://scholar/content/ingester/scrape-progress.xul",
- "_blank", "chrome,all,dialog=no", null, null, null);
-
- Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
-
- // If synchronous, call _scrapePageComplete();
- if(!scraperSandbox._waitForCompletion) {
- this._scrapePageComplete();
- }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// Private Scholar.Ingester.Document methods
-//
-//////////////////////////////////////////////////////////////////////////////
-
-/*
- * Piggy Bank/FS offers four objects to JavaScript scrapers
- * browser - the object representing the open browser window containing the
- * document to be processes
- * doc - the DOM (basically just browser.contentDocument)
- * model - the object representing the RDF model of data to be returned
- * (see Scholar.Ingester.Model)
- * utilities - a set of utilities for making certain tasks easier
- * (see Scholar.Ingester.Utilities);
- *
- * Piggy Bank/FS also offers two functions to simplify asynchronous requests
- * (these will only be available for scraping, and not for scrape detection)
- * wait() - called on asynchronous requests so that Piggy Bank/FS will not
- * automatically return at the end of code execution
- * done() - when wait() is called, Piggy Bank/FS will wait for this
- * function before returning
- */
-
-/*
- * Called when scraping (synchronous or asynchronous) is complete
- */
-Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
- this._updateDatabase();
- this._progressDialog.close();
-}
-
-Scholar.Ingester.Document.prototype._generateSandbox = function() {
- this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
- this.sandbox.browser = this.browser;
- this.sandbox.doc = this.sandbox.browser.contentDocument;
- this.sandbox.utilities = new Scholar.Ingester.Utilities;
- this.sandbox.model = this.model;
- this.sandbox.XPathResult = XPathResult;
-
- this.sandbox.wait = function(){ this._waitForCompletion = true; };
- this.sandbox.done = function(){ this._scrapePageComplete(); };
-}
-
-/*
- * Add data ingested using RDF to database
- * (Ontologies are hard-coded until we have a real way of dealing with them)
- */
-Scholar.Ingester.Document.prototype._updateDatabase = function() {
- var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
- var prefixDC = 'http://purl.org/dc/elements/1.1/';
- var prefixDCMI = 'http://purl.org/dc/dcmitype/';
- var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
-
- for(var uri in this.model.data) {
- var newItem = Scholar.Items.getNewItemByType(1);
- newItem.setField("source", uri);
- if(this.model.data[uri][prefixDC + 'title']) {
- newItem.setField("title", this.model.data[uri][prefixDC + 'title']);
- }
- if(this.model.data[uri][prefixDC + 'publisher']) {
- newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher']);
- }
- if(this.model.data[uri][prefixDC + 'year']) {
- data.date = this.model.data[uri][prefixDC + 'year'].substring(
- this.model.data[uri][prefixDC + 'year'].lastIndexOf(" ")+1,
- this.model.data[uri][prefixDC + 'year'].length);
- }
- if(this.model.data[uri][prefixDC + 'edition']) {
- newItem.setField("edition", this.model.data[uri][prefixDC + 'edition']);
- }
- if(this.model.data[uri][prefixDC + 'identifier']) {
- newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'].substring(5));
- }
- if(this.model.data[uri][prefixDummy + 'pages']) {
- newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages']);
- }
- if(this.model.data[uri][prefixDC + 'creator']) {
- var creator = this.model.data[uri][prefixDC + 'creator'];
-
- var spaceIndex = creator.lastIndexOf(" ");
- var firstName = creator.substring(spaceIndex+1, creator.length);
- var lastName = creator.substring(0, spaceIndex);
-
- newItem.setCreator(0, firstName, lastName);
- }
- newItem.save();
- }
-}
-\ No newline at end of file
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -0,0 +1,473 @@
+// Firefox Scholar Ingester
+// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
+// This code is licensed according to the GPL
+
+Scholar.Ingester = new function() {}
+
+/////////////////////////////////////////////////////////////////
+//
+// Scholar.Ingester.Model
+//
+/////////////////////////////////////////////////////////////////
+
+// Scholar.Ingester.Model, an object representing an RDF data model with
+// methods to add to that model. In Piggy Bank, this was implemented in Java,
+// but seeing as we don't really want an enormous web server running with FS,
+// but we don't actually need that, so it's much simpler.
+//
+// The Java version of this class can be viewed at
+// http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java
+Scholar.Ingester.Model = function() {
+ this.data = new Object();
+}
+
+// Piggy Bank provides a fourth argument, one that determines if the third
+// argument is a literal or an RDF URI. Since our ontologies are
+// sufficiently restricted, we have no chance of confusing a literal and an
+// RDF URI and thus this is unnecessary.
+Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) {
+ if(!this.data[uri]) this.data[uri] = new Object();
+ this.data[uri][rdfUri] = literal;
+ Scholar.debug(rdfUri+" for "+uri+" is "+literal);
+}
+
+// Additional functions added for compatibility purposes only
+// No idea if any scraper actually uses these, but just in case, they're
+// implemented so as not to throw an exception
+Scholar.Ingester.Model.prototype.addTag = function() {}
+Scholar.Ingester.Model.prototype.getRepository = function() {}
+Scholar.Ingester.Model.prototype.detachRepository = function() {}
+
+/////////////////////////////////////////////////////////////////
+//
+// Scholar.Ingester.Utilities
+//
+/////////////////////////////////////////////////////////////////
+// Scholar.Ingester.Utilities class, a set of methods to assist in data
+// extraction. Most code here was stolen directly from the Piggy Bank project.
+Scholar.Ingester.Utilities = function() {}
+
+// Adapter for Piggy Bank function to print debug messages; log level is
+// fixed at 4 (could change this)
+Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) {
+ Scholar.debug(msg, 4);
+}
+
+// Appears to trim a string, chopping of newlines/spacing
+Scholar.Ingester.Utilities.prototype.trimString = function(s) {
+ var i = 0;
+ var spaceChars = " \n\r\t" + String.fromCharCode(160) /* */;
+ while (i < s.length) {
+ var c = s.charAt(i);
+ if (spaceChars.indexOf(c) < 0) {
+ break;
+ }
+ i++;
+ }
+
+ s = s.substring(i);
+
+ i = s.length;
+ while (i > 0) {
+ var c = s.charAt(i - 1);
+ if (spaceChars.indexOf(c) < 0) {
+ break;
+ }
+ i--;
+ }
+
+ return s.substring(0, i);
+}
+
+// Takes an XPath query and returns the results
+Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {
+ var elmts = [];
+
+ var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
+ var elmt = iterator.iterateNext();
+ var i = 0;
+ while (elmt) {
+ elmts[i++] = elmt;
+ elmt = iterator.iterateNext();
+ }
+ return elmts;
+}
+
+// Loads a single document for a scraper, running succeeded() on success or
+// failed() on failure
+Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
+ this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
+}
+
+// Downloads and processes documents with processor()
+// browser - a browser object
+// firstDoc - the first document to process with the processor (if null,
+// first document is processed without processor)
+// urls - an array of URLs to load
+// processor - a function to execute to process each document
+// done - a function to execute when all document processing is complete
+// exception - a function to execute if an exception occurs (exceptions are
+// also logged in the Firefox Scholar log)
+Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
+ try {
+ if (urls.length == 0) {
+ if (firstDoc) {
+ processor(firstDoc, done);
+ } else {
+ done();
+ }
+ return;
+ }
+
+ var urlIndex = -1;
+ var doLoad = function() {
+ urlIndex++;
+ if (urlIndex < urls.length) {
+ try {
+ var url = urls[urlIndex];
+ var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
+ b.loadURI(url);
+ } catch (e) {
+ exception(e);
+ Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
+ }
+ } else {
+ window.setTimeout(done, 10);
+ }
+ };
+ var onLoad = function() {
+ try {
+ var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;
+ processor(b.contentDocument, doLoad);
+ } catch (e) {
+ exception(e);
+ Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
+ }
+ };
+ var init = function() {
+ var listener;
+ listener.onStateChange = function(webProgress, request, stateFlags, status) {
+ if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&
+ request.name == urls[urlIndex]) {
+ try {
+ Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);
+ } catch (e) {
+ exception(e);
+ Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);
+ }
+ }
+ };
+
+ var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
+ tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);
+
+ if (firstDoc) {
+ processor(firstDoc, doLoad);
+ } else {
+ doLoad();
+ }
+ }
+
+ w.addEventListener("load", init, false);
+ } catch (e) {
+ exception(e);
+ PB_Debug.print("processDocuments: " + e);
+ }
+}
+
+// Appears to look for links in a document containing a certain substring
+Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) {
+ var urls = [];
+ var addedURLs = [];
+
+ var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
+ var aElement = aElements.iterateNext();
+ while (aElement) {
+ var href = aElement.href;
+ if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {
+ urls.unshift(href);
+ addedURLs[href] = true;
+ }
+ aElement = aElements.iterateNext();
+ }
+ return urls;
+}
+
+// For now, we're going to skip the getLLsFromAddresses function (which gets
+// latitude and longitude pairs from a series of addresses, but requires the
+// big mess of Java code that is the Piggy Bank server) and the geoHelper
+// tools (which rely on getLLsFromAddresses) since these are probably not
+// essential components for Scholar and would take a great deal of effort to
+// implement. We can, however, always implement them later.
+
+// It looks like these are simple front-ends for XMLHttpRequest. They're a
+// component of the Piggy Bank API, so they're implemented here.
+Scholar.Ingester.Utilities.HTTPUtilities = function() {}
+
+Scholar.Ingester.Utilities.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
+ var xmlhttp = new XMLHttpRequest();
+
+ xmlhttp.open('GET', url, true);
+ xmlhttp.overrideMimeType("text/xml");
+ xmlhttp.onreadystatechange = function() {
+ Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
+ };
+ xmlhttp.send(null);
+}
+
+Scholar.Ingester.Utilities.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
+ var xmlhttp = new XMLHttpRequest();
+
+ xmlhttp.open('POST', url, true);
+ xmlhttp.overrideMimeType("text/xml");
+ xmlhttp.onreadystatechange = function() {
+ Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
+ };
+ xmlhttp.send(body);
+}
+
+Scholar.Ingester.Utilities.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
+ var xmlhttp = new XMLHttpRequest();
+
+ xmlhttp.open('OPTIONS', url, true);
+ xmlhttp.overrideMimeType("text/xml");
+ xmlhttp.onreadystatechange = function() {
+ Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
+ };
+ xmlhttp.send(body);
+}
+
+// Possible point of failure; for some reason, this used to be a separate
+// class, so make sure it works
+Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
+ switch (xmlhttp.readyState) {
+
+ // Request not yet made
+ case 1:
+ break;
+
+ // Contact established with server but nothing downloaded yet
+ case 2:
+ try {
+ // Check for HTTP status 200
+ if (xmlhttp.status != 200) {
+ if (onStatus) {
+ onStatus(
+ xmlhttp.status,
+ xmlhttp.statusText,
+ xmlhttp
+ );
+ xmlhttp.abort();
+ }
+ }
+ } catch (e) {
+ Scholar.debug(e, 2);
+ }
+ break;
+
+ // Called multiple while downloading in progress
+ case 3:
+ break;
+
+ // Download complete
+ case 4:
+ try {
+ if (onDone) {
+ onDone(xmlhttp.responseText, xmlhttp);
+ }
+ } catch (e) {
+ Scholar.debug(e, 2);
+ }
+ break;
+ }
+}
+//////////////////////////////////////////////////////////////////////////////
+//
+// Scholar.Ingester.Document
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/* Public properties:
+ * browser - browser window object of document
+ * model - data model for semantic scrapers
+ * scraper - best scraper to use to scrape page
+ *
+ * Private properties:
+ * _sandbox - sandbox for code execution
+ */
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Public Scholar.Ingester.Document methods
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Constructor for Document object
+ */
+Scholar.Ingester.Document = function(browserWindow){
+ this.browser = browserWindow;
+ this.scraper = null
+ this.model = new Scholar.Ingester.Model();
+ this._generateSandbox();
+}
+
+/*
+ * Retrieves the best scraper to scrape a given page
+ */
+Scholar.Ingester.Document.prototype.retrieveScraper = function() {
+ Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href);
+ var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC';
+ var scrapers = Scholar.DB.query(sql);
+ for(var i=0; i<scrapers.length; i++) {
+ var currentScraper = scrapers[i];
+ if(this.canScrape(currentScraper)) {
+ this.scraper = currentScraper;
+ Scholar.debug("Found scraper "+this.scraper.label);
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * Check to see if _scraper_ can scrape this document
+ */
+Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
+ var canScrape = false;
+
+ // Test with regular expression
+ // If this is slow, we could preload all scrapers and compile regular
+ // expressions, so each check will be faster
+ if(currentScraper.urlPattern) {
+ var regularExpression = new RegExp(currentScraper.urlPattern, "i");
+ if(regularExpression.test(this.browser.contentDocument.location.href)) {
+ canScrape = true;
+ }
+ }
+
+ // Test with JavaScript if available and didn't have a regular expression or
+ // passed regular expression test
+ if((!currentScraper.urlPattern || canScrape)
+ && currentScraper.scraperDetectCode) {
+ var scraperSandbox = this.sandbox;
+ try {
+ canScrape = this.evalInSandbox("(function(){\n" +
+ currentScraper.scraperDetectCode +
+ "\n})()", scraperSandbox);
+ } catch(e) {
+ throw e+' in scraper '+currentScraper.label;
+ }
+ }
+ return canScrape;
+}
+
+/*
+ * Populate model with semantic data regarding this page using _scraper_
+ * Callback will be executed once scraping is complete
+ */
+Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
+ if(callback) {
+ this._scrapeCallback = callback;
+ }
+
+ Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
+
+ var scraperSandbox = this.sandbox;
+
+ Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
+
+ // If synchronous, call _scrapePageComplete();
+ if(!scraperSandbox._waitForCompletion) {
+ this._scrapePageComplete();
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Private Scholar.Ingester.Document methods
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Piggy Bank/FS offers four objects to JavaScript scrapers
+ * browser - the object representing the open browser window containing the
+ * document to be processes
+ * doc - the DOM (basically just browser.contentDocument)
+ * model - the object representing the RDF model of data to be returned
+ * (see Scholar.Ingester.Model)
+ * utilities - a set of utilities for making certain tasks easier
+ * (see Scholar.Ingester.Utilities);
+ *
+ * Piggy Bank/FS also offers two functions to simplify asynchronous requests
+ * (these will only be available for scraping, and not for scrape detection)
+ * wait() - called on asynchronous requests so that Piggy Bank/FS will not
+ * automatically return at the end of code execution
+ * done() - when wait() is called, Piggy Bank/FS will wait for this
+ * function before returning
+ */
+
+/*`
+ * Called when scraping (synchronous or asynchronous) is complete
+ */
+Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
+ this._updateDatabase();
+ if(this._scrapeCallback) {
+ this._scrapeCallback();
+ }
+}
+
+Scholar.Ingester.Document.prototype._generateSandbox = function() {
+ this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
+ this.sandbox.browser = this.browser;
+ this.sandbox.doc = this.sandbox.browser.contentDocument;
+ this.sandbox.utilities = new Scholar.Ingester.Utilities;
+ this.sandbox.model = this.model;
+ this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
+
+ this.sandbox.wait = function(){ this._waitForCompletion = true; };
+ this.sandbox.done = function(){ this._scrapePageComplete(); };
+}
+
+/*
+ * Add data ingested using RDF to database
+ * (Ontologies are hard-coded until we have a real way of dealing with them)
+ */
+Scholar.Ingester.Document.prototype._updateDatabase = function() {
+ var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
+ var prefixDC = 'http://purl.org/dc/elements/1.1/';
+ var prefixDCMI = 'http://purl.org/dc/dcmitype/';
+ var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
+
+ for(var uri in this.model.data) {
+ var newItem = Scholar.Items.getNewItemByType(1);
+ newItem.setField("source", uri);
+ if(this.model.data[uri][prefixDC + 'title']) {
+ newItem.setField("title", this.model.data[uri][prefixDC + 'title']);
+ }
+ if(this.model.data[uri][prefixDC + 'publisher']) {
+ newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher']);
+ }
+ if(this.model.data[uri][prefixDC + 'year']) {
+ data.date = this.model.data[uri][prefixDC + 'year'].substring(
+ this.model.data[uri][prefixDC + 'year'].lastIndexOf(" ")+1,
+ this.model.data[uri][prefixDC + 'year'].length);
+ }
+ if(this.model.data[uri][prefixDC + 'edition']) {
+ newItem.setField("edition", this.model.data[uri][prefixDC + 'edition']);
+ }
+ if(this.model.data[uri][prefixDC + 'identifier']) {
+ newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'].substring(5));
+ }
+ if(this.model.data[uri][prefixDC + 'creator']) {
+ var creator = this.model.data[uri][prefixDC + 'creator'];
+
+ var spaceIndex = creator.lastIndexOf(" ");
+ var lastName = creator.substring(spaceIndex+1, creator.length);
+ var firstName = creator.substring(0, spaceIndex);
+
+ newItem.setCreator(0, firstName, lastName);
+ }
+ newItem.save();
+ }
+}
+\ No newline at end of file
diff --git a/components/chnmIScholarService.js b/components/chnmIScholarService.js
@@ -31,6 +31,10 @@ Cc["@mozilla.org/moz/jssubscript-loader;1"]
.getService(Ci.mozIJSSubScriptLoader)
.loadSubScript("chrome://scholar/content/xpcom/notifier.js");
+Cc["@mozilla.org/moz/jssubscript-loader;1"]
+ .getService(Ci.mozIJSSubScriptLoader)
+ .loadSubScript("chrome://scholar/content/xpcom/ingester.js");
+
/********************************************************************/