www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 22eebc6cdfb5f8fb837f65c4980397279c05e9d8
parent b8ad832e74e127786fe970852dfcd682327ef18c
Author: Simon Kornblith <simon@simonster.com>
Date:   Sun, 25 Jun 2006 04:30:43 +0000

Addresses #68, figure out way to have scrapers work for gated resources behind proxies. We can now access pages through an EZProxy. We need to know what alternatives to EZProxy exist in order to support them. Also, fixes some spacing issues in browser.js.


Diffstat:
Mchrome/chromeFiles/content/scholar/ingester/browser.js | 48++++++++++++++++++++++++------------------------
Mchrome/chromeFiles/content/scholar/xpcom/ingester.js | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 172 insertions(+), 37 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -27,8 +27,8 @@ Scholar_Ingester_Interface.init = function() { Scholar_Ingester_Interface.browserDocuments = new Object(); Scholar_Ingester_Interface.browserUris = new Array(); - window.addEventListener("load", Scholar_Ingester_Interface.chromeLoad, false); - window.addEventListener("unload", Scholar_Ingester_Interface.chromeUnload, false); + window.addEventListener("load", Scholar_Ingester_Interface.chromeLoad, false); + window.addEventListener("unload", Scholar_Ingester_Interface.chromeUnload, false); } /* @@ -39,12 +39,12 @@ Scholar_Ingester_Interface.chromeLoad = function() { Scholar_Ingester_Interface.appContent = document.getElementById("appcontent"); Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image"); - // this gives us onLocationChange + // this gives us onLocationChange, for updating when tabs are switched/created Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener, Components.interfaces.nsIWebProgress.NOTIFY_LOCATION); - // let's use load instead of DOMContentLoaded - Scholar_Ingester_Interface.appContent.addEventListener("pageshow", - Scholar_Ingester_Interface.contentLoad, true); + // this is for pageshow, for updating the status of the book icon + Scholar_Ingester_Interface.appContent.addEventListener("pageshow", + Scholar_Ingester_Interface.contentLoad, true); } /* @@ -132,29 +132,29 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {} * appropriate status indicator for the current tab, and to free useless objects */ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) { - var browsers = Scholar_Ingester_Interface.tabBrowser.browsers; + var browsers = Scholar_Ingester_Interface.tabBrowser.browsers; - // Remove document object of any browser that no longer exists - for (var i = 0; i < Scholar_Ingester_Interface.browsers.length; i++) { - var browser = Scholar_Ingester_Interface.browsers[i]; - var exists = false; + // Remove document object of any browser that no longer exists + for (var i = 0; i < Scholar_Ingester_Interface.browsers.length; i++) { + var browser = Scholar_Ingester_Interface.browsers[i]; + var exists = false; - for (var j = 0; j < browsers.length; j++) { - if (browser == browsers[j]) { - exists = true; - break; - } - } + for (var j = 0; j < browsers.length; j++) { + if (browser == browsers[j]) { + exists = true; + break; + } + } - if (!exists) { - Scholar_Ingester_Interface.browsers.splice(i,1); + if (!exists) { + Scholar_Ingester_Interface.browsers.splice(i,1); - // To execute if document object does not exist - Scholar_Ingester_Interface._deleteDocument(browser); - } - } + // To execute if document object does not exist + Scholar_Ingester_Interface._deleteDocument(browser); + } + } - Scholar_Ingester_Interface.updateStatus(); + Scholar_Ingester_Interface.updateStatus(); } ////////////////////////////////////////////////////////////////////////////// diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -21,6 +21,116 @@ Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) { ///////////////////////////////////////////////////////////////// // +// Scholar.Ingester.ProxyMonitor +// +///////////////////////////////////////////////////////////////// + +// A singleton for recognizing EZProxies and converting URLs such that databases +// will work from outside them. Unfortunately, this only works with the ($495) +// EZProxy software. If there are open source alternatives, we should support +// them too. + +/* + * Precompile proxy regexps + */ +Scholar.Ingester.ProxyMonitor = new Object(); +Scholar.Ingester.ProxyMonitor._ezProxyRe = new RegExp(); +Scholar.Ingester.ProxyMonitor._ezProxyRe.compile("(https?://([^/:]+)(?:\:[0-9])?/login)\\?(?:.+&)?(url|qurl)=([^&]+)"); +Scholar.Ingester.ProxyMonitor._hostRe = new RegExp(); +Scholar.Ingester.ProxyMonitor._hostRe.compile("^https?://(([^/:]+)(\:[0-9]+)?)"); + +/* + * Returns a page's proper url, adjusting for proxying + * + * This is a bit of a hack, in that it offers an opportunity for spoofing. Not + * really any way around this, but our scrapers should be sufficiently sandboxed + * that it won't be a problem. + */ +Scholar.Ingester.ProxyMonitor.proxyToProper = function(url) { + var m = Scholar.Ingester.ProxyMonitor._ezProxyRe.exec(url); + if(m) { + // EZProxy detected + var loginURL = m[1]; + var host = m[2]; + var arg = m[3]; + var url = m[4]; + + if(arg == "qurl") { + url = unescape(url); + } + + // FIXME - potential memory leak + Scholar.Ingester.ProxyMonitor._now = true; + Scholar.Ingester.ProxyMonitor._url = url; + Scholar.Ingester.ProxyMonitor._host = host; + Scholar.Ingester.ProxyMonitor._loginURL = loginURL; + } else if(Scholar.Ingester.ProxyMonitor._now) { + // EZProxying something + var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url); + + // EZProxy always runs on a higher port + if(url == Scholar.Ingester.ProxyMonitor._loginURL) { + Scholar.debug("EZProxy: detected wrong password; won't disable monitoring yet"); + } else { + if(m) { + var hostAndPort = m[1]; + var host = m[2]; + var port = m[3]; + + if(port) { + // Make sure our host is the same who we logged in under + if(host == Scholar.Ingester.ProxyMonitor._host) { + // Extract host information from the URL we're proxying + var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(Scholar.Ingester.ProxyMonitor._url); + var properHostAndPort = m[1]; + if(m) { + if(!Scholar.Ingester.ProxyMonitor._mapFromProxy) { + Scholar.Ingester.ProxyMonitor._mapFromProxy = new Object(); + Scholar.Ingester.ProxyMonitor._mapToProxy = new Object(); + } + Scholar.debug("EZProxy: host "+hostAndPort+" is really "+properHostAndPort); + Scholar.Ingester.ProxyMonitor._mapFromProxy[hostAndPort] = properHostAndPort; + Scholar.Ingester.ProxyMonitor._mapToProxy[properHostAndPort] = hostAndPort; + url = url.replace(hostAndPort, properHostAndPort); + } + } + } + } + Scholar.Ingester.ProxyMonitor._now = false; + } + } else if(Scholar.Ingester.ProxyMonitor._mapFromProxy) { + // EZProxy detection is active + + var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url); + if(m && Scholar.Ingester.ProxyMonitor._mapFromProxy[m[1]]) { + url = url.replace(m[1], Scholar.Ingester.ProxyMonitor._mapFromProxy[m[1]]); + Scholar.debug("EZProxy: proper url is "+url); + } + } + + return url; +} + +/* + * Returns a page's proxied url from the proper url + */ +Scholar.Ingester.ProxyMonitor.properToProxy = function(url) { + if(Scholar.Ingester.ProxyMonitor._mapToProxy) { + // EZProxy detection is active + + var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url); + if(Scholar.Ingester.ProxyMonitor._mapToProxy[m[1]]) { + // Actually need to map + url = url.replace(m[1], Scholar.Ingester.ProxyMonitor._mapToProxy[m[1]]); + Scholar.debug("EZProxy: proxied url is "+url); + } + } + + return url; +} + +///////////////////////////////////////////////////////////////// +// // Scholar.Ingester.Model // ///////////////////////////////////////////////////////////////// @@ -63,8 +173,9 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {} ///////////////////////////////////////////////////////////////// // Scholar.Ingester.Utilities class, a set of methods to assist in data // extraction. Most code here was stolen directly from the Piggy Bank project. -Scholar.Ingester.Utilities = function(myWindow) { +Scholar.Ingester.Utilities = function(myWindow, proxiedURL) { this.window = myWindow; + this.proxiedURL = proxiedURL; } // Adapter for Piggy Bank function to print debug messages; log level is @@ -149,8 +260,11 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD var doLoad = function() { urlIndex++; if (urlIndex < urls.length) { + url = urls[urlIndex]; + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } try { - url = urls[urlIndex]; Scholar.debug("loading "+url); hiddenBrowser.loadURI(url); } catch (e) { @@ -477,11 +591,16 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo // These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be // accessed outside the sandbox, and even if it could, it wouldn't let scripts // access across domains, so everything's replicated here. -Scholar.Ingester.HTTPUtilities = function(contentWindow) { +Scholar.Ingester.HTTPUtilities = function(contentWindow, proxiedURL) { this.window = contentWindow; + this.proxiedURL = proxiedURL; } Scholar.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) { + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + var xmlhttp = new this.window.XMLHttpRequest(); xmlhttp.open('GET', url, true); @@ -495,6 +614,10 @@ Scholar.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) } Scholar.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) { + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + var xmlhttp = new this.window.XMLHttpRequest(); xmlhttp.open('POST', url, true); @@ -508,6 +631,10 @@ Scholar.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus, } Scholar.Ingester.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) { + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + var xmlhttp = new this.window.XMLHttpRequest(); xmlhttp.open('OPTIONS', url, true); @@ -519,9 +646,7 @@ Scholar.Ingester.HTTPUtilities.prototype.doOptions = function(url, body, onStatu }; xmlhttp.send(body); } - -// Possible point of failure; for some reason, this used to be a separate -// class, so make sure it works + Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) { switch (xmlhttp.readyState) { @@ -564,6 +689,7 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu break; } } + ////////////////////////////////////////////////////////////////////////////// // // Scholar.Ingester.Document @@ -597,6 +723,13 @@ Scholar.Ingester.Document = function(browserWindow, myWindow){ this.browser = browserWindow; this.window = myWindow; this.model = new Scholar.Ingester.Model(); + + // Create separate URL to account for proxies + this.url = Scholar.Ingester.ProxyMonitor.proxyToProper(this.browser.contentDocument.location.href); + if(this.url != this.browser.contentDocument.location.href) { + this.proxiedURL = true; + } + this.items = new Array(); this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"] .getService(Ci.nsIAppShellService); @@ -607,7 +740,8 @@ Scholar.Ingester.Document = function(browserWindow, myWindow){ * Retrieves the best scraper to scrape a given page */ Scholar.Ingester.Document.prototype.retrieveScraper = function() { - Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href); + Scholar.debug("Retrieving scrapers for "+this.url); + var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC'; var scrapers = Scholar.DB.query(sql); for(var i=0; i<scrapers.length; i++) { @@ -625,14 +759,14 @@ Scholar.Ingester.Document.prototype.retrieveScraper = function() { * Check to see if _scraper_ can scrape this document */ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) { - var canScrape = false; + var canScrape = false; // Test with regular expression // If this is slow, we could preload all scrapers and compile regular // expressions, so each check will be faster if(currentScraper.urlPattern) { var regularExpression = new RegExp(currentScraper.urlPattern, "i"); - if(regularExpression.test(this.browser.contentDocument.location.href)) { + if(regularExpression.test(this.url)) { canScrape = true; } } @@ -672,7 +806,7 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) { this._scrapeCallback = callback; } - Scholar.debug("Scraping "+this.browser.contentDocument.location.href); + Scholar.debug("Scraping "+this.url); var scraperSandbox = this._sandbox; try { @@ -739,9 +873,10 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue) Scholar.Ingester.Document.prototype._generateSandbox = function() { this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href); this._sandbox.browser = this.browser; - this._sandbox.doc = this._sandbox.browser.contentDocument; - this._sandbox.utilities = new Scholar.Ingester.Utilities(this.window); - this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow); + this._sandbox.doc = this.browser.contentDocument; + this._sandbox.url = this.url; + this._sandbox.utilities = new Scholar.Ingester.Utilities(this.window, this.proxiedURL); + this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow, this.proxiedURL); this._sandbox.window = this.window; this._sandbox.model = this.model; this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;