closes #68, figure out way to have scrapers work for gated resources behind proxies. most institutions use EZProxy for their proxy needs (or a more transparent proxy, which we support natively). this implementation is significantly better than the old one, which refused to work after you'd already logged in once, and is also simpler, because it's stateless. it has to observe every HTTP request, but there's no noticeable speed hit. it also still doesn't work when there's a link from one gated site to another gated site, but as far as i can tell, this only happens on the Gale Group site. - www - Unnamed repository; edit this file 'description' to name the repository.

commit 257ed8f69b21d4781d2e0aaed93c69c1199f2b99
parent 19504e67468d5332d94835e8d5f236c2148f3c7c
Author: Simon Kornblith <simon@simonster.com>
Date:   Tue, 27 Jun 2006 04:08:21 +0000

closes #68, figure out way to have scrapers work for gated resources behind proxies. most institutions use EZProxy for their proxy needs (or a more transparent proxy, which we support natively). this implementation is significantly better than the old one, which refused to work after you'd already logged in once, and is also simpler, because it's stateless. it has to observe every HTTP request, but there's no noticeable speed hit. it also still doesn't work when there's a link from one gated site to another gated site, but as far as i can tell, this only happens on the Gale Group site.


Diffstat:
M chrome/chromeFiles/content/scholar/ingester/browser.js  | 5 +++--
M chrome/chromeFiles/content/scholar/xpcom/ingester.js  | 192 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M chrome/chromeFiles/content/scholar/xpcom/utilities.js  | 8 +++++++-

3 files changed, 120 insertions(+), 85 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -26,6 +26,7 @@ Scholar_Ingester_Interface.init = function() {
 	Scholar_Ingester_Interface.browsers = new Array();
 	Scholar_Ingester_Interface.browserDocuments = new Object();
 	Scholar_Ingester_Interface.browserUris = new Array();
+	Scholar.Ingester.ProxyMonitor.init();
 	
 	window.addEventListener("load", Scholar_Ingester_Interface.chromeLoad, false);
 	window.addEventListener("unload", Scholar_Ingester_Interface.chromeUnload, false);
@@ -249,9 +250,9 @@ Scholar_Ingester_Interface._finishScraping = function(obj, returnValue) {
 		}
 		
 		// Save items
-		/*for(i in obj.items) {
+		for(i in obj.items) {
 			obj.items[i].save();
-		}*/
+		}
 		setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2500);
 	} else if(returnValue) {
 		Scholar_Ingester_Interface.scrapeProgress.kill();
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -75,99 +75,127 @@ Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) {
 /*
  * Precompile proxy regexps
  */
-Scholar.Ingester.ProxyMonitor = new Object();
-Scholar.Ingester.ProxyMonitor._ezProxyRe = new RegExp();
-Scholar.Ingester.ProxyMonitor._ezProxyRe.compile("(https?://([^/:]+)(?:\:[0-9])?/login)\\?(?:.+&)?(url|qurl)=([^&]+)");
-Scholar.Ingester.ProxyMonitor._hostRe = new RegExp();
-Scholar.Ingester.ProxyMonitor._hostRe.compile("^https?://(([^/:]+)(\:[0-9]+)?)");
-
-/*
- * Returns a page's proper url, adjusting for proxying
- *
- * This is a bit of a hack, in that it offers an opportunity for spoofing. Not
- * really any way around this, but our scrapers should be sufficiently sandboxed
- * that it won't be a problem.
- */
-Scholar.Ingester.ProxyMonitor.proxyToProper = function(url) {
-	var m = Scholar.Ingester.ProxyMonitor._ezProxyRe.exec(url);
-	if(m) {
-		// EZProxy detected
-		var loginURL = m[1];
-		var host = m[2];
-		var arg = m[3];
-		var url = m[4];
-		
-		if(arg == "qurl") {
-			url = unescape(url);
+Scholar.Ingester.ProxyMonitor = new function() {
+	var _ezProxyRe = new RegExp();
+	_ezProxyRe.compile("\\?(?:.+&)?(url|qurl)=([^&]+)", "i");
+	/*var _hostRe = new RegExp();
+	_hostRe.compile("^https?://(([^/:]+)(?:\:([0-9]+))?)");*/
+	var ioService = Components.classes["@mozilla.org/network/io-service;1"]
+							  .getService(Components.interfaces.nsIIOService);
+	var on = false;
+	var _mapFromProxy = null;
+	var _mapToProxy = null;
+	
+	this.init = init;
+	this.proxyToProper = proxyToProper;
+	this.properToProxy = properToProxy;
+	this.observe = observe;
+	
+	function init() {
+		if(!on) {
+			var observerService = Components.classes["@mozilla.org/observer-service;1"]
+										.getService(Components.interfaces.nsIObserverService);
+			observerService.addObserver(this, "http-on-examine-response", false);
 		}
-		
-		Scholar.Ingester.ProxyMonitor._now = true;
-		Scholar.Ingester.ProxyMonitor._url = url;
-		Scholar.Ingester.ProxyMonitor._host = host;
-		Scholar.Ingester.ProxyMonitor._loginURL = loginURL;
-	} else if(Scholar.Ingester.ProxyMonitor._now) {
-		// EZProxying something
-		var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url);
-		
-		// EZProxy always runs on a higher port
-		if(url == Scholar.Ingester.ProxyMonitor._loginURL) {
-			Scholar.debug("EZProxy: detected wrong password; won't disable monitoring yet");
-		} else {
-			if(m) {
-				var hostAndPort = m[1];
-				var host = m[2];
-				var port = m[3];
+		on = true;
+	}
+	
+	function observe(channel) {
+		channel.QueryInterface(Components.interfaces.nsIHttpChannel);
+		if(channel.getResponseHeader("Server") == "EZproxy") {
+			// We're connected to an EZproxy
+			if(channel.responseStatus != "302") {
+				return;
+			}
+			
+			Scholar.debug(channel.URI.spec);
+			// We should be able to scrape the URL out of this
+			var m = _ezProxyRe.exec(channel.URI.spec);
+			if(!m) {
+				return;
+			}
+			
+			// Found URL
+			var variable = m[1];
+			var properURL = m[2];
+			if(variable.toLowerCase() == "qurl") {
+				properURL = unescape(properURL);
+			}
+			var properURI = _parseURL(properURL);
+			if(!properURI) {
+				return;
+			}
+			
+			// Get the new URL
+			var newURL = channel.getResponseHeader("Location");
+			if(!newURL) {
+				return;
+			}
+			var newURI = _parseURL(newURL);
+			if(!newURI) {
+				return;
+			}
+			
+			if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) {
+				// Different ports but the same server means EZproxy active
 				
-				if(port) {
-					// Make sure our host is the same who we logged in under
-					if(host == Scholar.Ingester.ProxyMonitor._host) {
-						// Extract host information from the URL we're proxying
-						var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(Scholar.Ingester.ProxyMonitor._url);
-						var properHostAndPort = m[1];
-						if(m) {
-							if(!Scholar.Ingester.ProxyMonitor._mapFromProxy) {
-								Scholar.Ingester.ProxyMonitor._mapFromProxy = new Object();
-								Scholar.Ingester.ProxyMonitor._mapToProxy = new Object();
-							}
-							Scholar.debug("EZProxy: host "+hostAndPort+" is really "+properHostAndPort);
-							Scholar.Ingester.ProxyMonitor._mapFromProxy[hostAndPort] = properHostAndPort;
-							Scholar.Ingester.ProxyMonitor._mapToProxy[properHostAndPort] = hostAndPort;
-							url = url.replace(hostAndPort, properHostAndPort);
-						}
-					}
+				Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort);
+				// Initialize variables here so people who never use EZProxies
+				// don't get the (very very minor) speed hit
+				if(!_mapFromProxy) {
+					_mapFromProxy = new Object();
+					_mapToProxy = new Object();
 				}
+				_mapFromProxy[newURI.hostPort] = properURI.hostPort;
+				_mapToProxy[properURI.hostPort] = newURI.hostPort;
 			}
-			Scholar.Ingester.ProxyMonitor._now = false;
-		}
-	} else if(Scholar.Ingester.ProxyMonitor._mapFromProxy) {
-		// EZProxy detection is active
-		
-		var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url);
-		if(m && Scholar.Ingester.ProxyMonitor._mapFromProxy[m[1]]) {
-			url = url.replace(m[1], Scholar.Ingester.ProxyMonitor._mapFromProxy[m[1]]);
-			Scholar.debug("EZProxy: proper url is "+url);
 		}
 	}
 	
-	return url;
-}
-
-/*
- * Returns a page's proxied url from the proper url
- */
-Scholar.Ingester.ProxyMonitor.properToProxy = function(url) {
-	if(Scholar.Ingester.ProxyMonitor._mapToProxy) {
-		// EZProxy detection is active
+	/*
+	 * Returns a page's proper url, adjusting for proxying
+	 */
+	function proxyToProper(url) {
+		if(_mapFromProxy) {
+			// EZProxy detection is active
+			
+			var uri = _parseURL(url);
+			if(uri && _mapFromProxy[uri.hostPort]) {
+				url = url.replace(uri.hostPort, _mapFromProxy[uri.hostPort]);
+				Scholar.debug("EZProxy: proper url is "+url);
+			}
+		}
 		
-		var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url);
-		if(Scholar.Ingester.ProxyMonitor._mapToProxy[m[1]]) {
-			// Actually need to map
-			url = url.replace(m[1], Scholar.Ingester.ProxyMonitor._mapToProxy[m[1]]);
-			Scholar.debug("EZProxy: proxied url is "+url);
+		return url;
+	}
+	
+	/*
+	 * Returns a page's proxied url from the proper url
+	 */
+	function properToProxy(url) {
+		if(_mapToProxy) {
+			// EZProxy detection is active
+			
+			var uri = _parseURL(url);
+			if(uri && _mapToProxy[uri.hostPort]) {
+				// Actually need to map
+				url = url.replace(uri.hostPort, _mapToProxy[uri.hostPort]);
+				Scholar.debug("EZProxy: proxied url is "+url);
+			}
 		}
+		
+		return url;
 	}
 	
-	return url;
+	/*
+	 * Parses a url into components (hostPort, port, host, and spec)
+	 */
+	function _parseURL(url) {
+		// create an nsIURI (not sure if this is faster than the regular
+		// expression, but it's at least more kosher)
+		var uri = ioService.newURI(url, null, null);
+		return uri;
+	}
 }
 
 /////////////////////////////////////////////////////////////////
diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
@@ -381,14 +381,20 @@ Scholar.Utilities.Ingester.prototype.importMARCRecord = function(record, uri, mo
 // Ingester adapters for Scholar.Utilities.HTTP to handle proxies
 
 Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) {
+	if(this.proxiedURL) {
+		url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
+	}
 	Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
 }
 Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
+	for(i in urls) {
+		urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(url);
+	}
 	Scholar.Utilities.HTTP.processDocuments(firstDoc, urls, processor, done, exception);
 }
 
 Scholar.Utilities.Ingester.HTTPUtilities = function(proxiedURL) {
-	this.proxiedURL = proxiedURL
+	this.proxiedURL = proxiedURL;
 }
 
 Scholar.Utilities.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {

	www Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	chrome/chromeFiles/content/scholar/ingester/browser.js	\|	5	+++--
M	chrome/chromeFiles/content/scholar/xpcom/ingester.js	\|	192	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M	chrome/chromeFiles/content/scholar/xpcom/utilities.js	\|	8	+++++++-