commit 2a74e88416e1ddbdea7dafe171d0f69f6cfc955f
parent 8fe72b3e3cd360e2f75a4a5973c85a605e7c4157
Author: Simon Kornblith <simon@simonster.com>
Date: Fri, 23 Jun 2006 20:09:48 +0000
- Make generalized function for finding search results case insensitive
- Scrape DRA search results
Diffstat:
2 files changed, 52 insertions(+), 24 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -339,12 +339,12 @@ Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe,
// Require link to match this
if(urlRe) {
var urlRegexp = new RegExp();
- urlRegexp.compile(urlRe);
+ urlRegexp.compile(urlRe, "i");
}
// Do not allow text to match this
if(rejectRe) {
var rejectRegexp = new RegExp();
- rejectRegexp.compile(rejectRe);
+ rejectRegexp.compile(rejectRe, "i");
}
if(!inHere.length) {
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,7 +1,7 @@
--- 13
+-- 14
-- Set the following timestamp to the most recent scraper update date
-REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 15:21:00'));
+REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 16:09:00'));
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
@@ -1299,34 +1299,62 @@ utilities.processDocuments(browser, null, newUris, function(newBrowser) {
wait();');
-REPLACE INTO "scrapers" VALUES('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-18 11:19:00', 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)',
-'if(doc.location.href.indexOf("authority_hits") > 0) {
- var body = doc.getElementsByTagName("body");
- if(body[0].innerHTML.indexOf("ISBN") < 0) {
- return false;
- }
-}
-return true;',
+REPLACE INTO "scrapers" VALUES('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-23 16:09:00', 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)', NULL,
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
-var uri = doc.location.href;
-var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i;
-var m = uriRegexp.exec(uri);
-if(uri.indexOf("authority_hits") < 0) {
- var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3];
+var checkItems = false;
+
+if(doc.location.href.indexOf("/authority_hits") > 0) {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ checkItems = utilities.gatherElementsOnXPath(doc, doc, "/html/body//ol/li", nsResolver);
+}
+
+if(checkItems && checkItems.length) {
+ var items = utilities.getItemArray(doc, checkItems, ''https?://.*/web2/tramp2\.exe/see_record'');
+ items = utilities.selectItems(items);
+
+ if(!items) {
+ return true;
+ }
+
+ var uris = new Array();
+ for(i in items) {
+ uris.push(i);
+ }
} else {
- var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc";
+ var uris = new Array(doc.location.href);
}
-utilities.HTTPUtilities.doGet(newUri, null, function(text) {
- var record = new MARC_Record();
- record.load(text, "binary");
- utilities.importMARCRecord(record, uri, model);
- done();
-})
+for(i in uris) {
+ var uri = uris[i];
+ var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i;
+ var m = uriRegexp.exec(uri);
+ if(uri.indexOf("/authority_hits") < 0) {
+ var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3];
+ } else {
+ var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc";
+ }
+
+ // Keep track of how many requests have been completed
+ var j = 0;
+
+ utilities.HTTPUtilities.doGet(newUri, null, function(text) {
+ var record = new MARC_Record();
+ record.load(text, "binary");
+ utilities.importMARCRecord(record, uris[j], model);
+ j++;
+ if(j == uris.length) {
+ done();
+ }
+ });
+}
wait();');