commit 470f7c463f7f1563216d385f7f67757fdffb48fd
parent 3890e5f1228fd88cd2fb315e30232f88f2c84a32
Author: Simon Kornblith <simon@simonster.com>
Date: Thu, 22 Jun 2006 20:50:57 +0000
The Voyager scraper now actually works on the search results page.
Diffstat:
4 files changed, 104 insertions(+), 24 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/selectitems.js b/chrome/chromeFiles/content/scholar/ingester/selectitems.js
@@ -21,7 +21,7 @@ Scholar_Ingester_Interface_SelectItems = function() {}
Scholar_Ingester_Interface_SelectItems.init = function() {
this.io = window.arguments[0];
this.Scholar_Ingester_Interface = window.arguments[1];
- this.listbox = document.getElementById("scholar-selectitems-links");
+ var listbox = document.getElementById("scholar-selectitems-links");
for(i in this.io.dataIn) { // we could use a tree for this if we wanted to
var itemNode = document.createElement("listitem");
@@ -29,16 +29,29 @@ Scholar_Ingester_Interface_SelectItems.init = function() {
itemNode.setAttribute("value", i);
itemNode.setAttribute("label", this.io.dataIn[i]);
itemNode.setAttribute("checked", false);
- this.listbox.appendChild(itemNode);
+ listbox.appendChild(itemNode);
}
}
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
+ var listbox = document.getElementById("scholar-selectitems-links");
+
+ var returnObject = false;
this.io.dataOut = new Object();
// collect scrapeURLList from listbox
- for(var i=0; i<this.listbox.length; i++) {
- var itemNode = this.listbox[i];
- this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
+ for(var i=0; i<listbox.childNodes.length; i++) {
+ var itemNode = listbox.childNodes[i];
+ if(itemNode.getAttribute("checked") == "true") {
+ this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
+ returnObject = true;
+ }
+ }
+
+ // What a hack! this makes code down the road much easier because otherwise
+ // an empty array is true but empty and we can't figure that out, because
+ // there's no length
+ if(!returnObject) {
+ this.io.dataOut = null;
}
}
\ No newline at end of file
diff --git a/chrome/chromeFiles/content/scholar/ingester/selectitems.xul b/chrome/chromeFiles/content/scholar/ingester/selectitems.xul
@@ -10,7 +10,6 @@ Borrowed from Linky, originally MPL/GPL/LGPL (now GPL, and modified into oblivio
persist="width height screenX screenY"
buttons="cancel,accept"
ondialogaccept="Scholar_Ingester_Interface_SelectItems.acceptSelection()"
- ondialogcancel="self.close()"
id="scholar-selectitems"
onload="Scholar_Ingester_Interface_SelectItems.init()">
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -700,6 +700,13 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue)
if(this._scrapeCallback) {
this._scrapeCallback(this, returnValue);
}
+ // Get us ready for another scrape
+ delete this.model;
+ delete this.items;
+ this.model = new Scholar.Ingester.Model();
+ this.items = new Array();
+ // This is perhaps a bit paranoid, but we need to get the model redone anyway
+ this._generateSandbox();
}
/*
diff --git a/scrapers.sql b/scrapers.sql
@@ -192,21 +192,72 @@ var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+var uri = doc.location.href;
+var postString = '''';
+var form = doc.forms.namedItem(''frm'');
+var newUri = form.action;
+var multiple = false;
+
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
- var items = utilities.getItemArray(doc, doc, ''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='', ''\[ [0-9]+ \]'');
- var items = utilities.selectItems(items);
+ multiple = true;
+
+ var availableItems = new Object(); // Technically, associative arrays are objects
+
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ // Require link to match this
+ var tagRegexp = new RegExp();
+ tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
+ // Do not allow text to match this
+ var rejectRegexp = new RegExp();
+ rejectRegexp.compile(''\[ [0-9]+ \]'');
+
+ var checkboxes = new Array();
+ var urls = new Array();
+
+ var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/table/tbody/tr[td/input[@type="checkbox"]]'', nsResolver);
+ // Go through table rows
+ for(var i=0; i<tableRows.length; i++) {
+ // CHK is what we need to get it all as one file
+ var input = utilities.getNode(doc, tableRows[i], ''./td/input[@name="CHK"]'', nsResolver);
+ checkboxes[i] = input.value;
+ var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver);
+ urls[i] = links[0].href;
+ utilities.debugPrint(urls[i]+" = "+links[0].href);
+ // Go through links
+ for(var j=0; j<links.length; j++) {
+ if(tagRegexp.test(links[j].href)) {
+ var text = utilities.getNodeString(doc, links[j], ''.//text()'', null);
+ if(text) {
+ text = utilities.cleanString(text);
+ if(!rejectRegexp.test(text)) {
+ if(availableItems[i]) {
+ availableItems[i] += " "+text;
+ } else {
+ availableItems[i] = text;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ var items = utilities.selectItems(availableItems);
if(!items) {
return true;
}
+
+ // add arguments for items we need to grab
+ for(i in items) {
+ postString += "CHK="+checkboxes[i]+"&";
+ }
}
-var uri = doc.location.href;
-
var raw, unicode, latin1;
-var form = doc.forms.namedItem(''frm'');
-var newUri = form.action;
-var postString = '''';
for(i in form.elements) {
if(form.elements[i].type == ''HIDDEN'' || form.elements[i].type == ''hidden'') {
postString += escape(form.elements[i].name)+''=''+escape(form.elements[i].value)+''&'';
@@ -227,11 +278,21 @@ for(i in export_options) {
}
postString += ''RD=''+i+''&MAILADDY=&SAVE=Press+to+SAVE+or+PRINT'';
+utilities.debugPrint(postString);
+
// No idea why this doesn''t work as post
utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
- var record = new MARC_Record();
- record.load(text, "binary");
- model = utilities.importMARCRecord(record, uri, model);
+ var records = text.split("\x1D");
+ for(var i=0; i<(records.length-1); i++) {
+ if(multiple) {
+ utilities.debugPrint("uri = urls["+i+"]");
+ uri = urls[i];
+ utilities.debugPrint("my uri = "+uri);
+ }
+ var record = new MARC_Record();
+ record.load(records[i], "binary");
+ utilities.importMARCRecord(record, uri, model);
+ }
done();
})
wait();');
@@ -466,7 +527,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
var record = new MARC_Record();
record.load(text, "MARC_PAC");
- model = utilities.importMARCRecord(record, uri, model);
+ utilities.importMARCRecord(record, uri, model);
done();
}, function() {});
@@ -867,7 +928,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
}
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
- model = utilities.importMARCRecord(record, uri, model);
+ utilities.importMARCRecord(record, uri, model);
done();
}, function() {});
@@ -915,7 +976,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
}
}
- model = utilities.importMARCRecord(record, uri, model);
+ utilities.importMARCRecord(record, uri, model);
done();
}, function() {})
@@ -952,7 +1013,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
record.add_field(field, ind1, ind2, value);
}
- model = utilities.importMARCRecord(record, uri, model);
+ utilities.importMARCRecord(record, uri, model);
done();
}, function() {})
@@ -983,7 +1044,7 @@ if(uri.indexOf("authority_hits") < 0) {
utilities.HTTPUtilities.doGet(newUri, null, function(text) {
var record = new MARC_Record();
record.load(text, "binary");
- model = utilities.importMARCRecord(record, uri, model);
+ utilities.importMARCRecord(record, uri, model);
done();
})
wait();');
@@ -1042,7 +1103,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
}
- model = utilities.importMARCRecord(record, uri, model);
+ utilities.importMARCRecord(record, uri, model);
done();
}, function() {});
@@ -1120,7 +1181,7 @@ utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format=
}
}
- model = utilities.importMARCRecord(record, uri, model);
+ utilities.importMARCRecord(record, uri, model);
done();
})
wait();');
@@ -1191,7 +1252,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
record.add_field(tag, ind1, ind2, content);
}
- model = utilities.importMARCRecord(record, uri, model);
+ utilities.importMARCRecord(record, uri, model);
done();
}, function() {});