commit ca3a0e6e5d6fa29936cc376885448504d0707d3f
parent 428eab6a95ae56c54b33a3973a8490bbae2cd024
Author: Simon Kornblith <simon@simonster.com>
Date: Thu, 22 Jun 2006 02:43:40 +0000
Beginnings of search result scraping (does not yet actually do the scraping, but does present the menu)
Diffstat:
7 files changed, 146 insertions(+), 43 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -61,11 +61,21 @@ Scholar_Ingester_Interface.chromeUnload = function() {
Scholar_Ingester_Interface.scrapeThisPage = function() {
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
if(documentObject.scraper) {
+ if(documentObject.scrapeURLList) {
+ // In the case that there are multiple scrapable URLs, make the user choose
+ Scholar_Ingester_Interface.chooseURL(documentObject);
+ }
Scholar_Ingester_Interface.scrapeProgress = new Scholar_Ingester_Interface.Progress(window, Scholar_Ingester_Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping"));
documentObject.scrapePage(Scholar_Ingester_Interface._finishScraping);
}
}
+Scholar_Ingester_Interface.chooseURL = function(documentObject) {
+ Scholar.debug("chooseURL called");
+ var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
+ "_blank","chrome,modal,centerscreen,resizable=yes", documentObject);
+}
+
/*
* Updates the status of the capture icon to reflect the scrapability or lack
* thereof of the current page
@@ -108,7 +118,6 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {}
* appropriate status indicator for the current tab, and to free useless objects
*/
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
- Scholar.debug("onLocationChange called");
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
// Remove document object of any browser that no longer exists
@@ -130,25 +139,6 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject)
Scholar_Ingester_Interface._deleteDocument(browser);
}
}
-
- /*// Add a collector to any new browser
- for (var i = 0; i < browsers.length; i++) {
- var browser = browsers[i];
- var exists = false;
-
- for (var j = 0; j < Scholar_Ingester_Interface.browsers.length; j++) {
- if (browser == Scholar_Ingester_Interface.browsers[j]) {
- exists = true;
- break;
- }
- }
-
- if (!exists) {
- Scholar_Ingester_Interface.browsers.splice(i,0,browser);
-
- // To execute if window is new
- }
- }*/
Scholar_Ingester_Interface.updateStatus(
Scholar_Ingester_Interface.tabBrowser.selectedBrowser
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.xul b/chrome/chromeFiles/content/scholar/ingester/browser.xul
@@ -1,12 +1,8 @@
<?xml version="1.0"?>
-
-
<!-- Note: Contains Firefox-specific overlay -->
-
<overlay id="scholar-ingester-overlay" xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul">
<script src="../include.js"/>
-
<script src="browser.js"/>
<script type="application/x-javascript">
diff --git a/chrome/chromeFiles/content/scholar/ingester/selectitems.js b/chrome/chromeFiles/content/scholar/ingester/selectitems.js
@@ -0,0 +1,44 @@
+//////////////////////////////////////////////////////////////////////////////
+//
+// Scholar_Ingester_Interface_SelectItems
+//
+//////////////////////////////////////////////////////////////////////////////
+
+// Class to interface with the browser when ingesting data
+
+Scholar_Ingester_Interface_SelectItems = function() {}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Public Scholar_Ingester_Interface_SelectItems methods
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Initialize some variables and prepare event listeners for when chrome is done
+ * loading
+ */
+Scholar_Ingester_Interface_SelectItems.init = function() {
+ this.documentObject = window.arguments[0];
+ this.listbox = document.getElementById("scholar-selectitems-links");
+
+ for(i in this.documentObject.scrapeURLList) { // we could use a tree for this if we wanted to
+ var itemNode = document.createElement("listitem");
+ itemNode.setAttribute("type", "checkbox");
+ itemNode.setAttribute("value", i);
+ itemNode.setAttribute("label", this.documentObject.scrapeURLList[i]);
+ itemNode.setAttribute("checked", false);
+ this.listbox.appendChild(itemNode);
+ }
+}
+
+Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
+ // clear scrapeURLList
+ this.documentObject.scrapeURLList = new Object();
+
+ // collect scrapeURLList from listbox
+ for(var i=0; i<this.listbox.length; i++) {
+ var itemNode = this.listbox[i];
+ this.documentObject.scrapeURLList[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
+ }
+}
+\ No newline at end of file
diff --git a/chrome/chromeFiles/content/scholar/ingester/selectitems.xul b/chrome/chromeFiles/content/scholar/ingester/selectitems.xul
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<!--
+Borrowed from Linky, originally MPL/GPL/LGPL (now GPL, and modified into oblivion)
+-->
+<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
+<!DOCTYPE window SYSTEM "chrome://scholar/locale/scholar.dtd">
+<dialog xmlns:html="http://www.w3.org/1999/xhtml"
+ xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
+ title="&selectitems.title;" width="400" height="330"
+ persist="width height screenX screenY"
+ buttons="cancel,accept"
+ ondialogaccept="Scholar_Ingester_Interface_SelectItems.acceptSelection()"
+ ondialogcancel="self.close()"
+ id="scholar-selectitems"
+ onload="Scholar_Ingester_Interface_SelectItems.init()">
+
+ <script src="../include.js"/>
+ <script src="selectitems.js"/>
+ <caption label="&selectitems.intro.label;" id="scholar-selectitems-intro"/>
+ <box flex="1">
+ <listbox id="scholar-selectitems-links" flex="1" context="scholarSelectContext"></listbox>
+ </box>
+</dialog>
+\ No newline at end of file
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -283,7 +283,7 @@ Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) {
*/
Scholar.Ingester.Utilities.prototype.cleanString = function(s) {
s = this.trimString(s);
- return s.replace(/ +/g, " ");
+ return s.replace(/[ \xA0]+/g, " ");
}
/*
@@ -569,7 +569,15 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
"\n})()", scraperSandbox);
} catch(e) {
Scholar.debug(e+' in scraperDetectCode for '+currentScraper.label);
- canScrape = false;
+ return false;
+ }
+
+ // scraperDetectCode returns an associative array (object) in the case of a search result
+ if(typeof(canScrape) == "object") {
+ Scholar.debug("scraperDetectCode returned a URL list");
+ this.scrapeURLList = canScrape;
+ } else {
+ Scholar.debug("canScrape was a "+typeof(canScrape));
}
}
return canScrape;
diff --git a/chrome/chromeFiles/locale/en-US/scholar/scholar.dtd b/chrome/chromeFiles/locale/en-US/scholar/scholar.dtd
@@ -24,4 +24,9 @@
<!ENTITY toolbar.newCollection.label "New Project">
<!ENTITY toolbar.renameCollection.label "Rename Project...">
<!ENTITY toolbar.removeCollection.label "Remove Project...">
-<!ENTITY toolbar.search.label "Search:">
-\ No newline at end of file
+<!ENTITY toolbar.search.label "Search:">
+
+<!ENTITY selectitems.title "Select Items">
+<!ENTITY selectitems.intro.label "Select which items you'd like to add to your library">
+<!ENTITY selectitems.cancel.label "Cancel">
+<!ENTITY selectitems.select.label "OK">
+\ No newline at end of file
diff --git a/scrapers.sql b/scrapers.sql
@@ -175,24 +175,59 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintex
wait();');
REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-18 11:02:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
-'try {
- if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
+'if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
+ // We have search results
+
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var availableItems = new Object(); // Technically, associative arrays are objects
+
+ // Require link to match this
+ var tagRegexp = new RegExp();
+ tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
+ // Do not allow text to match this
+ var rejectRegexp = new RegExp();
+ rejectRegexp.compile(''\[ [0-9]+ \]'');
+
+ var links = doc.getElementsByTagName("a");
+ for(var i=0; i<links.length; i++) {
+ if(tagRegexp.test(links[i].href)) {
+ var text = utilities.getNodeString(doc, links[i], ''.//text()'', nsResolver);
+ if(text) {
+ text = utilities.cleanString(text);
+ if(!rejectRegexp.test(text)) {
+ if(availableItems[links[i].href]) {
+ availableItems[links[i].href] += " "+text;
+ } else {
+ availableItems[links[i].href] = text;
+ }
+ }
+ }
+ }
+ }
+
+ if(availableItems) {
+ return availableItems;
+ } else {
return false;
}
- var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
- for(i in export_options) {
- if(export_options[i].text == ''Latin1 MARC''
- || export_options[i].text == ''Raw MARC''
- || export_options[i].text == ''UTF-8''
- || export_options[i].text == ''MARC (Unicode/UTF-8)''
- || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
- return true;
- }
+}
+
+var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
+for(i in export_options) {
+ if(export_options[i].text == ''Latin1 MARC''
+ || export_options[i].text == ''Raw MARC''
+ || export_options[i].text == ''UTF-8''
+ || export_options[i].text == ''MARC (Unicode/UTF-8)''
+ || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
+ // We have an exportable single record
+ return true;
}
- return false;
-} catch(e) {
- return false;
-}',
+}
+return false;',
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';