www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit ca3a0e6e5d6fa29936cc376885448504d0707d3f
parent 428eab6a95ae56c54b33a3973a8490bbae2cd024
Author: Simon Kornblith <simon@simonster.com>
Date:   Thu, 22 Jun 2006 02:43:40 +0000

Beginnings of search result scraping (does not yet actually do the scraping, but does present the menu)


Diffstat:
Mchrome/chromeFiles/content/scholar/ingester/browser.js | 30++++++++++--------------------
Mchrome/chromeFiles/content/scholar/ingester/browser.xul | 4----
Achrome/chromeFiles/content/scholar/ingester/selectitems.js | 45+++++++++++++++++++++++++++++++++++++++++++++
Achrome/chromeFiles/content/scholar/ingester/selectitems.xul | 24++++++++++++++++++++++++
Mchrome/chromeFiles/content/scholar/xpcom/ingester.js | 12++++++++++--
Mchrome/chromeFiles/locale/en-US/scholar/scholar.dtd | 9+++++++--
Mscrapers.sql | 65++++++++++++++++++++++++++++++++++++++++++++++++++---------------
7 files changed, 146 insertions(+), 43 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -61,11 +61,21 @@ Scholar_Ingester_Interface.chromeUnload = function() { Scholar_Ingester_Interface.scrapeThisPage = function() { var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); if(documentObject.scraper) { + if(documentObject.scrapeURLList) { + // In the case that there are multiple scrapable URLs, make the user choose + Scholar_Ingester_Interface.chooseURL(documentObject); + } Scholar_Ingester_Interface.scrapeProgress = new Scholar_Ingester_Interface.Progress(window, Scholar_Ingester_Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping")); documentObject.scrapePage(Scholar_Ingester_Interface._finishScraping); } } +Scholar_Ingester_Interface.chooseURL = function(documentObject) { + Scholar.debug("chooseURL called"); + var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul", + "_blank","chrome,modal,centerscreen,resizable=yes", documentObject); +} + /* * Updates the status of the capture icon to reflect the scrapability or lack * thereof of the current page @@ -108,7 +118,6 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {} * appropriate status indicator for the current tab, and to free useless objects */ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) { - Scholar.debug("onLocationChange called"); var browsers = Scholar_Ingester_Interface.tabBrowser.browsers; // Remove document object of any browser that no longer exists @@ -130,25 +139,6 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) Scholar_Ingester_Interface._deleteDocument(browser); } } - - /*// Add a collector to any new browser - for (var i = 0; i < browsers.length; i++) { - var browser = browsers[i]; - var exists = false; - - for (var j = 0; j < Scholar_Ingester_Interface.browsers.length; j++) { - if (browser == Scholar_Ingester_Interface.browsers[j]) { - exists = true; - break; - } - } - - if (!exists) { - Scholar_Ingester_Interface.browsers.splice(i,0,browser); - - // To execute if window is new - } - }*/ Scholar_Ingester_Interface.updateStatus( Scholar_Ingester_Interface.tabBrowser.selectedBrowser diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.xul b/chrome/chromeFiles/content/scholar/ingester/browser.xul @@ -1,12 +1,8 @@ <?xml version="1.0"?> - - <!-- Note: Contains Firefox-specific overlay --> - <overlay id="scholar-ingester-overlay" xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"> <script src="../include.js"/> - <script src="browser.js"/> <script type="application/x-javascript"> diff --git a/chrome/chromeFiles/content/scholar/ingester/selectitems.js b/chrome/chromeFiles/content/scholar/ingester/selectitems.js @@ -0,0 +1,44 @@ +////////////////////////////////////////////////////////////////////////////// +// +// Scholar_Ingester_Interface_SelectItems +// +////////////////////////////////////////////////////////////////////////////// + +// Class to interface with the browser when ingesting data + +Scholar_Ingester_Interface_SelectItems = function() {} + +////////////////////////////////////////////////////////////////////////////// +// +// Public Scholar_Ingester_Interface_SelectItems methods +// +////////////////////////////////////////////////////////////////////////////// + +/* + * Initialize some variables and prepare event listeners for when chrome is done + * loading + */ +Scholar_Ingester_Interface_SelectItems.init = function() { + this.documentObject = window.arguments[0]; + this.listbox = document.getElementById("scholar-selectitems-links"); + + for(i in this.documentObject.scrapeURLList) { // we could use a tree for this if we wanted to + var itemNode = document.createElement("listitem"); + itemNode.setAttribute("type", "checkbox"); + itemNode.setAttribute("value", i); + itemNode.setAttribute("label", this.documentObject.scrapeURLList[i]); + itemNode.setAttribute("checked", false); + this.listbox.appendChild(itemNode); + } +} + +Scholar_Ingester_Interface_SelectItems.acceptSelection = function() { + // clear scrapeURLList + this.documentObject.scrapeURLList = new Object(); + + // collect scrapeURLList from listbox + for(var i=0; i<this.listbox.length; i++) { + var itemNode = this.listbox[i]; + this.documentObject.scrapeURLList[itemNode.getAttribute("value")] = itemNode.getAttribute("label"); + } +} +\ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/ingester/selectitems.xul b/chrome/chromeFiles/content/scholar/ingester/selectitems.xul @@ -0,0 +1,23 @@ +<?xml version="1.0"?> +<!-- +Borrowed from Linky, originally MPL/GPL/LGPL (now GPL, and modified into oblivion) +--> +<?xml-stylesheet href="chrome://global/skin/" type="text/css"?> +<!DOCTYPE window SYSTEM "chrome://scholar/locale/scholar.dtd"> +<dialog xmlns:html="http://www.w3.org/1999/xhtml" + xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul" + title="&selectitems.title;" width="400" height="330" + persist="width height screenX screenY" + buttons="cancel,accept" + ondialogaccept="Scholar_Ingester_Interface_SelectItems.acceptSelection()" + ondialogcancel="self.close()" + id="scholar-selectitems" + onload="Scholar_Ingester_Interface_SelectItems.init()"> + + <script src="../include.js"/> + <script src="selectitems.js"/> + <caption label="&selectitems.intro.label;" id="scholar-selectitems-intro"/> + <box flex="1"> + <listbox id="scholar-selectitems-links" flex="1" context="scholarSelectContext"></listbox> + </box> +</dialog> +\ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -283,7 +283,7 @@ Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) { */ Scholar.Ingester.Utilities.prototype.cleanString = function(s) { s = this.trimString(s); - return s.replace(/ +/g, " "); + return s.replace(/[ \xA0]+/g, " "); } /* @@ -569,7 +569,15 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) { "\n})()", scraperSandbox); } catch(e) { Scholar.debug(e+' in scraperDetectCode for '+currentScraper.label); - canScrape = false; + return false; + } + + // scraperDetectCode returns an associative array (object) in the case of a search result + if(typeof(canScrape) == "object") { + Scholar.debug("scraperDetectCode returned a URL list"); + this.scrapeURLList = canScrape; + } else { + Scholar.debug("canScrape was a "+typeof(canScrape)); } } return canScrape; diff --git a/chrome/chromeFiles/locale/en-US/scholar/scholar.dtd b/chrome/chromeFiles/locale/en-US/scholar/scholar.dtd @@ -24,4 +24,9 @@ <!ENTITY toolbar.newCollection.label "New Project"> <!ENTITY toolbar.renameCollection.label "Rename Project..."> <!ENTITY toolbar.removeCollection.label "Remove Project..."> -<!ENTITY toolbar.search.label "Search:"> -\ No newline at end of file +<!ENTITY toolbar.search.label "Search:"> + +<!ENTITY selectitems.title "Select Items"> +<!ENTITY selectitems.intro.label "Select which items you'd like to add to your library"> +<!ENTITY selectitems.cancel.label "Cancel"> +<!ENTITY selectitems.select.label "OK"> +\ No newline at end of file diff --git a/scrapers.sql b/scrapers.sql @@ -175,24 +175,59 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintex wait();'); REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-18 11:02:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', -'try { - if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) { +'if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) { + // We have search results + + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var availableItems = new Object(); // Technically, associative arrays are objects + + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti=''); + // Do not allow text to match this + var rejectRegexp = new RegExp(); + rejectRegexp.compile(''\[ [0-9]+ \]''); + + var links = doc.getElementsByTagName("a"); + for(var i=0; i<links.length; i++) { + if(tagRegexp.test(links[i].href)) { + var text = utilities.getNodeString(doc, links[i], ''.//text()'', nsResolver); + if(text) { + text = utilities.cleanString(text); + if(!rejectRegexp.test(text)) { + if(availableItems[links[i].href]) { + availableItems[links[i].href] += " "+text; + } else { + availableItems[links[i].href] = text; + } + } + } + } + } + + if(availableItems) { + return availableItems; + } else { return false; } - var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options; - for(i in export_options) { - if(export_options[i].text == ''Latin1 MARC'' - || export_options[i].text == ''Raw MARC'' - || export_options[i].text == ''UTF-8'' - || export_options[i].text == ''MARC (Unicode/UTF-8)'' - || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') { - return true; - } +} + +var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options; +for(i in export_options) { + if(export_options[i].text == ''Latin1 MARC'' + || export_options[i].text == ''Raw MARC'' + || export_options[i].text == ''UTF-8'' + || export_options[i].text == ''MARC (Unicode/UTF-8)'' + || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') { + // We have an exportable single record + return true; } - return false; -} catch(e) { - return false; -}', +} +return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/'';