www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 09d79d6dd778975fd8508994b11ba287323f770b
parent 968348a5d149f0b6f3f8129fa6541e7ed786689d
Author: Simon Kornblith <simon@simonster.com>
Date:   Tue, 20 Jun 2006 17:06:41 +0000

Fix overly optimistic JSTOR scraper


Diffstat:
Mchrome/chromeFiles/content/scholar/ingester/browser.js | 52+++++++++++++++++++++++++++++-----------------------
Mchrome/chromeFiles/content/scholar/xpcom/ingester.js | 26++++++++++++++++++--------
Mscrapers.sql | 29++++++++++++++++++++++-------
3 files changed, 69 insertions(+), 38 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -107,6 +107,7 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {} * appropriate status indicator for the current tab, and to free useless objects */ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) { + Scholar.debug("onLocationChange called"); var browsers = Scholar_Ingester_Interface.tabBrowser.browsers; // Remove document object of any browser that no longer exists @@ -213,33 +214,38 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) { */ Scholar_Ingester_Interface._finishScraping = function(obj) { if(obj.items.length) { - var item1 = obj.items[0]; - - Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete")); - - var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID")); + try { // Encased in a try block to fix a as-of-yet unresolved issue + var item1 = obj.items[0]; - var titleLabel = Scholar.getString("itemFields.title") + ":" - Scholar_Ingester_Interface.scrapeProgress.addResult(titleLabel, item1.getField("title")); - var creators = item1.numCreators(); - if(creators) { - for(var i=0; i<creators; i++) { - var creator = item1.getCreator(i); - var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":"; - var data = creator.firstName + ' ' + creator.lastName; - Scholar_Ingester_Interface.scrapeProgress.addResult(label, data); - } - } - - for(i in fields) { - var data = item1.getField(fields[i]); - if(data) { - var name = Scholar.ItemFields.getName(fields[i]); - if(name != "source") { - var label = Scholar.getString("itemFields."+ name) + ":"; + Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete")); + + var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID")); + + // Display title and creators + var titleLabel = Scholar.getString("itemFields.title") + ":" + Scholar_Ingester_Interface.scrapeProgress.addResult(titleLabel, item1.getField("title")); + var creators = item1.numCreators(); + if(creators) { + for(var i=0; i<creators; i++) { + var creator = item1.getCreator(i); + var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":"; + var data = creator.firstName + ' ' + creator.lastName; Scholar_Ingester_Interface.scrapeProgress.addResult(label, data); } } + + // Add additional fields for display + for(i in fields) { + var data = item1.getField(fields[i]); + if(data) { + var name = Scholar.ItemFields.getName(fields[i]); + if(name != "source") { + var label = Scholar.getString("itemFields."+ name) + ":"; + Scholar_Ingester_Interface.scrapeProgress.addResult(label, data); + } + } + } + } catch(ex) { } // Save items diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -333,15 +333,13 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN '); // Extract creators model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor); - model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString); + model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString); + model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor); - model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString); - if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author - // in the person subject field as the first entry + model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString); + model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString); + if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) { + // some LOC entries have no listed author, but have the author in the person subject field as the first entry var field = record.get_field_subfields('600'); if(field[0]) { model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a'])); @@ -694,6 +692,18 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() { creatorIndex++; } } + if(this.model.data[uri][prefixDummy + 'corporateCreator']) { + for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) { + newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateCreator'][i], null, 1); + creatorIndex++; + } + } + if(this.model.data[uri][prefixDummy + 'corporateContributor']) { + for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) { + newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateContributor'][i], null, 2); + creatorIndex++; + } + } // Handle years, extracting from date if necessary if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) { diff --git a/scrapers.sql b/scrapers.sql @@ -234,7 +234,24 @@ utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) { }) wait();'); -REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)', +'var namespace = doc.documentElement.namespaceURI; +var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; +} : null; + +// If this is a view page, find the link to the citation +var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +if(!elmts.length) { + var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]''; + var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +} +utilities.debugPrint(elmts.length); +if(elmts && elmts.length) { + return true; +} +return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; @@ -249,15 +266,13 @@ var uri = doc.location.href; // If this is a view page, find the link to the citation var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -if(!elmts) { +if(!elmts.length) { var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); } -if(!elmts) { - exit; -} -var saveCitation = utilities.getNode(doc, elmts[0], ''.'', nsResolver).href; -var viewSavedCitations = utilities.getNode(doc, elmts[1], ''.'', nsResolver).href; + +var saveCitation = elmts[0].href; +var viewSavedCitations = elmts[1].href; saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save''); // Parse save citation link