commit 09d79d6dd778975fd8508994b11ba287323f770b
parent 968348a5d149f0b6f3f8129fa6541e7ed786689d
Author: Simon Kornblith <simon@simonster.com>
Date: Tue, 20 Jun 2006 17:06:41 +0000
Fix overly optimistic JSTOR scraper
Diffstat:
3 files changed, 69 insertions(+), 38 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -107,6 +107,7 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {}
* appropriate status indicator for the current tab, and to free useless objects
*/
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
+ Scholar.debug("onLocationChange called");
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
// Remove document object of any browser that no longer exists
@@ -213,33 +214,38 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) {
*/
Scholar_Ingester_Interface._finishScraping = function(obj) {
if(obj.items.length) {
- var item1 = obj.items[0];
-
- Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
-
- var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID"));
+ try { // Encased in a try block to fix a as-of-yet unresolved issue
+ var item1 = obj.items[0];
- var titleLabel = Scholar.getString("itemFields.title") + ":"
- Scholar_Ingester_Interface.scrapeProgress.addResult(titleLabel, item1.getField("title"));
- var creators = item1.numCreators();
- if(creators) {
- for(var i=0; i<creators; i++) {
- var creator = item1.getCreator(i);
- var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
- var data = creator.firstName + ' ' + creator.lastName;
- Scholar_Ingester_Interface.scrapeProgress.addResult(label, data);
- }
- }
-
- for(i in fields) {
- var data = item1.getField(fields[i]);
- if(data) {
- var name = Scholar.ItemFields.getName(fields[i]);
- if(name != "source") {
- var label = Scholar.getString("itemFields."+ name) + ":";
+ Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
+
+ var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID"));
+
+ // Display title and creators
+ var titleLabel = Scholar.getString("itemFields.title") + ":"
+ Scholar_Ingester_Interface.scrapeProgress.addResult(titleLabel, item1.getField("title"));
+ var creators = item1.numCreators();
+ if(creators) {
+ for(var i=0; i<creators; i++) {
+ var creator = item1.getCreator(i);
+ var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
+ var data = creator.firstName + ' ' + creator.lastName;
Scholar_Ingester_Interface.scrapeProgress.addResult(label, data);
}
}
+
+ // Add additional fields for display
+ for(i in fields) {
+ var data = item1.getField(fields[i]);
+ if(data) {
+ var name = Scholar.ItemFields.getName(fields[i]);
+ if(name != "source") {
+ var label = Scholar.getString("itemFields."+ name) + ":";
+ Scholar_Ingester_Interface.scrapeProgress.addResult(label, data);
+ }
+ }
+ }
+ } catch(ex) {
}
// Save items
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -333,15 +333,13 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
// Extract creators
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
- model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
- model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
- model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
- model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString);
- model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString);
- model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString);
- if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author
- // in the person subject field as the first entry
+ model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
+ if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
+ // some LOC entries have no listed author, but have the author in the person subject field as the first entry
var field = record.get_field_subfields('600');
if(field[0]) {
model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));
@@ -694,6 +692,18 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
creatorIndex++;
}
}
+ if(this.model.data[uri][prefixDummy + 'corporateCreator']) {
+ for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) {
+ newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateCreator'][i], null, 1);
+ creatorIndex++;
+ }
+ }
+ if(this.model.data[uri][prefixDummy + 'corporateContributor']) {
+ for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) {
+ newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateContributor'][i], null, 2);
+ creatorIndex++;
+ }
+ }
// Handle years, extracting from date if necessary
if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) {
diff --git a/scrapers.sql b/scrapers.sql
@@ -234,7 +234,24 @@ utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
})
wait();');
-REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)',
+'var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+// If this is a view page, find the link to the citation
+var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+if(!elmts.length) {
+ var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
+ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+}
+utilities.debugPrint(elmts.length);
+if(elmts && elmts.length) {
+ return true;
+}
+return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
@@ -249,15 +266,13 @@ var uri = doc.location.href;
// If this is a view page, find the link to the citation
var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
-if(!elmts) {
+if(!elmts.length) {
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
}
-if(!elmts) {
- exit;
-}
-var saveCitation = utilities.getNode(doc, elmts[0], ''.'', nsResolver).href;
-var viewSavedCitations = utilities.getNode(doc, elmts[1], ''.'', nsResolver).href;
+
+var saveCitation = elmts[0].href;
+var viewSavedCitations = elmts[1].href;
saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save'');
// Parse save citation link