commit 9e78d62b1353cff668e2abf51d6da40d4aa0dfec
parent f941bc192c206051ec1a43674582169274618a91
Author: Simon Kornblith <simon@simonster.com>
Date: Sun, 25 Jun 2006 05:03:01 +0000
Better handling of itemTypes, and improved date handling in PubMed scraper.
Diffstat:
2 files changed, 32 insertions(+), 21 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -913,16 +913,14 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
- var typeToTypeID = new Object();
- typeToTypeID[prefixDummy + 'book'] = 1;
- typeToTypeID[prefixDummy + 'journal'] = 3;
- typeToTypeID[prefixDummy + 'newspaper'] = 5;
-
try {
for(var uri in this.model.data) {
- var typeID = typeToTypeID[this.model.data[uri][prefixRDF + 'type']];
- if(!typeID) {
- var typeID = 1;
+ // Get typeID, defaulting to "website"
+ try {
+ var type = this.model.data[uri][prefixRDF + 'type'][0].substr(prefixDummy.length);
+ var typeID = Scholar.ItemTypes.getID(type);
+ } catch(ex) {
+ var typeID = Scholar.ItemTypes.getID("website")
}
var newItem = Scholar.Items.getNewItemByType(typeID);
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,7 +1,7 @@
--- 16
+-- 17
-- Set the following timestamp to the most recent scraper update date
-REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-24 13:31:00'));
+REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-25 00:56:00'));
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
@@ -452,7 +452,7 @@ utilities.HTTPUtilities.doPost(''http://www.jstor.org/browse'', postData, null,
}
// Loop through again so that we can add with the stableURL
- model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journal", false);
+ model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journalArticle", false);
for(i in data) {
if(data[i].length) {
for(j in data[i]) {
@@ -505,7 +505,7 @@ if(month && year) {
model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false);
}
-model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false);
');
REPLACE INTO "scrapers" VALUES('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-23 12:49:00', 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)',
@@ -898,9 +898,9 @@ for (var i = 0; i < elmts.length; i++) {
value = utilities.superCleanString(value.nodeValue).toLowerCase();
if(value == "periodical") {
- model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false);
} else if(value == "newspaper") {
- model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaper", false);
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaperArticle", false);
} else {
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
}
@@ -990,7 +990,7 @@ for (var i = 0; i < elmts.length; i++) {
model.addStatement(uri, prefixDC + "creator", utilities.cleanAuthor(value), false);
}
}
-model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);');
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false);');
REPLACE INTO "scrapers" VALUES('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-18 10:13:00', 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/document', NULL,
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
@@ -1048,9 +1048,9 @@ if(m) {
m[1] = m[1].substring(3);
}
model.addStatement(uri, prefixDC + "creator", m[1], true);
- model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaper", false);
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaperArticle", false);
} else {
- model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false);
}
var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/;
@@ -1745,9 +1745,9 @@ for(i in elmts) {
model.addStatement(uri, prefixDC + "creator", fname+" "+surname, true);
}
-model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);');
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false);');
-REPLACE INTO "scrapers" VALUES('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-24 13:17:00', 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+REPLACE INTO "scrapers" VALUES('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-25 00:56:00', 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
@@ -1832,7 +1832,20 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) {
mapRDF(uri, article.Journal.JournalIssue.Volume.text(), prefixDummy + "volume");
mapRDF(uri, article.Journal.JournalIssue.Issue.text(), prefixDummy + "number");
if(article.Journal.JournalIssue.PubDate.length()) {
- model.addStatement(uri, prefixDC + "date", article.Journal.JournalIssue.PubDate.Day.text()+" "+article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text(), true);
+ if(article.Journal.JournalIssue.PubDate.Day.text().toString() != "") {
+ var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Day.text()+", "+article.Journal.JournalIssue.PubDate.Year.text();
+ var jsDate = new Date(date);
+ if(!isNaN(jsDate.valueOf())) {
+ date = utilities.dateToISO(date);
+ }
+ } else if(article.Journal.JournalIssue.PubDate.Month.text().toString() != "") {
+ var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text();
+ } else if(article.Journal.JournalIssue.PubDate.Year.text().toString() != "") {
+ var date = article.Journal.JournalIssue.PubDate.Year.text();
+ }
+ if(date) {
+ model.addStatement(uri, prefixDC + "date", date, true);
+ }
}
}
}
@@ -1850,7 +1863,7 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) {
}
}
}
- model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false);
}
done();