www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 076ee0fad23b71b47c5f1901b6e737a03be43b10
parent 393d19ca85e9cd6231b16d9731730496dc578c4a
Author: Simon Kornblith <simon@simonster.com>
Date:   Thu,  8 Jun 2006 01:26:40 +0000

Add PubMed scraper, fix a few other small bugs



Diffstat:
Mchrome/chromeFiles/content/scholar/xpcom/ingester.js | 6+++---
Mscrapers.sql | 128++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
2 files changed, 111 insertions(+), 23 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -322,7 +322,7 @@ Scholar.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) var xmlhttp = new this.window.XMLHttpRequest(); xmlhttp.open('GET', url, true); - xmlhttp.overrideMimeType("text/xml"); + xmlhttp.overrideMimeType("text/plain"); var me = this; xmlhttp.onreadystatechange = function() { @@ -335,7 +335,7 @@ Scholar.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus, var xmlhttp = new this.window.XMLHttpRequest(); xmlhttp.open('POST', url, true); - xmlhttp.overrideMimeType("text/xml"); + xmlhttp.overrideMimeType("text/plain"); var me = this; xmlhttp.onreadystatechange = function() { @@ -348,7 +348,7 @@ Scholar.Ingester.HTTPUtilities.prototype.doOptions = function(url, body, onStatu var xmlhttp = new this.window.XMLHttpRequest(); xmlhttp.open('OPTIONS', url, true); - xmlhttp.overrideMimeType("text/xml"); + xmlhttp.overrideMimeType("text/plain"); var me = this; xmlhttp.onreadystatechange = function() { diff --git a/scrapers.sql b/scrapers.sql @@ -1,4 +1,4 @@ --- 4 +-- 5 DELETE FROM scrapers; INSERT INTO "scrapers" VALUES(1, NULL, NULL, 20060603002000, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -783,7 +783,7 @@ for (var i = 0; i < elmts.length; i++) { if(firstChar == "v") { rdfUri = prefixDummy + "volume"; } else if(firstChar == "i") { - rdfUri = prefixDummy + "issue"; + rdfUri = prefixDummy + "number"; } else if(firstChar == "p") { rdfUri = prefixDummy + "pages"; var pagesRegexp = /p(\w+)\((\w+)\)/; @@ -1312,7 +1312,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { wait();'); -INSERT INTO "scrapers" VALUES(18, NULL, NULL, 20060603002000, 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse.jhu.edu/journals/[^/]+/[^/]+/[^/]+.html$', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +INSERT INTO "scrapers" VALUES(18, NULL, NULL, 20060603002000, 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/journals/[^/]+/[^/]+/[^/]+\.html$', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; @@ -1322,10 +1322,6 @@ var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; -function numbersOnly(text) { - return text.replace(/[^0-9]/g, ""); -} - var uri = doc.location.href; var elmts = utilities.gatherElementsOnXPath(doc, doc, ''//comment()'', nsResolver); @@ -1341,24 +1337,29 @@ for(i in elmts) { // expose DOM/XPath to sandboxed scripts var newDOM = new XML(headerData); -function mapDOM(path, rdfUri) { - if(newDOM.header[path]) { - model.addStatement(uri, rdfUri, newDOM[path].text(), true); +function mapRDF(text, rdfUri) { + if(text) { + model.addStatement(uri, rdfUri, text, true); } } -mapDOM("journal", prefixDummy + "publication"); -mapDOM("issn", prefixDummy + "publication", numbersOnly); -mapDOM("volume", prefixDummy + "volume"); -mapDOM("issue", prefixDummy + "issue"); -mapDOM("year", prefixDummy + "year"); -mapDOM("pubdate", prefixDC + "date"); -mapDOM("doctitle", prefixDC + "title"); +mapRDF(newDOM.journal.text(), prefixDummy + "publication"); +mapRDF(newDOM.volume.text(), prefixDummy + "volume"); +mapRDF(newDOM.issue.text(), prefixDummy + "number"); +mapRDF(newDOM.year.text(), prefixDummy + "year"); +mapRDF(newDOM.pubdate.text(), prefixDC + "date"); +mapRDF(newDOM.doctitle.text(), prefixDC + "title"); + +// Do ISSN +var issn = newDOM.issn.text(); +if(issn) { + model.addStatement(uri, prefixDC + "identifier", "ISSN "+issn.replace(/[^0-9]/g, ""), true); +} // Do pages var fpage = newDOM.fpage.text(); var lpage = newDOM.lpage.text(); -if(fpage) { +if(fpage != "") { var pages = fpage; if(lpage) { pages += "-"+lpage; @@ -1374,4 +1375,91 @@ for(i in elmts) { model.addStatement(uri, prefixDC + "creator", fname+" "+surname, true); } -model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);'); -\ No newline at end of file +model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);'); + +INSERT INTO "scrapers" VALUES(19, NULL, NULL, 20060603002000, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +function mapRDF(text, rdfUri) { + if(text != "") { + model.addStatement(uri, rdfUri, text, true); + } +} + +function stringTrimmer(x) { + var x = x.replace(/^[^\w(]+/, ""); + return x.replace(/[^\w)]+$/, ""); +} + +var uri = doc.location.href; +var newUri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=PubMed&retmode=xml&rettype=citation&id="; +var idRegexp = /[\?\&]list_uids=([0-9\,]+)/; +var m = idRegexp.exec(uri); +newUri += m[1]; + +utilities.HTTPUtilities.doGet(newUri, null, function(text) { + // Remove xml parse instruction and doctype + text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, ""); + + var xml = new XML(text); + + for(var i=0; i<xml.PubmedArticle.length(); i++) { + utilities.debugPrint("one article..."); + var citation = xml.PubmedArticle[i].MedlineCitation; + + if(citation.PMID.length()) { + model.addStatement(uri, prefixDC + "identifier", "PMID "+citation.PMID.text(), true); + } + + var article = citation.Article; + if(article.ArticleTitle.length()) { + var title = article.ArticleTitle.text().toString(); + if(title.substr(-1) == ".") { + title = title.substring(0, title.length-1); + } + model.addStatement(uri, prefixDC + "title", title, true); + } + + if(article.Journal.length()) { + var issn = article.Journal.ISSN.text(); + if(issn) { + model.addStatement(uri, prefixDC + "identifier", "ISSN "+issn.replace(/[^0-9]/g, ""), true); + } + + if(article.Journal.Title.length()) { + model.addStatement(uri, prefixDummy + "publication", stringTrimmer(article.Journal.Title.text()), true); + } else if(citation.MedlineJournalInfo.MedlineTA.length()) { + model.addStatement(uri, prefixDummy + "publication", stringTrimmer(citation.MedlineJournalInfo.MedlineTA.text()), true); + } + + if(article.Journal.JournalIssue.length()) { + mapRDF(article.Journal.JournalIssue.Volume.text(), prefixDummy + "volume"); + mapRDF(article.Journal.JournalIssue.Issue.text(), prefixDummy + "number"); + if(article.Journal.JournalIssue.PubDate.length()) { + model.addStatement(uri, prefixDC + "date", article.Journal.JournalIssue.PubDate.Day.text()+" "+article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text(), true); + } + } + } + + if(article.AuthorList.length() && article.AuthorList.Author.length()) { + var authors = article.AuthorList.Author; + for(var j=0; j<authors.length(); j++) { + var lastName = authors[j].LastName.text(); + var firstName = authors[j].FirstName.text(); + if(firstName == "") { + var firstName = authors[j].ForeName.text(); + } + if(firstName && lastName) { + model.addStatement(uri, prefixDC + "creator", firstName + " " + lastName); + } + } + } + } + + done(); +}) + +model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); +wait();'); +\ No newline at end of file