commit f303db22e7a73419b1b887ab15ae04c9c3afabf9
parent 70ebccc8271321a98707fbb19b1654c1171711b9
Author: Matt Burton <mcburton@gmail.com>
Date: Tue, 28 Jul 2009 21:55:35 +0000
Fixing Google Patents.
Diffstat:
1 file changed, 59 insertions(+), 70 deletions(-)
diff --git a/translators/Google Patents.js b/translators/Google Patents.js
@@ -12,8 +12,12 @@
}
function detectWeb(doc, url) {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
- if (doc.location.href.match("Search")) {
+ if (doc.evaluate('//font[contains(./text(), "Result")]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
return "multiple";
} else if (doc.location.href.match("id")) {
return "patent";
@@ -36,44 +40,28 @@ function scrape(doc, url) {
} : null;
var dataTags = new Object();
- var headings = new Array();
var newItem = new Zotero.Item("patent");
- //checks format type
- if (doc.location.href.match("printsec")) {
-
- var contents = doc.evaluate('//table[@id="summarytable"]/tbody/tr[1]/td', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
- var xPathHeadings = doc.evaluate('//b', doc, nsResolver, XPathResult.ANY_TYPE, null);
-
- var xPathCount = doc.evaluate('count (//b)', doc, nsResolver, XPathResult.ANY_TYPE, null);
-
- if (doc.evaluate('//span[@class="addmd"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
-
- var author = doc.evaluate('//span[@class="addmd"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
- newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "inventor"));
+ //Grab the patent_bibdata items and the text node directly next to them
+ var xPathHeadings = doc.evaluate('//div[@class="patent_bibdata"]//b', doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var xPathContents = doc.evaluate('//div[@class="patent_bibdata"]//b/following::text()[1]', doc, nsResolver, XPathResult.ANY_TYPE, null);
+
+ // create an associative array of the items and their contents
+ var heading, content;
+ while( heading = xPathHeadings.iterateNext(), content = xPathContents.iterateNext()){
+ if(heading.textContent == 'Publication number'){
+ content = doc.evaluate('//div[@class="patent_bibdata"]//b[text()="Publication number"]/following::nobr[1]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
}
+ dataTags[heading.textContent] = content.textContent.replace(": ", '');;
+ //Zotero.debug(dataTags);
+ }
- } else {
-
- var xPathHeadings = doc.evaluate('//div[@class="patent_bibdata"]/p/b', doc, nsResolver, XPathResult.ANY_TYPE, null);
-
- var xPathCount = doc.evaluate('count (//div[@class="patent_bibdata"]/p/b)', doc, nsResolver, XPathResult.ANY_TYPE, null);
-
-
- var xPathContents = doc.evaluate('//div[@class="patent_bibdata"]/p', doc, nsResolver, XPathResult.ANY_TYPE, null);
- var contentsCount = doc.evaluate('count (//div[@class="patent_bibdata"]/p)', doc, nsResolver, XPathResult.ANY_TYPE, null);
-
- var contents;
- for (i = 0; i < contentsCount.numberValue; i++) {
- contents = (contents + xPathContents.iterateNext().textContent + " ");
- }
-
- if (doc.evaluate('//td[3]/p', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
- newItem.abstractNote = (doc.evaluate('//td[3]/p', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace("Abstract", ''));
- }
+ if (doc.evaluate('//td[3]/p', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
+ newItem.abstractNote = (doc.evaluate('//td[3]/p', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace("Abstract", ''));
+ }
- }
+ /*
for (var i =0; i < xPathCount.numberValue; i++) {
headings.push(xPathHeadings.iterateNext().textContent);
@@ -83,23 +71,28 @@ function scrape(doc, url) {
var splitContent = new Array();
splitContent = contents.split(/xxx/);
-
+ */
//associate headings with contents.
- for (var i = 0; i < headings.length; i++) {
- fieldTitle = headings[i].replace(/\s+|\W*/g, '');
-
- if (fieldTitle == "USClassification" | fieldTitle == "InternationalClassification" | fieldTitle == "Abstract") {
+
+//extra field\
+ newItem.extra = '';
+
+ for (fieldTitle in dataTags) {
+ Zotero.debug(fieldTitle);
+ //fieldTitle = item.replace(/\s+|\W*/g, '');
+ /*
+ if (fieldTitle == "US Classification" | fieldTitle == "International Classification" | fieldTitle == "Abstract") {
dataTags[fieldTitle] = splitContent[i+1];
} else {
dataTags[fieldTitle] = splitContent[i+1].replace(": ", '');
}
-
+ */
if (dataTags[fieldTitle].match("About this patent")) {
dataTags[fieldTitle] = dataTags[fieldTitle].replace("About this patent", '');
}
//author(s)
- if (fieldTitle == "Inventors") {
+ if (fieldTitle == "Inventors" | fieldTitle == "Inventor") {
var authors = dataTags[fieldTitle].split(", ");
for (var j = 0; j < authors.length; j++) {
newItem.creators.push(Zotero.Utilities.cleanAuthor(authors[j], "inventor"));
@@ -107,26 +100,28 @@ function scrape(doc, url) {
} else if (fieldTitle == "Inventor") {
newItem.creators.push(Zotero.Utilities.cleanAuthor(dataTags["Inventor"], "inventor"));
}
+
+ if (fieldTitle == "U.S. Classification" ) {
+ newItem.extra += "U.S. Classification: " + dataTags["U.S. Classification"]+"\n";
+ } else if (fieldTitle == "International Classification" ) {
+ newItem.extra += "International Classification: " + dataTags["International Classification"]+"\n";
+ } else if (fieldTitle == "Filing date" ) {
+ newItem.extra += "Filing Date: " + dataTags["Filing date"]+"\n";
+ } else if (fieldTitle == "Publication number" ) {
+ newItem.extra += "Publication number: " +dataTags["Publication number"]+"\n";
+ }
}
- //extra field
- if (dataTags["USClassification"] && dataTags["InternationalClassification"]) {
- Zotero.debug(doc.title);
- newItem.extra = ("U.S. Classification: " + dataTags["USClassification"] + "; International Classification: " + dataTags["InternationalClassification"]);
- } else if (dataTags["USClassification"] ) {
- newItem.extra = ("U.S. Classification: " + dataTags["USClassification"]);
- } else if (dataTags["InternationalClassification"]) {
- newItem.extra = ("International Classification: " + dataTags["InternationalClassification"]);
- }
+
- associateData (newItem, dataTags, "Patentnumber", "patentNumber");
- associateData (newItem, dataTags, "Issuedate", "date");
+ associateData (newItem, dataTags, "Patent number", "patentNumber");
+ associateData (newItem, dataTags, "Issue date", "date");
associateData (newItem, dataTags, "Assignees", "assignee");
associateData (newItem, dataTags, "Assignee", "assignee");
associateData (newItem, dataTags, "Abstract", "abstractNote");
- associateData (newItem, dataTags, "Applicationnumber", "applicationNumber");
+ associateData (newItem, dataTags, "Application number", "applicationNumber");
- newItem.title = doc.evaluate('//h2[@class="title"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
+ newItem.title = doc.evaluate('//h1[@class="title"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.url = doc.location.href;
newItem.complete();
@@ -138,33 +133,27 @@ function doWeb(doc, url) {
if (prefix == 'x') return namespace; else return null;
} : null;
+ var host = 'http://' + doc.location.host + "/";
+
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
+ var iterator = doc.evaluate('//a[@class = "big"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var links = [];
+ var element = iterator.iterateNext();
var items = new Object();
-
- var xPathFirstTitle = doc.evaluate('//div[@id="results_container"]/a', doc, nsResolver, XPathResult.ANY_TYPE, null);
- var firstTitle = xPathFirstTitle.iterateNext();
-
- var titles = doc.evaluate('//p/a', doc, nsResolver, XPathResult.ANY_TYPE, null);
-
- items[firstTitle.href] = firstTitle.textContent;
-
- var next_title;
- while (next_title = titles.iterateNext()) {
- if (next_title.textContent.match("RSS feed")) {
-
- } else {
- items[next_title.href] = next_title.textContent;
- }
+ while(element) {
+ items[element.href] = element.textContent;
+ element = iterator.iterateNext();
}
items = Zotero.selectItems(items);
+ if(!items) return true;
for (var i in items) {
articles.push(i);
}
- } else {
- articles = [url];
+
}
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
Zotero.wait();
+
}