commit 56e77619c4a7a2fc016b9fc5bc8285805df91999
parent 6d7974307f3543f4dfad2ac3e82c11a904a33275
Author: Simon Kornblith <simon@simonster.com>
Date: Tue, 3 Oct 2006 22:13:49 +0000
closes #334, Washington Post scraper shouldn't include " - washingtonpost.com" in title
closes #313, Blacklist known ad sites from scraper detection
closes #306, some New York Times ads prevent page from being recognized
closes #308, attachment import bug
currently, the ad site blacklist is located at the top of ingester/browser.js. at some point, we may want to switch this to a database table.
Diffstat:
2 files changed, 50 insertions(+), 18 deletions(-)
diff --git a/chrome/content/zotero/ingester/browser.js b/chrome/content/zotero/ingester/browser.js
@@ -12,6 +12,13 @@
var Zotero_Ingester_Interface = function() {}
+Zotero_Ingester_Interface.blacklist = [
+ "googlesyndication.com",
+ "doubleclick.net",
+ "questionmarket.com",
+ "atdmt.com"
+];
+
//////////////////////////////////////////////////////////////////////////////
//
// Public Zotero_Ingester_Interface methods
@@ -138,6 +145,13 @@ Zotero_Ingester_Interface.contentLoad = function(event) {
}
}
+ for each(var blacklistedURL in Zotero_Ingester_Interface.blacklist) {
+ if(doc.domain.substr(doc.domain.length-blacklistedURL.length) == blacklistedURL) {
+ Zotero.debug("Ignoring blacklisted URL "+document.location);
+ return;
+ }
+ }
+
// get translators
var translate = new Zotero.Translate("web");
translate.setDocument(doc);
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,4 +1,4 @@
--- 96
+-- 97
DROP TABLE IF EXISTS translators;
CREATE TABLE translators (
@@ -3408,7 +3408,7 @@ function doWeb(doc, url) {
Zotero.wait();
}');
-REPLACE INTO "translators" VALUES ('ce7a3727-d184-407f-ac12-52837f3361ff', '2006-10-02 17:00:00', 1, 100, 4, 'New York Times', 'Simon Kornblith', '^http://(?:query\.nytimes\.com/search/query|www\.nytimes\.com/.+)',
+REPLACE INTO "translators" VALUES ('ce7a3727-d184-407f-ac12-52837f3361ff', '2006-10-02 17:00:00', 1, 100, 4, 'New York Times', 'Simon Kornblith', '^http://(?:query\.nytimes\.com/search/query|(?:www\.)?nytimes\.com/.+)',
'function detectWeb(doc, url) {
if(doc.title.substr(0, 30) == "The New York Times: Search for") {
var namespace = doc.documentElement.namespaceURI;
@@ -3810,7 +3810,7 @@ REPLACE INTO "translators" VALUES ('d1bf1c29-4432-4ada-8893-2e29fc88fd9e', '2006
downloadable:true});
// grab title from doc title
- newItem.title = doc.title;
+ newItem.title = doc.title.replace(" - washingtonpost.com", "");
var byline = doc.evaluate(''//div[@id="byline"]'', doc, nsResolver,
XPathResult.ANY_TYPE, null).iterateNext();
@@ -4971,7 +4971,7 @@ function doExport() {
// set term type
Zotero.RDF.addStatement(term, rdf+"type", n.dcterms+"URI", false);
// set url value
- Zotero.RDF.addStatement(term, rdf+"value", attachment.url, true);
+ Zotero.RDF.addStatement(term, rdf+"value", item.url, true);
// add relationship to resource
Zotero.RDF.addStatement(resource, n.dc+"identifier", term, false);
}
@@ -4996,7 +4996,7 @@ function doExport() {
// set section title
Zotero.RDF.addStatement(section, n.dc+"title", item.section, true);
// add relationship to resource
- Zotero.RDF.addStatement(resource, n.dc+"isPartOf", section, false);
+ Zotero.RDF.addStatement(resource, n.dcterms+"isPartOf", section, false);
}
// generate container
@@ -5504,13 +5504,22 @@ function doImport() {
// type
var type = Zotero.RDF.getTargets(node, rdf+"type");
+
// also deal with type detection based on parts, so we can differentiate
// magazine and journal articles, and find container elements
var isPartOf = getFirstResults(node, [n.dcterms+"isPartOf"]);
- if(type) {
- type = Zotero.RDF.getResourceURI(type[0]);
-
+ // get parts of parts, because parts are sections of wholes.
+ if(isPartOf) {
+ for(var i=0; i<isPartOf.length; i++) {
+ var subParts = getFirstResults(isPartOf[i], [n.dcterms+"isPartOf"]);
+ if(subParts) {
+ isPartOf = isPartOf.concat(subParts);
+ }
+ }
+ }
+
+ if(type && (type = Zotero.RDF.getResourceURI(type[0]))) {
if(type == n.bib+"Book") {
newItem.itemType = "book";
} else if(type == n.bib+"BookSection") {
@@ -5663,16 +5672,25 @@ function doImport() {
if(identifiers) {
for(var i in identifiers) {
- var beforeSpace = identifiers[i].substr(0, identifiers[i].indexOf(" ")).toUpperCase();
-
- if(beforeSpace == "ISBN") {
- newItem.ISBN = identifiers[i].substr(5).toUpperCase();
- } else if(beforeSpace == "ISSN") {
- newItem.ISSN = identifiers[i].substr(5).toUpperCase();
- } else if(beforeSpace == "DOI") {
- newItem.DOI = identifiers[i].substr(4);
- } else if(!newItem.accessionNumber) {
- newItem.accessionNumber = identifiers[i];
+ if(typeof(identifiers[i]) == "string") {
+ // grab other things
+ var beforeSpace = identifiers[i].substr(0, identifiers[i].indexOf(" ")).toUpperCase();
+
+ if(beforeSpace == "ISBN") {
+ newItem.ISBN = identifiers[i].substr(5).toUpperCase();
+ } else if(beforeSpace == "ISSN") {
+ newItem.ISSN = identifiers[i].substr(5).toUpperCase();
+ } else if(beforeSpace == "DOI") {
+ newItem.DOI = identifiers[i].substr(4);
+ } else if(!newItem.accessionNumber) {
+ newItem.accessionNumber = identifiers[i];
+ }
+ } else {
+ // grab URLs
+ var type = Zotero.RDF.getTargets(identifiers[i], rdf+"type");
+ if(type && (type = Zotero.RDF.getResourceURI(type[0])) && type == n.dcterms+"URI") {
+ newItem.url = getFirstResults(identifiers[i], [rdf+"value"], true);
+ }
}
}
}