www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

feedReader.js (17566B)


      1 /*
      2     ***** BEGIN LICENSE BLOCK *****
      3     
      4     Copyright © 2015 Center for History and New Media
      5                      George Mason University, Fairfax, Virginia, USA
      6                      http://zotero.org
      7     
      8     This file is part of Zotero.
      9     
     10     Zotero is free software: you can redistribute it and/or modify
     11     it under the terms of the GNU Affero General Public License as published by
     12     the Free Software Foundation, either version 3 of the License, or
     13     (at your option) any later version.
     14     
     15     Zotero is distributed in the hope that it will be useful,
     16     but WITHOUT ANY WARRANTY; without even the implied warranty of
     17     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     18     GNU Affero General Public License for more details.
     19     
     20     You should have received a copy of the GNU Affero General Public License
     21     along with Zotero.  If not, see <http://www.gnu.org/licenses/>.
     22     
     23     ***** END LICENSE BLOCK *****
     24 */
     25 
     26 
     27 /**
     28  * Sample feeds:
     29  * 
     30  * http://cyber.law.harvard.edu/rss/examples/rss2sample.xml
     31  * http://feeds.feedburner.com/acs/acbcct
     32  * http://www.cell.com/molecular-cell/current.rss
     33  * http://ieeexplore.ieee.org/search/searchresult.jsp?searchField%3DSearch_All%26queryText%3Dwater&searchOrigin=saved_searches&rssFeed=true&rssFeedName=water
     34  * http://www.sciencemag.org/rss/current.xml
     35  * http://rss.sciencedirect.com/publication/science/20925212
     36  * http://www.ncbi.nlm.nih.gov/entrez/eutils/erss.cgi?rss_guid=1fmfIeN4X5Q8HemTZD5Rj6iu6-FQVCn7xc7_IPIIQtS1XiD9bf
     37  * http://export.arxiv.org/rss/astro-ph
     38  * http://fhs.dukejournals.org/rss_feeds/recent.xml
     39  */
     40 
     41 /**
     42  * class Zotero.FeedReader
     43  * Asynchronously reads an ATOM/RSS feed
     44  *
     45  * @param {String} url URL of the feed
     46  *
     47  * @property {Zotero.Promise<Object>} feedProperties An object
     48  *   representing feed properties
     49  * @property {Zotero.Promise<FeedItem>*} ItemIterator Returns an iterator
     50  *   for feed items. The iterator returns FeedItem promises that have to be
     51  *   resolved before requesting the next promise. When all items are exhausted.
     52  *   the promise resolves to null.
     53  * @method {void} terminate Stops retrieving/parsing the feed. Data parsed up
     54  *   to this point is still available.
     55  */
     56 Zotero.FeedReader = function(url) {
     57 	if (!url) throw new Error("Feed URL must be supplied");
     58 
     59 	
     60 	this._url = url;
     61 	this._feedItems = [Zotero.Promise.defer()];
     62 	this._feedProcessed = Zotero.Promise.defer();
     63 
     64 	let feedFetched = Zotero.Promise.defer();
     65 	feedFetched.promise.then(function(feed) {
     66 		let info = {};
     67 		
     68 		info.title = feed.title ? feed.title.plainText() : '';
     69 		info.subtitle = feed.subtitle ? feed.subtitle.plainText() : '';
     70 		
     71 		if (feed.updated) info.updated = new Date(feed.updated);
     72 		
     73 		// categories: MDN says "not yet implemented"
     74 		
     75 		info.creators = Zotero.FeedReader._processCreators(feed, 'authors', 'author');
     76 		
     77 		// TODO: image as icon
     78 		
     79 		let publicationTitle = Zotero.FeedReader._getFeedField(feed, 'publicationName', 'prism')
     80 			|| Zotero.FeedReader._getFeedField(feed, 'pubTitle');
     81 		if (publicationTitle) info.publicationTitle = publicationTitle;
     82 		
     83 		let publisher = Zotero.FeedReader._getFeedField(feed, 'publisher', 'dc');
     84 		if (publisher) info.publisher = publisher;
     85 		
     86 		let rights = (feed.rights && feed.rights.plainText())
     87 			|| Zotero.FeedReader._getFeedField(feed, 'copyright', 'prism')
     88 			|| Zotero.FeedReader._getFeedField(feed, 'rights', 'dc')
     89 			|| Zotero.FeedReader._getFeedField(feed, 'copyright');
     90 		if (rights) info.rights = rights;
     91 		
     92 		let issn = Zotero.FeedReader._getFeedField(feed, 'issn', 'prism');
     93 		if (issn) info.ISSN = issn;
     94 		
     95 		let isbn = Zotero.FeedReader._getFeedField(feed, 'isbn', 'prism')
     96 			|| Zotero.FeedReader._getFeedField(feed, 'isbn')
     97 		if (isbn) info.ISBN = isbn;
     98 		
     99 		let language = Zotero.FeedReader._getFeedField(feed, 'language', 'dc')
    100 			|| Zotero.FeedReader._getFeedField(feed, 'language');
    101 		if (language) info.language = language;
    102 		
    103 		let ttl = Zotero.FeedReader._getFeedField(feed, 'ttl');
    104 		if (ttl) info.ttl = ttl;
    105 		
    106 		this._feedProperties = info;
    107 		this._feed = feed;
    108 	}.bind(this)).then(function(){
    109 		let items = this._feed.items;
    110 		if (items && items.length) {
    111 			for (let i=0; i<items.length; i++) {
    112 				let item = items.queryElementAt(i, Components.interfaces.nsIFeedEntry);
    113 				if (!item) continue;
    114 				
    115 				let feedItem = Zotero.FeedReader._getFeedItem(item, this._feedProperties);
    116 				if (!feedItem) continue;
    117 				
    118 				let lastItem = this._feedItems[this._feedItems.length - 1];
    119 				this._feedItems.push(Zotero.Promise.defer()); // Push a new deferred promise so an iterator has something to return
    120 				lastItem.resolve(feedItem);
    121 			}
    122 		}
    123 		this._feedProcessed.resolve();
    124 	}.bind(this)).catch(function(e) {
    125 		Zotero.debug("Feed processing failed " + e.message);
    126 		this._feedProcessed.reject(e);
    127 	}.bind(this)).finally(function() {
    128 		// Make sure the last promise gets resolved to null
    129 		let lastItem = this._feedItems[this._feedItems.length - 1];
    130 		lastItem.resolve(null);
    131 	}.bind(this));
    132 	
    133 	// Set up asynchronous feed processor
    134 	let feedProcessor = Components.classes["@mozilla.org/feed-processor;1"]
    135 		.createInstance(Components.interfaces.nsIFeedProcessor);
    136 
    137 	let feedUrl = Services.io.newURI(url, null, null);
    138 	feedProcessor.parseAsync(null, feedUrl);
    139 	
    140 	feedProcessor.listener = {
    141 		/*
    142 		 * MDN suggests that we could use nsIFeedProgressListener to handle the feed
    143 		 * as it gets loaded, but this is actually not implemented (as of 32.0.3),
    144 		 * so we have to load the whole feed and handle it in handleResult.
    145 		 */
    146 		handleResult: (result) => {
    147 			if (!result.doc) {
    148 				this.terminate("No Feed");
    149 				return;
    150 			}
    151 			
    152 			let newFeed = result.doc.QueryInterface(Components.interfaces.nsIFeed);
    153 			feedFetched.resolve(newFeed);
    154 		}
    155 	};
    156 	
    157 	Zotero.debug("FeedReader: Fetching feed from " + feedUrl.spec);
    158 	
    159 	this._channel = Services.io.newChannelFromURI2(feedUrl, null, 
    160 		Services.scriptSecurityManager.getSystemPrincipal(), null, 
    161 		Ci.nsILoadInfo.SEC_NORMAL, Ci.nsIContentPolicy.TYPE_OTHER);
    162 	this._channel.loadFlags |= Components.interfaces.nsIRequest.LOAD_BYPASS_CACHE;
    163 	this._channel.asyncOpen(feedProcessor, null); // Sends an HTTP request
    164 }
    165 
    166 /*
    167  * The constructor initiates async feed processing, but _feedProcessed
    168  * needs to be resolved before proceeding.
    169  */
    170 Zotero.FeedReader.prototype.process = Zotero.Promise.coroutine(function* () {
    171 	return this._feedProcessed.promise;
    172 });
    173 
    174 /*
    175  * Terminate feed processing at any given time
    176  * @param {String} status Reason for terminating processing
    177  */
    178 Zotero.FeedReader.prototype.terminate = function(status) {
    179 	Zotero.debug("FeedReader: Terminating feed reader (" + status + ")");
    180 	
    181 	// Reject feed promise if not resolved yet
    182 	if (this._feedProcessed.promise.isPending()) {
    183 		this._feedProcessed.reject(new Error(status));
    184 	}
    185 	
    186 	// Reject feed item promise if not resolved yet
    187 	let lastItem = this._feedItems[this._feedItems.length - 1];
    188 	if (lastItem.promise.isPending()) {
    189 		// It seemed like a good idea to reject the last item but
    190 		// it's not really been useful yet, aside from bluebird
    191 		// throwing errors about unhandled rejections in tests
    192 		// so we suppress them here. TODO: We should probably
    193 		// rethink whether this code makes sense and make it better.
    194 		let er = new Error(status);
    195 		er.handledRejection = true;
    196 		lastItem.reject(er);
    197 	}
    198 	
    199 	// Close feed connection
    200 	if (this._channel.isPending()) {
    201 		this._channel.cancel(Components.results.NS_BINDING_ABORTED);
    202 	}
    203 };
    204 
    205 Zotero.defineProperty(Zotero.FeedReader.prototype, 'feedProperties', {
    206 	get: function(){ 
    207 		if (!this._feedProperties) {
    208 			throw new Error("Feed has not been resolved yet. Try calling FeedReader#process first")
    209 		}
    210 		return this._feedProperties
    211 	}
    212 });
    213 
    214 /*
    215  * Feed item iterator
    216  * Each iteration returns a _promise_ for an item. The promise _MUST_ be
    217  * resolved before requesting the next item.
    218  * The last item will always be resolved to `null`, unless the feed processing
    219  * is terminated ahead of time, in which case it will be rejected with the reason
    220  * for termination.
    221  */
    222 Zotero.defineProperty(Zotero.FeedReader.prototype, 'ItemIterator', {
    223 	get: function() {
    224 		let items = this._feedItems;
    225 		let feedReader = this;
    226 		
    227 		let iterator = function() {
    228 			if (!feedReader._feedProperties) {
    229 				throw new Error("Feed has not been resolved yet. Try calling FeedReader#process first")
    230 			}
    231 			this.index = 0;
    232 		};
    233 		
    234 		iterator.prototype.next = function() {
    235 			let item = items[this.index++];
    236 			return {
    237 				value: item ? item.promise : null,
    238 				done: this.index >= items.length
    239 			};
    240 		};
    241 		
    242 		iterator.prototype.last = function() {
    243 			return items[items.length-1];
    244 		}
    245 		
    246 		return iterator;
    247 	}
    248 }, {lazy: true});
    249 
    250 
    251 /*****************************
    252  * Item processing functions *
    253  *****************************/
    254  	 
    255 /**
    256  * Determine item type based on item data
    257  */
    258 Zotero.FeedReader._guessItemType = function(item) {
    259 	// Default to journalArticle
    260 	item.itemType = 'journalArticle';
    261 	
    262 	if (item.ISSN) {
    263 		return; // journalArticle
    264 	}
    265 	
    266 	if (item.ISBN) {
    267 		item.itemType = 'bookSection';
    268 		return;
    269 	}
    270 	
    271 	if (item.publicationType) {
    272 		let type = item.publicationType.toLowerCase();
    273 		if (type.indexOf('conference') != -1) {
    274 			item.itemType = 'conferencePaper';
    275 			return;
    276 		}
    277 		if (type.indexOf('journal') != -1) {
    278 			item.itemType = 'journalArticle';
    279 			return;
    280 		}
    281 		if (type.indexOf('book') != -1) {
    282 			item.itemType = 'bookSection';
    283 			return;
    284 		}
    285 	}
    286 };
    287 
    288 /*
    289  * Fetch creators from given field of a feed entry
    290  */
    291 Zotero.FeedReader._processCreators = function(feedEntry, field, role) {
    292 	let names = [],
    293 		nameStr;
    294 	try {
    295 		let personArr = feedEntry[field]; // Seems like this part can throw if there is no author data in the feed
    296 		for (let i=0; i<personArr.length; i++) {
    297 			let person = personArr.queryElementAt(i, Components.interfaces.nsIFeedPerson);
    298 			if (!person || !person.name) continue;
    299 			
    300 			let name = Zotero.Utilities.cleanTags(Zotero.Utilities.trimInternal(person.name));
    301 			if (!name) continue;
    302 			
    303 			let commas = name.split(',').length - 1,
    304 					other = name.split(/\s(?:and|&)\s|;/).length - 1,
    305 					separators = commas + other;
    306 			if (personArr.length == 1 &&
    307 				// Has typical name separators
    308 				(other || commas > 1
    309 				// If only one comma and first part has more than one space,
    310 				// it's probably not lastName, firstName
    311 					|| (commas == 1 && name.split(/\s*,/)[0].indexOf(' ') != -1)
    312 				)
    313 			) {
    314 				// Probably multiple authors listed in a single field
    315 				nameStr = name;
    316 				break; // For clarity. personArr.length == 1 anyway
    317 			} else {
    318 				names.push(name);
    319 			}
    320 		}
    321 	} 
    322 	catch(e) {
    323 		if (e.result != Components.results.NS_ERROR_FAILURE) throw e;
    324 		
    325 		if (field != 'authors') return [];
    326 		
    327 		// ieeexplore places these in "authors"... sigh
    328 		nameStr = Zotero.FeedReader._getFeedField(feedEntry, 'authors');
    329 		if (nameStr) nameStr = Zotero.Utilities.trimInternal(nameStr);
    330 		if (!nameStr) return [];
    331 	}
    332 	
    333 	if (nameStr) {
    334 		names = nameStr.split(/\s(?:and|&)\s|\s*[,;]\s*/);
    335 	}
    336 	
    337 	let creators = [];
    338 	for (let i=0; i<names.length; i++) {
    339 		let creator = Zotero.Utilities.cleanAuthor(
    340 			names[i],
    341 			role,
    342 			names[i].split(',').length == 2
    343 		);
    344 		if (!creator.firstName) {
    345 			creator.fieldMode = 1;
    346 		}
    347 		// Sometimes these end up empty when parsing really nasty HTML based fields, so just skip.
    348 		if (!creator.firstName && !creator.lastName) {
    349 			continue;
    350 		}
    351 		
    352 		creators.push(creator);
    353 	}
    354 	return creators;
    355 }
    356 
    357 /*
    358  * Parse feed entry into a Zotero item
    359  */
    360 Zotero.FeedReader._getFeedItem = function(feedEntry, feedInfo) {
    361 	// ID is not required, but most feeds have these and we have to rely on them
    362 	// to handle updating properly
    363 	// Can probably fall back to links on missing id - unlikely to change
    364 	if (!feedEntry.id && !feedEntry.link) {
    365 		Zotero.debug("FeedReader: Feed item missing an ID or link - discarding");
    366 		return;
    367 	}
    368 	
    369 	let item = {
    370 		guid: feedEntry.id || feedEntry.link.spec
    371 	};
    372 			
    373 	if (feedEntry.title) item.title = Zotero.FeedReader._getRichText(feedEntry.title, 'title');
    374 	
    375 	if (feedEntry.summary) {
    376 		item.abstractNote = Zotero.FeedReader._getRichText(feedEntry.summary, 'abstractNote');
    377 		
    378 		if (!item.title) {
    379 			// We will probably have to trim this, so let's use plain text to
    380 			// avoid splitting inside some markup
    381 			let title = Zotero.Utilities.trimInternal(feedEntry.summary.plainText());
    382 			let splitAt = title.lastIndexOf(' ', 50);
    383 			if (splitAt == -1) splitAt = 50;
    384 			
    385 			item.title = title.substr(0, splitAt);
    386 			if (splitAt <= title.length) item.title += '...';
    387 		}
    388 	}
    389 	
    390 	if (feedEntry.link) item.url = feedEntry.link.spec;
    391 	
    392 	if (feedEntry.rights) item.rights = Zotero.FeedReader._getRichText(feedEntry.rights, 'rights');
    393 	
    394 	item.creators = Zotero.FeedReader._processCreators(feedEntry, 'authors', 'author');
    395 	if (!item.creators.length) {
    396 		// Use feed authors as item author. Maybe not the best idea.
    397 		for (let i=0; i<feedInfo.creators.length; i++) {
    398 			if (feedInfo.creators[i].creatorType != 'author') continue;
    399 			item.creators.push(feedInfo.creators[i]);
    400 		}
    401 	}
    402 	
    403 	let contributors = Zotero.FeedReader._processCreators(feedEntry, 'contributors', 'contributor');
    404 	if (contributors.length) item.creators = item.creators.concat(contributors);
    405 	
    406 	/** Done with basic metadata, now look for better data **/
    407 	
    408 	let date = Zotero.FeedReader._getFeedField(feedEntry, 'publicationDate', 'prism')
    409 		|| Zotero.FeedReader._getFeedField(feedEntry, 'date', 'dc')
    410 		// DEBUG: Why not get these from the feedEntry?
    411 		|| Zotero.FeedReader._getFeedField(feedEntry, 'pubDate') // RSS
    412 		|| Zotero.FeedReader._getFeedField(feedEntry, 'updated', 'atom') // Atom
    413 		|| Zotero.FeedReader._getFeedField(feedEntry, 'published', 'atom'); // Atom
    414 		
    415 	
    416 	if (date) item.date = date;
    417 	
    418 	let publicationTitle = Zotero.FeedReader._getFeedField(feedEntry, 'publicationName', 'prism')
    419 		|| Zotero.FeedReader._getFeedField(feedEntry, 'source', 'dc')
    420 		|| Zotero.FeedReader._getFeedField(feedEntry, 'pubTitle');
    421 	if (publicationTitle) item.publicationTitle = publicationTitle;
    422 	
    423 	let publicationType = Zotero.FeedReader._getFeedField(feedEntry, 'pubType');
    424 	if (publicationType) item.publicationType = publicationType;
    425 	
    426 	let startPage = Zotero.FeedReader._getFeedField(feedEntry, 'startPage');
    427 	let endPage = Zotero.FeedReader._getFeedField(feedEntry, 'endPage');
    428 	if (startPage || endPage) {
    429 		item.pages = ( startPage || '' )
    430 			+ ( endPage && startPage ? '–' : '' )
    431 			+ ( endPage || '' );
    432 	}
    433 	
    434 	let issn = Zotero.FeedReader._getFeedField(feedEntry, 'issn', 'prism');
    435 	if (issn) item.ISSN = issn;
    436 	
    437 	let isbn = Zotero.FeedReader._getFeedField(feedEntry, 'isbn', 'prism')
    438 		|| Zotero.FeedReader._getFeedField(feedEntry, 'isbn')
    439 	if (isbn) item.ISBN = isbn;
    440 	
    441 	let identifier = Zotero.FeedReader._getFeedField(feedEntry, 'identifier', 'dc');
    442 	if (identifier) {
    443 		let cleanId = Zotero.Utilities.cleanDOI(identifier);
    444 		if (cleanId) {
    445 			if (!item.DOI) item.DOI = cleanId;
    446 		} else if (cleanId = Zotero.Utilities.cleanISBN(identifier)) {
    447 			if (!item.ISBN) item.ISBN = cleanId;
    448 		} else if (cleanId = Zotero.Utilities.cleanISSN(identifier)) {
    449 			if (!item.ISSN) item.ISSN = cleanId;
    450 		}
    451 	}
    452 	
    453 	let publisher = Zotero.FeedReader._getFeedField(feedEntry, 'publisher', 'dc');
    454 	if (publisher) item.publisher = publisher;
    455 	
    456 	let rights = Zotero.FeedReader._getFeedField(feedEntry, 'copyright', 'prism')
    457 		|| Zotero.FeedReader._getFeedField(feedEntry, 'rights', 'dc')
    458 		|| Zotero.FeedReader._getFeedField(feedEntry, 'copyright');
    459 	if (rights) item.rights = rights;
    460 	
    461 	let language = Zotero.FeedReader._getFeedField(feedEntry, 'language', 'dc')
    462 		|| Zotero.FeedReader._getFeedField(feedEntry, 'language');
    463 	if (language) item.language = language;
    464 	
    465 	/** Incorporate missing values from feed metadata **/
    466 	
    467 	let supplementFields = ['publicationTitle', 'ISSN', 'publisher', 'rights', 'language'];
    468 	for (let i=0; i<supplementFields.length; i++) {
    469 		let field = supplementFields[i];
    470 		if (!item[field] && feedInfo[field]) {
    471 			item[field] = feedInfo[field];
    472 		}
    473 	}
    474 	
    475 	Zotero.FeedReader._guessItemType(item);
    476 	
    477 	item.enclosedItems = Zotero.FeedReader._getEnclosedItems(feedEntry);
    478 	
    479 	return item;
    480 }
    481 
    482 /*********************
    483  * Utility functions *
    484  *********************/
    485 /*
    486  * Convert HTML-formatted text to Zotero-compatible formatting
    487  */
    488 Zotero.FeedReader._getRichText = function(feedText, field) {
    489 	let domDiv = Zotero.Utilities.Internal.getDOMDocument().createElement("div");
    490 	let domFragment = feedText.createDocumentFragment(domDiv);
    491 	return Zotero.Utilities.dom2text(domFragment, field);
    492 };
    493 
    494 /*
    495  * Get field value from feed entry by namespace:fieldName
    496  */
    497 // Properties are stored internally as ns+name, but only some namespaces are
    498 // supported. Others are just "null"
    499 let ns = {
    500 	'prism': 'null',
    501 	'dc': 'dc:'
    502 }
    503 Zotero.FeedReader._getFeedField = function(feedEntry, field, namespace) {
    504 	let prefix = namespace ? ns[namespace] || 'null' : '';
    505 	try {
    506 		return feedEntry.fields.getPropertyAsAUTF8String(prefix+field);
    507 	} catch(e) {}
    508 	
    509 	try {
    510 		if (namespace && !ns[namespace]) {
    511 			prefix = namespace + ':';
    512 			return feedEntry.fields.getPropertyAsAUTF8String(prefix+field);
    513 		}
    514 	} catch(e) {}
    515 	
    516 	return;
    517 }
    518 
    519 Zotero.FeedReader._getEnclosedItems = function(feedEntry) {
    520 	var enclosedItems = [];
    521 	
    522 	if (feedEntry.enclosures) {
    523 		for (let i = 0; i < feedEntry.enclosures.length; i++) {
    524 			let elem = feedEntry.enclosures.queryElementAt(0, Components.interfaces.nsIPropertyBag2);
    525 			if (elem.get('url')) {
    526 				let enclosedItem = {url: elem.get('url'), contentType: elem.get('type') || ''};
    527 				enclosedItems.push(enclosedItem);
    528 			}
    529 		}
    530 	}
    531 	
    532 	return enclosedItems;
    533 }