www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit ca36096bcf2401b59334fd58d27ada32d4184179
parent c02baa639d64deb860c4e6769d6f364e4e20729b
Author: Aurimas Vinckevicius <aurimas.dev@gmail.com>
Date:   Thu,  6 Nov 2014 22:08:56 -0600

Add FeedReader

Diffstat:
Achrome/content/zotero/xpcom/feedReader.js | 531+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mcomponents/zotero-service.js | 1+
2 files changed, 532 insertions(+), 0 deletions(-)

diff --git a/chrome/content/zotero/xpcom/feedReader.js b/chrome/content/zotero/xpcom/feedReader.js @@ -0,0 +1,530 @@ +/* + ***** BEGIN LICENSE BLOCK ***** + + Copyright © 2015 Center for History and New Media + George Mason University, Fairfax, Virginia, USA + http://zotero.org + + This file is part of Zotero. + + Zotero is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Zotero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with Zotero. If not, see <http://www.gnu.org/licenses/>. + + ***** END LICENSE BLOCK ***** +*/ + + +/** + * Sample feeds: + * + * http://cyber.law.harvard.edu/rss/examples/rss2sample.xml + * http://feeds.feedburner.com/acs/acbcct + * http://www.cell.com/molecular-cell/current.rss + * http://ieeexplore.ieee.org/search/searchresult.jsp?searchField%3DSearch_All%26queryText%3Dwater&searchOrigin=saved_searches&rssFeed=true&rssFeedName=water + * http://www.sciencemag.org/rss/current.xml + * http://rss.sciencedirect.com/publication/science/20925212 + * http://www.ncbi.nlm.nih.gov/entrez/eutils/erss.cgi?rss_guid=1fmfIeN4X5Q8HemTZD5Rj6iu6-FQVCn7xc7_IPIIQtS1XiD9bf + * http://export.arxiv.org/rss/astro-ph + */ + +/** + * class Zotero.FeedReader + * Asynchronously reads an ATOM/RSS feed + * + * @param {String} url URL of the feed + * + * @property {Zotero.Promise<Object>} feedProperties An object + * representing feed properties + * @property {Zotero.Promise<FeedItem>*} itemIterator Returns an iterator + * for feed items. The iterator returns FeedItem promises that have to be + * resolved before requesting the next promise. When all items are exhausted. + * the promise resolves to null. + * @method {void} terminate Stops retrieving/parsing the feed. Data parsed up + * to this point is still available. + */ +Zotero.FeedReader = new function() { + let ios = Components.classes["@mozilla.org/network/io-service;1"] + .getService(Components.interfaces.nsIIOService); + + /***************************** + * Item processing functions * + *****************************/ + + /** + * Determine item type based on item data + */ + function guessItemType(item) { + // Default to journalArticle + item.itemType = 'journalArticle'; + + if (item.ISSN) { + return; // journalArticle + } + + if (item.ISBN) { + item.itemType = 'bookSection'; + return; + } + + if (item.publicationType) { + let type = item.publicationType.toLowerCase(); + if (type.indexOf('conference') != -1) { + item.itemType = 'conferencePaper'; + return; + } + if (type.indexOf('journal') != -1) { + item.itemType = 'journalArticle'; + return; + } + if (type.indexOf('book') != -1) { + item.itemType = 'bookSection'; + return; + } + } + }; + + /* + * Fetch creators from given field of a feed entry + */ + function processCreators(feedEntry, field, role) { + let names = [], + nameStr; + try { + let personArr = feedEntry[field]; // Seems like this part can throw if there is no author data in the feed + for (let i=0; i<personArr.length; i++) { + let person = personArr.queryElementAt(i, Components.interfaces.nsIFeedPerson); + if (!person || !person.name) continue; + + let name = Zotero.Utilities.trimInternal(person.name); + if (!name) continue; + + let commas = name.split(',').length - 1, + other = name.split(/\s(?:and|&)\s|;/).length - 1, + separators = commas + other; + if (personArr.length == 1 && + // Has typical name separators + (other || commas > 1 + // If only one comma and first part has more than one space, + // it's probably not lastName, firstName + || (commas == 1 && name.split(/\s*,/)[0].indexOf(' ') != -1) + ) + ) { + // Probably multiple authors listed in a single field + nameStr = name; + break; // For clarity. personArr.length == 1 anyway + } else { + names.push(name); + } + } + } catch(e) { + if (e.result != Components.results.NS_ERROR_FAILURE) throw e + + if (field != 'authors') return []; + + // ieeexplore places these in "authors"... sigh + nameStr = getFeedField(feedEntry, null, 'authors'); + if (nameStr) nameStr = Zotero.Utilities.trimInternal(nameStr); + if (!nameStr) return []; + } + + if (nameStr) { + names = nameStr.split(/\s(?:and|&)\s|\s*[,;]\s*/); + } + + let creators = []; + for (let i=0; i<names.length; i++) { + let creator = Zotero.Utilities.cleanAuthor( + names[i], + role, + names[i].split(',').length == 2 + ); + if (!creator.firstName) { + creator.fieldMode = 1; + } + + creators.push(creator); + } + return creators; + } + + /********************* + * Utility functions * + *********************/ + /* + * Convert HTML-formatted text to Zotero-compatible formatting + */ + let domDiv = Zotero.Utilities.Internal.getDOMDocument().createElement("div"); + function getRichText(feedText, field) { + let domFragment = feedText.createDocumentFragment(domDiv); + return Zotero.Utilities.dom2text(domFragment, field); + } + + /* + * Format JS date as SQL date + time zone offset + */ + function formatDate(date) { + let offset = (date.getTimezoneOffset() / 60) * -1; + let absOffset = Math.abs(offset); + offset = offset + ? ' ' + (offset < 0 ? '-' : '+') + + Zotero.Utilities.lpad(Math.floor(absOffset), '0', 2) + + ('' + ( (absOffset - Math.floor(absOffset)) || '' )).substr(1) // Get ".5" fraction or "" otherwise + : ''; + return Zotero.Date.dateToSQL(date, false) + offset; + } + + /* + * Get field value from feed entry by namespace:fieldName + */ + // Properties are stored internally as ns+name, but only some namespaces are + // supported. Others are just "null" + let ns = { + 'prism': 'null', + 'dc': 'dc:' + } + function getFeedField(feedEntry, namespace, field) { + let prefix = namespace ? ns[namespace] || 'null' : ''; + try { + return feedEntry.fields.getPropertyAsAUTF8String(prefix+field); + } catch(e) {} + + try { + if (namespace && !ns[namespace]) { + prefix = namespace + ':'; + return feedEntry.fields.getPropertyAsAUTF8String(prefix+field); + } + } catch(e) {} + + return; + } + + /* + * Parse feed entry into a Zotero item + */ + function getFeedItem(feedEntry, feedInfo) { + // ID is not required, but most feeds have these and we have to rely on them + // to handle updating properly + if (!feedEntry.id) { + Zotero.debug("FeedReader: Feed item missing an ID"); + return; + } + + let item = { + guid: feedEntry.id + }; + + if (feedEntry.title) item.title = getRichText(feedEntry.title, 'title'); + + if (feedEntry.summary) { + item.abstractNote = getRichText(feedEntry.summary, 'abstractNote'); + + if (!item.title) { + // We will probably have to trim this, so let's use plain text to + // avoid splitting inside some markup + let title = Zotero.Utilities.trimInternal(feedEntry.summary.plainText()); + let splitAt = title.lastIndexOf(' ', 50); + if (splitAt == -1) splitAt = 50; + + item.title = title.substr(0, splitAt); + if (splitAt <= title.length) item.title += '...'; + } + } + + if (feedEntry.link) item.url = feedEntry.link.spec; + + if (feedEntry.updated) item.dateModified = new Date(feedEntry.updated); + + if (feedEntry.published) { + let date = new Date(feedEntry.published); + + if (!date.getUTCSeconds() && !(date.getUTCHours() && date.getUTCMinutes())) { + // There was probably no time, but there may have been a a date range, + // so something could have ended up in the hour _or_ minute field + item.date = getFeedField(feedEntry, null, 'pubDate') + /* In case it was magically pulled from some other field */ + || ( date.getUTCFullYear() + '-' + + (date.getUTCMonth() + 1) + '-' + + date.getUTCDate() ); + } else { + item.date = formatDate(date); + // Add time zone + } + + if (!item.dateModified) { + items.dateModified = date; + } + } + + if (!item.dateModified) { + // When there's no reliable modification date, we can assume that item doesn't get updated + Zotero.debug("FeedReader: Feed item missing a modification date (" + item.guid + ")"); + item.dateModified = null; + } + + if (!item.date && item.dateModified) { + // Use lastModified date + item.date = formatDate(item.dateModified); + } + + // Convert date modified to string, since those are directly comparable + if (item.dateModified) item.dateModified = Zotero.Date.dateToSQL(item.dateModified, true); + + if (feedEntry.rights) item.rights = getRichText(feedEntry.rights, 'rights'); + + item.creators = processCreators(feedEntry, 'authors', 'author'); + if (!item.creators.length) { + // Use feed authors as item author. Maybe not the best idea. + for (let i=0; i<feedInfo.creators.length; i++) { + if (feedInfo.creators[i].creatorType != 'author') continue; + item.creators.push(feedInfo.creators[i]); + } + } + + let contributors = processCreators(feedEntry, 'contributors', 'contributor'); + if (contributors.length) item.creators = item.creators.concat(contributors); + + /** Done with basic metadata, now look for better data **/ + + let date = getFeedField(feedEntry, 'prism', 'publicationDate') + || getFeedField(feedEntry, 'dc', 'date'); + if (date) item.date = date; + + let publicationTitle = getFeedField(feedEntry, 'prism', 'publicationName') + || getFeedField(feedEntry, 'dc', 'source') + || getFeedField(feedEntry, null, 'pubTitle'); + if (publicationTitle) item.publicationTitle = publicationTitle; + + let publicationType = getFeedField(feedEntry, null, 'pubType'); + if (publicationType) item.publicationType = publicationType; + + let startPage = getFeedField(feedEntry, null, 'startPage'); + let endPage = getFeedField(feedEntry, null, 'endPage'); + if (startPage || endPage) { + item.pages = ( startPage || '' ) + + ( endPage && startPage ? '–' : '' ) + + ( endPage || '' ); + } + + let issn = getFeedField(feedEntry, 'prism', 'issn'); + if (issn) item.ISSN = issn; + + let isbn = getFeedField(feedEntry, 'prism', 'isbn') + || getFeedField(feedEntry, null, 'isbn') + if (isbn) item.ISBN = isbn; + + let identifier = getFeedField(feedEntry, 'dc', 'identifier'); + if (identifier) { + let cleanId = Zotero.Utilities.cleanDOI(identifier); + if (cleanId) { + if (!item.DOI) item.DOI = cleanId; + } else if (cleanId = Zotero.Utilities.cleanISBN(identifier)) { + if (!item.ISBN) item.ISBN = cleanId; + } else if (cleanId = Zotero.Utilities.cleanISSN(identifier)) { + if (!item.ISSN) item.ISSN = cleanId; + } + } + + let publisher = getFeedField(feedEntry, 'dc', 'publisher'); + if (publisher) item.publisher = publisher; + + let rights = getFeedField(feedEntry, 'prism', 'copyright') + || getFeedField(feedEntry, 'dc', 'rights') + || getFeedField(feedEntry, null, 'copyright'); + if (rights) item.rights = rights; + + let language = getFeedField(feedEntry, 'dc', 'language') + || getFeedField(feedEntry, null, 'language'); + if (language) item.language = language; + + /** Incorporate missing values from feed metadata **/ + + let supplementFields = ['publicationTitle', 'ISSN', 'publisher', 'rights', 'language']; + for (let i=0; i<supplementFields.length; i++) { + let field = supplementFields[i]; + if (!item[field] && feedInfo[field]) { + item[field] = feedInfo[field]; + } + } + + guessItemType(item); + + return item; + } + + /********************* + * FeedReader object * + *********************/ + let FeedReader = function(url) { + if (!url) throw new Error("Feed URL must be supplied"); + + this._feed = Zotero.Promise.defer(); // Fetched asynchronously + + this._feedProperties = this._feed.promise + .then(function(feed) { + let info = {}; + + info.title = feed.title ? feed.title.plainText() : ''; + info.subtitle = feed.subtitle ? feed.subtitle.plainText() : ''; + + if (feed.updated) info.updated = new Date(feed.updated); + + // categories: MDN says "not yet implemented" + + info.creators = processCreators(feed, 'authors', 'author'); + + // TODO: image as icon + + let publicationTitle = getFeedField(feed, 'prism', 'publicationName') + || getFeedField(feed, null, 'pubTitle'); + if (publicationTitle) info.publicationTitle = publicationTitle; + + let publisher = getFeedField(feed, 'dc', 'publisher'); + if (publisher) info.publisher = publisher; + + let rights = (feed.rights && feed.rights.plainText()) + || getFeedField(feed, 'prism', 'copyright') + || getFeedField(feed, 'dc', 'rights') + || getFeedField(feed, null, 'copyright'); + if (rights) info.rights = rights; + + let issn = getFeedField(feed, 'prism', 'issn'); + if (issn) info.ISSN = issn; + + let isbn = getFeedField(feed, 'prism', 'isbn') + || getFeedField(feed, null, 'isbn') + if (isbn) info.ISBN = isbn; + + let language = getFeedField(feed, 'dc', 'language') + || getFeedField(feed, null, 'language'); + if (language) info.language = language; + + let ttl = getFeedField(feed, null, 'ttl'); + if (ttl) info.ttl = ttl; + + return info; + }); + + // Array of deferred item promises + this._feedItems = [Zotero.Promise.defer()]; + + // Process items once they're available and push them into the array + Zotero.Promise.join( + this._feed.promise, + this._feedProperties, + (feed, feedInfo) => { + let items = feed.items; + if (items && items.length) { + for (let i=0; i<items.length; i++) { + let item = items.queryElementAt(i, Components.interfaces.nsIFeedEntry); + if (!item) continue; + + let feedItem = getFeedItem(item, feedInfo); + if (!feedItem) continue; + + let lastItem = this._feedItems[this._feedItems.length - 1]; + this._feedItems.push(Zotero.Promise.defer()); // Push a new deferred promise so an iterator has something to return + lastItem.resolve(feedItem); + } + } + } + ) + .finally(() => { + // Make sure the last promise gets resolved to null + let lastItem = this._feedItems[this._feedItems.length - 1]; + lastItem.resolve(null); + }); + + // Set up asynchronous feed processor + let feedProcessor = Components.classes["@mozilla.org/feed-processor;1"] + .createInstance(Components.interfaces.nsIFeedProcessor); + + let feedUrl = ios.newURI(url, null, null); + feedProcessor.parseAsync(null, feedUrl); + + feedProcessor.listener = { + /* + * MDN suggests that we could use nsIFeedProgressListener to handle the feed + * as it gets loaded, but this is actually not implemented (as of 32.0.3), + * so we have to load the whole feed and handle it in handleResult. + */ + handleResult: (result) => { + if (!result.doc) { + this.terminate("No Feed"); + return; + } + + let newFeed = result.doc.QueryInterface(Components.interfaces.nsIFeed); + this._feed.resolve(newFeed); + } + }; + + Zotero.debug("FeedReader: Fetching feed from " + feedUrl.spec); + + this._channel = ios.newChannelFromURI(feedUrl); + this._channel.asyncOpen(feedProcessor, null); // Sends an HTTP request + } + + Zotero.defineProperty(FeedReader.prototype, 'feedProperties', { + get: function() this._feedProperties + }); + + /* + * Feed item iterator + * Each iteration returns a _promise_ for an item. The promise _MUST_ be + * resolved before requesting the next item. + * The last item will always be resolved to `null`, unless the feed processing + * is terminated ahead of time, in which case it will be rejected with the reason + * for termination. + */ + Zotero.defineProperty(FeedReader.prototype, 'itemIterator', { + get: function() { + let items = this._feedItems; + return new function() { + let i = 0; + this.next = function() { + let item = items[i++]; + return { + value: item ? item.promise : null, + done: i >= items.length + }; + }; + } + } + }); + + /* + * Terminate feed processing at any given time + * @param {String} status Reason for terminating processing + */ + FeedReader.prototype.terminate = function(status) { + Zotero.debug("FeedReader: Terminating feed reader (" + status + ")"); + + // Reject feed promise if not resolved yet + if (this._feed.promise.isPending()) { + this._feed.reject(status); + } + + // Reject feed item promise if not resolved yet + let lastItem = this._feedItems[this._feedItems.length - 1]; + if (lastItem.promise.isPending()) { + lastItem.reject(status); + } + + // Close feed connection + if (channel.isPending) { + channel.cancel(Components.results.NS_BINDING_ABORTED); + } + }; + + return FeedReader; +}; +\ No newline at end of file diff --git a/components/zotero-service.js b/components/zotero-service.js @@ -85,6 +85,7 @@ const xpcomFilesLocal = [ 'data/tags', 'db', 'duplicates', + 'feedReader', 'fulltext', 'id', 'integration',