commit ca36096bcf2401b59334fd58d27ada32d4184179
parent c02baa639d64deb860c4e6769d6f364e4e20729b
Author: Aurimas Vinckevicius <aurimas.dev@gmail.com>
Date: Thu, 6 Nov 2014 22:08:56 -0600
Add FeedReader
Diffstat:
2 files changed, 532 insertions(+), 0 deletions(-)
diff --git a/chrome/content/zotero/xpcom/feedReader.js b/chrome/content/zotero/xpcom/feedReader.js
@@ -0,0 +1,530 @@
+/*
+ ***** BEGIN LICENSE BLOCK *****
+
+ Copyright © 2015 Center for History and New Media
+ George Mason University, Fairfax, Virginia, USA
+ http://zotero.org
+
+ This file is part of Zotero.
+
+ Zotero is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Zotero is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with Zotero. If not, see <http://www.gnu.org/licenses/>.
+
+ ***** END LICENSE BLOCK *****
+*/
+
+
+/**
+ * Sample feeds:
+ *
+ * http://cyber.law.harvard.edu/rss/examples/rss2sample.xml
+ * http://feeds.feedburner.com/acs/acbcct
+ * http://www.cell.com/molecular-cell/current.rss
+ * http://ieeexplore.ieee.org/search/searchresult.jsp?searchField%3DSearch_All%26queryText%3Dwater&searchOrigin=saved_searches&rssFeed=true&rssFeedName=water
+ * http://www.sciencemag.org/rss/current.xml
+ * http://rss.sciencedirect.com/publication/science/20925212
+ * http://www.ncbi.nlm.nih.gov/entrez/eutils/erss.cgi?rss_guid=1fmfIeN4X5Q8HemTZD5Rj6iu6-FQVCn7xc7_IPIIQtS1XiD9bf
+ * http://export.arxiv.org/rss/astro-ph
+ */
+
+/**
+ * class Zotero.FeedReader
+ * Asynchronously reads an ATOM/RSS feed
+ *
+ * @param {String} url URL of the feed
+ *
+ * @property {Zotero.Promise<Object>} feedProperties An object
+ * representing feed properties
+ * @property {Zotero.Promise<FeedItem>*} itemIterator Returns an iterator
+ * for feed items. The iterator returns FeedItem promises that have to be
+ * resolved before requesting the next promise. When all items are exhausted.
+ * the promise resolves to null.
+ * @method {void} terminate Stops retrieving/parsing the feed. Data parsed up
+ * to this point is still available.
+ */
+Zotero.FeedReader = new function() {
+ let ios = Components.classes["@mozilla.org/network/io-service;1"]
+ .getService(Components.interfaces.nsIIOService);
+
+ /*****************************
+ * Item processing functions *
+ *****************************/
+
+ /**
+ * Determine item type based on item data
+ */
+ function guessItemType(item) {
+ // Default to journalArticle
+ item.itemType = 'journalArticle';
+
+ if (item.ISSN) {
+ return; // journalArticle
+ }
+
+ if (item.ISBN) {
+ item.itemType = 'bookSection';
+ return;
+ }
+
+ if (item.publicationType) {
+ let type = item.publicationType.toLowerCase();
+ if (type.indexOf('conference') != -1) {
+ item.itemType = 'conferencePaper';
+ return;
+ }
+ if (type.indexOf('journal') != -1) {
+ item.itemType = 'journalArticle';
+ return;
+ }
+ if (type.indexOf('book') != -1) {
+ item.itemType = 'bookSection';
+ return;
+ }
+ }
+ };
+
+ /*
+ * Fetch creators from given field of a feed entry
+ */
+ function processCreators(feedEntry, field, role) {
+ let names = [],
+ nameStr;
+ try {
+ let personArr = feedEntry[field]; // Seems like this part can throw if there is no author data in the feed
+ for (let i=0; i<personArr.length; i++) {
+ let person = personArr.queryElementAt(i, Components.interfaces.nsIFeedPerson);
+ if (!person || !person.name) continue;
+
+ let name = Zotero.Utilities.trimInternal(person.name);
+ if (!name) continue;
+
+ let commas = name.split(',').length - 1,
+ other = name.split(/\s(?:and|&)\s|;/).length - 1,
+ separators = commas + other;
+ if (personArr.length == 1 &&
+ // Has typical name separators
+ (other || commas > 1
+ // If only one comma and first part has more than one space,
+ // it's probably not lastName, firstName
+ || (commas == 1 && name.split(/\s*,/)[0].indexOf(' ') != -1)
+ )
+ ) {
+ // Probably multiple authors listed in a single field
+ nameStr = name;
+ break; // For clarity. personArr.length == 1 anyway
+ } else {
+ names.push(name);
+ }
+ }
+ } catch(e) {
+ if (e.result != Components.results.NS_ERROR_FAILURE) throw e
+
+ if (field != 'authors') return [];
+
+ // ieeexplore places these in "authors"... sigh
+ nameStr = getFeedField(feedEntry, null, 'authors');
+ if (nameStr) nameStr = Zotero.Utilities.trimInternal(nameStr);
+ if (!nameStr) return [];
+ }
+
+ if (nameStr) {
+ names = nameStr.split(/\s(?:and|&)\s|\s*[,;]\s*/);
+ }
+
+ let creators = [];
+ for (let i=0; i<names.length; i++) {
+ let creator = Zotero.Utilities.cleanAuthor(
+ names[i],
+ role,
+ names[i].split(',').length == 2
+ );
+ if (!creator.firstName) {
+ creator.fieldMode = 1;
+ }
+
+ creators.push(creator);
+ }
+ return creators;
+ }
+
+ /*********************
+ * Utility functions *
+ *********************/
+ /*
+ * Convert HTML-formatted text to Zotero-compatible formatting
+ */
+ let domDiv = Zotero.Utilities.Internal.getDOMDocument().createElement("div");
+ function getRichText(feedText, field) {
+ let domFragment = feedText.createDocumentFragment(domDiv);
+ return Zotero.Utilities.dom2text(domFragment, field);
+ }
+
+ /*
+ * Format JS date as SQL date + time zone offset
+ */
+ function formatDate(date) {
+ let offset = (date.getTimezoneOffset() / 60) * -1;
+ let absOffset = Math.abs(offset);
+ offset = offset
+ ? ' ' + (offset < 0 ? '-' : '+')
+ + Zotero.Utilities.lpad(Math.floor(absOffset), '0', 2)
+ + ('' + ( (absOffset - Math.floor(absOffset)) || '' )).substr(1) // Get ".5" fraction or "" otherwise
+ : '';
+ return Zotero.Date.dateToSQL(date, false) + offset;
+ }
+
+ /*
+ * Get field value from feed entry by namespace:fieldName
+ */
+ // Properties are stored internally as ns+name, but only some namespaces are
+ // supported. Others are just "null"
+ let ns = {
+ 'prism': 'null',
+ 'dc': 'dc:'
+ }
+ function getFeedField(feedEntry, namespace, field) {
+ let prefix = namespace ? ns[namespace] || 'null' : '';
+ try {
+ return feedEntry.fields.getPropertyAsAUTF8String(prefix+field);
+ } catch(e) {}
+
+ try {
+ if (namespace && !ns[namespace]) {
+ prefix = namespace + ':';
+ return feedEntry.fields.getPropertyAsAUTF8String(prefix+field);
+ }
+ } catch(e) {}
+
+ return;
+ }
+
+ /*
+ * Parse feed entry into a Zotero item
+ */
+ function getFeedItem(feedEntry, feedInfo) {
+ // ID is not required, but most feeds have these and we have to rely on them
+ // to handle updating properly
+ if (!feedEntry.id) {
+ Zotero.debug("FeedReader: Feed item missing an ID");
+ return;
+ }
+
+ let item = {
+ guid: feedEntry.id
+ };
+
+ if (feedEntry.title) item.title = getRichText(feedEntry.title, 'title');
+
+ if (feedEntry.summary) {
+ item.abstractNote = getRichText(feedEntry.summary, 'abstractNote');
+
+ if (!item.title) {
+ // We will probably have to trim this, so let's use plain text to
+ // avoid splitting inside some markup
+ let title = Zotero.Utilities.trimInternal(feedEntry.summary.plainText());
+ let splitAt = title.lastIndexOf(' ', 50);
+ if (splitAt == -1) splitAt = 50;
+
+ item.title = title.substr(0, splitAt);
+ if (splitAt <= title.length) item.title += '...';
+ }
+ }
+
+ if (feedEntry.link) item.url = feedEntry.link.spec;
+
+ if (feedEntry.updated) item.dateModified = new Date(feedEntry.updated);
+
+ if (feedEntry.published) {
+ let date = new Date(feedEntry.published);
+
+ if (!date.getUTCSeconds() && !(date.getUTCHours() && date.getUTCMinutes())) {
+ // There was probably no time, but there may have been a a date range,
+ // so something could have ended up in the hour _or_ minute field
+ item.date = getFeedField(feedEntry, null, 'pubDate')
+ /* In case it was magically pulled from some other field */
+ || ( date.getUTCFullYear() + '-'
+ + (date.getUTCMonth() + 1) + '-'
+ + date.getUTCDate() );
+ } else {
+ item.date = formatDate(date);
+ // Add time zone
+ }
+
+ if (!item.dateModified) {
+ items.dateModified = date;
+ }
+ }
+
+ if (!item.dateModified) {
+ // When there's no reliable modification date, we can assume that item doesn't get updated
+ Zotero.debug("FeedReader: Feed item missing a modification date (" + item.guid + ")");
+ item.dateModified = null;
+ }
+
+ if (!item.date && item.dateModified) {
+ // Use lastModified date
+ item.date = formatDate(item.dateModified);
+ }
+
+ // Convert date modified to string, since those are directly comparable
+ if (item.dateModified) item.dateModified = Zotero.Date.dateToSQL(item.dateModified, true);
+
+ if (feedEntry.rights) item.rights = getRichText(feedEntry.rights, 'rights');
+
+ item.creators = processCreators(feedEntry, 'authors', 'author');
+ if (!item.creators.length) {
+ // Use feed authors as item author. Maybe not the best idea.
+ for (let i=0; i<feedInfo.creators.length; i++) {
+ if (feedInfo.creators[i].creatorType != 'author') continue;
+ item.creators.push(feedInfo.creators[i]);
+ }
+ }
+
+ let contributors = processCreators(feedEntry, 'contributors', 'contributor');
+ if (contributors.length) item.creators = item.creators.concat(contributors);
+
+ /** Done with basic metadata, now look for better data **/
+
+ let date = getFeedField(feedEntry, 'prism', 'publicationDate')
+ || getFeedField(feedEntry, 'dc', 'date');
+ if (date) item.date = date;
+
+ let publicationTitle = getFeedField(feedEntry, 'prism', 'publicationName')
+ || getFeedField(feedEntry, 'dc', 'source')
+ || getFeedField(feedEntry, null, 'pubTitle');
+ if (publicationTitle) item.publicationTitle = publicationTitle;
+
+ let publicationType = getFeedField(feedEntry, null, 'pubType');
+ if (publicationType) item.publicationType = publicationType;
+
+ let startPage = getFeedField(feedEntry, null, 'startPage');
+ let endPage = getFeedField(feedEntry, null, 'endPage');
+ if (startPage || endPage) {
+ item.pages = ( startPage || '' )
+ + ( endPage && startPage ? '–' : '' )
+ + ( endPage || '' );
+ }
+
+ let issn = getFeedField(feedEntry, 'prism', 'issn');
+ if (issn) item.ISSN = issn;
+
+ let isbn = getFeedField(feedEntry, 'prism', 'isbn')
+ || getFeedField(feedEntry, null, 'isbn')
+ if (isbn) item.ISBN = isbn;
+
+ let identifier = getFeedField(feedEntry, 'dc', 'identifier');
+ if (identifier) {
+ let cleanId = Zotero.Utilities.cleanDOI(identifier);
+ if (cleanId) {
+ if (!item.DOI) item.DOI = cleanId;
+ } else if (cleanId = Zotero.Utilities.cleanISBN(identifier)) {
+ if (!item.ISBN) item.ISBN = cleanId;
+ } else if (cleanId = Zotero.Utilities.cleanISSN(identifier)) {
+ if (!item.ISSN) item.ISSN = cleanId;
+ }
+ }
+
+ let publisher = getFeedField(feedEntry, 'dc', 'publisher');
+ if (publisher) item.publisher = publisher;
+
+ let rights = getFeedField(feedEntry, 'prism', 'copyright')
+ || getFeedField(feedEntry, 'dc', 'rights')
+ || getFeedField(feedEntry, null, 'copyright');
+ if (rights) item.rights = rights;
+
+ let language = getFeedField(feedEntry, 'dc', 'language')
+ || getFeedField(feedEntry, null, 'language');
+ if (language) item.language = language;
+
+ /** Incorporate missing values from feed metadata **/
+
+ let supplementFields = ['publicationTitle', 'ISSN', 'publisher', 'rights', 'language'];
+ for (let i=0; i<supplementFields.length; i++) {
+ let field = supplementFields[i];
+ if (!item[field] && feedInfo[field]) {
+ item[field] = feedInfo[field];
+ }
+ }
+
+ guessItemType(item);
+
+ return item;
+ }
+
+ /*********************
+ * FeedReader object *
+ *********************/
+ let FeedReader = function(url) {
+ if (!url) throw new Error("Feed URL must be supplied");
+
+ this._feed = Zotero.Promise.defer(); // Fetched asynchronously
+
+ this._feedProperties = this._feed.promise
+ .then(function(feed) {
+ let info = {};
+
+ info.title = feed.title ? feed.title.plainText() : '';
+ info.subtitle = feed.subtitle ? feed.subtitle.plainText() : '';
+
+ if (feed.updated) info.updated = new Date(feed.updated);
+
+ // categories: MDN says "not yet implemented"
+
+ info.creators = processCreators(feed, 'authors', 'author');
+
+ // TODO: image as icon
+
+ let publicationTitle = getFeedField(feed, 'prism', 'publicationName')
+ || getFeedField(feed, null, 'pubTitle');
+ if (publicationTitle) info.publicationTitle = publicationTitle;
+
+ let publisher = getFeedField(feed, 'dc', 'publisher');
+ if (publisher) info.publisher = publisher;
+
+ let rights = (feed.rights && feed.rights.plainText())
+ || getFeedField(feed, 'prism', 'copyright')
+ || getFeedField(feed, 'dc', 'rights')
+ || getFeedField(feed, null, 'copyright');
+ if (rights) info.rights = rights;
+
+ let issn = getFeedField(feed, 'prism', 'issn');
+ if (issn) info.ISSN = issn;
+
+ let isbn = getFeedField(feed, 'prism', 'isbn')
+ || getFeedField(feed, null, 'isbn')
+ if (isbn) info.ISBN = isbn;
+
+ let language = getFeedField(feed, 'dc', 'language')
+ || getFeedField(feed, null, 'language');
+ if (language) info.language = language;
+
+ let ttl = getFeedField(feed, null, 'ttl');
+ if (ttl) info.ttl = ttl;
+
+ return info;
+ });
+
+ // Array of deferred item promises
+ this._feedItems = [Zotero.Promise.defer()];
+
+ // Process items once they're available and push them into the array
+ Zotero.Promise.join(
+ this._feed.promise,
+ this._feedProperties,
+ (feed, feedInfo) => {
+ let items = feed.items;
+ if (items && items.length) {
+ for (let i=0; i<items.length; i++) {
+ let item = items.queryElementAt(i, Components.interfaces.nsIFeedEntry);
+ if (!item) continue;
+
+ let feedItem = getFeedItem(item, feedInfo);
+ if (!feedItem) continue;
+
+ let lastItem = this._feedItems[this._feedItems.length - 1];
+ this._feedItems.push(Zotero.Promise.defer()); // Push a new deferred promise so an iterator has something to return
+ lastItem.resolve(feedItem);
+ }
+ }
+ }
+ )
+ .finally(() => {
+ // Make sure the last promise gets resolved to null
+ let lastItem = this._feedItems[this._feedItems.length - 1];
+ lastItem.resolve(null);
+ });
+
+ // Set up asynchronous feed processor
+ let feedProcessor = Components.classes["@mozilla.org/feed-processor;1"]
+ .createInstance(Components.interfaces.nsIFeedProcessor);
+
+ let feedUrl = ios.newURI(url, null, null);
+ feedProcessor.parseAsync(null, feedUrl);
+
+ feedProcessor.listener = {
+ /*
+ * MDN suggests that we could use nsIFeedProgressListener to handle the feed
+ * as it gets loaded, but this is actually not implemented (as of 32.0.3),
+ * so we have to load the whole feed and handle it in handleResult.
+ */
+ handleResult: (result) => {
+ if (!result.doc) {
+ this.terminate("No Feed");
+ return;
+ }
+
+ let newFeed = result.doc.QueryInterface(Components.interfaces.nsIFeed);
+ this._feed.resolve(newFeed);
+ }
+ };
+
+ Zotero.debug("FeedReader: Fetching feed from " + feedUrl.spec);
+
+ this._channel = ios.newChannelFromURI(feedUrl);
+ this._channel.asyncOpen(feedProcessor, null); // Sends an HTTP request
+ }
+
+ Zotero.defineProperty(FeedReader.prototype, 'feedProperties', {
+ get: function() this._feedProperties
+ });
+
+ /*
+ * Feed item iterator
+ * Each iteration returns a _promise_ for an item. The promise _MUST_ be
+ * resolved before requesting the next item.
+ * The last item will always be resolved to `null`, unless the feed processing
+ * is terminated ahead of time, in which case it will be rejected with the reason
+ * for termination.
+ */
+ Zotero.defineProperty(FeedReader.prototype, 'itemIterator', {
+ get: function() {
+ let items = this._feedItems;
+ return new function() {
+ let i = 0;
+ this.next = function() {
+ let item = items[i++];
+ return {
+ value: item ? item.promise : null,
+ done: i >= items.length
+ };
+ };
+ }
+ }
+ });
+
+ /*
+ * Terminate feed processing at any given time
+ * @param {String} status Reason for terminating processing
+ */
+ FeedReader.prototype.terminate = function(status) {
+ Zotero.debug("FeedReader: Terminating feed reader (" + status + ")");
+
+ // Reject feed promise if not resolved yet
+ if (this._feed.promise.isPending()) {
+ this._feed.reject(status);
+ }
+
+ // Reject feed item promise if not resolved yet
+ let lastItem = this._feedItems[this._feedItems.length - 1];
+ if (lastItem.promise.isPending()) {
+ lastItem.reject(status);
+ }
+
+ // Close feed connection
+ if (channel.isPending) {
+ channel.cancel(Components.results.NS_BINDING_ABORTED);
+ }
+ };
+
+ return FeedReader;
+};
+\ No newline at end of file
diff --git a/components/zotero-service.js b/components/zotero-service.js
@@ -85,6 +85,7 @@ const xpcomFilesLocal = [
'data/tags',
'db',
'duplicates',
+ 'feedReader',
'fulltext',
'id',
'integration',