feedReader.js (17566B)
1 /* 2 ***** BEGIN LICENSE BLOCK ***** 3 4 Copyright © 2015 Center for History and New Media 5 George Mason University, Fairfax, Virginia, USA 6 http://zotero.org 7 8 This file is part of Zotero. 9 10 Zotero is free software: you can redistribute it and/or modify 11 it under the terms of the GNU Affero General Public License as published by 12 the Free Software Foundation, either version 3 of the License, or 13 (at your option) any later version. 14 15 Zotero is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU Affero General Public License for more details. 19 20 You should have received a copy of the GNU Affero General Public License 21 along with Zotero. If not, see <http://www.gnu.org/licenses/>. 22 23 ***** END LICENSE BLOCK ***** 24 */ 25 26 27 /** 28 * Sample feeds: 29 * 30 * http://cyber.law.harvard.edu/rss/examples/rss2sample.xml 31 * http://feeds.feedburner.com/acs/acbcct 32 * http://www.cell.com/molecular-cell/current.rss 33 * http://ieeexplore.ieee.org/search/searchresult.jsp?searchField%3DSearch_All%26queryText%3Dwater&searchOrigin=saved_searches&rssFeed=true&rssFeedName=water 34 * http://www.sciencemag.org/rss/current.xml 35 * http://rss.sciencedirect.com/publication/science/20925212 36 * http://www.ncbi.nlm.nih.gov/entrez/eutils/erss.cgi?rss_guid=1fmfIeN4X5Q8HemTZD5Rj6iu6-FQVCn7xc7_IPIIQtS1XiD9bf 37 * http://export.arxiv.org/rss/astro-ph 38 * http://fhs.dukejournals.org/rss_feeds/recent.xml 39 */ 40 41 /** 42 * class Zotero.FeedReader 43 * Asynchronously reads an ATOM/RSS feed 44 * 45 * @param {String} url URL of the feed 46 * 47 * @property {Zotero.Promise<Object>} feedProperties An object 48 * representing feed properties 49 * @property {Zotero.Promise<FeedItem>*} ItemIterator Returns an iterator 50 * for feed items. The iterator returns FeedItem promises that have to be 51 * resolved before requesting the next promise. When all items are exhausted. 52 * the promise resolves to null. 53 * @method {void} terminate Stops retrieving/parsing the feed. Data parsed up 54 * to this point is still available. 55 */ 56 Zotero.FeedReader = function(url) { 57 if (!url) throw new Error("Feed URL must be supplied"); 58 59 60 this._url = url; 61 this._feedItems = [Zotero.Promise.defer()]; 62 this._feedProcessed = Zotero.Promise.defer(); 63 64 let feedFetched = Zotero.Promise.defer(); 65 feedFetched.promise.then(function(feed) { 66 let info = {}; 67 68 info.title = feed.title ? feed.title.plainText() : ''; 69 info.subtitle = feed.subtitle ? feed.subtitle.plainText() : ''; 70 71 if (feed.updated) info.updated = new Date(feed.updated); 72 73 // categories: MDN says "not yet implemented" 74 75 info.creators = Zotero.FeedReader._processCreators(feed, 'authors', 'author'); 76 77 // TODO: image as icon 78 79 let publicationTitle = Zotero.FeedReader._getFeedField(feed, 'publicationName', 'prism') 80 || Zotero.FeedReader._getFeedField(feed, 'pubTitle'); 81 if (publicationTitle) info.publicationTitle = publicationTitle; 82 83 let publisher = Zotero.FeedReader._getFeedField(feed, 'publisher', 'dc'); 84 if (publisher) info.publisher = publisher; 85 86 let rights = (feed.rights && feed.rights.plainText()) 87 || Zotero.FeedReader._getFeedField(feed, 'copyright', 'prism') 88 || Zotero.FeedReader._getFeedField(feed, 'rights', 'dc') 89 || Zotero.FeedReader._getFeedField(feed, 'copyright'); 90 if (rights) info.rights = rights; 91 92 let issn = Zotero.FeedReader._getFeedField(feed, 'issn', 'prism'); 93 if (issn) info.ISSN = issn; 94 95 let isbn = Zotero.FeedReader._getFeedField(feed, 'isbn', 'prism') 96 || Zotero.FeedReader._getFeedField(feed, 'isbn') 97 if (isbn) info.ISBN = isbn; 98 99 let language = Zotero.FeedReader._getFeedField(feed, 'language', 'dc') 100 || Zotero.FeedReader._getFeedField(feed, 'language'); 101 if (language) info.language = language; 102 103 let ttl = Zotero.FeedReader._getFeedField(feed, 'ttl'); 104 if (ttl) info.ttl = ttl; 105 106 this._feedProperties = info; 107 this._feed = feed; 108 }.bind(this)).then(function(){ 109 let items = this._feed.items; 110 if (items && items.length) { 111 for (let i=0; i<items.length; i++) { 112 let item = items.queryElementAt(i, Components.interfaces.nsIFeedEntry); 113 if (!item) continue; 114 115 let feedItem = Zotero.FeedReader._getFeedItem(item, this._feedProperties); 116 if (!feedItem) continue; 117 118 let lastItem = this._feedItems[this._feedItems.length - 1]; 119 this._feedItems.push(Zotero.Promise.defer()); // Push a new deferred promise so an iterator has something to return 120 lastItem.resolve(feedItem); 121 } 122 } 123 this._feedProcessed.resolve(); 124 }.bind(this)).catch(function(e) { 125 Zotero.debug("Feed processing failed " + e.message); 126 this._feedProcessed.reject(e); 127 }.bind(this)).finally(function() { 128 // Make sure the last promise gets resolved to null 129 let lastItem = this._feedItems[this._feedItems.length - 1]; 130 lastItem.resolve(null); 131 }.bind(this)); 132 133 // Set up asynchronous feed processor 134 let feedProcessor = Components.classes["@mozilla.org/feed-processor;1"] 135 .createInstance(Components.interfaces.nsIFeedProcessor); 136 137 let feedUrl = Services.io.newURI(url, null, null); 138 feedProcessor.parseAsync(null, feedUrl); 139 140 feedProcessor.listener = { 141 /* 142 * MDN suggests that we could use nsIFeedProgressListener to handle the feed 143 * as it gets loaded, but this is actually not implemented (as of 32.0.3), 144 * so we have to load the whole feed and handle it in handleResult. 145 */ 146 handleResult: (result) => { 147 if (!result.doc) { 148 this.terminate("No Feed"); 149 return; 150 } 151 152 let newFeed = result.doc.QueryInterface(Components.interfaces.nsIFeed); 153 feedFetched.resolve(newFeed); 154 } 155 }; 156 157 Zotero.debug("FeedReader: Fetching feed from " + feedUrl.spec); 158 159 this._channel = Services.io.newChannelFromURI2(feedUrl, null, 160 Services.scriptSecurityManager.getSystemPrincipal(), null, 161 Ci.nsILoadInfo.SEC_NORMAL, Ci.nsIContentPolicy.TYPE_OTHER); 162 this._channel.loadFlags |= Components.interfaces.nsIRequest.LOAD_BYPASS_CACHE; 163 this._channel.asyncOpen(feedProcessor, null); // Sends an HTTP request 164 } 165 166 /* 167 * The constructor initiates async feed processing, but _feedProcessed 168 * needs to be resolved before proceeding. 169 */ 170 Zotero.FeedReader.prototype.process = Zotero.Promise.coroutine(function* () { 171 return this._feedProcessed.promise; 172 }); 173 174 /* 175 * Terminate feed processing at any given time 176 * @param {String} status Reason for terminating processing 177 */ 178 Zotero.FeedReader.prototype.terminate = function(status) { 179 Zotero.debug("FeedReader: Terminating feed reader (" + status + ")"); 180 181 // Reject feed promise if not resolved yet 182 if (this._feedProcessed.promise.isPending()) { 183 this._feedProcessed.reject(new Error(status)); 184 } 185 186 // Reject feed item promise if not resolved yet 187 let lastItem = this._feedItems[this._feedItems.length - 1]; 188 if (lastItem.promise.isPending()) { 189 // It seemed like a good idea to reject the last item but 190 // it's not really been useful yet, aside from bluebird 191 // throwing errors about unhandled rejections in tests 192 // so we suppress them here. TODO: We should probably 193 // rethink whether this code makes sense and make it better. 194 let er = new Error(status); 195 er.handledRejection = true; 196 lastItem.reject(er); 197 } 198 199 // Close feed connection 200 if (this._channel.isPending()) { 201 this._channel.cancel(Components.results.NS_BINDING_ABORTED); 202 } 203 }; 204 205 Zotero.defineProperty(Zotero.FeedReader.prototype, 'feedProperties', { 206 get: function(){ 207 if (!this._feedProperties) { 208 throw new Error("Feed has not been resolved yet. Try calling FeedReader#process first") 209 } 210 return this._feedProperties 211 } 212 }); 213 214 /* 215 * Feed item iterator 216 * Each iteration returns a _promise_ for an item. The promise _MUST_ be 217 * resolved before requesting the next item. 218 * The last item will always be resolved to `null`, unless the feed processing 219 * is terminated ahead of time, in which case it will be rejected with the reason 220 * for termination. 221 */ 222 Zotero.defineProperty(Zotero.FeedReader.prototype, 'ItemIterator', { 223 get: function() { 224 let items = this._feedItems; 225 let feedReader = this; 226 227 let iterator = function() { 228 if (!feedReader._feedProperties) { 229 throw new Error("Feed has not been resolved yet. Try calling FeedReader#process first") 230 } 231 this.index = 0; 232 }; 233 234 iterator.prototype.next = function() { 235 let item = items[this.index++]; 236 return { 237 value: item ? item.promise : null, 238 done: this.index >= items.length 239 }; 240 }; 241 242 iterator.prototype.last = function() { 243 return items[items.length-1]; 244 } 245 246 return iterator; 247 } 248 }, {lazy: true}); 249 250 251 /***************************** 252 * Item processing functions * 253 *****************************/ 254 255 /** 256 * Determine item type based on item data 257 */ 258 Zotero.FeedReader._guessItemType = function(item) { 259 // Default to journalArticle 260 item.itemType = 'journalArticle'; 261 262 if (item.ISSN) { 263 return; // journalArticle 264 } 265 266 if (item.ISBN) { 267 item.itemType = 'bookSection'; 268 return; 269 } 270 271 if (item.publicationType) { 272 let type = item.publicationType.toLowerCase(); 273 if (type.indexOf('conference') != -1) { 274 item.itemType = 'conferencePaper'; 275 return; 276 } 277 if (type.indexOf('journal') != -1) { 278 item.itemType = 'journalArticle'; 279 return; 280 } 281 if (type.indexOf('book') != -1) { 282 item.itemType = 'bookSection'; 283 return; 284 } 285 } 286 }; 287 288 /* 289 * Fetch creators from given field of a feed entry 290 */ 291 Zotero.FeedReader._processCreators = function(feedEntry, field, role) { 292 let names = [], 293 nameStr; 294 try { 295 let personArr = feedEntry[field]; // Seems like this part can throw if there is no author data in the feed 296 for (let i=0; i<personArr.length; i++) { 297 let person = personArr.queryElementAt(i, Components.interfaces.nsIFeedPerson); 298 if (!person || !person.name) continue; 299 300 let name = Zotero.Utilities.cleanTags(Zotero.Utilities.trimInternal(person.name)); 301 if (!name) continue; 302 303 let commas = name.split(',').length - 1, 304 other = name.split(/\s(?:and|&)\s|;/).length - 1, 305 separators = commas + other; 306 if (personArr.length == 1 && 307 // Has typical name separators 308 (other || commas > 1 309 // If only one comma and first part has more than one space, 310 // it's probably not lastName, firstName 311 || (commas == 1 && name.split(/\s*,/)[0].indexOf(' ') != -1) 312 ) 313 ) { 314 // Probably multiple authors listed in a single field 315 nameStr = name; 316 break; // For clarity. personArr.length == 1 anyway 317 } else { 318 names.push(name); 319 } 320 } 321 } 322 catch(e) { 323 if (e.result != Components.results.NS_ERROR_FAILURE) throw e; 324 325 if (field != 'authors') return []; 326 327 // ieeexplore places these in "authors"... sigh 328 nameStr = Zotero.FeedReader._getFeedField(feedEntry, 'authors'); 329 if (nameStr) nameStr = Zotero.Utilities.trimInternal(nameStr); 330 if (!nameStr) return []; 331 } 332 333 if (nameStr) { 334 names = nameStr.split(/\s(?:and|&)\s|\s*[,;]\s*/); 335 } 336 337 let creators = []; 338 for (let i=0; i<names.length; i++) { 339 let creator = Zotero.Utilities.cleanAuthor( 340 names[i], 341 role, 342 names[i].split(',').length == 2 343 ); 344 if (!creator.firstName) { 345 creator.fieldMode = 1; 346 } 347 // Sometimes these end up empty when parsing really nasty HTML based fields, so just skip. 348 if (!creator.firstName && !creator.lastName) { 349 continue; 350 } 351 352 creators.push(creator); 353 } 354 return creators; 355 } 356 357 /* 358 * Parse feed entry into a Zotero item 359 */ 360 Zotero.FeedReader._getFeedItem = function(feedEntry, feedInfo) { 361 // ID is not required, but most feeds have these and we have to rely on them 362 // to handle updating properly 363 // Can probably fall back to links on missing id - unlikely to change 364 if (!feedEntry.id && !feedEntry.link) { 365 Zotero.debug("FeedReader: Feed item missing an ID or link - discarding"); 366 return; 367 } 368 369 let item = { 370 guid: feedEntry.id || feedEntry.link.spec 371 }; 372 373 if (feedEntry.title) item.title = Zotero.FeedReader._getRichText(feedEntry.title, 'title'); 374 375 if (feedEntry.summary) { 376 item.abstractNote = Zotero.FeedReader._getRichText(feedEntry.summary, 'abstractNote'); 377 378 if (!item.title) { 379 // We will probably have to trim this, so let's use plain text to 380 // avoid splitting inside some markup 381 let title = Zotero.Utilities.trimInternal(feedEntry.summary.plainText()); 382 let splitAt = title.lastIndexOf(' ', 50); 383 if (splitAt == -1) splitAt = 50; 384 385 item.title = title.substr(0, splitAt); 386 if (splitAt <= title.length) item.title += '...'; 387 } 388 } 389 390 if (feedEntry.link) item.url = feedEntry.link.spec; 391 392 if (feedEntry.rights) item.rights = Zotero.FeedReader._getRichText(feedEntry.rights, 'rights'); 393 394 item.creators = Zotero.FeedReader._processCreators(feedEntry, 'authors', 'author'); 395 if (!item.creators.length) { 396 // Use feed authors as item author. Maybe not the best idea. 397 for (let i=0; i<feedInfo.creators.length; i++) { 398 if (feedInfo.creators[i].creatorType != 'author') continue; 399 item.creators.push(feedInfo.creators[i]); 400 } 401 } 402 403 let contributors = Zotero.FeedReader._processCreators(feedEntry, 'contributors', 'contributor'); 404 if (contributors.length) item.creators = item.creators.concat(contributors); 405 406 /** Done with basic metadata, now look for better data **/ 407 408 let date = Zotero.FeedReader._getFeedField(feedEntry, 'publicationDate', 'prism') 409 || Zotero.FeedReader._getFeedField(feedEntry, 'date', 'dc') 410 // DEBUG: Why not get these from the feedEntry? 411 || Zotero.FeedReader._getFeedField(feedEntry, 'pubDate') // RSS 412 || Zotero.FeedReader._getFeedField(feedEntry, 'updated', 'atom') // Atom 413 || Zotero.FeedReader._getFeedField(feedEntry, 'published', 'atom'); // Atom 414 415 416 if (date) item.date = date; 417 418 let publicationTitle = Zotero.FeedReader._getFeedField(feedEntry, 'publicationName', 'prism') 419 || Zotero.FeedReader._getFeedField(feedEntry, 'source', 'dc') 420 || Zotero.FeedReader._getFeedField(feedEntry, 'pubTitle'); 421 if (publicationTitle) item.publicationTitle = publicationTitle; 422 423 let publicationType = Zotero.FeedReader._getFeedField(feedEntry, 'pubType'); 424 if (publicationType) item.publicationType = publicationType; 425 426 let startPage = Zotero.FeedReader._getFeedField(feedEntry, 'startPage'); 427 let endPage = Zotero.FeedReader._getFeedField(feedEntry, 'endPage'); 428 if (startPage || endPage) { 429 item.pages = ( startPage || '' ) 430 + ( endPage && startPage ? '–' : '' ) 431 + ( endPage || '' ); 432 } 433 434 let issn = Zotero.FeedReader._getFeedField(feedEntry, 'issn', 'prism'); 435 if (issn) item.ISSN = issn; 436 437 let isbn = Zotero.FeedReader._getFeedField(feedEntry, 'isbn', 'prism') 438 || Zotero.FeedReader._getFeedField(feedEntry, 'isbn') 439 if (isbn) item.ISBN = isbn; 440 441 let identifier = Zotero.FeedReader._getFeedField(feedEntry, 'identifier', 'dc'); 442 if (identifier) { 443 let cleanId = Zotero.Utilities.cleanDOI(identifier); 444 if (cleanId) { 445 if (!item.DOI) item.DOI = cleanId; 446 } else if (cleanId = Zotero.Utilities.cleanISBN(identifier)) { 447 if (!item.ISBN) item.ISBN = cleanId; 448 } else if (cleanId = Zotero.Utilities.cleanISSN(identifier)) { 449 if (!item.ISSN) item.ISSN = cleanId; 450 } 451 } 452 453 let publisher = Zotero.FeedReader._getFeedField(feedEntry, 'publisher', 'dc'); 454 if (publisher) item.publisher = publisher; 455 456 let rights = Zotero.FeedReader._getFeedField(feedEntry, 'copyright', 'prism') 457 || Zotero.FeedReader._getFeedField(feedEntry, 'rights', 'dc') 458 || Zotero.FeedReader._getFeedField(feedEntry, 'copyright'); 459 if (rights) item.rights = rights; 460 461 let language = Zotero.FeedReader._getFeedField(feedEntry, 'language', 'dc') 462 || Zotero.FeedReader._getFeedField(feedEntry, 'language'); 463 if (language) item.language = language; 464 465 /** Incorporate missing values from feed metadata **/ 466 467 let supplementFields = ['publicationTitle', 'ISSN', 'publisher', 'rights', 'language']; 468 for (let i=0; i<supplementFields.length; i++) { 469 let field = supplementFields[i]; 470 if (!item[field] && feedInfo[field]) { 471 item[field] = feedInfo[field]; 472 } 473 } 474 475 Zotero.FeedReader._guessItemType(item); 476 477 item.enclosedItems = Zotero.FeedReader._getEnclosedItems(feedEntry); 478 479 return item; 480 } 481 482 /********************* 483 * Utility functions * 484 *********************/ 485 /* 486 * Convert HTML-formatted text to Zotero-compatible formatting 487 */ 488 Zotero.FeedReader._getRichText = function(feedText, field) { 489 let domDiv = Zotero.Utilities.Internal.getDOMDocument().createElement("div"); 490 let domFragment = feedText.createDocumentFragment(domDiv); 491 return Zotero.Utilities.dom2text(domFragment, field); 492 }; 493 494 /* 495 * Get field value from feed entry by namespace:fieldName 496 */ 497 // Properties are stored internally as ns+name, but only some namespaces are 498 // supported. Others are just "null" 499 let ns = { 500 'prism': 'null', 501 'dc': 'dc:' 502 } 503 Zotero.FeedReader._getFeedField = function(feedEntry, field, namespace) { 504 let prefix = namespace ? ns[namespace] || 'null' : ''; 505 try { 506 return feedEntry.fields.getPropertyAsAUTF8String(prefix+field); 507 } catch(e) {} 508 509 try { 510 if (namespace && !ns[namespace]) { 511 prefix = namespace + ':'; 512 return feedEntry.fields.getPropertyAsAUTF8String(prefix+field); 513 } 514 } catch(e) {} 515 516 return; 517 } 518 519 Zotero.FeedReader._getEnclosedItems = function(feedEntry) { 520 var enclosedItems = []; 521 522 if (feedEntry.enclosures) { 523 for (let i = 0; i < feedEntry.enclosures.length; i++) { 524 let elem = feedEntry.enclosures.queryElementAt(0, Components.interfaces.nsIPropertyBag2); 525 if (elem.get('url')) { 526 let enclosedItem = {url: elem.get('url'), contentType: elem.get('type') || ''}; 527 enclosedItems.push(enclosedItem); 528 } 529 } 530 } 531 532 return enclosedItems; 533 }