diff --git a/README.md b/README.md index c1fbc833..58cddc6d 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,9 @@ Returns an object containing the following properties: * `siteName`: name of the site; * `lang`: content language; * `publishedTime`: published time; +* `modifiedTime`: modified time; +* `favicon`: site favicon as URI, SVG format if available; +* `image`: URI to article image, extracted from metadata; The `parse()` method works by modifying the DOM. This removes some elements in the web page, which may be undesirable. You can avoid this by passing the clone of the `document` object to the `Readability` constructor: diff --git a/Readability.js b/Readability.js index 53dacaca..e933c71f 100644 --- a/Readability.js +++ b/Readability.js @@ -455,23 +455,6 @@ Readability.prototype = { * @return void */ _fixRelativeUris(articleContent) { - var baseURI = this._doc.baseURI; - var documentURI = this._doc.documentURI; - function toAbsoluteURI(uri) { - // Leave hash links alone if the base URI matches the document URI: - if (baseURI == documentURI && uri.charAt(0) == "#") { - return uri; - } - - // Otherwise, resolve against base URI: - try { - return new URL(uri, baseURI).href; - } catch (ex) { - // Something went wrong, just return the original: - } - return uri; - } - var links = this._getAllNodesWithTag(articleContent, ["a"]); this._forEachNode(links, function (link) { var href = link.getAttribute("href"); @@ -495,7 +478,7 @@ Readability.prototype = { link.parentNode.replaceChild(container, link); } } else { - link.setAttribute("href", toAbsoluteURI(href)); + link.setAttribute("href", this._toAbsoluteURI(href)); } } }); @@ -515,19 +498,17 @@ Readability.prototype = { var srcset = media.getAttribute("srcset"); if (src) { - media.setAttribute("src", toAbsoluteURI(src)); + media.setAttribute("src", this._toAbsoluteURI(src)); } if (poster) { - media.setAttribute("poster", toAbsoluteURI(poster)); + media.setAttribute("poster", this._toAbsoluteURI(poster)); } if (srcset) { var newSrcset = srcset.replace( this.REGEXPS.srcsetUrl, - function (_, p1, p2, p3) { - return toAbsoluteURI(p1) + (p2 || "") + p3; - } + (_, p1, p2, p3) => this._toAbsoluteURI(p1) + (p2 || "") + p3 ); media.setAttribute("srcset", newSrcset); @@ -1748,6 +1729,9 @@ Readability.prototype = { if (typeof parsed.datePublished === "string") { metadata.datePublished = parsed.datePublished.trim(); } + if (typeof parsed.dateModified === "string") { + metadata.dateModified = parsed.dateModified.trim(); + } } catch (err) { this.log(err.message); } @@ -1771,11 +1755,11 @@ Readability.prototype = { // property is a space-separated list of values var propertyPattern = - /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi; + /^\s*(article|dc|dcterms|og|twitter)\s*:\s*(author|creator|description|image|published_time|modified|title|site_name)\s*$/i; // name is a single value var namePattern = - /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i; + /^\s*(?:(dc|dcterms|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i; // Find description tags. this._forEachNode(metaElements, function (element) { @@ -1813,7 +1797,7 @@ Readability.prototype = { metadata.title = jsonld.title || values["dc:title"] || - values["dcterm:title"] || + values["dcterms:title"] || values["og:title"] || values["weibo:article:title"] || values["weibo:webpage:title"] || @@ -1835,7 +1819,7 @@ Readability.prototype = { metadata.byline = jsonld.byline || values["dc:creator"] || - values["dcterm:creator"] || + values["dcterms:creator"] || values.author || values["parsely-author"] || articleAuthor; @@ -1844,7 +1828,7 @@ Readability.prototype = { metadata.excerpt = jsonld.excerpt || values["dc:description"] || - values["dcterm:description"] || + values["dcterms:description"] || values["og:description"] || values["weibo:article:description"] || values["weibo:webpage:description"] || @@ -1854,6 +1838,12 @@ Readability.prototype = { // get site name metadata.siteName = jsonld.siteName || values["og:site_name"]; + // get image thumbnail + metadata.image = values["og:image"] || values.image || values["twitter:image"]; + + // get favicon + metadata.favicon = this._getArticleFavicon() + // get article published time metadata.publishedTime = jsonld.datePublished || @@ -1861,6 +1851,13 @@ Readability.prototype = { values["parsely-pub-date"] || null; + // get modified date + metadata.modifiedTime = + jsonld.dateModified || + values["article:modified_time"] || + values["dcterms:modified"] || + null; + // in many sites the meta value is escaped with HTML entities, // so here we need to unescape it metadata.title = this._unescapeHtmlEntities(metadata.title); @@ -1868,10 +1865,69 @@ Readability.prototype = { metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime); + metadata.modifiedTime = this._unescapeHtmlEntities(metadata.modifiedTime); return metadata; }, + /** + * Trying to extract the favicon from the page + **/ + _getArticleFavicon() { + + // string to return + var favicon = ""; + + // find all ink tags + var metaElements = this._doc.getElementsByTagName("link"); + + // iterate over tags. + this._forEachNode(metaElements, function (element) { + + // make sure the type is correct and element contains a href attribute + var rel = element.hasAttribute("rel") ? element.getAttribute("rel") : ""; + if (rel === "icon" && element.hasAttribute("href")) { + favicon = element.getAttribute("href"); + + var type = element.hasAttribute("type") ? element.getAttribute("type") : ""; + if(type === "image/svg+xml") + { + // svg wins as best quality format + return this._toAbsoluteURI(favicon); + } + + // what is missing here is an algorithm which compares all href and selects the "best" size + } + }); + + // make sure to return an absolute URI + return this._toAbsoluteURI(favicon); + }, + + /** + * Convert a relative to an absolute URI + * + * @param {string} uri + **/ + _toAbsoluteURI(uri) { + + const baseURI = this._doc.baseURI; + const documentURI = this._doc.documentURI; + + // Leave hash links alone if the base URI matches the document URI: + if (baseURI === documentURI && uri.charAt(0) === "#") { + return uri; + } + + // Otherwise, resolve against base URI: + try { + return new URL(uri, baseURI).href; + } catch (ex) { + // Something went wrong, just return the original: + } + return uri; + }, + /** * Check if node is image, or if node contains exactly only one image * whether as a direct child or as its descendants. @@ -2784,7 +2840,10 @@ Readability.prototype = { length: textContent.length, excerpt: metadata.excerpt, siteName: metadata.siteName || this._articleSiteName, + image: metadata.image, + favicon: metadata.favicon, publishedTime: metadata.publishedTime, + modifiedTime: metadata.modifiedTime, }; }, };