From 47f4b3a40a2c15c969416fad96885ddb57ad39ac Mon Sep 17 00:00:00 2001 From: LordEidi Date: Mon, 21 Jul 2025 09:45:42 +0200 Subject: [PATCH 1/5] Extract image URL --- Readability.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Readability.js b/Readability.js index 53dacaca..e521cb04 100644 --- a/Readability.js +++ b/Readability.js @@ -1771,7 +1771,7 @@ Readability.prototype = { // property is a space-separated list of values var propertyPattern = - /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi; + /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|image:alt|image|published_time|title|site_name)\s*/gi; // name is a single value var namePattern = @@ -1854,6 +1854,12 @@ Readability.prototype = { // get site name metadata.siteName = jsonld.siteName || values["og:site_name"]; + // get image thumbnail + metadata.image = values["og:image"] || values["image"] || values["twitter:image"]; + + // get favicon + // metadata.favicon = this._getArticleFavicon() + // get article published time metadata.publishedTime = jsonld.datePublished || @@ -2784,6 +2790,7 @@ Readability.prototype = { length: textContent.length, excerpt: metadata.excerpt, siteName: metadata.siteName || this._articleSiteName, + image: metadata.image, publishedTime: metadata.publishedTime, }; }, From 65ad660f46d289c9f12abdb312c13900e15d4403 Mon Sep 17 00:00:00 2001 From: LordEidi Date: Mon, 21 Jul 2025 10:27:55 +0200 Subject: [PATCH 2/5] Fixed image reference to dot notation according Mozilla QS --- Readability.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Readability.js b/Readability.js index e521cb04..698e96e5 100644 --- a/Readability.js +++ b/Readability.js @@ -1855,7 +1855,7 @@ Readability.prototype = { metadata.siteName = jsonld.siteName || values["og:site_name"]; // get image thumbnail - metadata.image = values["og:image"] || values["image"] || values["twitter:image"]; + metadata.image = values["og:image"] || values.image || values["twitter:image"]; // get favicon // metadata.favicon = this._getArticleFavicon() From 5936d88ae3cf6f1d6cc735e10f112f986e939ede Mon Sep 17 00:00:00 2001 From: LordEidi Date: Thu, 24 Jul 2025 16:19:55 +0200 Subject: [PATCH 3/5] Adding extraction for favicon and modifiedTime. Fixing typo in "dcterms". --- Readability.js | 82 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 6 deletions(-) diff --git a/Readability.js b/Readability.js index 698e96e5..5a10e306 100644 --- a/Readability.js +++ b/Readability.js @@ -1748,6 +1748,9 @@ Readability.prototype = { if (typeof parsed.datePublished === "string") { metadata.datePublished = parsed.datePublished.trim(); } + if (typeof parsed.dateModified === "string") { + metadata.dateModified = parsed.dateModified.trim(); + } } catch (err) { this.log(err.message); } @@ -1771,11 +1774,11 @@ Readability.prototype = { // property is a space-separated list of values var propertyPattern = - /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|image:alt|image|published_time|title|site_name)\s*/gi; + /\s*(article|dc|dcterms|og|twitter)\s*:\s*(author|creator|description|image:alt|image|published_time|modified|title|site_name)\s*/gi; // name is a single value var namePattern = - /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i; + /^\s*(?:(dc|dcterms|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i; // Find description tags. this._forEachNode(metaElements, function (element) { @@ -1813,7 +1816,7 @@ Readability.prototype = { metadata.title = jsonld.title || values["dc:title"] || - values["dcterm:title"] || + values["dcterms:title"] || values["og:title"] || values["weibo:article:title"] || values["weibo:webpage:title"] || @@ -1835,7 +1838,7 @@ Readability.prototype = { metadata.byline = jsonld.byline || values["dc:creator"] || - values["dcterm:creator"] || + values["dcterms:creator"] || values.author || values["parsely-author"] || articleAuthor; @@ -1844,7 +1847,7 @@ Readability.prototype = { metadata.excerpt = jsonld.excerpt || values["dc:description"] || - values["dcterm:description"] || + values["dcterms:description"] || values["og:description"] || values["weibo:article:description"] || values["weibo:webpage:description"] || @@ -1858,7 +1861,7 @@ Readability.prototype = { metadata.image = values["og:image"] || values.image || values["twitter:image"]; // get favicon - // metadata.favicon = this._getArticleFavicon() + metadata.favicon = this._getArticleFavicon() // get article published time metadata.publishedTime = @@ -1867,6 +1870,13 @@ Readability.prototype = { values["parsely-pub-date"] || null; + // get modified date + metadata.modifiedTime = + jsonld.dateModified || + values["article:modified_time"] || + values["dcterms:modified"] || + null; + // in many sites the meta value is escaped with HTML entities, // so here we need to unescape it metadata.title = this._unescapeHtmlEntities(metadata.title); @@ -1874,10 +1884,68 @@ Readability.prototype = { metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime); + metadata.modifiedTime = this._unescapeHtmlEntities(metadata.modifiedTime); return metadata; }, + /** + * Trying to extract the favicon from the page + **/ + _getArticleFavicon() { + + // string to return + var favicon = ""; + + // find all ink tags + var metaElements = this._doc.getElementsByTagName("link"); + + // iterate over tags. + this._forEachNode(metaElements, function (element) { + + // make sure the type is correct and element contains a href attribute + var rel = element.hasAttribute("rel") ? element.getAttribute("rel") : ""; + if (rel === "icon" && element.hasAttribute("href")) { + favicon = element.getAttribute("href"); + + var type = element.hasAttribute("type") ? element.getAttribute("type") : ""; + if(type === "image/svg+xml") + { + // svg wins as best quality format + return this._toAbsoluteURI(favicon); + } + + // what is missing here is an algorithm which compares all href and selects the "best" size + } + }); + + // make sure to return an absolute URI + return this._toAbsoluteURI(favicon); + }, + + /** + * Convert a relative to an absolute URI + * + * @param {string} uri + **/ + _toAbsoluteURI(uri) { + + // stop processing if uri is empty + if(uri === ""){ + return uri; + } + + // try to parse into URL object + var absolute_uri = URL.parse(uri, this._doc.baseURI); + if(!absolute_uri){ + // parsing failed, return original URI + return uri; + } + + // parsing worked, return absolute URI + return absolute_uri.href; + }, + /** * Check if node is image, or if node contains exactly only one image * whether as a direct child or as its descendants. @@ -2791,7 +2859,9 @@ Readability.prototype = { excerpt: metadata.excerpt, siteName: metadata.siteName || this._articleSiteName, image: metadata.image, + favicon: metadata.favicon, publishedTime: metadata.publishedTime, + modifiedTime: metadata.modifiedTime, }; }, }; From 27a90dcc0e807083be7bed23360ce5283cbc3d40 Mon Sep 17 00:00:00 2001 From: LordEidi Date: Fri, 25 Jul 2025 10:25:27 +0200 Subject: [PATCH 4/5] Updating description of returned object --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index c1fbc833..58cddc6d 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,9 @@ Returns an object containing the following properties: * `siteName`: name of the site; * `lang`: content language; * `publishedTime`: published time; +* `modifiedTime`: modified time; +* `favicon`: site favicon as URI, SVG format if available; +* `image`: URI to article image, extracted from metadata; The `parse()` method works by modifying the DOM. This removes some elements in the web page, which may be undesirable. You can avoid this by passing the clone of the `document` object to the `Readability` constructor: From 355d694786d1dd98d56ad0624adfead24723135f Mon Sep 17 00:00:00 2001 From: LordEidi Date: Mon, 4 Aug 2025 09:46:23 +0200 Subject: [PATCH 5/5] - Switched the property regex in the ArticleMetadata function to check for begin and end of string to not extract the wrong properties. Therefore, removed the "image:alt" hack from before (the hack failed for the regular image:size properties and similar anyway). - Moved the toAbsoluteUri function in function to the "root", changed all calls to the new location of the function. Using an arrow function in one location because of scope issues with "this". --- Readability.js | 50 ++++++++++++++++---------------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/Readability.js b/Readability.js index 5a10e306..e933c71f 100644 --- a/Readability.js +++ b/Readability.js @@ -455,23 +455,6 @@ Readability.prototype = { * @return void */ _fixRelativeUris(articleContent) { - var baseURI = this._doc.baseURI; - var documentURI = this._doc.documentURI; - function toAbsoluteURI(uri) { - // Leave hash links alone if the base URI matches the document URI: - if (baseURI == documentURI && uri.charAt(0) == "#") { - return uri; - } - - // Otherwise, resolve against base URI: - try { - return new URL(uri, baseURI).href; - } catch (ex) { - // Something went wrong, just return the original: - } - return uri; - } - var links = this._getAllNodesWithTag(articleContent, ["a"]); this._forEachNode(links, function (link) { var href = link.getAttribute("href"); @@ -495,7 +478,7 @@ Readability.prototype = { link.parentNode.replaceChild(container, link); } } else { - link.setAttribute("href", toAbsoluteURI(href)); + link.setAttribute("href", this._toAbsoluteURI(href)); } } }); @@ -515,19 +498,17 @@ Readability.prototype = { var srcset = media.getAttribute("srcset"); if (src) { - media.setAttribute("src", toAbsoluteURI(src)); + media.setAttribute("src", this._toAbsoluteURI(src)); } if (poster) { - media.setAttribute("poster", toAbsoluteURI(poster)); + media.setAttribute("poster", this._toAbsoluteURI(poster)); } if (srcset) { var newSrcset = srcset.replace( this.REGEXPS.srcsetUrl, - function (_, p1, p2, p3) { - return toAbsoluteURI(p1) + (p2 || "") + p3; - } + (_, p1, p2, p3) => this._toAbsoluteURI(p1) + (p2 || "") + p3 ); media.setAttribute("srcset", newSrcset); @@ -1774,7 +1755,7 @@ Readability.prototype = { // property is a space-separated list of values var propertyPattern = - /\s*(article|dc|dcterms|og|twitter)\s*:\s*(author|creator|description|image:alt|image|published_time|modified|title|site_name)\s*/gi; + /^\s*(article|dc|dcterms|og|twitter)\s*:\s*(author|creator|description|image|published_time|modified|title|site_name)\s*$/i; // name is a single value var namePattern = @@ -1930,20 +1911,21 @@ Readability.prototype = { **/ _toAbsoluteURI(uri) { - // stop processing if uri is empty - if(uri === ""){ - return uri; - } + const baseURI = this._doc.baseURI; + const documentURI = this._doc.documentURI; - // try to parse into URL object - var absolute_uri = URL.parse(uri, this._doc.baseURI); - if(!absolute_uri){ - // parsing failed, return original URI + // Leave hash links alone if the base URI matches the document URI: + if (baseURI === documentURI && uri.charAt(0) === "#") { return uri; } - // parsing worked, return absolute URI - return absolute_uri.href; + // Otherwise, resolve against base URI: + try { + return new URL(uri, baseURI).href; + } catch (ex) { + // Something went wrong, just return the original: + } + return uri; }, /**