maintenance update - october 2022 (#696)

mtashley · web-flow · commit ab401822aa02 · 2022-10-07T08:47:41.000-07:00
* fix: add alternative word count method

* fix: replace pages_rendered key with rendered_pages for consistency

* fix: return first lead_image_url when multiple og:image present

* fix: properly pull image src from lazy loaded img

* fix: allow drop cap character in medium custom extractor

* fix: refined medium parser
diff --git a/src/extractors/collect-all-pages.js b/src/extractors/collect-all-pages.js
@@ -54,7 +54,7 @@ export default async function collectAllPages({
   return {
     ...result,
     total_pages: pages,
-    pages_rendered: pages,
+    rendered_pages: pages,
     word_count,
   };
 }
diff --git a/src/extractors/custom/medium.com/index.js b/src/extractors/custom/medium.com/index.js
@@ -15,6 +15,13 @@ export const MediumExtractor = {
     // Is there anything in the content you selected that needs transformed
     // before it's consumable content? E.g., unusual lazy loaded images
     transforms: {
+      // Allow drop cap character.
+      'section span:first-of-type': $node => {
+        const $text = $node.html();
+        if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
+          $node.replaceWith($text);
+        }
+      },
       // Re-write lazy-loaded youtube videos
       iframe: $node => {
         const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
@@ -55,7 +62,7 @@ export const MediumExtractor = {
     // Is there anything that is in the result that shouldn't be?
     // The clean selectors will remove anything that matches from
     // the result
-    clean: ['span', 'svg'],
+    clean: ['span a', 'svg'],
   },
 
   date_published: {
diff --git a/src/extractors/custom/techlog.iij.ad.jp/index.test.js b/src/extractors/custom/techlog.iij.ad.jp/index.test.js
@@ -84,14 +84,14 @@ describe('TechlogIijAdJpExtractor', () => {
       );
     });
 
-    it('returns the pages_rendered', async () => {
+    it('returns the rendered_pages', async () => {
       // To pass this test, fill out the pages_rendered selector
       // in ./src/extractors/custom/techlog.iij.ad.jp/index.js.
-      const { pages_rendered } = await result;
+      const { rendered_pages } = await result;
 
       // Update these values with the expected values from
       // the article.
-      assert.equal(pages_rendered, null);
+      assert.equal(rendered_pages, 1);
     });
 
     it('returns the content', async () => {
diff --git a/src/extractors/generic/word-count/extractor.js b/src/extractors/generic/word-count/extractor.js
@@ -2,13 +2,25 @@ import cheerio from 'cheerio';
 
 import { normalizeSpaces } from 'utils/text';
 
+const getWordCount = content => {
+  const $ = cheerio.load(content);
+  const $content = $('div').first();
+  const text = normalizeSpaces($content.text());
+  return text.split(/\s/).length;
+};
+
+const getWordCountAlt = content => {
+  content = content.replace(/<[^>]*>/g, ' ');
+  content = content.replace(/\s+/g, ' ');
+  content = content.trim();
+  return content.split(' ').length;
+};
+
 const GenericWordCountExtractor = {
   extract({ content }) {
-    const $ = cheerio.load(content);
-    const $content = $('div').first();
-
-    const text = normalizeSpaces($content.text());
-    return text.split(/\s/).length;
+    let count = getWordCount(content);
+    if (count === 1) count = getWordCountAlt(content);
+    return count;
   },
 };
 
diff --git a/src/extractors/root-extractor.js b/src/extractors/root-extractor.js
@@ -76,11 +76,13 @@ export function select(opts) {
 
   const { selectors, defaultCleaner = true, allowMultiple } = extractionOpts;
 
+  const overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
+
   const matchingSelector = findMatchingSelector(
     $,
     selectors,
     extractHtml,
-    allowMultiple
+    overrideAllowMultiple
   );
 
   if (!matchingSelector) return null;
diff --git a/src/mercury.test.js b/src/mercury.test.js
@@ -78,10 +78,10 @@ describe('Parser', () => {
         'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
       const result = await Parser.parse(url, { fetchAllPages: true });
 
-      const { total_pages, pages_rendered } = result;
+      const { total_pages, rendered_pages } = result;
 
       assert.equal(total_pages, 3);
-      assert.equal(pages_rendered, 3);
+      assert.equal(rendered_pages, 3);
 
       assert.equal(result.next_page_url, `${url}2`);
     });
diff --git a/src/resource/utils/dom/convert-lazy-loaded-images.js b/src/resource/utils/dom/convert-lazy-loaded-images.js
@@ -8,6 +8,17 @@ import { IS_LINK, IS_IMAGE, IS_SRCSET } from './constants';
 // attribute that a is a placeholer. We need to be able to properly fill in
 // the src attribute so the images are no longer lazy loaded.
 export default function convertLazyLoadedImages($) {
+  const extractSrcFromJSON = str => {
+    try {
+      const { src } = JSON.parse(str);
+      if (typeof src === 'string') return src;
+    } catch (e) {
+      return false;
+    }
+
+    return false;
+  };
+
   $('img').each((_, img) => {
     const attrs = getAttrs(img);
 
@@ -22,7 +33,13 @@ export default function convertLazyLoadedImages($) {
         IS_LINK.test(value) &&
         IS_IMAGE.test(value)
       ) {
-        $(img).attr('src', value);
+        // Is the value a JSON object? If so, we should attempt to extract the image src from the data.
+        const existingSrc = extractSrcFromJSON(value);
+        if (existingSrc) {
+          $(img).attr('src', existingSrc);
+        } else {
+          $(img).attr('src', value);
+        }
       }
     });
   });

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ export default async function collectAllPages({`
`54`	`54`	`return {`
`55`	`55`	`...result,`
`56`	`56`	`total_pages: pages,`
`57`		`- pages_rendered: pages,`
	`57`	`+ rendered_pages: pages,`
`58`	`58`	`word_count,`
`59`	`59`	`};`
`60`	`60`	`}`