Skip to content

Commit ab40182

Browse files
authored
maintenance update - october 2022 (#696)
* fix: add alternative word count method * fix: replace pages_rendered key with rendered_pages for consistency * fix: return first lead_image_url when multiple og:image present * fix: properly pull image src from lazy loaded img * fix: allow drop cap character in medium custom extractor * fix: refined medium parser
1 parent 8ca8a5f commit ab40182

File tree

7 files changed

+52
-14
lines changed

7 files changed

+52
-14
lines changed

src/extractors/collect-all-pages.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ export default async function collectAllPages({
5454
return {
5555
...result,
5656
total_pages: pages,
57-
pages_rendered: pages,
57+
rendered_pages: pages,
5858
word_count,
5959
};
6060
}

src/extractors/custom/medium.com/index.js

+8-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ export const MediumExtractor = {
1515
// Is there anything in the content you selected that needs transformed
1616
// before it's consumable content? E.g., unusual lazy loaded images
1717
transforms: {
18+
// Allow drop cap character.
19+
'section span:first-of-type': $node => {
20+
const $text = $node.html();
21+
if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
22+
$node.replaceWith($text);
23+
}
24+
},
1825
// Re-write lazy-loaded youtube videos
1926
iframe: $node => {
2027
const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
@@ -55,7 +62,7 @@ export const MediumExtractor = {
5562
// Is there anything that is in the result that shouldn't be?
5663
// The clean selectors will remove anything that matches from
5764
// the result
58-
clean: ['span', 'svg'],
65+
clean: ['span a', 'svg'],
5966
},
6067

6168
date_published: {

src/extractors/custom/techlog.iij.ad.jp/index.test.js

+3-3
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,14 @@ describe('TechlogIijAdJpExtractor', () => {
8484
);
8585
});
8686

87-
it('returns the pages_rendered', async () => {
87+
it('returns the rendered_pages', async () => {
8888
// To pass this test, fill out the pages_rendered selector
8989
// in ./src/extractors/custom/techlog.iij.ad.jp/index.js.
90-
const { pages_rendered } = await result;
90+
const { rendered_pages } = await result;
9191

9292
// Update these values with the expected values from
9393
// the article.
94-
assert.equal(pages_rendered, null);
94+
assert.equal(rendered_pages, 1);
9595
});
9696

9797
it('returns the content', async () => {

src/extractors/generic/word-count/extractor.js

+17-5
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,25 @@ import cheerio from 'cheerio';
22

33
import { normalizeSpaces } from 'utils/text';
44

5+
const getWordCount = content => {
6+
const $ = cheerio.load(content);
7+
const $content = $('div').first();
8+
const text = normalizeSpaces($content.text());
9+
return text.split(/\s/).length;
10+
};
11+
12+
const getWordCountAlt = content => {
13+
content = content.replace(/<[^>]*>/g, ' ');
14+
content = content.replace(/\s+/g, ' ');
15+
content = content.trim();
16+
return content.split(' ').length;
17+
};
18+
519
const GenericWordCountExtractor = {
620
extract({ content }) {
7-
const $ = cheerio.load(content);
8-
const $content = $('div').first();
9-
10-
const text = normalizeSpaces($content.text());
11-
return text.split(/\s/).length;
21+
let count = getWordCount(content);
22+
if (count === 1) count = getWordCountAlt(content);
23+
return count;
1224
},
1325
};
1426

src/extractors/root-extractor.js

+3-1
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,13 @@ export function select(opts) {
7676

7777
const { selectors, defaultCleaner = true, allowMultiple } = extractionOpts;
7878

79+
const overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
80+
7981
const matchingSelector = findMatchingSelector(
8082
$,
8183
selectors,
8284
extractHtml,
83-
allowMultiple
85+
overrideAllowMultiple
8486
);
8587

8688
if (!matchingSelector) return null;

src/mercury.test.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,10 @@ describe('Parser', () => {
7878
'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
7979
const result = await Parser.parse(url, { fetchAllPages: true });
8080

81-
const { total_pages, pages_rendered } = result;
81+
const { total_pages, rendered_pages } = result;
8282

8383
assert.equal(total_pages, 3);
84-
assert.equal(pages_rendered, 3);
84+
assert.equal(rendered_pages, 3);
8585

8686
assert.equal(result.next_page_url, `${url}2`);
8787
});

src/resource/utils/dom/convert-lazy-loaded-images.js

+18-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ import { IS_LINK, IS_IMAGE, IS_SRCSET } from './constants';
88
// attribute that a is a placeholer. We need to be able to properly fill in
99
// the src attribute so the images are no longer lazy loaded.
1010
export default function convertLazyLoadedImages($) {
11+
const extractSrcFromJSON = str => {
12+
try {
13+
const { src } = JSON.parse(str);
14+
if (typeof src === 'string') return src;
15+
} catch (e) {
16+
return false;
17+
}
18+
19+
return false;
20+
};
21+
1122
$('img').each((_, img) => {
1223
const attrs = getAttrs(img);
1324

@@ -22,7 +33,13 @@ export default function convertLazyLoadedImages($) {
2233
IS_LINK.test(value) &&
2334
IS_IMAGE.test(value)
2435
) {
25-
$(img).attr('src', value);
36+
// Is the value a JSON object? If so, we should attempt to extract the image src from the data.
37+
const existingSrc = extractSrcFromJSON(value);
38+
if (existingSrc) {
39+
$(img).attr('src', existingSrc);
40+
} else {
41+
$(img).attr('src', value);
42+
}
2643
}
2744
});
2845
});

0 commit comments

Comments
 (0)