Skip to content

Commit 9bf88b0

Browse files
authored
chore: refactor format output adjustments (#272)
I had previously done this in an overly complicated manner. This PR cleans it up a bit.
1 parent 867623a commit 9bf88b0

File tree

6 files changed

+64
-139
lines changed

6 files changed

+64
-139
lines changed

Diff for: dist/mercury.js

+16-38
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ var _objectWithoutProperties = _interopDefault(require('@babel/runtime-corejs2/h
88
var _asyncToGenerator = _interopDefault(require('@babel/runtime-corejs2/helpers/asyncToGenerator'));
99
var URL = _interopDefault(require('url'));
1010
var cheerio = _interopDefault(require('cheerio'));
11+
var TurndownService = _interopDefault(require('turndown'));
1112
var iconv = _interopDefault(require('iconv-lite'));
1213
var _parseInt = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-int'));
1314
var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
@@ -21,7 +22,6 @@ var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
2122
var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
2223
var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
2324
var _Object$keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
24-
var TurndownService = _interopDefault(require('turndown'));
2525
var stringDirection = _interopDefault(require('string-direction'));
2626
var validUrl = _interopDefault(require('valid-url'));
2727
var moment = _interopDefault(require('moment-timezone'));
@@ -6018,9 +6018,7 @@ var GenericExtractor = {
60186018
},
60196019
extract: function extract(options) {
60206020
var html = options.html,
6021-
$ = options.$,
6022-
_options$contentType = options.contentType,
6023-
contentType = _options$contentType === void 0 ? 'html' : _options$contentType;
6021+
$ = options.$;
60246022

60256023
if (html && !$) {
60266024
var loaded = cheerio.load(html);
@@ -6054,24 +6052,13 @@ var GenericExtractor = {
60546052
url = _this$url_and_domain.url,
60556053
domain = _this$url_and_domain.domain;
60566054

6057-
var convertedContent;
6058-
6059-
if (contentType === 'html') {
6060-
convertedContent = content;
6061-
} else if (contentType === 'text') {
6062-
convertedContent = $.text(cheerio.load(content));
6063-
} else if (contentType === 'markdown') {
6064-
var turndownService = new TurndownService();
6065-
convertedContent = turndownService.turndown(content);
6066-
}
6067-
60686055
return {
60696056
title: title,
60706057
author: author,
60716058
date_published: date_published || null,
60726059
dek: dek,
60736060
lead_image_url: lead_image_url,
6074-
content: convertedContent,
6061+
content: content,
60756062
next_page_url: next_page_url,
60766063
url: url,
60776064
domain: domain,
@@ -6161,9 +6148,7 @@ function select(opts) {
61616148
type = opts.type,
61626149
extractionOpts = opts.extractionOpts,
61636150
_opts$extractHtml = opts.extractHtml,
6164-
extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml,
6165-
_opts$contentType = opts.contentType,
6166-
contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType; // Skip if there's not extraction for this type
6151+
extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type
61676152

61686153
if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
61696154
// contributors), return the string
@@ -6205,19 +6190,7 @@ function select(opts) {
62056190
$content = Cleaners[type]($content, _objectSpread({}, opts, {
62066191
defaultCleaner: defaultCleaner
62076192
}));
6208-
6209-
if (contentType === 'html') {
6210-
return $.html($content);
6211-
}
6212-
6213-
if (contentType === 'text') {
6214-
return $.text($content);
6215-
}
6216-
6217-
if (contentType === 'markdown') {
6218-
var turndownService = new TurndownService();
6219-
return turndownService.turndown($.html($content));
6220-
}
6193+
return $.html($content);
62216194
}
62226195

62236196
var result; // if selector is an array (e.g., ['img', 'src']),
@@ -6270,9 +6243,7 @@ var RootExtractor = {
62706243
var opts = arguments.length > 1 ? arguments[1] : undefined;
62716244
var _opts = opts,
62726245
contentOnly = _opts.contentOnly,
6273-
extractedTitle = _opts.extractedTitle,
6274-
_opts$contentType2 = _opts.contentType,
6275-
contentType = _opts$contentType2 === void 0 ? 'html' : _opts$contentType2; // This is the generic extractor. Run its extract method
6246+
extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method
62766247

62776248
if (extractor.domain === '*') return extractor.extract(opts);
62786249
opts = _objectSpread({}, opts, {
@@ -6283,8 +6254,7 @@ var RootExtractor = {
62836254
var _content = extractResult(_objectSpread({}, opts, {
62846255
type: 'content',
62856256
extractHtml: true,
6286-
title: extractedTitle,
6287-
contentType: contentType
6257+
title: extractedTitle
62886258
}));
62896259

62906260
return {
@@ -6451,6 +6421,7 @@ var Mercury = {
64516421
_result,
64526422
title,
64536423
next_page_url,
6424+
turndownService,
64546425
_args = arguments;
64556426

64566427
return _regeneratorRuntime.wrap(function _callee$(_context) {
@@ -6545,9 +6516,16 @@ var Mercury = {
65456516
});
65466517

65476518
case 23:
6519+
if (contentType === 'markdown') {
6520+
turndownService = new TurndownService();
6521+
result.content = turndownService.turndown(result.content);
6522+
} else if (contentType === 'text') {
6523+
result.content = $.text($(result.content));
6524+
}
6525+
65486526
return _context.abrupt("return", result);
65496527

6550-
case 24:
6528+
case 25:
65516529
case "end":
65526530
return _context.stop();
65536531
}

Diff for: src/extractors/generic/index.js

+2-14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import cheerio from 'cheerio';
2-
import TurndownService from 'turndown';
32
import stringDirection from 'string-direction';
43

54
import GenericContentExtractor from './content/extractor';
@@ -29,7 +28,7 @@ const GenericExtractor = {
2928
direction: ({ title }) => stringDirection.getDirection(title),
3029

3130
extract(options) {
32-
const { html, $, contentType = 'html' } = options;
31+
const { html, $ } = options;
3332

3433
if (html && !$) {
3534
const loaded = cheerio.load(html);
@@ -48,24 +47,13 @@ const GenericExtractor = {
4847
const direction = this.direction({ title });
4948
const { url, domain } = this.url_and_domain(options);
5049

51-
let convertedContent;
52-
53-
if (contentType === 'html') {
54-
convertedContent = content;
55-
} else if (contentType === 'text') {
56-
convertedContent = $.text(cheerio.load(content));
57-
} else if (contentType === 'markdown') {
58-
const turndownService = new TurndownService();
59-
convertedContent = turndownService.turndown(content);
60-
}
61-
6250
return {
6351
title,
6452
author,
6553
date_published: date_published || null,
6654
dek,
6755
lead_image_url,
68-
content: convertedContent,
56+
content,
6957
next_page_url,
7058
url,
7159
domain,

Diff for: src/extractors/root-extractor.js

+3-20
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import TurndownService from 'turndown';
21
import Cleaners from 'cleaners';
32
import { convertNodeTo } from 'utils/dom';
43
import GenericExtractor from './generic';
@@ -67,13 +66,7 @@ function findMatchingSelector($, selectors, extractHtml) {
6766
}
6867

6968
export function select(opts) {
70-
const {
71-
$,
72-
type,
73-
extractionOpts,
74-
extractHtml = false,
75-
contentType = 'html',
76-
} = opts;
69+
const { $, type, extractionOpts, extractHtml = false } = opts;
7770
// Skip if there's not extraction for this type
7871
if (!extractionOpts) return null;
7972

@@ -120,16 +113,7 @@ export function select(opts) {
120113

121114
$content = Cleaners[type]($content, { ...opts, defaultCleaner });
122115

123-
if (contentType === 'html') {
124-
return $.html($content);
125-
}
126-
if (contentType === 'text') {
127-
return $.text($content);
128-
}
129-
if (contentType === 'markdown') {
130-
const turndownService = new TurndownService();
131-
return turndownService.turndown($.html($content));
132-
}
116+
return $.html($content);
133117
}
134118

135119
let result;
@@ -178,7 +162,7 @@ function extractResult(opts) {
178162

179163
const RootExtractor = {
180164
extract(extractor = GenericExtractor, opts) {
181-
const { contentOnly, extractedTitle, contentType = 'html' } = opts;
165+
const { contentOnly, extractedTitle } = opts;
182166
// This is the generic extractor. Run its extract method
183167
if (extractor.domain === '*') return extractor.extract(opts);
184168

@@ -193,7 +177,6 @@ const RootExtractor = {
193177
type: 'content',
194178
extractHtml: true,
195179
title: extractedTitle,
196-
contentType,
197180
});
198181
return {
199182
content,

Diff for: src/extractors/root-extractor.test.js

-67
Original file line numberDiff line numberDiff line change
@@ -32,73 +32,6 @@ describe('RootExtractor', () => {
3232

3333
assert.equal(url, null);
3434
});
35-
it('returns text content if text is passed as contentType', () => {
36-
const fullUrl =
37-
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
38-
const html = fs.readFileSync(
39-
'./src/extractors/custom/nymag.com/fixtures/test.html',
40-
'utf8'
41-
);
42-
const $ = cheerio.load(html);
43-
44-
const { content } = RootExtractor.extract(NYMagExtractor, {
45-
url: fullUrl,
46-
html,
47-
$,
48-
metaCache: [],
49-
fallback: false,
50-
contentType: 'text',
51-
});
52-
53-
const htmlRe = /<[a-z][\s\S]*>/g;
54-
55-
assert.equal(htmlRe.test(content), false);
56-
});
57-
it('returns markdown if markdown is passed as contentType', () => {
58-
const fullUrl =
59-
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
60-
const html = fs.readFileSync(
61-
'./src/extractors/custom/nymag.com/fixtures/test.html',
62-
'utf8'
63-
);
64-
const $ = cheerio.load(html);
65-
66-
const { content } = RootExtractor.extract(NYMagExtractor, {
67-
url: fullUrl,
68-
html,
69-
$,
70-
metaCache: [],
71-
fallback: false,
72-
contentType: 'markdown',
73-
});
74-
75-
const htmlRe = /<[a-z][\s\S]*>/;
76-
const markdownRe = /\[[\w\s]+\]\(.*\)/;
77-
78-
assert.equal(htmlRe.test(content), false);
79-
assert.equal(markdownRe.test(content), true);
80-
});
81-
it('also can select type on Generic Extractor', () => {
82-
const fullUrl =
83-
'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html';
84-
85-
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
86-
const $ = cheerio.load(html);
87-
const { content } = RootExtractor.extract(undefined, {
88-
url: fullUrl,
89-
html,
90-
$,
91-
metaCache: [],
92-
fallback: false,
93-
contentType: 'markdown',
94-
});
95-
96-
const htmlRe = /<[a-z][\s\S]*>/;
97-
const markdownRe = /\[[\w\s]+\]\(.*\)/;
98-
99-
assert.equal(htmlRe.test(content), false);
100-
assert.equal(markdownRe.test(content), true);
101-
});
10235
});
10336

10437
describe('cleanBySelectors($content, $, { clean })', () => {

Diff for: src/mercury.js

+8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import URL from 'url';
22
import cheerio from 'cheerio';
3+
import TurndownService from 'turndown';
34

45
import Resource from 'resource';
56
import { validateUrl, Errors } from 'utils';
@@ -83,6 +84,13 @@ const Mercury = {
8384
};
8485
}
8586

87+
if (contentType === 'markdown') {
88+
const turndownService = new TurndownService();
89+
result.content = turndownService.turndown(result.content);
90+
} else if (contentType === 'text') {
91+
result.content = $.text($(result.content));
92+
}
93+
8694
return result;
8795
},
8896

Diff for: src/mercury.test.js

+35
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import { Errors } from 'utils';
44
import { record } from 'test-helpers';
55
import Mercury from './mercury';
66

7+
const fs = require('fs');
8+
79
describe('Mercury', () => {
810
const recorder = record('mercury-test');
911
beforeAll(recorder.before);
@@ -92,4 +94,37 @@ describe('Mercury', () => {
9294
assert.equal(result.next_page_url, `${url}2`);
9395
});
9496
});
97+
98+
it('returns text content if text is passed as contentType', async () => {
99+
const url =
100+
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
101+
const html = fs.readFileSync(
102+
'./src/extractors/custom/nymag.com/fixtures/test.html',
103+
'utf8'
104+
);
105+
const { content } = await Mercury.parse(url, { html, contentType: 'text' });
106+
107+
const htmlRe = /<[a-z][\s\S]*>/g;
108+
109+
assert.equal(htmlRe.test(content), false);
110+
});
111+
112+
it('returns markdown if markdown is passed as contentType', async () => {
113+
const url =
114+
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
115+
const html = fs.readFileSync(
116+
'./src/extractors/custom/nymag.com/fixtures/test.html',
117+
'utf8'
118+
);
119+
const { content } = await Mercury.parse(url, {
120+
html,
121+
contentType: 'markdown',
122+
});
123+
124+
const htmlRe = /<[a-z][\s\S]*>/;
125+
const markdownRe = /\[[\w\s]+\]\(.*\)/;
126+
127+
assert.equal(htmlRe.test(content), false);
128+
assert.equal(markdownRe.test(content), true);
129+
});
95130
});

0 commit comments

Comments
 (0)