Skip to content

Commit e12c916

Browse files
authored
feat: ability to add custom extractors via api (#484)
* feat: ability to add custom extractors via api * docs: updating readme * fix: example.com was being used in another test * fix: timezone was messing up date_published test * fix: using a unique site for testing * fix: updated custom extractor api * docs: updating readme * fix: removing unused fixture * fix: updating test description * feat: ability to add custom extractors via cli
1 parent f95947f commit e12c916

File tree

10 files changed

+3186
-3
lines changed

10 files changed

+3186
-3
lines changed

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --ext
137137

138138
# Get the value of attributes by adding a pipe to --extend or --extend-list
139139
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list links=".body a|href"
140+
141+
# Pass optional --add-extractor argument to add a custom extractor at runtime.
142+
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js
140143
```
141144

142145
## License

cli.js

+28-3
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,25 @@ const {
1414
l,
1515
header,
1616
h,
17+
addExtractor,
18+
x,
1719
} = argv;
18-
(async (urlToParse, contentType, extendedTypes, extendedListTypes, headers) => {
20+
(async (
21+
urlToParse,
22+
contentType,
23+
extendedTypes,
24+
extendedListTypes,
25+
headers,
26+
addExtractor
27+
) => {
1928
if (!urlToParse) {
2029
console.log(
2130
'\n\
2231
mercury-parser\n\n\
2332
The Mercury Parser extracts semantic content from any url\n\n\
2433
Usage:\n\
2534
\n\
26-
$ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... \n\
35+
$ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... [--add-extractor path_to_extractor.js]... \n\
2736
\n\
2837
'
2938
);
@@ -37,6 +46,7 @@ Usage:\n\
3746
text: 'text',
3847
txt: 'text',
3948
};
49+
4050
const extensions = {};
4151
[].concat(extendedTypes || []).forEach(t => {
4252
const [name, selector] = t.split('=');
@@ -53,10 +63,18 @@ Usage:\n\
5363
allowMultiple: true,
5464
};
5565
});
66+
67+
// Attempt to load custom extractor from path.
68+
let customExtractor;
69+
if (addExtractor) {
70+
customExtractor = require(addExtractor);
71+
}
72+
5673
const result = await Mercury.parse(urlToParse, {
5774
contentType: contentTypeMap[contentType],
5875
extend: extensions,
5976
headers,
77+
customExtractor,
6078
});
6179
console.log(JSON.stringify(result, null, 2));
6280
} catch (e) {
@@ -75,4 +93,11 @@ Usage:\n\
7593
console.error(`\n${reportBug}\n`);
7694
process.exit(1);
7795
}
78-
})(url, format || f, extend || e, extendList || l, header || h);
96+
})(
97+
url,
98+
format || f,
99+
extend || e,
100+
extendList || l,
101+
header || h,
102+
addExtractor || x
103+
);

fixtures/sandiegouniontribune.com/test.html

+2,989
Large diffs are not rendered by default.

src/extractors/add-extractor.js

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import mergeSupportedDomains from '../utils/merge-supported-domains';
2+
3+
export const apiExtractors = {};
4+
5+
export default function addExtractor(extractor) {
6+
if (!extractor || !extractor.domain) {
7+
return {
8+
error: true,
9+
message: 'Unable to add custom extractor. Invalid parameters.',
10+
};
11+
}
12+
13+
Object.assign(apiExtractors, mergeSupportedDomains(extractor));
14+
15+
return apiExtractors;
16+
}

src/extractors/add-extractor.test.js

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import assert from 'assert';
2+
3+
import addExtractor from './add-extractor';
4+
5+
describe('addExtractor(extractor)', () => {
6+
it('can add multiple custom extractors', () => {
7+
addExtractor({ domain: 'www.site1.com' });
8+
addExtractor({ domain: 'www.site2.com' });
9+
const result = addExtractor({ domain: 'www.site3.com' });
10+
assert.equal(Object.keys(result).length, 3);
11+
});
12+
13+
it('returns error if an extractor is not provided', () => {
14+
const result = addExtractor();
15+
assert.equal(result.error, true);
16+
});
17+
18+
it('returns error if a domain key is not included within the custom extractor', () => {
19+
const result = addExtractor({ test: 'abc' });
20+
assert.equal(result.error, true);
21+
});
22+
});

src/extractors/custom/README.md

+59
Original file line numberDiff line numberDiff line change
@@ -349,3 +349,62 @@ This script will open both an `html` and `json` file allowing you to preview you
349349
If you've written a custom extractor, please send us a pull request! Passing tests that demonstrate your parser in action will help us evaluate the parser.
350350
351351
Sometimes you may find that the site you're parsing doesn't provide certain information. For example, some sites don't have deks, and in those instances, you don't need to write a selector for that field. If there's a test for a selector you don't need, you can just remove that test and make note of it in your pull request.
352+
353+
---
354+
355+
## Adding Custom Extractor via API
356+
357+
As of **version 2.1.1**, you can additionally add custom private extractors via API. Make sure that your custom extractor includes a domain name. Note that extractors added via API will take precedence over the packaged custom extractors.
358+
359+
```javascript
360+
const customExtractor = {
361+
domain: 'www.sandiegouniontribune.com',
362+
title: {
363+
selectors: ['h1', '.ArticlePage-headline'],
364+
},
365+
author: {
366+
selectors: ['.ArticlePage-authorInfo-bio-name'],
367+
},
368+
content: {
369+
selectors: ['article'],
370+
},
371+
};
372+
373+
Mercury.addExtractor(customExtractor);
374+
```
375+
376+
---
377+
378+
## Passing custom extractor to addExtractor via CLI
379+
380+
It's also possible to add a custom parser at runtime via the CLI.
381+
382+
### 1. Create your custom extractor in a standalone file.
383+
384+
```javascript
385+
var customExtractor = {
386+
domain: 'postlight.com',
387+
title: {
388+
selectors: ['h1'],
389+
},
390+
author: {
391+
selectors: ['.byline-name'],
392+
},
393+
content: {
394+
selectors: ['article'],
395+
},
396+
extend: {
397+
uniqueKeyFromFixture: {
398+
selectors: ['.single__hero-category'],
399+
},
400+
},
401+
};
402+
403+
module.exports = customExtractor;
404+
```
405+
406+
### 2. From the CLI, add the `--add-extractor` param:
407+
408+
```bash
409+
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js
410+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
var customExtractor = {
2+
domain: 'postlight.com',
3+
title: {
4+
selectors: ['h1'],
5+
},
6+
author: {
7+
selectors: ['.byline-name'],
8+
},
9+
content: {
10+
selectors: ['article'],
11+
},
12+
extend: {
13+
uniqueKeyFromFixture: {
14+
selectors: ['.single__hero-category'],
15+
},
16+
},
17+
};
18+
19+
module.exports = customExtractor;

src/extractors/get-extractor.js

+3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import URL from 'url';
33
import Extractors from './all';
44
import GenericExtractor from './generic';
55
import detectByHtml from './detect-by-html';
6+
import { apiExtractors } from './add-extractor';
67

78
export default function getExtractor(url, parsedUrl, $) {
89
parsedUrl = parsedUrl || URL.parse(url);
@@ -13,6 +14,8 @@ export default function getExtractor(url, parsedUrl, $) {
1314
.join('.');
1415

1516
return (
17+
apiExtractors[hostname] ||
18+
apiExtractors[baseDomain] ||
1619
Extractors[hostname] ||
1720
Extractors[baseDomain] ||
1821
detectByHtml($) ||

src/mercury.js

+11
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import TurndownService from 'turndown';
44

55
import Resource from 'resource';
66
import { validateUrl } from 'utils';
7+
import addCustomExtractor from 'extractors/add-extractor';
78
import getExtractor from 'extractors/get-extractor';
89
import RootExtractor, { selectExtendedTypes } from 'extractors/root-extractor';
910
import collectAllPages from 'extractors/collect-all-pages';
@@ -16,6 +17,7 @@ const Mercury = {
1617
contentType = 'html',
1718
headers = {},
1819
extend,
20+
customExtractor,
1921
} = opts;
2022

2123
// if no url was passed and this is the browser version,
@@ -43,6 +45,11 @@ const Mercury = {
4345
return $;
4446
}
4547

48+
// Add custom extractor via cli.
49+
if (customExtractor) {
50+
addCustomExtractor(customExtractor);
51+
}
52+
4653
const Extractor = getExtractor(url, parsedUrl, $);
4754
// console.log(`Using extractor for ${Extractor.domain}`);
4855

@@ -112,6 +119,10 @@ const Mercury = {
112119
fetchResource(url) {
113120
return Resource.create(url);
114121
},
122+
123+
addExtractor(extractor) {
124+
return addCustomExtractor(extractor);
125+
},
115126
};
116127

117128
export default Mercury;

src/mercury.test.js

+36
Original file line numberDiff line numberDiff line change
@@ -182,4 +182,40 @@ describe('Mercury', () => {
182182
assert.equal(sites.length, 8);
183183
assert.equal(sites[1], 'http://nymag.com/daily/intelligencer/');
184184
});
185+
186+
it('is able to use custom extractors (with extension) added via api', async () => {
187+
const url =
188+
'https://www.sandiegouniontribune.com/business/growth-development/story/2019-08-27/sdsu-mission-valley-stadium-management-firm';
189+
const html = fs.readFileSync(
190+
'./fixtures/sandiegouniontribune.com/test.html',
191+
'utf8'
192+
);
193+
194+
const customExtractor = {
195+
domain: 'www.sandiegouniontribune.com',
196+
title: {
197+
selectors: ['h1', '.ArticlePage-headline'],
198+
},
199+
author: {
200+
selectors: ['.ArticlePage-authorInfo-bio-name'],
201+
},
202+
content: {
203+
selectors: ['article'],
204+
},
205+
extend: {
206+
testContent: {
207+
selectors: ['.ArticlePage-breadcrumbs a'],
208+
},
209+
},
210+
};
211+
212+
Mercury.addExtractor(customExtractor);
213+
214+
const result = await Mercury.parse(url, { html });
215+
assert.equal(typeof result, 'object');
216+
assert.equal(result.author, 'Jennifer Van Grove');
217+
assert.equal(result.domain, 'www.sandiegouniontribune.com');
218+
assert.equal(result.total_pages, 1);
219+
assert.equal(result.testContent, 'Growth & Development');
220+
});
185221
});

0 commit comments

Comments
 (0)