Skip to content

Commit eacd1ee

Browse files
adampashtoufic-m
authored andcommitted
feat: custom genius parser. (#284)
also adds ability to transform value returned by an attribute selector
1 parent c389c96 commit eacd1ee

File tree

6 files changed

+1435
-5
lines changed

6 files changed

+1435
-5
lines changed

Diff for: fixtures/genius.com/1550609084053.html

+1,272
Large diffs are not rendered by default.

Diff for: src/extractors/custom/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,8 @@ As [explained above](#selecting-an-attribute), to return an attribute rather tha
328328
...
329329
```
330330
331+
In rare circumstances, you may want to manipulate the result of the attribute value. In these cases, you can add a third element to the selector array above — a function that will take the value of the attribute and return a value you've transformed it to. E.g., imagine that you want to access a JSON value that's been stringified into an attribute. Your function could take the stringified JSON, parse it, and return just the piece of it you want.
332+
331333
You can refer to the [NewYorkerExtractor](www.newyorker.com/index.js) to see more the rest of the basic selectors.
332334
333335
### Step 4: Content extraction

Diff for: src/extractors/custom/genius.com/index.js

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
export const GeniusComExtractor = {
2+
domain: 'genius.com',
3+
4+
title: {
5+
selectors: ['h1'],
6+
},
7+
8+
author: {
9+
selectors: ['h2 a'],
10+
},
11+
12+
date_published: {
13+
selectors: [
14+
[
15+
'meta[itemprop=page_data]',
16+
'value',
17+
res => {
18+
const json = JSON.parse(res);
19+
return json.song.release_date;
20+
},
21+
],
22+
],
23+
},
24+
25+
dek: {
26+
selectors: [
27+
// enter selectors
28+
],
29+
},
30+
31+
lead_image_url: {
32+
selectors: [
33+
[
34+
'meta[itemprop=page_data]',
35+
'value',
36+
res => {
37+
const json = JSON.parse(res);
38+
return json.song.album.cover_art_url;
39+
},
40+
],
41+
],
42+
},
43+
44+
content: {
45+
selectors: ['.lyrics'],
46+
47+
// Is there anything in the content you selected that needs transformed
48+
// before it's consumable content? E.g., unusual lazy loaded images
49+
transforms: {},
50+
51+
// Is there anything that is in the result that shouldn't be?
52+
// The clean selectors will remove anything that matches from
53+
// the result
54+
clean: [],
55+
},
56+
};

Diff for: src/extractors/custom/genius.com/index.test.js

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import assert from 'assert';
2+
import URL from 'url';
3+
import cheerio from 'cheerio';
4+
import moment from 'moment';
5+
6+
import Mercury from 'mercury';
7+
import getExtractor from 'extractors/get-extractor';
8+
import { excerptContent } from 'utils/text';
9+
10+
const fs = require('fs');
11+
12+
describe('GeniusComExtractor', () => {
13+
describe('initial test case', () => {
14+
let result;
15+
let url;
16+
beforeAll(() => {
17+
url = 'https://genius.com/Prince-and-the-revolution-purple-rain-lyrics';
18+
const html = fs.readFileSync('./fixtures/genius.com/1550609084053.html');
19+
result = Mercury.parse(url, { html, fallback: false });
20+
});
21+
22+
it('is selected properly', () => {
23+
// This test should be passing by default.
24+
// It sanity checks that the correct parser
25+
// is being selected for URLs from this domain
26+
const extractor = getExtractor(url);
27+
assert.equal(extractor.domain, URL.parse(url).hostname);
28+
});
29+
30+
it('returns the title', async () => {
31+
// To pass this test, fill out the title selector
32+
// in ./src/extractors/custom/genius.com/index.js.
33+
const { title } = await result;
34+
35+
// Update these values with the expected values from
36+
// the article.
37+
assert.equal(title, `Purple Rain`);
38+
});
39+
40+
it('returns the author', async () => {
41+
// To pass this test, fill out the author selector
42+
// in ./src/extractors/custom/genius.com/index.js.
43+
const { author } = await result;
44+
45+
// Update these values with the expected values from
46+
// the article.
47+
assert.equal(author, 'Prince and the Revolution');
48+
});
49+
50+
it('returns the date_published', async () => {
51+
// To pass this test, fill out the date_published selector
52+
// in ./src/extractors/custom/genius.com/index.js.
53+
const { date_published } = await result;
54+
const newDatePublished = moment(date_published).format();
55+
56+
// Update these values with the expected values from
57+
// the article.
58+
assert.equal(newDatePublished.split('T')[0], '1984-06-25');
59+
});
60+
61+
it('returns the lead_image_url', async () => {
62+
// To pass this test, fill out the lead_image_url selector
63+
// in ./src/extractors/custom/genius.com/index.js.
64+
const { lead_image_url } = await result;
65+
66+
// Update these values with the expected values from
67+
// the article.
68+
assert.equal(
69+
lead_image_url,
70+
`https://images.genius.com/da3381a38218928924c94db9ea59543b.1000x1000x1.jpg`
71+
);
72+
});
73+
74+
it('returns the content', async () => {
75+
// To pass this test, fill out the content selector
76+
// in ./src/extractors/custom/genius.com/index.js.
77+
// You may also want to make use of the clean and transform
78+
// options.
79+
const { content } = await result;
80+
81+
const $ = cheerio.load(content || '');
82+
83+
const first13 = excerptContent(
84+
$('*')
85+
.first()
86+
.text(),
87+
13
88+
);
89+
90+
// Update these values with the expected values from
91+
// the article.
92+
assert.equal(
93+
first13,
94+
'[Verse 1] I never meant to cause you any sorrow I never meant'
95+
);
96+
});
97+
});
98+
});

Diff for: src/extractors/custom/index.js

+1
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,4 @@ export * from './www.sanwa.co.jp';
103103
export * from './www.elecom.co.jp';
104104
export * from './scan.netsecurity.ne.jp';
105105
export * from './jvndb.jvn.jp';
106+
export * from './genius.com';

Diff for: src/extractors/root-extractor.js

+6-5
Original file line numberDiff line numberDiff line change
@@ -140,14 +140,15 @@ export function select(opts) {
140140
// if selector is an array (e.g., ['img', 'src']),
141141
// extract the attr
142142
if (Array.isArray(matchingSelector)) {
143-
const [selector, attr] = matchingSelector;
143+
const [selector, attr, transform] = matchingSelector;
144144
$match = $(selector);
145145
$match = transformAndClean($match);
146-
result = $match.map((_, el) =>
147-
$(el)
146+
result = $match.map((_, el) => {
147+
const item = $(el)
148148
.attr(attr)
149-
.trim()
150-
);
149+
.trim();
150+
return transform ? transform(item) : item;
151+
});
151152
} else {
152153
$match = $(matchingSelector);
153154
$match = transformAndClean($match);

0 commit comments

Comments
 (0)