Skip to content

Commit 3297ab0

Browse files
authored
feat: bloomberg extractor (#59)
Bloomberg has several templates. I'm supporting three different templates here, but I'm not sure that this is complete by any means. It's also worth noting that SVGs don't make it through the parser terribly well for many reasons. One, for example, is that a lot of SVGs require custom CSS in order for them to make sense. I'm not sure this is something we can expect to address in the parser.
1 parent e55e9da commit 3297ab0

File tree

7 files changed

+1084
-1
lines changed

7 files changed

+1084
-1
lines changed

Diff for: fixtures/www.bloomberg.com/1481135708958.html

+1
Large diffs are not rendered by default.

Diff for: fixtures/www.bloomberg.com/1481136509532.html

+754
Large diffs are not rendered by default.

Diff for: fixtures/www.bloomberg.com/1481138014494.html

+1
Large diffs are not rendered by default.

Diff for: src/extractors/custom/index.js

+1
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ export * from './www.aol.com';
2727
export * from './www.youtube.com';
2828
export * from './www.theguardian.com';
2929
export * from './www.sbnation.com';
30+
export * from './www.bloomberg.com';

Diff for: src/extractors/custom/www.bloomberg.com/index.js

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
export const WwwBloombergComExtractor = {
2+
domain: 'www.bloomberg.com',
3+
4+
title: {
5+
selectors: [
6+
// normal articles
7+
'.lede-headline',
8+
9+
// /graphics/ template
10+
'h1.article-title',
11+
12+
// /news/ template
13+
'h1.lede-text-only__hed',
14+
],
15+
},
16+
17+
author: {
18+
selectors: [
19+
['meta[name="parsely-author"]', 'value'],
20+
'.byline-details__link',
21+
22+
// /graphics/ template
23+
'.bydek',
24+
25+
// /news/ template
26+
'.author',
27+
],
28+
},
29+
30+
date_published: {
31+
selectors: [
32+
['time.published-at', 'datetime'],
33+
['time[datetime]', 'datetime'],
34+
['meta[name="date"]', 'value'],
35+
['meta[name="parsely-pub-date"]', 'value'],
36+
],
37+
},
38+
39+
dek: {
40+
selectors: [
41+
],
42+
},
43+
44+
lead_image_url: {
45+
selectors: [
46+
['meta[name="og:image"]', 'value'],
47+
],
48+
},
49+
50+
content: {
51+
selectors: [
52+
'.article-body__content',
53+
54+
// /graphics/ template
55+
['section.copy-block'],
56+
57+
// /news/ template
58+
'.body-copy',
59+
],
60+
61+
// Is there anything in the content you selected that needs transformed
62+
// before it's consumable content? E.g., unusual lazy loaded images
63+
transforms: {
64+
},
65+
66+
// Is there anything that is in the result that shouldn't be?
67+
// The clean selectors will remove anything that matches from
68+
// the result
69+
clean: [
70+
'.inline-newsletter',
71+
'.page-ad',
72+
],
73+
},
74+
};
+241
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
import assert from 'assert';
2+
import fs from 'fs';
3+
import URL from 'url';
4+
import cheerio from 'cheerio';
5+
6+
import Mercury from 'mercury';
7+
import getExtractor from 'extractors/get-extractor';
8+
import { excerptContent } from 'utils/text';
9+
10+
describe('WwwBloombergComExtractor', () => {
11+
describe('initial test case', () => {
12+
let result;
13+
let url;
14+
beforeAll(() => {
15+
url =
16+
'http://www.bloomberg.com/politics/articles/2016-12-07/trump-hits-emblem-of-presidential-power-with-air-force-one-tweet';
17+
const html =
18+
fs.readFileSync('./fixtures/www.bloomberg.com/1481135708958.html');
19+
result =
20+
Mercury.parse(url, html, { fallback: false });
21+
});
22+
23+
it('is selected properly', () => {
24+
// This test should be passing by default.
25+
// It sanity checks that the correct parser
26+
// is being selected for URLs from this domain
27+
const extractor = getExtractor(url);
28+
assert.equal(extractor.domain, URL.parse(url).hostname);
29+
});
30+
31+
it('returns the title', async () => {
32+
// To pass this test, fill out the title selector
33+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
34+
const { title } = await result;
35+
36+
// Update these values with the expected values from
37+
// the article.
38+
assert.equal(title, 'Air Force One Costs Billions of Dollars Because It’s a Flying White House');
39+
});
40+
41+
it('returns the author', async () => {
42+
// To pass this test, fill out the author selector
43+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
44+
const { author } = await result;
45+
46+
// Update these values with the expected values from
47+
// the article.
48+
assert.equal(author, 'Margaret Talev');
49+
});
50+
51+
it('returns the date_published', async () => {
52+
// To pass this test, fill out the date_published selector
53+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
54+
const { date_published } = await result;
55+
56+
// Update these values with the expected values from
57+
// the article.
58+
assert.equal(date_published, '2016-12-07T10:00:00.011Z');
59+
});
60+
61+
it('returns the lead_image_url', async () => {
62+
// To pass this test, fill out the lead_image_url selector
63+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
64+
const { lead_image_url } = await result;
65+
66+
// Update these values with the expected values from
67+
// the article.
68+
assert.equal(lead_image_url, 'https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ioUAfA1V2nzk/v0/-1x-1.jpg');
69+
});
70+
71+
it('returns the content', async () => {
72+
// To pass this test, fill out the content selector
73+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
74+
// You may also want to make use of the clean and transform
75+
// options.
76+
const { content } = await result;
77+
78+
const $ = cheerio.load(content || '');
79+
80+
const first13 = excerptContent($('*').first().text(), 13);
81+
82+
// Update these values with the expected values from
83+
// the article.
84+
assert.equal(first13, 'Donald Trump took aim at one of the most visible emblems of the');
85+
});
86+
});
87+
88+
describe('/graphics/ template', () => {
89+
let result;
90+
let url;
91+
beforeAll(() => {
92+
url =
93+
'https://www.bloomberg.com/graphics/2016-apple-profits/';
94+
const html =
95+
fs.readFileSync('./fixtures/www.bloomberg.com/1481136509532.html');
96+
result =
97+
Mercury.parse(url, html, { fallback: false });
98+
});
99+
100+
it('is selected properly', () => {
101+
// This test should be passing by default.
102+
// It sanity checks that the correct parser
103+
// is being selected for URLs from this domain
104+
const extractor = getExtractor(url);
105+
assert.equal(extractor.domain, URL.parse(url).hostname);
106+
});
107+
108+
it('returns the title', async () => {
109+
// To pass this test, fill out the title selector
110+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
111+
const { title } = await result;
112+
113+
// Update these values with the expected values from
114+
// the article.
115+
assert.equal(title, 'Americans Are Paying Apple Millions to Shelter Overseas Profits');
116+
});
117+
118+
it('returns the author', async () => {
119+
// To pass this test, fill out the author selector
120+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
121+
const { author } = await result;
122+
123+
// Update these values with the expected values from
124+
// the article.
125+
assert.equal(author, 'Andrea Wong');
126+
});
127+
128+
it('returns the date_published', async () => {
129+
// To pass this test, fill out the date_published selector
130+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
131+
const { date_published } = await result;
132+
133+
// Update these values with the expected values from
134+
// the article.
135+
assert.equal(date_published, '2016-12-07T10:00:00.000Z');
136+
});
137+
138+
it('returns the lead_image_url', async () => {
139+
// To pass this test, fill out the lead_image_url selector
140+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
141+
const { lead_image_url } = await result;
142+
143+
// Update these values with the expected values from
144+
// the article.
145+
assert.equal(lead_image_url, 'https://www.bloomberg.com/graphics/2016-apple-profits/img/2016-apple-profits-facebook.png');
146+
});
147+
148+
it('returns the content', async () => {
149+
// To pass this test, fill out the content selector
150+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
151+
// You may also want to make use of the clean and transform
152+
// options.
153+
const { content } = await result;
154+
155+
const $ = cheerio.load(content || '');
156+
157+
const first13 = excerptContent($('*').first().text(), 13);
158+
159+
// Update these values with the expected values from
160+
// the article.
161+
assert.equal(first13, 'Over the years, Apple Inc. has become the poster child for U.S. multinationals');
162+
});
163+
});
164+
165+
describe('/news/ template', () => {
166+
let result;
167+
let url;
168+
beforeAll(() => {
169+
url =
170+
'https://www.bloomberg.com/news/articles/2016-12-06/stock-rally-extends-into-asia-as-traders-await-rbi-aussie-gdp';
171+
const html =
172+
fs.readFileSync('./fixtures/www.bloomberg.com/1481138014494.html');
173+
result =
174+
Mercury.parse(url, html, { fallback: false });
175+
});
176+
177+
it('is selected properly', () => {
178+
// This test should be passing by default.
179+
// It sanity checks that the correct parser
180+
// is being selected for URLs from this domain
181+
const extractor = getExtractor(url);
182+
assert.equal(extractor.domain, URL.parse(url).hostname);
183+
});
184+
185+
it('returns the title', async () => {
186+
// To pass this test, fill out the title selector
187+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
188+
const { title } = await result;
189+
190+
// Update these values with the expected values from
191+
// the article.
192+
assert.equal(title, 'U.S. Stocks Rise to Records, Bonds Gain on ECB Stimulus Optimism');
193+
});
194+
195+
it('returns the author', async () => {
196+
// To pass this test, fill out the author selector
197+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
198+
const { author } = await result;
199+
200+
// Update these values with the expected values from
201+
// the article.
202+
assert.equal(author, 'Jeremy Herron');
203+
});
204+
205+
it('returns the date_published', async () => {
206+
// To pass this test, fill out the date_published selector
207+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
208+
const { date_published } = await result;
209+
210+
// Update these values with the expected values from
211+
// the article.
212+
assert.equal(date_published, '2016-12-06T23:22:22.402Z');
213+
});
214+
215+
it('returns the lead_image_url', async () => {
216+
// To pass this test, fill out the lead_image_url selector
217+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
218+
const { lead_image_url } = await result;
219+
220+
// Update these values with the expected values from
221+
// the article.
222+
assert.equal(lead_image_url, 'https://assets.bwbx.io/javelin/public/images/social-markets-3d32d2f713.jpg');
223+
});
224+
225+
it('returns the content', async () => {
226+
// To pass this test, fill out the content selector
227+
// in ./src/extractors/custom/www.bloomberg.com/index.js.
228+
// You may also want to make use of the clean and transform
229+
// options.
230+
const { content } = await result;
231+
232+
const $ = cheerio.load(content || '');
233+
234+
const first13 = excerptContent($('*').first().text(), 13);
235+
236+
// Update these values with the expected values from
237+
// the article.
238+
assert.equal(first13, 'The Dow Jones Industrial Average rose 220 points as U.S. stock indexes powered');
239+
});
240+
});
241+
});

Diff for: src/utils/dom/constants.js

+12-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,18 @@ export const STRIP_OUTPUT_TAGS = [
2929
export const REMOVE_ATTRS = ['style', 'align'];
3030
export const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`);
3131
export const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');
32-
export const WHITELIST_ATTRS = ['src', 'srcset', 'href', 'class', 'id', 'alt'];
32+
export const WHITELIST_ATTRS = [
33+
'src',
34+
'srcset',
35+
'href',
36+
'class',
37+
'id',
38+
'alt',
39+
'xlink:href',
40+
'width',
41+
'height',
42+
];
43+
3344
export const WHITELIST_ATTRS_RE = new RegExp(`^(${WHITELIST_ATTRS.join('|')})$`, 'i');
3445

3546
// removeEmpty

0 commit comments

Comments
 (0)