Skip to content

Commit

Permalink
Fix: extension bugs (#47)
Browse files Browse the repository at this point in the history
* feat: lead image on atlantic stories now included

* feat: supporting buzzfeed "longform" template

* feat: cleaning .parter-box from the atlantic
  • Loading branch information
adampash authored Dec 3, 2016
1 parent 16860f1 commit f9902cf
Show file tree
Hide file tree
Showing 5 changed files with 423 additions and 3 deletions.
361 changes: 361 additions & 0 deletions fixtures/www.buzzfeed.com/1480717502688.html

Large diffs are not rendered by default.

14 changes: 13 additions & 1 deletion src/extractors/custom/www.buzzfeed.com/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ export const BuzzfeedExtractor = {

content: {
selectors: [
['.longform_custom_header_media', '#buzz_sub_buzz'],
'#buzz_sub_buzz',
// enter content selectors
],

defaultCleaner: false,
Expand All @@ -29,6 +29,17 @@ export const BuzzfeedExtractor = {
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
h2: 'b',

'div.longform_custom_header_media': ($node) => {
if ($node.has('img') && $node.has('.longform_header_image_source')) {
return 'figure';
}

return null;
},

'figure.longform_custom_header_media .longform_header_image_source':
'figcaption',
},

// Is there anything that is in the result that shouldn't be?
Expand All @@ -38,6 +49,7 @@ export const BuzzfeedExtractor = {
'.instapaper_ignore',
'.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline',
'.share-box',
'.print',
],
},

Expand Down
46 changes: 46 additions & 0 deletions src/extractors/custom/www.buzzfeed.com/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,50 @@ describe('BuzzfeedExtractor', () => {
// the article.
assert.equal(first13, 'A few months ago, Vladimir Serbanescu, a 17-year-old artist from Romania, drew this');
});

it('returns big header images in the content', async () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
// You may also want to make use of the clean and transform
// options.
const html =
fs.readFileSync('./fixtures/www.buzzfeed.com/1480717502688.html');
const url =
'https://www.buzzfeed.com/katiejmbaker/college-trump-supporters-the-new-counterculture?utm_term=.ckb72b58Y#.oxY8ZOWY3';

const { content } =
await Mercury.parse(url, html, { fallback: false });

const $ = cheerio.load(content || '');

const imgSrc = $('img').first().attr('src');

// Update these values with the expected values from
// the article.
assert.equal(imgSrc, 'https://img.buzzfeed.com/buzzfeed-static/static/2016-11/21/10/enhanced/buzzfeed-prod-fastlane03/longform-original-25748-1479741827-5.jpg');
});

it('transforms the splash image to a figure and caption', async () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
// You may also want to make use of the clean and transform
// options.
const html =
fs.readFileSync('./fixtures/www.buzzfeed.com/1480717502688.html');
const url =
'https://www.buzzfeed.com/katiejmbaker/college-trump-supporters-the-new-counterculture?utm_term=.ckb72b58Y#.oxY8ZOWY3';

const { content } =
await Mercury.parse(url, html, { fallback: false });

const $ = cheerio.load(content || '');

const imgSrc = $('figure img').first().attr('src');
const figcaption = $('figure figcaption').first().text();

// Update these values with the expected values from
// the article.
assert.equal(imgSrc, 'https://img.buzzfeed.com/buzzfeed-static/static/2016-11/21/10/enhanced/buzzfeed-prod-fastlane03/longform-original-25748-1479741827-5.jpg');
assert.equal(figcaption, 'Adam Maida for BuzzFeed News');
});
});
3 changes: 2 additions & 1 deletion src/extractors/custom/www.theatlantic.com/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export const TheAtlanticExtractor = {

content: {
selectors: [
['.article-cover figure.lead-img', '.article-body'],
'.article-body',
],

Expand All @@ -28,7 +29,7 @@ export const TheAtlanticExtractor = {
// The clean selectors will remove anything that matches from
// the result
clean: [

'.partner-box',
],
},

Expand Down
2 changes: 1 addition & 1 deletion src/extractors/custom/www.theatlantic.com/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,6 @@ describe('CustomExtractor', () => {

assert.equal(title, 'Why New Yorkers Received a Push Alert About a Manhunt');
assert.equal(author, 'Kaveh Waddell');
assert.equal(text, 'Updated on September');
assert.equal(text, 'New York police offi');
});
});

0 comments on commit f9902cf

Please sign in to comment.