From 3cf2bb78c400aa774448d687a60ae1cfbdc8db16 Mon Sep 17 00:00:00 2001 From: Janet Date: Thu, 15 Dec 2016 20:48:15 -0500 Subject: [PATCH] feat: vox custom parser (#67) --- fixtures/www.vox.com/1481563623532.html | 1 + src/extractors/custom/index.js | 1 + src/extractors/custom/www.vox.com/index.js | 58 +++++++++++ .../custom/www.vox.com/index.test.js | 97 +++++++++++++++++++ 4 files changed, 157 insertions(+) create mode 100644 fixtures/www.vox.com/1481563623532.html create mode 100644 src/extractors/custom/www.vox.com/index.js create mode 100644 src/extractors/custom/www.vox.com/index.test.js diff --git a/fixtures/www.vox.com/1481563623532.html b/fixtures/www.vox.com/1481563623532.html new file mode 100644 index 000000000..b58354c21 --- /dev/null +++ b/fixtures/www.vox.com/1481563623532.html @@ -0,0 +1 @@ + Donald Trump’s tweets fit a pattern of harassment Twitter has banned before - Vox
clock menu more-arrow

Donald Trump’s tweets fit a pattern of harassment Twitter has banned before

The President-elect’s targets may endure real-life threats from hordes of his supporters.

\ No newline at end of file diff --git a/src/extractors/custom/index.js b/src/extractors/custom/index.js index 658496358..7a9698c71 100644 --- a/src/extractors/custom/index.js +++ b/src/extractors/custom/index.js @@ -29,3 +29,4 @@ export * from './www.theguardian.com'; export * from './www.sbnation.com'; export * from './www.bloomberg.com'; export * from './www.bustle.com'; +export * from './www.vox.com'; diff --git a/src/extractors/custom/www.vox.com/index.js b/src/extractors/custom/www.vox.com/index.js new file mode 100644 index 000000000..7c54bf4f5 --- /dev/null +++ b/src/extractors/custom/www.vox.com/index.js @@ -0,0 +1,58 @@ +export const WwwVoxComExtractor = { + domain: 'www.vox.com', + + title: { + selectors: [ + 'h1.c-page-title', + ], + }, + + author: { + selectors: [ + ['meta[name="author"]', 'value'], + ], + }, + + date_published: { + selectors: [ + ['meta[name="article:published_time"]', 'value'], + ], + }, + + dek: { + selectors: [ + '.p-dek', + ], + }, + + lead_image_url: { + selectors: [ + ['meta[name="og:image"]', 'value'], + ], + }, + + content: { + selectors: [ + ['figure.e-image--hero', '.c-entry-content'], + '.c-entry-content', + ], + + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: { + 'figure .e-image__image noscript': ($node) => { + const imgHtml = $node.html(); + $node.parents('.e-image__image').find('.c-dynamic-image').replaceWith(imgHtml); + }, + + 'figure .e-image__meta': 'figcaption', + }, + + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [ + + ], + }, +}; diff --git a/src/extractors/custom/www.vox.com/index.test.js b/src/extractors/custom/www.vox.com/index.test.js new file mode 100644 index 000000000..0493ae95e --- /dev/null +++ b/src/extractors/custom/www.vox.com/index.test.js @@ -0,0 +1,97 @@ +import assert from 'assert'; +import fs from 'fs'; +import URL from 'url'; +import cheerio from 'cheerio'; + +import Mercury from 'mercury'; +import getExtractor from 'extractors/get-extractor'; +import { excerptContent } from 'utils/text'; + +describe('WwwVoxComExtractor', () => { + describe('initial test case', () => { + let result; + let url; + beforeAll(() => { + url = + 'http://www.vox.com/culture/2016/12/10/13898352/trump-twitter-harassment-policy-bannable'; + const html = + fs.readFileSync('./fixtures/www.vox.com/1481563623532.html'); + result = + Mercury.parse(url, html, { fallback: false }); + }); + + it('is selected properly', () => { + // This test should be passing by default. + // It sanity checks that the correct parser + // is being selected for URLs from this domain + const extractor = getExtractor(url); + assert.equal(extractor.domain, URL.parse(url).hostname); + }); + + it('returns the title', async () => { + // To pass this test, fill out the title selector + // in ./src/extractors/custom/www.vox.com/index.js. + const { title } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(title, 'Donald Trump’s tweets fit a pattern of harassment Twitter has banned before'); + }); + + it('returns the author', async () => { + // To pass this test, fill out the author selector + // in ./src/extractors/custom/www.vox.com/index.js. + const { author } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(author, 'Aja Romano'); + }); + + it('returns the date_published', async () => { + // To pass this test, fill out the date_published selector + // in ./src/extractors/custom/www.vox.com/index.js. + const { date_published } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(date_published, '2016-12-10T15:20:01.000Z'); + }); + + it('returns the dek', async () => { + // To pass this test, fill out the dek selector + // in ./src/extractors/custom/www.vox.com/index.js. + const { dek } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(dek, 'The President-elect’s targets may endure real-life threats from hordes of his supporters.'); + }); + + it('returns the lead_image_url', async () => { + // To pass this test, fill out the lead_image_url selector + // in ./src/extractors/custom/www.vox.com/index.js. + const { lead_image_url } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(lead_image_url, 'https://cdn0.vox-cdn.com/thumbor/RuJTDlBH9LAp_9uFqYfnPzWXhj0=/0x175:2500x1564/1080x600/cdn0.vox-cdn.com/uploads/chorus_image/image/52223131/628656068.0.jpeg'); + }); + + it('returns the content', async () => { + // To pass this test, fill out the content selector + // in ./src/extractors/custom/www.vox.com/index.js. + // You may also want to make use of the clean and transform + // options. + const { content } = await result; + + const $ = cheerio.load(content || ''); + + const first13 = excerptContent($('*').first().text(), 13); + + // Update these values with the expected values from + // the article. + assert.equal(first13, 'Photo by Steve Pope/Getty Images After the election, Twitter issued a statement to'); + }); + }); +});