diff --git a/dist/mercury.js b/dist/mercury.js
index 4e5c7ff80..33758d684 100644
--- a/dist/mercury.js
+++ b/dist/mercury.js
@@ -7,10 +7,10 @@ var _extends = _interopDefault(require('babel-runtime/helpers/extends'));
var _asyncToGenerator = _interopDefault(require('babel-runtime/helpers/asyncToGenerator'));
var URL = _interopDefault(require('url'));
var cheerio = _interopDefault(require('cheerio'));
-var _Promise = _interopDefault(require('babel-runtime/core-js/promise'));
-var request = _interopDefault(require('request'));
var iconv = _interopDefault(require('iconv-lite'));
var _slicedToArray = _interopDefault(require('babel-runtime/helpers/slicedToArray'));
+var _Promise = _interopDefault(require('babel-runtime/core-js/promise'));
+var request = _interopDefault(require('request'));
var _Reflect$ownKeys = _interopDefault(require('babel-runtime/core-js/reflect/own-keys'));
var _toConsumableArray = _interopDefault(require('babel-runtime/helpers/toConsumableArray'));
var _defineProperty = _interopDefault(require('babel-runtime/helpers/defineProperty'));
@@ -26,50 +26,6 @@ var difflib = _interopDefault(require('difflib'));
var _Array$from = _interopDefault(require('babel-runtime/core-js/array/from'));
var ellipsize = _interopDefault(require('ellipsize'));
-var _marked = [range].map(_regeneratorRuntime.mark);
-
-function range() {
- var start = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
- var end = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 1;
- return _regeneratorRuntime.wrap(function range$(_context) {
- while (1) {
- switch (_context.prev = _context.next) {
- case 0:
- if (!(start <= end)) {
- _context.next = 5;
- break;
- }
-
- _context.next = 3;
- return start += 1;
-
- case 3:
- _context.next = 0;
- break;
-
- case 5:
- case "end":
- return _context.stop();
- }
- }
- }, _marked[0], this);
-}
-
-// extremely simple url validation as a first step
-function validateUrl(_ref) {
- var hostname = _ref.hostname;
-
- // If this isn't a valid url, return an error message
- return !!hostname;
-}
-
-var Errors = {
- badUrl: {
- error: true,
- messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.'
- }
-};
-
var NORMALIZE_RE = /\s{2,}/g;
function normalizeSpaces(text) {
@@ -116,6 +72,7 @@ var IS_ALPHA_RE = /^[a-z]+$/i;
var IS_DIGIT_RE = /^[0-9]+$/i;
var ENCODING_RE = /charset=([\w-]+)\b/;
+var DEFAULT_ENCODING = 'utf-8';
function pageNumFromUrl(url) {
var matches = url.match(PAGE_IN_HREF_RE);
@@ -224,13 +181,60 @@ function excerptContent(content) {
// used in our fetchResource function to
// ensure correctly encoded responses
function getEncoding(str) {
+ var encoding = DEFAULT_ENCODING;
if (ENCODING_RE.test(str)) {
- return ENCODING_RE.exec(str)[1];
+ var testEncode = ENCODING_RE.exec(str)[1];
+ if (iconv.encodingExists(testEncode)) {
+ encoding = testEncode;
+ }
}
+ return encoding;
+}
- return null;
+var _marked = [range].map(_regeneratorRuntime.mark);
+
+function range() {
+ var start = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
+ var end = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 1;
+ return _regeneratorRuntime.wrap(function range$(_context) {
+ while (1) {
+ switch (_context.prev = _context.next) {
+ case 0:
+ if (!(start <= end)) {
+ _context.next = 5;
+ break;
+ }
+
+ _context.next = 3;
+ return start += 1;
+
+ case 3:
+ _context.next = 0;
+ break;
+
+ case 5:
+ case "end":
+ return _context.stop();
+ }
+ }
+ }, _marked[0], this);
+}
+
+// extremely simple url validation as a first step
+function validateUrl(_ref) {
+ var hostname = _ref.hostname;
+
+ // If this isn't a valid url, return an error message
+ return !!hostname;
}
+var Errors = {
+ badUrl: {
+ error: true,
+ messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.'
+ }
+};
+
// Browser does not like us setting user agent
var REQUEST_HEADERS = cheerio.browser ? {} : {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
@@ -258,21 +262,6 @@ function get(options) {
if (err) {
reject(err);
} else {
- var encoding = getEncoding(response.headers['content-type']);
-
- if (iconv.encodingExists(encoding)) {
- body = iconv.decode(body, encoding);
- }
-
- if (typeof body !== 'string') {
- var $ = cheerio.load(iconv.decode(body, 'utf8'));
- var contentType = $('meta[http-equiv=content-type]').attr('content');
- var properEncoding = getEncoding(contentType);
- if (iconv.encodingExists(properEncoding)) {
- body = iconv.decode(body, properEncoding);
- }
- }
-
resolve({ body: body, response: response });
}
});
@@ -343,11 +332,11 @@ var fetchResource$1 = (function () {
url: parsedUrl.href,
headers: _extends({}, REQUEST_HEADERS),
timeout: FETCH_TIMEOUT,
- // Don't set encoding; fixes issues
- // w/gzipped responses
- encoding: null,
// Accept cookies
jar: true,
+ // Set to null so the response returns as binary and body as buffer
+ // https://github.com/request/request#requestoptions-callback
+ encoding: null,
// Accept and decode gzip
gzip: true,
// Follow any redirect
@@ -1860,7 +1849,7 @@ var Resource = {
throw new Error('Content does not appear to be text.');
}
- var $ = cheerio.load(content);
+ var $ = this.encodeDoc({ content: content, contentType: contentType });
if ($.root().children().length === 0) {
throw new Error('No children, likely a bad parse.');
@@ -1870,6 +1859,26 @@ var Resource = {
$ = convertLazyLoadedImages($);
$ = clean($);
+ return $;
+ },
+ encodeDoc: function encodeDoc(_ref2) {
+ var content = _ref2.content,
+ contentType = _ref2.contentType;
+
+ var encoding = getEncoding(contentType);
+ var decodedContent = iconv.decode(content, encoding);
+ var $ = cheerio.load(decodedContent);
+
+ // after first cheerio.load, check to see if encoding matches
+ var metaContentType = $('meta[http-equiv=content-type]').attr('content');
+ var properEncoding = getEncoding(metaContentType);
+
+ // if encodings in the header/body dont match, use the one in the body
+ if (properEncoding !== encoding) {
+ decodedContent = iconv.decode(content, properEncoding);
+ $ = cheerio.load(decodedContent);
+ }
+
return $;
}
};
@@ -5300,6 +5309,54 @@ var WwwOpposingviewsComExtractor = {
}
};
+var GothamistComExtractor = {
+ domain: 'gothamist.com',
+
+ supportedDomains: ['chicagoist.com', 'laist.com', 'sfist.com', 'shanghaiist.com', 'dcist.com'],
+
+ title: {
+ selectors: ['h1', '.entry-header h1']
+ },
+
+ author: {
+ selectors: ['.author']
+ },
+
+ date_published: {
+ selectors: ['abbr', 'abbr.published'],
+
+ timezone: 'America/New_York'
+ },
+
+ dek: {
+ selectors: [null]
+ },
+
+ lead_image_url: {
+ selectors: [['meta[name="og:image"]', 'value']]
+ },
+
+ content: {
+ selectors: ['.entry-body'],
+
+ // Is there anything in the content you selected that needs transformed
+ // before it's consumable content? E.g., unusual lazy loaded images
+ transforms: {
+ 'div.image-none': 'figure',
+ '.image-none i': 'figcaption',
+ 'div.image-left': 'figure',
+ '.image-left i': 'figcaption',
+ 'div.image-right': 'figure',
+ '.image-right i': 'figcaption'
+ },
+
+ // Is there anything that is in the result that shouldn't be?
+ // The clean selectors will remove anything that matches from
+ // the result
+ clean: ['.image-none br', '.image-left br', '.image-right br', '.galleryEase']
+ }
+};
+
var CustomExtractors = Object.freeze({
@@ -5386,7 +5443,8 @@ var CustomExtractors = Object.freeze({
FortuneComExtractor: FortuneComExtractor,
WwwLinkedinComExtractor: WwwLinkedinComExtractor,
ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
- WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor
+ WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
+ GothamistComExtractor: GothamistComExtractor
});
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
diff --git a/dist/mercury.js.map b/dist/mercury.js.map
index dbe3b92af..b0ba28575 100644
--- a/dist/mercury.js.map
+++ b/dist/mercury.js.map
@@ -1 +1 @@
-{"version":3,"file":null,"sources":["../src/utils/range.js","../src/utils/validate-url.js","../src/utils/errors.js","../src/utils/text/normalize-spaces.js","../src/utils/text/extract-from-url.js","../src/utils/text/constants.js","../src/utils/text/page-num-from-url.js","../src/utils/text/remove-anchor.js","../src/utils/text/article-base-url.js","../src/utils/text/has-sentence-end.js","../src/utils/text/excerpt-content.js","../src/utils/text/get-encoding.js","../src/resource/utils/constants.js","../src/resource/utils/fetch-resource.js","../src/resource/utils/dom/normalize-meta-tags.js","../src/utils/dom/constants.js","../src/utils/dom/strip-unlikely-candidates.js","../src/utils/dom/brs-to-ps.js","../src/utils/dom/paragraphize.js","../src/utils/dom/convert-to-paragraphs.js","../src/utils/dom/convert-node-to.js","../src/utils/dom/clean-images.js","../src/utils/dom/mark-to-keep.js","../src/utils/dom/strip-junk-tags.js","../src/utils/dom/clean-h-ones.js","../src/utils/dom/clean-attributes.js","../src/utils/dom/remove-empty.js","../src/extractors/generic/content/scoring/constants.js","../src/extractors/generic/content/scoring/get-weight.js","../src/extractors/generic/content/scoring/get-score.js","../src/extractors/generic/content/scoring/score-commas.js","../src/extractors/generic/content/scoring/score-length.js","../src/extractors/generic/content/scoring/score-paragraph.js","../src/extractors/generic/content/scoring/set-score.js","../src/extractors/generic/content/scoring/add-score.js","../src/extractors/generic/content/scoring/add-to-parent.js","../src/extractors/generic/content/scoring/get-or-init-score.js","../src/extractors/generic/content/scoring/score-node.js","../src/extractors/generic/content/scoring/score-content.js","../src/extractors/generic/content/scoring/merge-siblings.js","../src/extractors/generic/content/scoring/find-top-candidate.js","../src/extractors/generic/content/scoring/index.js","../src/utils/dom/clean-tags.js","../src/utils/dom/clean-headers.js","../src/utils/dom/rewrite-top-level.js","../src/utils/dom/make-links-absolute.js","../src/utils/dom/link-density.js","../src/utils/dom/extract-from-meta.js","../src/utils/dom/extract-from-selectors.js","../src/utils/dom/strip-tags.js","../src/utils/dom/within-comment.js","../src/utils/dom/node-is-sufficient.js","../src/utils/dom/is-wordpress.js","../src/utils/dom/get-attrs.js","../src/utils/dom/set-attr.js","../src/utils/dom/set-attrs.js","../src/utils/dom/index.js","../src/resource/utils/dom/constants.js","../src/resource/utils/dom/convert-lazy-loaded-images.js","../src/resource/utils/dom/clean.js","../src/resource/index.js","../src/utils/merge-supported-domains.js","../src/extractors/custom/blogspot.com/index.js","../src/extractors/custom/nymag.com/index.js","../src/extractors/custom/wikipedia.org/index.js","../src/extractors/custom/twitter.com/index.js","../src/extractors/custom/www.nytimes.com/index.js","../src/extractors/custom/www.theatlantic.com/index.js","../src/extractors/custom/www.newyorker.com/index.js","../src/extractors/custom/www.wired.com/index.js","../src/extractors/custom/www.msn.com/index.js","../src/extractors/custom/www.yahoo.com/index.js","../src/extractors/custom/www.buzzfeed.com/index.js","../src/extractors/custom/fandom.wikia.com/index.js","../src/extractors/custom/www.littlethings.com/index.js","../src/extractors/custom/www.politico.com/index.js","../src/extractors/custom/deadspin.com/index.js","../src/extractors/custom/www.broadwayworld.com/index.js","../src/extractors/custom/www.apartmenttherapy.com/index.js","../src/extractors/custom/medium.com/index.js","../src/extractors/custom/www.tmz.com/index.js","../src/extractors/custom/www.washingtonpost.com/index.js","../src/extractors/custom/www.huffingtonpost.com/index.js","../src/extractors/custom/newrepublic.com/index.js","../src/extractors/custom/money.cnn.com/index.js","../src/extractors/custom/www.theverge.com/index.js","../src/extractors/custom/www.cnn.com/index.js","../src/extractors/custom/www.aol.com/index.js","../src/extractors/custom/www.youtube.com/index.js","../src/extractors/custom/www.theguardian.com/index.js","../src/extractors/custom/www.sbnation.com/index.js","../src/extractors/custom/www.bloomberg.com/index.js","../src/extractors/custom/www.bustle.com/index.js","../src/extractors/custom/www.npr.org/index.js","../src/extractors/custom/www.recode.net/index.js","../src/extractors/custom/qz.com/index.js","../src/extractors/custom/www.dmagazine.com/index.js","../src/extractors/custom/www.reuters.com/index.js","../src/extractors/custom/mashable.com/index.js","../src/extractors/custom/www.chicagotribune.com/index.js","../src/extractors/custom/www.vox.com/index.js","../src/extractors/custom/news.nationalgeographic.com/index.js","../src/extractors/custom/www.nationalgeographic.com/index.js","../src/extractors/custom/www.latimes.com/index.js","../src/extractors/custom/pagesix.com/index.js","../src/extractors/custom/thefederalistpapers.org/index.js","../src/extractors/custom/www.cbssports.com/index.js","../src/extractors/custom/www.msnbc.com/index.js","../src/extractors/custom/www.thepoliticalinsider.com/index.js","../src/extractors/custom/www.mentalfloss.com/index.js","../src/extractors/custom/abcnews.go.com/index.js","../src/extractors/custom/www.nydailynews.com/index.js","../src/extractors/custom/www.cnbc.com/index.js","../src/extractors/custom/www.popsugar.com/index.js","../src/extractors/custom/observer.com/index.js","../src/extractors/custom/people.com/index.js","../src/extractors/custom/www.usmagazine.com/index.js","../src/extractors/custom/www.rollingstone.com/index.js","../src/extractors/custom/247sports.com/index.js","../src/extractors/custom/uproxx.com/index.js","../src/extractors/custom/www.eonline.com/index.js","../src/extractors/custom/www.miamiherald.com/index.js","../src/extractors/custom/www.refinery29.com/index.js","../src/extractors/custom/www.macrumors.com/index.js","../src/extractors/custom/www.androidcentral.com/index.js","../src/extractors/custom/www.si.com/index.js","../src/extractors/custom/www.rawstory.com/index.js","../src/extractors/custom/www.cnet.com/index.js","../src/extractors/custom/www.cinemablend.com/index.js","../src/extractors/custom/www.today.com/index.js","../src/extractors/custom/www.howtogeek.com/index.js","../src/extractors/custom/www.al.com/index.js","../src/extractors/custom/www.thepennyhoarder.com/index.js","../src/extractors/custom/www.westernjournalism.com/index.js","../src/extractors/custom/fusion.net/index.js","../src/extractors/custom/www.americanow.com/index.js","../src/extractors/custom/sciencefly.com/index.js","../src/extractors/custom/hellogiggles.com/index.js","../src/extractors/custom/thoughtcatalog.com/index.js","../src/extractors/custom/www.nj.com/index.js","../src/extractors/custom/www.inquisitr.com/index.js","../src/extractors/custom/www.nbcnews.com/index.js","../src/extractors/custom/fortune.com/index.js","../src/extractors/custom/www.linkedin.com/index.js","../src/extractors/custom/obamawhitehouse.archives.gov/index.js","../src/extractors/custom/www.opposingviews.com/index.js","../src/extractors/all.js","../src/cleaners/constants.js","../src/cleaners/author.js","../src/cleaners/lead-image-url.js","../src/cleaners/dek.js","../src/cleaners/date-published.js","../src/cleaners/content.js","../src/cleaners/title.js","../src/cleaners/resolve-split-title.js","../src/cleaners/index.js","../src/extractors/generic/content/extract-best-node.js","../src/extractors/generic/content/extractor.js","../src/extractors/generic/title/constants.js","../src/extractors/generic/title/extractor.js","../src/extractors/generic/author/constants.js","../src/extractors/generic/author/extractor.js","../src/extractors/generic/date-published/constants.js","../src/extractors/generic/date-published/extractor.js","../src/extractors/generic/dek/extractor.js","../src/extractors/generic/lead-image-url/constants.js","../src/extractors/generic/lead-image-url/score-image.js","../src/extractors/generic/lead-image-url/extractor.js","../src/extractors/generic/next-page-url/scoring/utils/score-similarity.js","../src/extractors/generic/next-page-url/scoring/utils/score-link-text.js","../src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.js","../src/extractors/generic/next-page-url/scoring/constants.js","../src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.js","../src/extractors/generic/next-page-url/scoring/utils/score-by-parents.js","../src/extractors/generic/next-page-url/scoring/utils/score-prev-link.js","../src/extractors/generic/next-page-url/scoring/utils/should-score.js","../src/extractors/generic/next-page-url/scoring/utils/score-base-url.js","../src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.js","../src/extractors/generic/next-page-url/scoring/utils/score-cap-links.js","../src/extractors/generic/next-page-url/scoring/score-links.js","../src/extractors/generic/next-page-url/extractor.js","../src/extractors/generic/url/constants.js","../src/extractors/generic/url/extractor.js","../src/extractors/generic/excerpt/constants.js","../src/extractors/generic/excerpt/extractor.js","../src/extractors/generic/word-count/extractor.js","../src/extractors/generic/index.js","../src/extractors/detect-by-html.js","../src/extractors/get-extractor.js","../src/extractors/root-extractor.js","../src/extractors/collect-all-pages.js","../src/mercury.js"],"sourcesContent":["export default function* range(start = 1, end = 1) {\n while (start <= end) {\n yield start += 1;\n }\n}\n","// extremely simple url validation as a first step\nexport default function validateUrl({ hostname }) {\n // If this isn't a valid url, return an error message\n return !!hostname;\n}\n","const Errors = {\n badUrl: {\n error: true,\n messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.',\n },\n};\n\nexport default Errors;\n","const NORMALIZE_RE = /\\s{2,}/g;\n\nexport default function normalizeSpaces(text) {\n return text.replace(NORMALIZE_RE, ' ').trim();\n}\n","// Given a node type to search for, and a list of regular expressions,\n// look to see if this extraction can be found in the URL. Expects\n// that each expression in r_list will return group(1) as the proper\n// string to be cleaned.\n// Only used for date_published currently.\nexport default function extractFromUrl(url, regexList) {\n const matchRe = regexList.find(re => re.test(url));\n if (matchRe) {\n return matchRe.exec(url)[1];\n }\n\n return null;\n}\n","// An expression that looks to try to find the page digit within a URL, if\n// it exists.\n// Matches:\n// page=1\n// pg=1\n// p=1\n// paging=12\n// pag=7\n// pagination/1\n// paging/88\n// pa/83\n// p/11\n//\n// Does not match:\n// pg=102\n// page:2\nexport const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i');\n\nexport const HAS_ALPHA_RE = /[a-z]/i;\n\nexport const IS_ALPHA_RE = /^[a-z]+$/i;\nexport const IS_DIGIT_RE = /^[0-9]+$/i;\n\nexport const ENCODING_RE = /charset=([\\w-]+)\\b/;\n","import { PAGE_IN_HREF_RE } from './constants';\n\nexport default function pageNumFromUrl(url) {\n const matches = url.match(PAGE_IN_HREF_RE);\n if (!matches) return null;\n\n const pageNum = parseInt(matches[6], 10);\n\n // Return pageNum < 100, otherwise\n // return null\n return pageNum < 100 ? pageNum : null;\n}\n","export default function removeAnchor(url) {\n return url.split('#')[0].replace(/\\/$/, '');\n}\n","import URL from 'url';\n\nimport {\n HAS_ALPHA_RE,\n IS_ALPHA_RE,\n IS_DIGIT_RE,\n PAGE_IN_HREF_RE,\n} from './constants';\n\nfunction isGoodSegment(segment, index, firstSegmentHasLetters) {\n let goodSegment = true;\n\n // If this is purely a number, and it's the first or second\n // url_segment, it's probably a page number. Remove it.\n if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {\n goodSegment = true;\n }\n\n // If this is the first url_segment and it's just \"index\",\n // remove it\n if (index === 0 && segment.toLowerCase() === 'index') {\n goodSegment = false;\n }\n\n // If our first or second url_segment is smaller than 3 characters,\n // and the first url_segment had no alphas, remove it.\n if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {\n goodSegment = false;\n }\n\n return goodSegment;\n}\n\n// Take a URL, and return the article base of said URL. That is, no\n// pagination data exists in it. Useful for comparing to other links\n// that might have pagination data within them.\nexport default function articleBaseUrl(url, parsed) {\n const parsedUrl = parsed || URL.parse(url);\n const { protocol, host, path } = parsedUrl;\n\n let firstSegmentHasLetters = false;\n const cleanedSegments = path.split('/')\n .reverse()\n .reduce((acc, rawSegment, index) => {\n let segment = rawSegment;\n\n // Split off and save anything that looks like a file type.\n if (segment.includes('.')) {\n const [possibleSegment, fileExt] = segment.split('.');\n if (IS_ALPHA_RE.test(fileExt)) {\n segment = possibleSegment;\n }\n }\n\n // If our first or second segment has anything looking like a page\n // number, remove it.\n if (PAGE_IN_HREF_RE.test(segment) && index < 2) {\n segment = segment.replace(PAGE_IN_HREF_RE, '');\n }\n\n // If we're on the first segment, check to see if we have any\n // characters in it. The first segment is actually the last bit of\n // the URL, and this will be helpful to determine if we're on a URL\n // segment that looks like \"/2/\" for example.\n if (index === 0) {\n firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);\n }\n\n // If it's not marked for deletion, push it to cleaned_segments.\n if (isGoodSegment(segment, index, firstSegmentHasLetters)) {\n acc.push(segment);\n }\n\n return acc;\n }, []);\n\n return `${protocol}//${host}${cleanedSegments.reverse().join('/')}`;\n}\n","// Given a string, return True if it appears to have an ending sentence\n// within it, false otherwise.\nconst SENTENCE_END_RE = new RegExp('.( |$)');\nexport default function hasSentenceEnd(text) {\n return SENTENCE_END_RE.test(text);\n}\n","export default function excerptContent(content, words = 10) {\n return content.trim()\n .split(/\\s+/)\n .slice(0, words)\n .join(' ');\n}\n","import { ENCODING_RE } from './constants';\n\n// check a string for encoding; this is\n// used in our fetchResource function to\n// ensure correctly encoded responses\nexport default function getEncoding(str) {\n if (ENCODING_RE.test(str)) {\n return ENCODING_RE.exec(str)[1];\n }\n\n return null;\n}\n","import cheerio from 'cheerio';\n\n// Browser does not like us setting user agent\nexport const REQUEST_HEADERS = cheerio.browser ? {} : {\n 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',\n};\n\n// The number of milliseconds to attempt to fetch a resource before timing out.\nexport const FETCH_TIMEOUT = 10000;\n\n// Content types that we do not extract content from\nconst BAD_CONTENT_TYPES = [\n 'audio/mpeg',\n 'image/gif',\n 'image/jpeg',\n 'image/jpg',\n];\n\nexport const BAD_CONTENT_TYPES_RE = new RegExp(`^(${BAD_CONTENT_TYPES.join('|')})$`, 'i');\n\n// Use this setting as the maximum size an article can be\n// for us to attempt parsing. Defaults to 5 MB.\nexport const MAX_CONTENT_LENGTH = 5242880;\n\n// Turn the global proxy on or off\n// Proxying is not currently enabled in Python source\n// so not implementing logic in port.\nexport const PROXY_DOMAINS = false;\nexport const REQUESTS_PROXIES = {\n http: 'http://38.98.105.139:33333',\n https: 'http://38.98.105.139:33333',\n};\n\nexport const DOMAINS_TO_PROXY = [\n 'nih.gov',\n 'gutenberg.org',\n];\n","import URL from 'url';\nimport request from 'request';\nimport iconv from 'iconv-lite';\nimport cheerio from 'cheerio';\nimport { Errors } from 'utils';\nimport { getEncoding } from 'utils/text';\n\nimport {\n REQUEST_HEADERS,\n FETCH_TIMEOUT,\n BAD_CONTENT_TYPES_RE,\n MAX_CONTENT_LENGTH,\n} from './constants';\n\nfunction get(options) {\n return new Promise((resolve, reject) => {\n request(options, (err, response, body) => {\n if (err) {\n reject(err);\n } else {\n const encoding = getEncoding(response.headers['content-type']);\n\n if (iconv.encodingExists(encoding)) {\n body = iconv.decode(body, encoding);\n }\n\n if (typeof body !== 'string') {\n const $ = cheerio.load(iconv.decode(body, 'utf8'));\n const contentType = $('meta[http-equiv=content-type]').attr('content');\n const properEncoding = getEncoding(contentType);\n if (iconv.encodingExists(properEncoding)) {\n body = iconv.decode(body, properEncoding);\n }\n }\n\n resolve({ body, response });\n }\n });\n });\n}\n\n// Evaluate a response to ensure it's something we should be keeping.\n// This does not validate in the sense of a response being 200 level or\n// not. Validation here means that we haven't found reason to bail from\n// further processing of this url.\n\nexport function validateResponse(response, parseNon2xx = false) {\n // Check if we got a valid status code\n // This isn't great, but I'm requiring a statusMessage to be set\n // before short circuiting b/c nock doesn't set it in tests\n // statusMessage only not set in nock response, in which case\n // I check statusCode, which is currently only 200 for OK responses\n // in tests\n if (\n (response.statusMessage && response.statusMessage !== 'OK') ||\n response.statusCode !== 200\n ) {\n if (!response.statusCode) {\n throw new Error(\n `Unable to fetch content. Original exception was ${response.error}`\n );\n } else if (!parseNon2xx) {\n throw new Error(\n `Resource returned a response status code of ${response.statusCode} and resource was instructed to reject non-2xx level status codes.`\n );\n }\n }\n\n const {\n 'content-type': contentType,\n 'content-length': contentLength,\n } = response.headers;\n\n // Check that the content is not in BAD_CONTENT_TYPES\n if (BAD_CONTENT_TYPES_RE.test(contentType)) {\n throw new Error(\n `Content-type for this resource was ${contentType} and is not allowed.`\n );\n }\n\n // Check that the content length is below maximum\n if (contentLength > MAX_CONTENT_LENGTH) {\n throw new Error(\n `Content for this resource was too large. Maximum content length is ${MAX_CONTENT_LENGTH}.`\n );\n }\n\n return true;\n}\n\n// Grabs the last two pieces of the URL and joins them back together\n// This is to get the 'livejournal.com' from 'erotictrains.livejournal.com'\nexport function baseDomain({ host }) {\n return host.split('.').slice(-2).join('.');\n}\n\n// Set our response attribute to the result of fetching our URL.\n// TODO: This should gracefully handle timeouts and raise the\n// proper exceptions on the many failure cases of HTTP.\n// TODO: Ensure we are not fetching something enormous. Always return\n// unicode content for HTML, with charset conversion.\n\nexport default async function fetchResource(url, parsedUrl) {\n parsedUrl = parsedUrl || URL.parse(encodeURI(url));\n\n const options = {\n url: parsedUrl.href,\n headers: { ...REQUEST_HEADERS },\n timeout: FETCH_TIMEOUT,\n // Don't set encoding; fixes issues\n // w/gzipped responses\n encoding: null,\n // Accept cookies\n jar: true,\n // Accept and decode gzip\n gzip: true,\n // Follow any redirect\n followAllRedirects: true,\n };\n\n const { response, body } = await get(options);\n\n try {\n validateResponse(response);\n return {\n body,\n response,\n };\n } catch (e) {\n return Errors.badUrl;\n }\n}\n","function convertMetaProp($, from, to) {\n $(`meta[${from}]`).each((_, node) => {\n const $node = $(node);\n\n const value = $node.attr(from);\n $node.attr(to, value);\n $node.removeAttr(from);\n });\n\n return $;\n}\n\n// For ease of use in extracting from meta tags,\n// replace the \"content\" attribute on meta tags with the\n// \"value\" attribute.\n//\n// In addition, normalize 'property' attributes to 'name' for ease of\n// querying later. See, e.g., og or twitter meta tags.\n\nexport default function normalizeMetaTags($) {\n $ = convertMetaProp($, 'content', 'value');\n $ = convertMetaProp($, 'property', 'name');\n return $;\n}\n","// Spacer images to be removed\nexport const SPACER_RE = new RegExp('transparent|spacer|blank', 'i');\n\n// The class we will use to mark elements we want to keep\n// but would normally remove\nexport const KEEP_CLASS = 'mercury-parser-keep';\n\nexport const KEEP_SELECTORS = [\n 'iframe[src^=\"https://www.youtube.com\"]',\n 'iframe[src^=\"http://www.youtube.com\"]',\n 'iframe[src^=\"https://player.vimeo\"]',\n 'iframe[src^=\"http://player.vimeo\"]',\n];\n\n// A list of tags to strip from the output if we encounter them.\nexport const STRIP_OUTPUT_TAGS = [\n 'title',\n 'script',\n 'noscript',\n 'link',\n 'style',\n 'hr',\n 'embed',\n 'iframe',\n 'object',\n];\n\n// cleanAttributes\nexport const REMOVE_ATTRS = ['style', 'align'];\nexport const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`);\nexport const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');\nexport const WHITELIST_ATTRS = [\n 'src',\n 'srcset',\n 'href',\n 'class',\n 'id',\n 'alt',\n 'xlink:href',\n 'width',\n 'height',\n];\n\nexport const WHITELIST_ATTRS_RE = new RegExp(`^(${WHITELIST_ATTRS.join('|')})$`, 'i');\n\n// removeEmpty\nexport const REMOVE_EMPTY_TAGS = ['p'];\nexport const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(tag => `${tag}:empty`).join(',');\n\n// cleanTags\nexport const CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(',');\n\n// cleanHeaders\nconst HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];\nexport const HEADER_TAG_LIST = HEADER_TAGS.join(',');\n\n// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nexport const UNLIKELY_CANDIDATES_BLACKLIST = [\n 'ad-break',\n 'adbox',\n 'advert',\n 'addthis',\n 'agegate',\n 'aux',\n 'blogger-labels',\n 'combx',\n 'comment',\n 'conversation',\n 'disqus',\n 'entry-unrelated',\n 'extra',\n 'foot',\n // 'form', // This is too generic, has too many false positives\n 'header',\n 'hidden',\n 'loader',\n 'login', // Note: This can hit 'blogindex'.\n 'menu',\n 'meta',\n 'nav',\n 'outbrain',\n 'pager',\n 'pagination',\n 'predicta', // readwriteweb inline ad box\n 'presence_control_external', // lifehacker.com container full of false positives\n 'popup',\n 'printfriendly',\n 'related',\n 'remove',\n 'remark',\n 'rss',\n 'share',\n 'shoutbox',\n 'sidebar',\n 'sociable',\n 'sponsor',\n 'taboola',\n 'tools',\n];\n\n// A list of strings that can be considered LIKELY candidates when\n// extracting content from a resource. Essentially, the inverse of the\n// blacklist above - if something matches both blacklist and whitelist,\n// it is kept. This is useful, for example, if something has a className\n// of \"rss-content entry-content\". It matched 'rss', so it would normally\n// be removed, however, it's also the entry content, so it should be left\n// alone.\n//\n// These strings are joined together and then tested for existence using\n// re:test, so may contain simple, non-pipe style regular expression queries\n// if necessary.\nexport const UNLIKELY_CANDIDATES_WHITELIST = [\n 'and',\n 'article',\n 'body',\n 'blogindex',\n 'column',\n 'content',\n 'entry-content-asset',\n 'format', // misuse of form\n 'hfeed',\n 'hentry',\n 'hatom',\n 'main',\n 'page',\n 'posts',\n 'shadow',\n];\n\n// A list of tags which, if found inside, should cause a
to NOT\n// be turned into a paragraph tag. Shallow div tags without these elements\n// should be turned into tags.\nexport const DIV_TO_P_BLOCK_TAGS = [\n 'a',\n 'blockquote',\n 'dl',\n 'div',\n 'img',\n 'p',\n 'pre',\n 'table',\n].join(',');\n\n// A list of tags that should be ignored when trying to find the top candidate\n// for a document.\nexport const NON_TOP_CANDIDATE_TAGS = [\n 'br',\n 'b',\n 'i',\n 'label',\n 'hr',\n 'area',\n 'base',\n 'basefont',\n 'input',\n 'img',\n 'link',\n 'meta',\n];\n\nexport const NON_TOP_CANDIDATE_TAGS_RE =\n new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i');\n\n// A list of selectors that specify, very clearly, either hNews or other\n// very content-specific style content, like Blogger templates.\n// More examples here: http://microformats.org/wiki/blog-post-formats\nexport const HNEWS_CONTENT_SELECTORS = [\n ['.hentry', '.entry-content'],\n ['entry', '.entry-content'],\n ['.entry', '.entry_content'],\n ['.post', '.postbody'],\n ['.post', '.post_body'],\n ['.post', '.post-body'],\n];\n\nexport const PHOTO_HINTS = [\n 'figure',\n 'photo',\n 'image',\n 'caption',\n];\nexport const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');\n\n// A list of strings that denote a positive scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const POSITIVE_SCORE_HINTS = [\n 'article',\n 'articlecontent',\n 'instapaper_body',\n 'blog',\n 'body',\n 'content',\n 'entry-content-asset',\n 'entry',\n 'hentry',\n 'main',\n 'Normal',\n 'page',\n 'pagination',\n 'permalink',\n 'post',\n 'story',\n 'text',\n '[-_]copy', // usatoday\n '\\\\Bcopy',\n];\n\n// The above list, joined into a matching regular expression\nexport const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i');\n\n// Readability publisher-specific guidelines\nexport const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');\n\n// A list of strings that denote a negative scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const NEGATIVE_SCORE_HINTS = [\n 'adbox',\n 'advert',\n 'author',\n 'bio',\n 'bookmark',\n 'bottom',\n 'byline',\n 'clear',\n 'com-',\n 'combx',\n 'comment',\n 'comment\\\\B',\n 'contact',\n 'copy',\n 'credit',\n 'crumb',\n 'date',\n 'deck',\n 'excerpt',\n 'featured', // tnr.com has a featured_content which throws us off\n 'foot',\n 'footer',\n 'footnote',\n 'graf',\n 'head',\n 'info',\n 'infotext', // newscientist.com copyright\n 'instapaper_ignore',\n 'jump',\n 'linebreak',\n 'link',\n 'masthead',\n 'media',\n 'meta',\n 'modal',\n 'outbrain', // slate.com junk\n 'promo',\n 'pr_', // autoblog - press release\n 'related',\n 'respond',\n 'roundcontent', // lifehacker restricted content warning\n 'scroll',\n 'secondary',\n 'share',\n 'shopping',\n 'shoutbox',\n 'side',\n 'sidebar',\n 'sponsor',\n 'stamp',\n 'sub',\n 'summary',\n 'tags',\n 'tools',\n 'widget',\n];\n// The above list, joined into a matching regular expression\nexport const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i');\n\n// XPath to try to determine if a page is wordpress. Not always successful.\nexport const IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]';\n\n// Match a digit. Pretty clear.\nexport const DIGIT_RE = new RegExp('[0-9]');\n\n// A list of words that, if found in link text or URLs, likely mean that\n// this link is not a next page link.\nexport const EXTRANEOUS_LINK_HINTS = [\n 'print',\n 'archive',\n 'comment',\n 'discuss',\n 'e-mail',\n 'email',\n 'share',\n 'reply',\n 'all',\n 'login',\n 'sign',\n 'single',\n 'adx',\n 'entry-unrelated',\n];\nexport const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i');\n\n// Match any phrase that looks like it could be page, or paging, or pagination\nexport const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');\n\n// Match any link text/classname/id that looks like it could mean the next\n// page. Things like: next, continue, >, >>, » but not >|, »| as those can\n// mean last page.\n// export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))', 'i');\nexport const NEXT_LINK_TEXT_RE = /(next|weiter|continue|>([^|]|$)|»([^|]|$))/i;\n\n// Match any link text/classname/id that looks like it is an end link: things\n// like \"first\", \"last\", \"end\", etc.\nexport const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i');\n\n// Match any link text/classname/id that looks like it means the previous\n// page.\nexport const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i');\n\n// Match 2 or more consecutive tags\nexport const BR_TAGS_RE = new RegExp('( ]*>[ \\n\\r\\t]*){2,}', 'i');\n\n// Match 1 BR tag.\nexport const BR_TAG_RE = new RegExp(' ]*>', 'i');\n\n// A list of all of the block level tags known in HTML5 and below. Taken from\n// http://bit.ly/qneNIT\nexport const BLOCK_LEVEL_TAGS = [\n 'article',\n 'aside',\n 'blockquote',\n 'body',\n 'br',\n 'button',\n 'canvas',\n 'caption',\n 'col',\n 'colgroup',\n 'dd',\n 'div',\n 'dl',\n 'dt',\n 'embed',\n 'fieldset',\n 'figcaption',\n 'figure',\n 'footer',\n 'form',\n 'h1',\n 'h2',\n 'h3',\n 'h4',\n 'h5',\n 'h6',\n 'header',\n 'hgroup',\n 'hr',\n 'li',\n 'map',\n 'object',\n 'ol',\n 'output',\n 'p',\n 'pre',\n 'progress',\n 'section',\n 'table',\n 'tbody',\n 'textarea',\n 'tfoot',\n 'th',\n 'thead',\n 'tr',\n 'ul',\n 'video',\n];\nexport const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i');\n\n// The removal is implemented as a blacklist and whitelist, this test finds\n// blacklisted elements that aren't whitelisted. We do this all in one\n// expression-both because it's only one pass, and because this skips the\n// serialization for whitelisted nodes.\nconst candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');\nexport const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');\n\nconst candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');\nexport const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');\n\nexport const UNLIKELY_RE = new RegExp(`!(${candidatesWhitelist})|(${candidatesBlacklist})`, 'i');\n\nexport const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');\nexport const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');\nexport const BAD_TAGS = new RegExp('^(address|form)$', 'i');\n\nexport const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');\n","import {\n CANDIDATES_WHITELIST,\n CANDIDATES_BLACKLIST,\n} from './constants';\n\nexport default function stripUnlikelyCandidates($) {\n // Loop through the provided document and remove any non-link nodes\n // that are unlikely candidates for article content.\n //\n // Links are ignored because there are very often links to content\n // that are identified as non-body-content, but may be inside\n // article-like content.\n //\n // :param $: a cheerio object to strip nodes from\n // :return $: the cleaned cheerio object\n $('*').not('a').each((index, node) => {\n const $node = $(node);\n const classes = $node.attr('class');\n const id = $node.attr('id');\n if (!id && !classes) return;\n\n const classAndId = `${classes || ''} ${id || ''}`;\n if (CANDIDATES_WHITELIST.test(classAndId)) {\n return;\n } else if (CANDIDATES_BLACKLIST.test(classAndId)) {\n $node.remove();\n }\n });\n\n return $;\n}\n","import { paragraphize } from './index';\n\n// ## NOTES:\n// Another good candidate for refactoring/optimizing.\n// Very imperative code, I don't love it. - AP\n\n// Given cheerio object, convert consecutive tags into\n// tags instead.\n//\n// :param $: A cheerio object\n\nexport default function brsToPs($) {\n let collapsing = false;\n $('br').each((index, element) => {\n const $element = $(element);\n const nextElement = $element.next().get(0);\n\n if (nextElement && nextElement.tagName.toLowerCase() === 'br') {\n collapsing = true;\n $element.remove();\n } else if (collapsing) {\n collapsing = false;\n // $(element).replaceWith('')\n paragraphize(element, $, true);\n }\n });\n\n return $;\n}\n","import { BLOCK_LEVEL_TAGS_RE } from './constants';\n\n// Given a node, turn it into a P if it is not already a P, and\n// make sure it conforms to the constraints of a P tag (I.E. does\n// not contain any other block tags.)\n//\n// If the node is a , it treats the following inline siblings\n// as if they were its children.\n//\n// :param node: The node to paragraphize; this is a raw node\n// :param $: The cheerio object to handle dom manipulation\n// :param br: Whether or not the passed node is a br\n\nexport default function paragraphize(node, $, br = false) {\n const $node = $(node);\n\n if (br) {\n let sibling = node.nextSibling;\n const p = $('');\n\n // while the next node is text or not a block level element\n // append it to a new p node\n while (sibling && !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))) {\n const nextSibling = sibling.nextSibling;\n $(sibling).appendTo(p);\n sibling = nextSibling;\n }\n\n $node.replaceWith(p);\n $node.remove();\n return $;\n }\n\n return $;\n}\n","import { brsToPs, convertNodeTo } from 'utils/dom';\n\nimport { DIV_TO_P_BLOCK_TAGS } from './constants';\n\nfunction convertDivs($) {\n $('div').each((index, div) => {\n const $div = $(div);\n const convertable = $div.children(DIV_TO_P_BLOCK_TAGS).length === 0;\n\n if (convertable) {\n convertNodeTo($div, $, 'p');\n }\n });\n\n return $;\n}\n\nfunction convertSpans($) {\n $('span').each((index, span) => {\n const $span = $(span);\n const convertable = $span.parents('p, div').length === 0;\n if (convertable) {\n convertNodeTo($span, $, 'p');\n }\n });\n\n return $;\n}\n\n// Loop through the provided doc, and convert any p-like elements to\n// actual paragraph tags.\n//\n// Things fitting this criteria:\n// * Multiple consecutive tags.\n// * tags without block level elements inside of them\n// * tags who are not children of or tags.\n//\n// :param $: A cheerio object to search\n// :return cheerio object with new p elements\n// (By-reference mutation, though. Returned just for convenience.)\n\nexport default function convertToParagraphs($) {\n $ = brsToPs($);\n $ = convertDivs($);\n $ = convertSpans($);\n\n return $;\n}\n","import { getAttrs } from 'utils/dom';\n\nexport default function convertNodeTo($node, $, tag = 'p') {\n const node = $node.get(0);\n if (!node) {\n return $;\n }\n const attrs = getAttrs(node) || {};\n // console.log(attrs)\n\n const attribString = Reflect.ownKeys(attrs)\n .map(key => `${key}=${attrs[key]}`)\n .join(' ');\n let html;\n\n if ($.browser) {\n // In the browser, the contents of noscript tags aren't rendered, therefore\n // transforms on the noscript tag (commonly used for lazy-loading) don't work\n // as expected. This test case handles that\n html = node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html();\n } else {\n html = $node.contents();\n }\n $node.replaceWith(\n `<${tag} ${attribString}>${html}${tag}>`\n );\n return $;\n}\n","import { SPACER_RE } from './constants';\n\nfunction cleanForHeight($img, $) {\n const height = parseInt($img.attr('height'), 10);\n const width = parseInt($img.attr('width'), 10) || 20;\n\n // Remove images that explicitly have very small heights or\n // widths, because they are most likely shims or icons,\n // which aren't very useful for reading.\n if ((height || 20) < 10 || width < 10) {\n $img.remove();\n } else if (height) {\n // Don't ever specify a height on images, so that we can\n // scale with respect to width without screwing up the\n // aspect ratio.\n $img.removeAttr('height');\n }\n\n return $;\n}\n\n// Cleans out images where the source string matches transparent/spacer/etc\n// TODO This seems very aggressive - AP\nfunction removeSpacers($img, $) {\n if (SPACER_RE.test($img.attr('src'))) {\n $img.remove();\n }\n\n return $;\n}\n\nexport default function cleanImages($article, $) {\n $article.find('img').each((index, img) => {\n const $img = $(img);\n\n cleanForHeight($img, $);\n removeSpacers($img, $);\n });\n\n return $;\n}\n","import URL from 'url';\n\nimport {\n KEEP_SELECTORS,\n KEEP_CLASS,\n} from './constants';\n\nexport default function markToKeep(article, $, url, tags = []) {\n if (tags.length === 0) {\n tags = KEEP_SELECTORS;\n }\n\n if (url) {\n const { protocol, hostname } = URL.parse(url);\n tags = [...tags, `iframe[src^=\"${protocol}//${hostname}\"]`];\n }\n\n $(tags.join(','), article).addClass(KEEP_CLASS);\n\n return $;\n}\n","import {\n STRIP_OUTPUT_TAGS,\n KEEP_CLASS,\n} from './constants';\n\nexport default function stripJunkTags(article, $, tags = []) {\n if (tags.length === 0) {\n tags = STRIP_OUTPUT_TAGS;\n }\n\n // Remove matching elements, but ignore\n // any element with a class of mercury-parser-keep\n $(tags.join(','), article).not(`.${KEEP_CLASS}`).remove();\n\n // Remove the mercury-parser-keep class from result\n $(`.${KEEP_CLASS}`, article).removeClass(KEEP_CLASS);\n\n return $;\n}\n","import { convertNodeTo } from 'utils/dom';\n\n// H1 tags are typically the article title, which should be extracted\n// by the title extractor instead. If there's less than 3 of them (<3),\n// strip them. Otherwise, turn 'em into H2s.\nexport default function cleanHOnes(article, $) {\n const $hOnes = $('h1', article);\n\n if ($hOnes.length < 3) {\n $hOnes.each((index, node) => $(node).remove());\n } else {\n $hOnes.each((index, node) => {\n convertNodeTo($(node), $, 'h2');\n });\n }\n\n return $;\n}\n","import {\n getAttrs,\n setAttrs,\n} from 'utils/dom';\n\nimport { WHITELIST_ATTRS_RE } from './constants';\n\nfunction removeAllButWhitelist($article) {\n $article.find('*').each((index, node) => {\n const attrs = getAttrs(node);\n\n setAttrs(node, Reflect.ownKeys(attrs).reduce((acc, attr) => {\n if (WHITELIST_ATTRS_RE.test(attr)) {\n return { ...acc, [attr]: attrs[attr] };\n }\n\n return acc;\n }, {}));\n });\n\n return $article;\n}\n\n// function removeAttrs(article, $) {\n// REMOVE_ATTRS.forEach((attr) => {\n// $(`[${attr}]`, article).removeAttr(attr);\n// });\n// }\n\n// Remove attributes like style or align\nexport default function cleanAttributes($article) {\n // Grabbing the parent because at this point\n // $article will be wrapped in a div which will\n // have a score set on it.\n return removeAllButWhitelist(\n $article.parent().length ?\n $article.parent() : $article\n );\n}\n","export default function removeEmpty($article, $) {\n $article.find('p').each((index, p) => {\n const $p = $(p);\n if ($p.find('iframe, img').length === 0 && $p.text().trim() === '') $p.remove();\n });\n\n return $;\n}\n","// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nexport const UNLIKELY_CANDIDATES_BLACKLIST = [\n 'ad-break',\n 'adbox',\n 'advert',\n 'addthis',\n 'agegate',\n 'aux',\n 'blogger-labels',\n 'combx',\n 'comment',\n 'conversation',\n 'disqus',\n 'entry-unrelated',\n 'extra',\n 'foot',\n 'form',\n 'header',\n 'hidden',\n 'loader',\n 'login', // Note: This can hit 'blogindex'.\n 'menu',\n 'meta',\n 'nav',\n 'pager',\n 'pagination',\n 'predicta', // readwriteweb inline ad box\n 'presence_control_external', // lifehacker.com container full of false positives\n 'popup',\n 'printfriendly',\n 'related',\n 'remove',\n 'remark',\n 'rss',\n 'share',\n 'shoutbox',\n 'sidebar',\n 'sociable',\n 'sponsor',\n 'tools',\n];\n\n// A list of strings that can be considered LIKELY candidates when\n// extracting content from a resource. Essentially, the inverse of the\n// blacklist above - if something matches both blacklist and whitelist,\n// it is kept. This is useful, for example, if something has a className\n// of \"rss-content entry-content\". It matched 'rss', so it would normally\n// be removed, however, it's also the entry content, so it should be left\n// alone.\n//\n// These strings are joined together and then tested for existence using\n// re:test, so may contain simple, non-pipe style regular expression queries\n// if necessary.\nexport const UNLIKELY_CANDIDATES_WHITELIST = [\n 'and',\n 'article',\n 'body',\n 'blogindex',\n 'column',\n 'content',\n 'entry-content-asset',\n 'format', // misuse of form\n 'hfeed',\n 'hentry',\n 'hatom',\n 'main',\n 'page',\n 'posts',\n 'shadow',\n];\n\n// A list of tags which, if found inside, should cause a to NOT\n// be turned into a paragraph tag. Shallow div tags without these elements\n// should be turned into tags.\nexport const DIV_TO_P_BLOCK_TAGS = [\n 'a',\n 'blockquote',\n 'dl',\n 'div',\n 'img',\n 'p',\n 'pre',\n 'table',\n].join(',');\n\n// A list of tags that should be ignored when trying to find the top candidate\n// for a document.\nexport const NON_TOP_CANDIDATE_TAGS = [\n 'br',\n 'b',\n 'i',\n 'label',\n 'hr',\n 'area',\n 'base',\n 'basefont',\n 'input',\n 'img',\n 'link',\n 'meta',\n];\n\nexport const NON_TOP_CANDIDATE_TAGS_RE =\n new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i');\n\n// A list of selectors that specify, very clearly, either hNews or other\n// very content-specific style content, like Blogger templates.\n// More examples here: http://microformats.org/wiki/blog-post-formats\nexport const HNEWS_CONTENT_SELECTORS = [\n ['.hentry', '.entry-content'],\n ['entry', '.entry-content'],\n ['.entry', '.entry_content'],\n ['.post', '.postbody'],\n ['.post', '.post_body'],\n ['.post', '.post-body'],\n];\n\nexport const PHOTO_HINTS = [\n 'figure',\n 'photo',\n 'image',\n 'caption',\n];\nexport const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');\n\n// A list of strings that denote a positive scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const POSITIVE_SCORE_HINTS = [\n 'article',\n 'articlecontent',\n 'instapaper_body',\n 'blog',\n 'body',\n 'content',\n 'entry-content-asset',\n 'entry',\n 'hentry',\n 'main',\n 'Normal',\n 'page',\n 'pagination',\n 'permalink',\n 'post',\n 'story',\n 'text',\n '[-_]copy', // usatoday\n '\\\\Bcopy',\n];\n\n// The above list, joined into a matching regular expression\nexport const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i');\n\n// Readability publisher-specific guidelines\nexport const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');\n\n// A list of strings that denote a negative scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const NEGATIVE_SCORE_HINTS = [\n 'adbox',\n 'advert',\n 'author',\n 'bio',\n 'bookmark',\n 'bottom',\n 'byline',\n 'clear',\n 'com-',\n 'combx',\n 'comment',\n 'comment\\\\B',\n 'contact',\n 'copy',\n 'credit',\n 'crumb',\n 'date',\n 'deck',\n 'excerpt',\n 'featured', // tnr.com has a featured_content which throws us off\n 'foot',\n 'footer',\n 'footnote',\n 'graf',\n 'head',\n 'info',\n 'infotext', // newscientist.com copyright\n 'instapaper_ignore',\n 'jump',\n 'linebreak',\n 'link',\n 'masthead',\n 'media',\n 'meta',\n 'modal',\n 'outbrain', // slate.com junk\n 'promo',\n 'pr_', // autoblog - press release\n 'related',\n 'respond',\n 'roundcontent', // lifehacker restricted content warning\n 'scroll',\n 'secondary',\n 'share',\n 'shopping',\n 'shoutbox',\n 'side',\n 'sidebar',\n 'sponsor',\n 'stamp',\n 'sub',\n 'summary',\n 'tags',\n 'tools',\n 'widget',\n];\n// The above list, joined into a matching regular expression\nexport const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i');\n\n// Match a digit. Pretty clear.\nexport const DIGIT_RE = new RegExp('[0-9]');\n\n// Match 2 or more consecutive tags\nexport const BR_TAGS_RE = new RegExp('( ]*>[ \\n\\r\\t]*){2,}', 'i');\n\n// Match 1 BR tag.\nexport const BR_TAG_RE = new RegExp(' ]*>', 'i');\n\n// A list of all of the block level tags known in HTML5 and below. Taken from\n// http://bit.ly/qneNIT\nexport const BLOCK_LEVEL_TAGS = [\n 'article',\n 'aside',\n 'blockquote',\n 'body',\n 'br',\n 'button',\n 'canvas',\n 'caption',\n 'col',\n 'colgroup',\n 'dd',\n 'div',\n 'dl',\n 'dt',\n 'embed',\n 'fieldset',\n 'figcaption',\n 'figure',\n 'footer',\n 'form',\n 'h1',\n 'h2',\n 'h3',\n 'h4',\n 'h5',\n 'h6',\n 'header',\n 'hgroup',\n 'hr',\n 'li',\n 'map',\n 'object',\n 'ol',\n 'output',\n 'p',\n 'pre',\n 'progress',\n 'section',\n 'table',\n 'tbody',\n 'textarea',\n 'tfoot',\n 'th',\n 'thead',\n 'tr',\n 'ul',\n 'video',\n];\nexport const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i');\n\n// The removal is implemented as a blacklist and whitelist, this test finds\n// blacklisted elements that aren't whitelisted. We do this all in one\n// expression-both because it's only one pass, and because this skips the\n// serialization for whitelisted nodes.\nconst candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');\nexport const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');\n\nconst candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');\nexport const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');\n\nexport const UNLIKELY_RE = new RegExp(`!(${candidatesWhitelist})|(${candidatesBlacklist})`, 'i');\n\nexport const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');\nexport const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');\nexport const BAD_TAGS = new RegExp('^(address|form)$', 'i');\n\nexport const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');\n","import {\n NEGATIVE_SCORE_RE,\n POSITIVE_SCORE_RE,\n PHOTO_HINTS_RE,\n READABILITY_ASSET,\n} from './constants';\n\n// Get the score of a node based on its className and id.\nexport default function getWeight(node) {\n const classes = node.attr('class');\n const id = node.attr('id');\n let score = 0;\n\n if (id) {\n // if id exists, try to score on both positive and negative\n if (POSITIVE_SCORE_RE.test(id)) {\n score += 25;\n }\n if (NEGATIVE_SCORE_RE.test(id)) {\n score -= 25;\n }\n }\n\n if (classes) {\n if (score === 0) {\n // if classes exist and id did not contribute to score\n // try to score on both positive and negative\n if (POSITIVE_SCORE_RE.test(classes)) {\n score += 25;\n }\n if (NEGATIVE_SCORE_RE.test(classes)) {\n score -= 25;\n }\n }\n\n // even if score has been set by id, add score for\n // possible photo matches\n // \"try to keep photos if we can\"\n if (PHOTO_HINTS_RE.test(classes)) {\n score += 10;\n }\n\n // add 25 if class matches entry-content-asset,\n // a class apparently instructed for use in the\n // Readability publisher guidelines\n // https://www.readability.com/developers/guidelines\n if (READABILITY_ASSET.test(classes)) {\n score += 25;\n }\n }\n\n return score;\n}\n","// returns the score of a node based on\n// the node's score attribute\n// returns null if no score set\nexport default function getScore($node) {\n return parseFloat($node.attr('score')) || null;\n}\n","// return 1 for every comma in text\nexport default function scoreCommas(text) {\n return (text.match(/,/g) || []).length;\n}\n","const idkRe = new RegExp('^(p|pre)$', 'i');\n\nexport default function scoreLength(textLength, tagName = 'p') {\n const chunks = textLength / 50;\n\n if (chunks > 0) {\n let lengthBonus;\n\n // No idea why p or pre are being tamped down here\n // but just following the source for now\n // Not even sure why tagName is included here,\n // since this is only being called from the context\n // of scoreParagraph\n if (idkRe.test(tagName)) {\n lengthBonus = chunks - 2;\n } else {\n lengthBonus = chunks - 1.25;\n }\n\n return Math.min(Math.max(lengthBonus, 0), 3);\n }\n\n return 0;\n}\n","import {\n scoreCommas,\n scoreLength,\n} from './index';\n\n// Score a paragraph using various methods. Things like number of\n// commas, etc. Higher is better.\nexport default function scoreParagraph(node) {\n let score = 1;\n const text = node.text().trim();\n const textLength = text.length;\n\n // If this paragraph is less than 25 characters, don't count it.\n if (textLength < 25) {\n return 0;\n }\n\n // Add points for any commas within this paragraph\n score += scoreCommas(text);\n\n // For every 50 characters in this paragraph, add another point. Up\n // to 3 points.\n score += scoreLength(textLength);\n\n // Articles can end with short paragraphs when people are being clever\n // but they can also end with short paragraphs setting up lists of junk\n // that we strip. This negative tweaks junk setup paragraphs just below\n // the cutoff threshold.\n if (text.slice(-1) === ':') {\n score -= 1;\n }\n\n return score;\n}\n","export default function setScore($node, $, score) {\n $node.attr('score', score);\n return $node;\n}\n","import {\n getOrInitScore,\n setScore,\n} from './index';\n\nexport default function addScore($node, $, amount) {\n try {\n const score = getOrInitScore($node, $) + amount;\n setScore($node, $, score);\n } catch (e) {\n // Ignoring; error occurs in scoreNode\n }\n\n return $node;\n}\n","import { addScore } from './index';\n\n// Adds 1/4 of a child's score to its parent\nexport default function addToParent(node, $, score) {\n const parent = node.parent();\n if (parent) {\n addScore(parent, $, score * 0.25);\n }\n\n return node;\n}\n","import {\n getScore,\n scoreNode,\n getWeight,\n addToParent,\n} from './index';\n\n// gets and returns the score if it exists\n// if not, initializes a score based on\n// the node's tag type\nexport default function getOrInitScore($node, $, weightNodes = true) {\n let score = getScore($node);\n\n if (score) {\n return score;\n }\n\n score = scoreNode($node);\n\n if (weightNodes) {\n score += getWeight($node);\n }\n\n addToParent($node, $, score);\n\n return score;\n}\n","import { scoreParagraph } from './index';\nimport {\n PARAGRAPH_SCORE_TAGS,\n CHILD_CONTENT_TAGS,\n BAD_TAGS,\n} from './constants';\n\n// Score an individual node. Has some smarts for paragraphs, otherwise\n// just scores based on tag.\nexport default function scoreNode($node) {\n const { tagName } = $node.get(0);\n\n // TODO: Consider ordering by most likely.\n // E.g., if divs are a more common tag on a page,\n // Could save doing that regex test on every node – AP\n if (PARAGRAPH_SCORE_TAGS.test(tagName)) {\n return scoreParagraph($node);\n } else if (tagName.toLowerCase() === 'div') {\n return 5;\n } else if (CHILD_CONTENT_TAGS.test(tagName)) {\n return 3;\n } else if (BAD_TAGS.test(tagName)) {\n return -3;\n } else if (tagName.toLowerCase() === 'th') {\n return -5;\n }\n\n return 0;\n}\n","import { convertNodeTo } from 'utils/dom';\n\nimport { HNEWS_CONTENT_SELECTORS } from './constants';\nimport {\n scoreNode,\n setScore,\n getOrInitScore,\n addScore,\n} from './index';\n\nfunction convertSpans($node, $) {\n if ($node.get(0)) {\n const { tagName } = $node.get(0);\n\n if (tagName === 'span') {\n // convert spans to divs\n convertNodeTo($node, $, 'div');\n }\n }\n}\n\nfunction addScoreTo($node, $, score) {\n if ($node) {\n convertSpans($node, $);\n addScore($node, $, score);\n }\n}\n\nfunction scorePs($, weightNodes) {\n $('p, pre').not('[score]').each((index, node) => {\n // The raw score for this paragraph, before we add any parent/child\n // scores.\n let $node = $(node);\n $node = setScore($node, $, getOrInitScore($node, $, weightNodes));\n\n const $parent = $node.parent();\n const rawScore = scoreNode($node);\n\n addScoreTo($parent, $, rawScore, weightNodes);\n if ($parent) {\n // Add half of the individual content score to the\n // grandparent\n addScoreTo($parent.parent(), $, rawScore / 2, weightNodes);\n }\n });\n\n return $;\n}\n\n// score content. Parents get the full value of their children's\n// content score, grandparents half\nexport default function scoreContent($, weightNodes = true) {\n // First, look for special hNews based selectors and give them a big\n // boost, if they exist\n HNEWS_CONTENT_SELECTORS.forEach(([parentSelector, childSelector]) => {\n $(`${parentSelector} ${childSelector}`).each((index, node) => {\n addScore($(node).parent(parentSelector), $, 80);\n });\n });\n\n // Doubling this again\n // Previous solution caused a bug\n // in which parents weren't retaining\n // scores. This is not ideal, and\n // should be fixed.\n scorePs($, weightNodes);\n scorePs($, weightNodes);\n\n return $;\n}\n","import {\n textLength,\n linkDensity,\n} from 'utils/dom';\nimport { hasSentenceEnd } from 'utils/text';\n\nimport { NON_TOP_CANDIDATE_TAGS_RE } from './constants';\nimport { getScore } from './index';\n\n// Now that we have a top_candidate, look through the siblings of\n// it to see if any of them are decently scored. If they are, they\n// may be split parts of the content (Like two divs, a preamble and\n// a body.) Example:\n// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14\nexport default function mergeSiblings($candidate, topScore, $) {\n if (!$candidate.parent().length) {\n return $candidate;\n }\n\n const siblingScoreThreshold = Math.max(10, topScore * 0.25);\n const wrappingDiv = $('');\n\n $candidate.parent().children().each((index, sibling) => {\n const $sibling = $(sibling);\n // Ignore tags like BR, HR, etc\n if (NON_TOP_CANDIDATE_TAGS_RE.test(sibling.tagName)) {\n return null;\n }\n\n const siblingScore = getScore($sibling);\n if (siblingScore) {\n if ($sibling.get(0) === $candidate.get(0)) {\n wrappingDiv.append($sibling);\n } else {\n let contentBonus = 0;\n const density = linkDensity($sibling);\n\n // If sibling has a very low link density,\n // give it a small bonus\n if (density < 0.05) {\n contentBonus += 20;\n }\n\n // If sibling has a high link density,\n // give it a penalty\n if (density >= 0.5) {\n contentBonus -= 20;\n }\n\n // If sibling node has the same class as\n // candidate, give it a bonus\n if ($sibling.attr('class') === $candidate.attr('class')) {\n contentBonus += topScore * 0.2;\n }\n\n const newScore = siblingScore + contentBonus;\n\n if (newScore >= siblingScoreThreshold) {\n return wrappingDiv.append($sibling);\n } else if (sibling.tagName === 'p') {\n const siblingContent = $sibling.text();\n const siblingContentLength = textLength(siblingContent);\n\n if (siblingContentLength > 80 && density < 0.25) {\n return wrappingDiv.append($sibling);\n } else if (siblingContentLength <= 80 && density === 0 &&\n hasSentenceEnd(siblingContent)) {\n return wrappingDiv.append($sibling);\n }\n }\n }\n }\n\n return null;\n });\n\n if (wrappingDiv.children().length === 1 &&\n wrappingDiv.children().first().get(0) === $candidate.get(0)) {\n return $candidate;\n }\n\n return wrappingDiv;\n}\n","import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';\nimport { getScore } from './index';\nimport mergeSiblings from './merge-siblings';\n\n// After we've calculated scores, loop through all of the possible\n// candidate nodes we found and find the one with the highest score.\nexport default function findTopCandidate($) {\n let $candidate;\n let topScore = 0;\n\n $('[score]').each((index, node) => {\n // Ignore tags like BR, HR, etc\n if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {\n return;\n }\n\n const $node = $(node);\n const score = getScore($node);\n\n if (score > topScore) {\n topScore = score;\n $candidate = $node;\n }\n });\n\n // If we don't have a candidate, return the body\n // or whatever the first element is\n if (!$candidate) {\n return $('body') || $('*').first();\n }\n\n $candidate = mergeSiblings($candidate, topScore, $);\n\n return $candidate;\n}\n","// Scoring\nexport { default as getWeight } from './get-weight';\nexport { default as getScore } from './get-score';\nexport { default as scoreCommas } from './score-commas';\nexport { default as scoreLength } from './score-length';\nexport { default as scoreParagraph } from './score-paragraph';\nexport { default as setScore } from './set-score';\nexport { default as addScore } from './add-score';\nexport { default as addToParent } from './add-to-parent';\nexport { default as getOrInitScore } from './get-or-init-score';\nexport { default as scoreNode } from './score-node';\nexport { default as scoreContent } from './score-content';\nexport { default as findTopCandidate } from './find-top-candidate';\n","import {\n getScore,\n setScore,\n getOrInitScore,\n scoreCommas,\n} from 'extractors/generic/content/scoring';\n\nimport { CLEAN_CONDITIONALLY_TAGS } from './constants';\nimport { normalizeSpaces } from '../text';\nimport { linkDensity } from './index';\n\nfunction removeUnlessContent($node, $, weight) {\n // Explicitly save entry-content-asset tags, which are\n // noted as valuable in the Publisher guidelines. For now\n // this works everywhere. We may want to consider making\n // this less of a sure-thing later.\n if ($node.hasClass('entry-content-asset')) {\n return;\n }\n\n const content = normalizeSpaces($node.text());\n\n if (scoreCommas(content) < 10) {\n const pCount = $('p', $node).length;\n const inputCount = $('input', $node).length;\n\n // Looks like a form, too many inputs.\n if (inputCount > (pCount / 3)) {\n $node.remove();\n return;\n }\n\n const contentLength = content.length;\n const imgCount = $('img', $node).length;\n\n // Content is too short, and there are no images, so\n // this is probably junk content.\n if (contentLength < 25 && imgCount === 0) {\n $node.remove();\n return;\n }\n\n const density = linkDensity($node);\n\n // Too high of link density, is probably a menu or\n // something similar.\n // console.log(weight, density, contentLength)\n if (weight < 25 && density > 0.2 && contentLength > 75) {\n $node.remove();\n return;\n }\n\n // Too high of a link density, despite the score being\n // high.\n if (weight >= 25 && density > 0.5) {\n // Don't remove the node if it's a list and the\n // previous sibling starts with a colon though. That\n // means it's probably content.\n const tagName = $node.get(0).tagName.toLowerCase();\n const nodeIsList = tagName === 'ol' || tagName === 'ul';\n if (nodeIsList) {\n const previousNode = $node.prev();\n if (previousNode && normalizeSpaces(previousNode.text()).slice(-1) === ':') {\n return;\n }\n }\n\n $node.remove();\n return;\n }\n\n const scriptCount = $('script', $node).length;\n\n // Too many script tags, not enough content.\n if (scriptCount > 0 && contentLength < 150) {\n $node.remove();\n return;\n }\n }\n}\n\n// Given an article, clean it of some superfluous content specified by\n// tags. Things like forms, ads, etc.\n//\n// Tags is an array of tag name's to search through. (like div, form,\n// etc)\n//\n// Return this same doc.\nexport default function cleanTags($article, $) {\n $(CLEAN_CONDITIONALLY_TAGS, $article).each((index, node) => {\n const $node = $(node);\n let weight = getScore($node);\n if (!weight) {\n weight = getOrInitScore($node, $);\n setScore($node, $, weight);\n }\n\n // drop node if its weight is < 0\n if (weight < 0) {\n $node.remove();\n } else {\n // deteremine if node seems like content\n removeUnlessContent($node, $, weight);\n }\n });\n\n return $;\n}\n","import { getWeight } from 'extractors/generic/content/scoring';\n\nimport { HEADER_TAG_LIST } from './constants';\nimport { normalizeSpaces } from '../text';\n\nexport default function cleanHeaders($article, $, title = '') {\n $(HEADER_TAG_LIST, $article).each((index, header) => {\n const $header = $(header);\n // Remove any headers that appear before all other p tags in the\n // document. This probably means that it was part of the title, a\n // subtitle or something else extraneous like a datestamp or byline,\n // all of which should be handled by other metadata handling.\n if ($($header, $article).prevAll('p').length === 0) {\n return $header.remove();\n }\n\n // Remove any headers that match the title exactly.\n if (normalizeSpaces($(header).text()) === title) {\n return $header.remove();\n }\n\n // If this header has a negative weight, it's probably junk.\n // Get rid of it.\n if (getWeight($(header)) < 0) {\n return $header.remove();\n }\n\n return $header;\n });\n\n return $;\n}\n","import { convertNodeTo } from 'utils/dom';\n\n// Rewrite the tag name to div if it's a top level node like body or\n// html to avoid later complications with multiple body tags.\nexport default function rewriteTopLevel(article, $) {\n // I'm not using context here because\n // it's problematic when converting the\n // top-level/root node - AP\n $ = convertNodeTo($('html'), $, 'div');\n $ = convertNodeTo($('body'), $, 'div');\n\n return $;\n}\n","import URL from 'url';\n\nimport {\n getAttrs,\n setAttr,\n} from 'utils/dom';\n\nfunction absolutize($, rootUrl, attr, $content) {\n $(`[${attr}]`, $content).each((_, node) => {\n const attrs = getAttrs(node);\n const url = attrs[attr];\n\n if (url) {\n const absoluteUrl = URL.resolve(rootUrl, url);\n setAttr(node, attr, absoluteUrl);\n }\n });\n}\n\nexport default function makeLinksAbsolute($content, $, url) {\n ['href', 'src'].forEach(attr => absolutize($, url, attr, $content));\n\n return $content;\n}\n","export function textLength(text) {\n return text.trim()\n .replace(/\\s+/g, ' ')\n .length;\n}\n\n// Determines what percentage of the text\n// in a node is link text\n// Takes a node, returns a float\nexport function linkDensity($node) {\n const totalTextLength = textLength($node.text());\n\n const linkText = $node.find('a').text();\n const linkLength = textLength(linkText);\n\n if (totalTextLength > 0) {\n return linkLength / totalTextLength;\n } else if (totalTextLength === 0 && linkLength > 0) {\n return 1;\n }\n\n return 0;\n}\n","import { stripTags } from 'utils/dom';\n\n// Given a node type to search for, and a list of meta tag names to\n// search for, find a meta tag associated.\nexport default function extractFromMeta(\n $,\n metaNames,\n cachedNames,\n cleanTags = true\n) {\n const foundNames = metaNames.filter(name => cachedNames.indexOf(name) !== -1);\n\n for (const name of foundNames) {\n const type = 'name';\n const value = 'value';\n\n const nodes = $(`meta[${type}=\"${name}\"]`);\n\n // Get the unique value of every matching node, in case there\n // are two meta tags with the same name and value.\n // Remove empty values.\n const values =\n nodes.map((index, node) => $(node).attr(value))\n .toArray()\n .filter(text => text !== '');\n\n // If we have more than one value for the same name, we have a\n // conflict and can't trust any of them. Skip this name. If we have\n // zero, that means our meta tags had no values. Skip this name\n // also.\n if (values.length === 1) {\n let metaValue;\n // Meta values that contain HTML should be stripped, as they\n // weren't subject to cleaning previously.\n if (cleanTags) {\n metaValue = stripTags(values[0], $);\n } else {\n metaValue = values[0];\n }\n\n return metaValue;\n }\n }\n\n // If nothing is found, return null\n return null;\n}\n","import { withinComment } from 'utils/dom';\n\nfunction isGoodNode($node, maxChildren) {\n // If it has a number of children, it's more likely a container\n // element. Skip it.\n if ($node.children().length > maxChildren) {\n return false;\n }\n // If it looks to be within a comment, skip it.\n if (withinComment($node)) {\n return false;\n }\n\n return true;\n}\n\n// Given a a list of selectors find content that may\n// be extractable from the document. This is for flat\n// meta-information, like author, title, date published, etc.\nexport default function extractFromSelectors(\n $,\n selectors,\n maxChildren = 1,\n textOnly = true\n) {\n for (const selector of selectors) {\n const nodes = $(selector);\n\n // If we didn't get exactly one of this selector, this may be\n // a list of articles or comments. Skip it.\n if (nodes.length === 1) {\n const $node = $(nodes[0]);\n\n if (isGoodNode($node, maxChildren)) {\n let content;\n if (textOnly) {\n content = $node.text();\n } else {\n content = $node.html();\n }\n\n if (content) {\n return content;\n }\n }\n }\n }\n\n return null;\n}\n","// strips all tags from a string of text\nexport default function stripTags(text, $) {\n // Wrapping text in html element prevents errors when text\n // has no html\n const cleanText = $(`${text}`).text();\n return cleanText === '' ? text : cleanText;\n}\n","import { getAttrs } from 'utils/dom';\n\nexport default function withinComment($node) {\n const parents = $node.parents().toArray();\n const commentParent = parents.find((parent) => {\n const attrs = getAttrs(parent);\n const { class: nodeClass, id } = attrs;\n const classAndId = `${nodeClass} ${id}`;\n return classAndId.includes('comment');\n });\n\n return commentParent !== undefined;\n}\n","// Given a node, determine if it's article-like enough to return\n// param: node (a cheerio node)\n// return: boolean\n\nexport default function nodeIsSufficient($node) {\n return $node.text().trim().length >= 100;\n}\n","import { IS_WP_SELECTOR } from './constants';\n\nexport default function isWordpress($) {\n return $(IS_WP_SELECTOR).length > 0;\n}\n","export default function getAttrs(node) {\n const { attribs, attributes } = node;\n\n if (!attribs && attributes) {\n const attrs = Reflect.ownKeys(attributes).reduce((acc, index) => {\n const attr = attributes[index];\n\n if (!attr.name || !attr.value) return acc;\n\n acc[attr.name] = attr.value;\n return acc;\n }, {});\n return attrs;\n }\n\n return attribs;\n}\n","export default function setAttr(node, attr, val) {\n if (node.attribs) {\n node.attribs[attr] = val;\n } else if (node.attributes) {\n node.setAttribute(attr, val);\n }\n\n return node;\n}\n","export default function setAttrs(node, attrs) {\n if (node.attribs) {\n node.attribs = attrs;\n } else if (node.attributes) {\n while (node.attributes.length > 0) {\n node.removeAttribute(node.attributes[0].name);\n }\n\n Reflect.ownKeys(attrs).forEach((key) => {\n node.setAttribute(key, attrs[key]);\n });\n }\n\n return node;\n}\n","// DOM manipulation\nexport { default as stripUnlikelyCandidates } from './strip-unlikely-candidates';\nexport { default as brsToPs } from './brs-to-ps';\nexport { default as paragraphize } from './paragraphize';\nexport { default as convertToParagraphs } from './convert-to-paragraphs';\nexport { default as convertNodeTo } from './convert-node-to';\nexport { default as cleanImages } from './clean-images';\nexport { default as markToKeep } from './mark-to-keep';\nexport { default as stripJunkTags } from './strip-junk-tags';\nexport { default as cleanHOnes } from './clean-h-ones';\nexport { default as cleanAttributes } from './clean-attributes';\nexport { default as removeEmpty } from './remove-empty';\nexport { default as cleanTags } from './clean-tags';\nexport { default as cleanHeaders } from './clean-headers';\nexport { default as rewriteTopLevel } from './rewrite-top-level';\nexport { default as makeLinksAbsolute } from './make-links-absolute';\nexport { textLength, linkDensity } from './link-density';\nexport { default as extractFromMeta } from './extract-from-meta';\nexport { default as extractFromSelectors } from './extract-from-selectors';\nexport { default as stripTags } from './strip-tags';\nexport { default as withinComment } from './within-comment';\nexport { default as nodeIsSufficient } from './node-is-sufficient';\nexport { default as isWordpress } from './is-wordpress';\nexport { default as getAttrs } from './get-attrs';\nexport { default as setAttr } from './set-attr';\nexport { default as setAttrs } from './set-attrs';\n","export const IS_LINK = new RegExp('https?://', 'i');\nexport const IS_IMAGE = new RegExp('.(png|gif|jpe?g)', 'i');\n\nexport const TAGS_TO_REMOVE = [\n 'script',\n 'style',\n 'form',\n].join(',');\n","import { getAttrs } from 'utils/dom';\n\nimport {\n IS_LINK,\n IS_IMAGE,\n} from './constants';\n\n// Convert all instances of images with potentially\n// lazy loaded images into normal images.\n// Many sites will have img tags with no source, or an image tag with a src\n// attribute that a is a placeholer. We need to be able to properly fill in\n// the src attribute so the images are no longer lazy loaded.\nexport default function convertLazyLoadedImages($) {\n $('img').each((_, img) => {\n const attrs = getAttrs(img);\n\n Reflect.ownKeys(attrs).forEach((attr) => {\n const value = attrs[attr];\n\n if (attr !== 'src' && IS_LINK.test(value) &&\n IS_IMAGE.test(value)) {\n $(img).attr('src', value);\n }\n });\n });\n\n return $;\n}\n","import { TAGS_TO_REMOVE } from './constants';\n\nfunction isComment(index, node) {\n return node.type === 'comment';\n}\n\nfunction cleanComments($) {\n $.root().find('*')\n .contents()\n .filter(isComment)\n .remove();\n\n return $;\n}\n\nexport default function clean($) {\n $(TAGS_TO_REMOVE).remove();\n\n $ = cleanComments($);\n return $;\n}\n","import cheerio from 'cheerio';\n\nimport { fetchResource } from './utils';\nimport {\n normalizeMetaTags,\n convertLazyLoadedImages,\n clean,\n} from './utils/dom';\n\nconst Resource = {\n\n // Create a Resource.\n //\n // :param url: The URL for the document we should retrieve.\n // :param response: If set, use as the response rather than\n // attempting to fetch it ourselves. Expects a\n // string.\n async create(url, preparedResponse, parsedUrl) {\n let result;\n\n if (preparedResponse) {\n const validResponse = {\n statusMessage: 'OK',\n statusCode: 200,\n headers: {\n 'content-type': 'text/html',\n 'content-length': 500,\n },\n };\n\n result = { body: preparedResponse, response: validResponse };\n } else {\n result = await fetchResource(url, parsedUrl);\n }\n\n if (result.error) {\n result.failed = true;\n return result;\n }\n\n return this.generateDoc(result);\n },\n\n generateDoc({ body: content, response }) {\n const { 'content-type': contentType } = response.headers;\n\n // TODO: Implement is_text function from\n // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57\n if (!contentType.includes('html') &&\n !contentType.includes('text')) {\n throw new Error('Content does not appear to be text.');\n }\n\n let $ = cheerio.load(content);\n\n if ($.root().children().length === 0) {\n throw new Error('No children, likely a bad parse.');\n }\n\n $ = normalizeMetaTags($);\n $ = convertLazyLoadedImages($);\n $ = clean($);\n\n return $;\n },\n};\n\nexport default Resource;\n","const merge = (extractor, domains) => (\n domains.reduce((acc, domain) => {\n acc[domain] = extractor;\n return acc;\n }, {})\n);\n\nexport default function mergeSupportedDomains(extractor) {\n return extractor.supportedDomains ?\n merge(extractor, [extractor.domain, ...extractor.supportedDomains])\n :\n merge(extractor, [extractor.domain]);\n}\n","export const BloggerExtractor = {\n domain: 'blogspot.com',\n content: {\n // Blogger is insane and does not load its content\n // initially in the page, but it's all there\n // in noscript\n selectors: [\n '.post-content noscript',\n ],\n\n // Selectors to remove from the extracted content\n clean: [\n ],\n\n // Convert the noscript tag to a div\n transforms: {\n noscript: 'div',\n },\n },\n\n author: {\n selectors: [\n '.post-author-name',\n ],\n },\n\n title: {\n selectors: [\n '.post h2.title',\n ],\n },\n\n date_published: {\n selectors: [\n 'span.publishdate',\n ],\n },\n};\n","export const NYMagExtractor = {\n domain: 'nymag.com',\n content: {\n // Order by most likely. Extractor will stop on first occurrence\n selectors: [\n 'div.article-content',\n 'section.body',\n 'article.article',\n ],\n\n // Selectors to remove from the extracted content\n clean: [\n '.ad',\n '.single-related-story',\n ],\n\n // Object of tranformations to make on matched elements\n // Each key is the selector, each value is the tag to\n // transform to.\n // If a function is given, it should return a string\n // to convert to or nothing (in which case it will not perform\n // the transformation.\n transforms: {\n // Convert h1s to h2s\n h1: 'h2',\n\n // Convert lazy-loaded noscript images to figures\n noscript: ($node, $) => {\n const $children = $.browser ? $($node.text()) : $node.children();\n if ($children.length === 1 && $children.get(0) !== undefined &&\n $children.get(0).tagName.toLowerCase() === 'img') {\n return 'figure';\n }\n\n return null;\n },\n },\n },\n\n title: {\n selectors: [\n 'h1.lede-feature-title',\n 'h1.headline-primary',\n 'h1',\n ],\n },\n\n author: {\n selectors: [\n '.by-authors',\n '.lede-feature-author',\n ],\n },\n\n dek: {\n selectors: [\n '.lede-feature-teaser',\n ],\n },\n\n date_published: {\n selectors: [\n ['time.article-timestamp[datetime]', 'datetime'],\n 'time.article-timestamp',\n ],\n },\n};\n","export const WikipediaExtractor = {\n domain: 'wikipedia.org',\n content: {\n selectors: [\n '#mw-content-text',\n ],\n\n defaultCleaner: false,\n\n // transform top infobox to an image with caption\n transforms: {\n '.infobox img': ($node) => {\n const $parent = $node.parents('.infobox');\n // Only prepend the first image in .infobox\n if ($parent.children('img').length === 0) {\n $parent.prepend($node);\n }\n },\n '.infobox caption': 'figcaption',\n '.infobox': 'figure',\n },\n\n // Selectors to remove from the extracted content\n clean: [\n '.mw-editsection',\n 'figure tr, figure td, figure tbody',\n '#toc',\n '.navbox',\n ],\n\n },\n\n author: 'Wikipedia Contributors',\n\n title: {\n selectors: [\n 'h2.title',\n ],\n },\n\n date_published: {\n selectors: [\n '#footer-info-lastmod',\n ],\n },\n\n};\n","export const TwitterExtractor = {\n domain: 'twitter.com',\n\n content: {\n transforms: {\n // We're transforming essentially the whole page here.\n // Twitter doesn't have nice selectors, so our initial\n // selector grabs the whole page, then we're re-writing\n // it to fit our needs before we clean it up.\n '.permalink[role=main]': ($node, $) => {\n const tweets = $node.find('.tweet');\n const $tweetContainer = $('');\n $tweetContainer.append(tweets);\n $node.replaceWith($tweetContainer);\n },\n\n // Twitter wraps @ with s, which\n // renders as a strikethrough\n s: 'span',\n },\n\n selectors: [\n '.permalink[role=main]',\n ],\n\n defaultCleaner: false,\n\n clean: [\n '.stream-item-footer',\n 'button',\n '.tweet-details-fixer',\n ],\n },\n\n author: {\n selectors: [\n '.tweet.permalink-tweet .username',\n ],\n },\n\n date_published: {\n selectors: [\n ['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms'],\n // '.tweet.permalink-tweet .metadata',\n ],\n },\n\n};\n","export const NYTimesExtractor = {\n domain: 'www.nytimes.com',\n\n title: {\n selectors: [\n '.g-headline',\n 'h1.headline',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n '.g-byline',\n '.byline',\n ],\n },\n\n content: {\n selectors: [\n 'div.g-blocks',\n 'article#story',\n ],\n\n transforms: {\n 'img.g-lazy': ($node) => {\n let src = $node.attr('src');\n // const widths = $node.attr('data-widths')\n // .slice(1)\n // .slice(0, -1)\n // .split(',');\n // if (widths.length) {\n // width = widths.slice(-1);\n // } else {\n // width = '900';\n // }\n const width = 640;\n\n src = src.replace('{{size}}', width);\n $node.attr('src', src);\n },\n },\n\n clean: [\n '.ad',\n 'header#story-header',\n '.story-body-1 .lede.video',\n '.visually-hidden',\n '#newsletter-promo',\n '.promo',\n '.comments-button',\n '.hidden',\n '.comments',\n '.supplemental',\n '.nocontent',\n '.story-footer-links',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: null,\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\nexport const TheAtlanticExtractor = {\n domain: 'www.theatlantic.com',\n title: {\n selectors: [\n 'h1.hed',\n ],\n },\n\n author: {\n selectors: [\n 'article#article .article-cover-extra .metadata .byline a',\n ],\n },\n\n content: {\n selectors: [\n ['.article-cover figure.lead-img', '.article-body'],\n '.article-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.partner-box',\n ],\n },\n\n date_published: {\n selectors: [\n ['time[itemProp=\"datePublished\"]', 'datetime'],\n ],\n },\n\n lead_image_url: null,\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const NewYorkerExtractor = {\n domain: 'www.newyorker.com',\n title: {\n selectors: [\n 'h1.title',\n ],\n },\n\n author: {\n selectors: [\n '.contributors',\n ],\n },\n\n content: {\n selectors: [\n 'div#articleBody',\n 'div.articleBody',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ['time[itemProp=\"datePublished\"]', 'content'],\n ],\n\n timezone: 'America/New_York',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n '.dek',\n 'h2.dek',\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const WiredExtractor = {\n domain: 'www.wired.com',\n title: {\n selectors: [\n 'h1.post-title',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'a[rel=\"author\"]',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n 'article.content',\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.visually-hidden',\n 'figcaption img.photo',\n\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[itemprop=\"datePublished\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const MSNExtractor = {\n domain: 'www.msn.com',\n title: {\n selectors: [\n 'h1',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'span.authorname-txt',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n 'div.richtext',\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n 'span.caption',\n\n ],\n },\n\n date_published: {\n selectors: [\n 'span.time',\n ],\n },\n\n lead_image_url: {\n selectors: [\n\n ],\n },\n\n dek: {\n selectors: [\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const YahooExtractor = {\n domain: 'www.yahoo.com',\n title: {\n selectors: [\n 'header.canvas-header',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'span.provider-name',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n '.content-canvas',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.figure-caption',\n\n ],\n },\n\n date_published: {\n selectors: [\n ['time.date[datetime]', 'datetime'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n // enter dek selectors\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const BuzzfeedExtractor = {\n domain: 'www.buzzfeed.com',\n title: {\n selectors: [\n 'h1[id=\"post-title\"]',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'a[data-action=\"user/username\"]', 'byline__author',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n ['.longform_custom_header_media', '#buzz_sub_buzz'],\n '#buzz_sub_buzz',\n ],\n\n defaultCleaner: false,\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n h2: 'b',\n\n 'div.longform_custom_header_media': ($node) => {\n if ($node.has('img') && $node.has('.longform_header_image_source')) {\n return 'figure';\n }\n\n return null;\n },\n\n 'figure.longform_custom_header_media .longform_header_image_source':\n 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.instapaper_ignore',\n '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline',\n '.share-box',\n '.print',\n ],\n },\n\n date_published: {\n selectors: [\n '.buzz-datetime',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const WikiaExtractor = {\n domain: 'fandom.wikia.com',\n title: {\n selectors: [\n 'h1.entry-title',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n '.author vcard', '.fn',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n '.grid-content',\n '.entry-content',\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const LittleThingsExtractor = {\n domain: 'www.littlethings.com',\n title: {\n selectors: [\n 'h1.post-title',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n '.mainContentIntro',\n '.content-wrapper',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const PoliticoExtractor = {\n domain: 'www.politico.com',\n title: {\n selectors: [\n // enter title selectors\n ['meta[name=\"og:title\"]', 'value'],\n ],\n },\n\n author: {\n selectors: [\n '.story-main-content .byline .vcard',\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n '.story-main-content',\n '.content-group', '.story-core',\n '.story-text',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n 'figcaption',\n ],\n },\n\n date_published: {\n selectors: [\n ['.story-main-content .timestamp time[datetime]', 'datetime'],\n\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter lead_image_url selectors\n ['meta[name=\"og:image\"]', 'value'],\n\n ],\n },\n\n dek: {\n selectors: [\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","export const DeadspinExtractor = {\n domain: 'deadspin.com',\n\n supportedDomains: [\n 'jezebel.com',\n 'lifehacker.com',\n 'kotaku.com',\n 'gizmodo.com',\n 'jalopnik.com',\n 'kinja.com',\n ],\n\n title: {\n selectors: [\n 'h1.headline',\n ],\n },\n\n author: {\n selectors: [\n '.author',\n ],\n },\n\n content: {\n selectors: [\n '.post-content',\n '.entry-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'iframe.lazyload[data-recommend-id^=\"youtube://\"]': ($node) => {\n const youtubeId = $node.attr('id').split('youtube-')[1];\n $node.attr('src', `https://www.youtube.com/embed/${youtubeId}`);\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.magnifier',\n '.lightbox',\n ],\n },\n\n date_published: {\n selectors: [\n ['time.updated[datetime]', 'datetime'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const BroadwayWorldExtractor = {\n domain: 'www.broadwayworld.com',\n title: {\n selectors: [\n 'h1.article-title',\n ],\n },\n\n author: {\n selectors: [\n 'span[itemprop=author]',\n ],\n },\n\n content: {\n selectors: [\n 'div[itemprop=articlebody]',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[itemprop=datePublished]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const ApartmentTherapyExtractor = {\n domain: 'www.apartmenttherapy.com',\n title: {\n selectors: [\n 'h1.headline',\n ],\n },\n\n author: {\n selectors: [\n '.PostByline__name',\n ],\n },\n\n content: {\n selectors: [\n 'div.post__content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'div[data-render-react-id=\"images/LazyPicture\"]': ($node, $) => {\n const data = JSON.parse($node.attr('data-props'));\n const { src } = data.sources[0];\n const $img = $('').attr('src', src);\n $node.replaceWith($img);\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n date_published: {\n selectors: [\n ['.PostByline__timestamp[datetime]', 'datetime'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n};\n","export const MediumExtractor = {\n domain: 'medium.com',\n\n supportedDomains: [\n 'trackchanges.postlight.com',\n ],\n\n title: {\n selectors: [\n 'h1',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['.section-content'],\n '.section-content',\n 'article > div > section',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n // Re-write lazy-loaded youtube videos\n iframe: ($node) => {\n const ytRe =\n /https:\\/\\/i.embed.ly\\/.+url=https:\\/\\/i\\.ytimg\\.com\\/vi\\/(\\w+)\\//;\n const thumb = decodeURIComponent($node.attr('data-thumbnail'));\n\n if (ytRe.test(thumb)) {\n const [_, youtubeId] = thumb.match(ytRe) // eslint-disable-line\n $node.attr('src', `https://www.youtube.com/embed/${youtubeId}`);\n const $parent = $node.parents('figure');\n const $caption = $parent.find('figcaption');\n $parent.empty().append([$node, $caption]);\n }\n },\n\n // rewrite figures to pull out image and caption, remove rest\n figure: ($node) => {\n // ignore if figure has an iframe\n if ($node.find('iframe').length > 0) return;\n\n const $img = $node.find('img').slice(-1)[0];\n const $caption = $node.find('figcaption');\n $node.empty().append([$img, $caption]);\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n date_published: {\n selectors: [\n ['time[datetime]', 'datetime'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n};\n","export const WwwTmzComExtractor = {\n domain: 'www.tmz.com',\n\n title: {\n selectors: [\n '.post-title-breadcrumb',\n 'h1',\n '.headline',\n ],\n },\n\n author: 'TMZ STAFF',\n\n date_published: {\n selectors: [\n '.article-posted-date',\n ],\n\n timezone: 'America/Los_Angeles',\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.article-content',\n '.all-post-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.lightbox-link',\n ],\n },\n};\n","export const WwwWashingtonpostComExtractor = {\n domain: 'www.washingtonpost.com',\n\n title: {\n selectors: [\n 'h1',\n '#topper-headline-wrapper',\n ],\n },\n\n author: {\n selectors: [\n '.pb-byline',\n ],\n },\n\n date_published: {\n selectors: [\n ['.pb-timestamp[itemprop=\"datePublished\"]', 'content'],\n ],\n },\n\n dek: {\n selectors: [\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.article-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'div.inline-content': ($node) => {\n if ($node.has('img,iframe,video').length > 0) {\n return 'figure';\n }\n\n $node.remove();\n return null;\n },\n '.pb-caption': 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.interstitial-link',\n '.newsletter-inline-unit',\n ],\n },\n};\n","export const WwwHuffingtonpostComExtractor = {\n domain: 'www.huffingtonpost.com',\n\n title: {\n selectors: [\n 'h1.headline__title',\n ],\n },\n\n author: {\n selectors: [\n 'span.author-card__details__name',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:modified_time\"]', 'value'],\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n 'h2.headline__subtitle',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.entry__body',\n ],\n\n defaultCleaner: false,\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n // 'div.top-media': ($node) => {\n // const $figure = $node.children('figure');\n // $node.replaceWith($figure);\n // },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.pull-quote',\n '.tag-cloud',\n '.embed-asset',\n '.below-entry',\n '.entry-corrections',\n '#suggested-story',\n ],\n },\n};\n","export const NewrepublicComExtractor = {\n domain: 'newrepublic.com',\n\n title: {\n selectors: [\n 'h1.article-headline',\n '.minutes-primary h1.minute-title',\n ],\n },\n\n author: {\n selectors: [\n 'div.author-list',\n '.minutes-primary h3.minute-byline',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n\n timezone: 'America/New_York',\n },\n\n dek: {\n selectors: [\n 'h2.article-subhead',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.content-body',\n '.minutes-primary div.content-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n 'aside',\n ],\n },\n};\n","export const MoneyCnnComExtractor = {\n domain: 'money.cnn.com',\n\n title: {\n selectors: [\n '.article-title',\n ],\n },\n\n author: {\n selectors: [\n '.byline a',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"date\"]', 'value'],\n ],\n\n timezone: 'GMT',\n },\n\n dek: {\n selectors: [\n '#storytext h2',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '#storytext',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.inStoryHeading',\n ],\n },\n};\n","export const WwwThevergeComExtractor = {\n domain: 'www.theverge.com',\n\n supportedDomains: ['www.polygon.com'],\n\n title: {\n selectors: [\n 'h1',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n 'h2.p-dek',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n // feature template multi-match\n ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'],\n // regular post multi-match\n ['.e-image--hero', '.c-entry-content'],\n // feature template fallback\n '.l-wrapper .l-feature',\n // regular post fallback\n 'div.c-entry-content',\n ],\n\n // Transform lazy-loaded images\n transforms: {\n noscript: ($node) => {\n const $children = $node.children();\n if ($children.length === 1 && $children.get(0).tagName === 'img') {\n return 'span';\n }\n\n return null;\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.aside',\n 'img.c-dynamic-image', // images come from noscript transform\n ],\n },\n};\n","export const WwwCnnComExtractor = {\n domain: 'www.cnn.com',\n\n title: {\n selectors: [\n 'h1.pg-headline',\n 'h1',\n ],\n },\n\n author: {\n selectors: [\n '.metadata__byline__author',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"pubdate\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n // a more specific selector to grab the lead image and the body\n ['.media__video--thumbnail', '.zn-body-text'],\n // a fallback for the above\n '.zn-body-text',\n 'div[itemprop=\"articleBody\"]',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': ($node) => {\n const $text = $node.html();\n if ($text) {\n return 'p';\n }\n\n return null;\n },\n\n // this transform cleans the short, all-link sections linking\n // to related content but not marked as such in any way.\n '.zn-body__paragraph': ($node) => {\n if ($node.has('a')) {\n if ($node.text().trim() === $node.find('a').text().trim()) {\n $node.remove();\n }\n }\n },\n\n '.media__video--thumbnail': 'figure',\n\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n ],\n },\n};\n","export const WwwAolComExtractor = {\n domain: 'www.aol.com',\n\n title: {\n selectors: [\n 'h1.p-article__title',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n '.p-article__byline__date',\n ],\n\n timezone: 'America/New_York',\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.article-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwYoutubeComExtractor = {\n domain: 'www.youtube.com',\n\n title: {\n selectors: [\n '.watch-title',\n 'h1.watch-title-container',\n ],\n },\n\n author: {\n selectors: [\n '.yt-user-info',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[itemProp=\"datePublished\"]', 'value'],\n ],\n\n timezone: 'GMT',\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n defaultCleaner: false,\n\n selectors: [\n ['#player-api', '#eow-description'],\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '#player-api': ($node, $) => {\n const videoId = $('meta[itemProp=\"videoId\"]').attr('value');\n $node.html(`\n `\n );\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwTheguardianComExtractor = {\n domain: 'www.theguardian.com',\n\n title: {\n selectors: [\n '.content__headline',\n ],\n },\n\n author: {\n selectors: [\n 'p.byline',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n '.content__standfirst',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.content__article-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.hide-on-mobile',\n '.inline-icon',\n ],\n },\n};\n","export const WwwSbnationComExtractor = {\n domain: 'www.sbnation.com',\n\n title: {\n selectors: [\n 'h1.c-page-title',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n 'h2.c-entry-summary.p-dek',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.c-entry-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwBloombergComExtractor = {\n domain: 'www.bloomberg.com',\n\n title: {\n selectors: [\n // normal articles\n '.lede-headline',\n\n // /graphics/ template\n 'h1.article-title',\n\n // /news/ template\n 'h1.lede-text-only__hed',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"parsely-author\"]', 'value'],\n '.byline-details__link',\n\n // /graphics/ template\n '.bydek',\n\n // /news/ template\n '.author',\n ],\n },\n\n date_published: {\n selectors: [\n ['time.published-at', 'datetime'],\n ['time[datetime]', 'datetime'],\n ['meta[name=\"date\"]', 'value'],\n ['meta[name=\"parsely-pub-date\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.article-body__content',\n\n // /graphics/ template\n ['section.copy-block'],\n\n // /news/ template\n '.body-copy',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.inline-newsletter',\n '.page-ad',\n ],\n },\n};\n","export const WwwBustleComExtractor = {\n domain: 'www.bustle.com',\n\n title: {\n selectors: [\n 'h1.post-page__title',\n ],\n },\n\n author: {\n selectors: [\n 'div.content-meta__author',\n ],\n },\n\n date_published: {\n selectors: [\n ['time.content-meta__published-date[datetime]', 'datetime'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.post-page__body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwNprOrgExtractor = {\n domain: 'www.npr.org',\n\n title: {\n selectors: [\n 'h1',\n '.storytitle',\n ],\n },\n\n author: {\n selectors: [\n 'p.byline__name.byline__name--block',\n ],\n },\n\n date_published: {\n selectors: [\n ['.dateblock time[datetime]', 'datetime'],\n ['meta[name=\"date\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ['meta[name=\"twitter:image:src\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.storytext',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.bucketwrap.image': 'figure',\n '.bucketwrap.image .credit-caption': 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n 'div.enlarge_measure',\n ],\n },\n};\n","export const WwwRecodeNetExtractor = {\n domain: 'www.recode.net',\n\n title: {\n selectors: [\n 'h1.c-page-title',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n 'h2.c-entry-summary.p-dek',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['figure.e-image--hero', '.c-entry-content'],\n '.c-entry-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const QzComExtractor = {\n domain: 'qz.com',\n\n title: {\n selectors: [\n 'header.item-header.content-width-responsive',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n '.timestamp',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['figure.featured-image', '.item-body'],\n '.item-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.article-aside',\n '.progressive-image-thumbnail',\n ],\n },\n};\n","export const WwwDmagazineComExtractor = {\n domain: 'www.dmagazine.com',\n\n title: {\n selectors: [\n 'h1.story__title',\n ],\n },\n\n author: {\n selectors: [\n '.story__info .story__info__item:first-child',\n ],\n },\n\n date_published: {\n selectors: [\n // enter selectors\n '.story__info',\n ],\n\n timezone: 'America/Chicago',\n },\n\n dek: {\n selectors: [\n '.story__subhead',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['article figure a:first-child', 'href'],\n ],\n },\n\n content: {\n selectors: [\n '.story__content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwReutersComExtractor = {\n domain: 'www.reuters.com',\n\n title: {\n selectors: [\n 'h1.article-headline',\n ],\n },\n\n author: {\n selectors: [\n '.author',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"og:article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '#article-text',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.article-subtitle': 'h4',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '#article-byline .author',\n // 'span.location',\n // 'span.articleLocation',\n ],\n },\n};\n","export const MashableComExtractor = {\n domain: 'mashable.com',\n\n title: {\n selectors: [\n 'h1.title',\n ],\n },\n\n author: {\n selectors: [\n 'span.author_name a',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"og:article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'section.article-content.blueprint',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.image-credit': 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwChicagotribuneComExtractor = {\n domain: 'www.chicagotribune.com',\n\n title: {\n selectors: [\n 'h1.trb_ar_hl_t',\n ],\n },\n\n author: {\n selectors: [\n 'span.trb_ar_by_nm_au',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[itemprop=\"datePublished\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.trb_ar_page',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwVoxComExtractor = {\n domain: 'www.vox.com',\n\n title: {\n selectors: [\n 'h1.c-page-title',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n '.p-dek',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['figure.e-image--hero', '.c-entry-content'],\n '.c-entry-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'figure .e-image__image noscript': ($node) => {\n const imgHtml = $node.html();\n $node.parents('.e-image__image').find('.c-dynamic-image').replaceWith(imgHtml);\n },\n\n 'figure .e-image__meta': 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const NewsNationalgeographicComExtractor = {\n domain: 'news.nationalgeographic.com',\n\n title: {\n selectors: [\n 'h1',\n 'h1.main-title',\n ],\n },\n\n author: {\n selectors: [\n '.byline-component__contributors b span',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n format: 'ddd MMM DD HH:mm:ss zz YYYY',\n timezone: 'EST',\n },\n\n dek: {\n selectors: [\n '.article__deck',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['.parsys.content', '.__image-lead__'],\n '.content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.parsys.content': ($node, $) => {\n const $imgSrc = $node.find('.image.parbase.section')\n .find('.picturefill')\n .first()\n .data('platform-src');\n if ($imgSrc) {\n $node.prepend($(``));\n }\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.pull-quote.pull-quote--large',\n ],\n },\n};\n","export const WwwNationalgeographicComExtractor = {\n domain: 'www.nationalgeographic.com',\n\n title: {\n selectors: [\n 'h1',\n 'h1.main-title',\n ],\n },\n\n author: {\n selectors: [\n '.byline-component__contributors b span',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n '.article__deck',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['.parsys.content', '.__image-lead__'],\n '.content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.parsys.content': ($node, $) => {\n const $imageParent = $node.children().first();\n if ($imageParent.hasClass('imageGroup')) {\n const $dataAttrContainer = $imageParent.find('.media--medium__container').children().first();\n const imgPath1 = $dataAttrContainer.data('platform-image1-path');\n const imgPath2 = $dataAttrContainer.data('platform-image2-path');\n if (imgPath2 && imgPath1) {\n $node.prepend($(`
\n \n \n
`));\n }\n } else {\n const $imgSrc = $node.find('.image.parbase.section')\n .find('.picturefill')\n .first()\n .data('platform-src');\n if ($imgSrc) {\n $node.prepend($(``));\n }\n }\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.pull-quote.pull-quote--small',\n ],\n },\n};\n","export const WwwLatimesComExtractor = {\n domain: 'www.latimes.com',\n\n title: {\n selectors: [\n '.trb_ar_hl',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[itemprop=\"datePublished\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.trb_ar_main',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.trb_ar_la': ($node) => {\n const $figure = $node.find('figure');\n $node.replaceWith($figure);\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.trb_ar_by',\n '.trb_ar_cr',\n ],\n },\n};\n","export const PagesixComExtractor = {\n domain: 'pagesix.com',\n\n supportedDomains: [\n 'nypost.com',\n ],\n\n title: {\n selectors: [\n 'h1 a',\n ],\n },\n\n author: {\n selectors: [\n '.byline',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ['meta[name=\"description\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['#featured-image-wrapper', '.entry-content'],\n '.entry-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '#featured-image-wrapper': 'figure',\n '.wp-caption-text': 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.modal-trigger',\n ],\n },\n};\n","export const ThefederalistpapersOrgExtractor = {\n domain: 'thefederalistpapers.org',\n\n title: {\n selectors: [\n 'h1.entry-title',\n ],\n },\n\n author: {\n selectors: [\n 'main span.entry-author-name',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.entry-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n ['p[style]'],\n ],\n },\n};\n","export const WwwCbssportsComExtractor = {\n domain: 'www.cbssports.com',\n\n title: {\n selectors: [\n '.article-headline',\n ],\n },\n\n author: {\n selectors: [\n '.author-name',\n ],\n },\n\n date_published: {\n selectors: [\n ['.date-original-reading-time time', 'datetime'],\n ],\n timezone: 'UTC',\n },\n\n dek: {\n selectors: [\n '.article-subline',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.article',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwMsnbcComExtractor = {\n domain: 'www.msnbc.com',\n\n title: {\n selectors: [\n 'h1',\n 'h1.is-title-pane',\n ],\n },\n\n author: {\n selectors: [\n '.author',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"DC.date.issued\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ['meta[name=\"description\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.pane-node-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.pane-node-body': ($node, $) => {\n const [selector, attr] = WwwMsnbcComExtractor.lead_image_url.selectors[0];\n const src = $(selector).attr(attr);\n if (src) {\n $node.prepend(``);\n }\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwThepoliticalinsiderComExtractor = {\n domain: 'www.thepoliticalinsider.com',\n\n title: {\n selectors: [\n ['meta[name=\"sailthru.title\"]', 'value'],\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"sailthru.author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"sailthru.date\"]', 'value'],\n ],\n timezone: 'America/New_York',\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'], // enter selectors\n ],\n },\n\n content: {\n selectors: [\n 'div#article-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwMentalflossComExtractor = {\n domain: 'www.mentalfloss.com',\n\n title: {\n selectors: [\n 'h1.title',\n '.title-group',\n '.inner',\n ],\n },\n\n author: {\n selectors: [\n '.field-name-field-enhanced-authors',\n ],\n },\n\n date_published: {\n selectors: [\n '.date-display-single',\n ],\n timezone: 'America/New_York',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.field.field-name-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const AbcnewsGoComExtractor = {\n domain: 'abcnews.go.com',\n\n title: {\n selectors: [\n '.article-header h1',\n ],\n },\n\n author: {\n selectors: [\n '.authors',\n ],\n clean: [\n '.author-overlay',\n '.by-text',\n ],\n },\n\n date_published: {\n selectors: [\n '.timestamp',\n ],\n timezone: 'America/New_York',\n\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.article-copy',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwNydailynewsComExtractor = {\n domain: 'www.nydailynews.com',\n\n title: {\n selectors: [\n 'h1#ra-headline',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"parsely-author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"sailthru.date\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'article#ra-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n 'dl#ra-tags',\n '.ra-related',\n 'a.ra-editor',\n 'dl#ra-share-bottom',\n ],\n },\n};\n","export const WwwCnbcComExtractor = {\n domain: 'www.cnbc.com',\n\n title: {\n selectors: [\n 'h1.title',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div#article_body.content',\n 'div.story',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwPopsugarComExtractor = {\n domain: 'www.popsugar.com',\n\n title: {\n selectors: [\n 'h2.post-title',\n 'title-text',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"article:author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '#content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.share-copy-title',\n '.post-tags',\n '.reactions',\n ],\n },\n};\n","export const ObserverComExtractor = {\n domain: 'observer.com',\n\n title: {\n selectors: [\n 'h1.entry-title',\n ],\n },\n\n author: {\n selectors: [\n '.author',\n '.vcard',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n 'h2.dek',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.entry-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const PeopleComExtractor = {\n domain: 'people.com',\n\n title: {\n selectors: [\n ['meta[name=\"og:title\"]', 'value'],\n ],\n },\n\n author: {\n selectors: [\n 'a.author.url.fn',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.article-body__inner',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwUsmagazineComExtractor = {\n domain: 'www.usmagazine.com',\n\n title: {\n selectors: [\n 'header h1',\n ],\n },\n\n author: {\n selectors: [\n 'a.article-byline.tracked-offpage',\n ],\n },\n\n date_published: {\n timezone: 'America/New_York',\n\n selectors: [\n 'time.article-published-date',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.article-body-inner',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.module-related',\n ],\n },\n};\n","export const WwwRollingstoneComExtractor = {\n domain: 'www.rollingstone.com',\n\n title: {\n selectors: [\n 'h1.content-title',\n ],\n },\n\n author: {\n selectors: [\n 'a.content-author.tracked-offpage',\n ],\n },\n\n date_published: {\n selectors: [\n 'time.content-published-date',\n ],\n\n timezone: 'America/New_York',\n },\n\n dek: {\n selectors: [\n '.content-description',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['.lead-container', '.article-content'],\n '.article-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.module-related',\n ],\n },\n};\n","export const twofortysevensportsComExtractor = {\n domain: '247sports.com',\n\n title: {\n selectors: [\n 'title',\n 'article header h1',\n ],\n },\n\n author: {\n selectors: [\n '.author',\n ],\n },\n\n date_published: {\n selectors: [\n ['time[data-published]', 'data-published'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'section.body.article',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const UproxxComExtractor = {\n domain: 'uproxx.com',\n\n title: {\n selectors: [\n 'div.post-top h1',\n ],\n },\n\n author: {\n selectors: [\n '.post-top .authorname',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.post-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'div.image': 'figure',\n 'div.image .wp-media-credit': 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwEonlineComExtractor = {\n domain: 'www.eonline.com',\n\n title: {\n selectors: [\n 'h1.article__title',\n ],\n },\n\n author: {\n selectors: [\n '.entry-meta__author a',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[itemprop=\"datePublished\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['.post-content section, .post-content div.post-content__image'],\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'div.post-content__image': 'figure',\n 'div.post-content__image .image__credits': 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwMiamiheraldComExtractor = {\n domain: 'www.miamiherald.com',\n\n title: {\n selectors: [\n 'h1.title',\n ],\n },\n\n date_published: {\n selectors: [\n 'p.published-date',\n ],\n\n timezone: 'America/New_York',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.dateline-storybody',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwRefinery29ComExtractor = {\n domain: 'www.refinery29.com',\n\n title: {\n selectors: [\n 'h1.title',\n ],\n },\n\n author: {\n selectors: [\n '.contributor',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"sailthru.date\"]', 'value'],\n ],\n\n timezone: 'America/New_York',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['.full-width-opener', '.article-content'],\n '.article-content',\n '.body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'div.loading noscript': ($node) => {\n const imgHtml = $node.html();\n $node.parents('.loading').replaceWith(imgHtml);\n },\n\n '.section-image': 'figure',\n\n '.section-image .content-caption': 'figcaption',\n\n '.section-text': 'p',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.story-share',\n ],\n },\n};\n","export const WwwMacrumorsComExtractor = {\n domain: 'www.macrumors.com',\n\n title: {\n selectors: [\n 'h1',\n 'h1.title',\n ],\n },\n\n author: {\n selectors: [\n '.author-url',\n ],\n },\n\n date_published: {\n selectors: [\n '.article .byline',\n ],\n\n // Wednesday January 18, 2017 11:44 am PST\n format: 'dddd MMMM D, YYYY h:mm A zz',\n\n timezone: 'America/Los_Angeles',\n },\n\n dek: {\n selectors: [\n ['meta[name=\"description\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.article',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwAndroidcentralComExtractor = {\n domain: 'www.androidcentral.com',\n\n title: {\n selectors: [\n 'h1',\n 'h1.main-title',\n ],\n },\n\n author: {\n selectors: [\n '.meta-by',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ['meta[name=\"og:description\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['.image-large', 'src'],\n ],\n },\n\n content: {\n selectors: [\n '.article-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.intro',\n 'blockquote',\n ],\n },\n};\n","export const WwwSiComExtractor = {\n domain: 'www.si.com',\n\n title: {\n selectors: [\n 'h1',\n 'h1.headline',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n '.timestamp',\n ],\n\n timezone: 'America/New_York',\n },\n\n dek: {\n selectors: [\n '.quick-hit ul',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['p', '.marquee_large_2x', '.component.image'],\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n\n noscript: ($node) => {\n const $children = $node.children();\n if ($children.length === 1 && $children.get(0).tagName === 'img') {\n return 'figure';\n }\n\n return null;\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n [\n '.inline-thumb',\n '.primary-message',\n '.description',\n '.instructions',\n ],\n ],\n },\n};\n","export const WwwRawstoryComExtractor = {\n domain: 'www.rawstory.com',\n\n title: {\n selectors: [\n '.blog-title',\n ],\n },\n\n author: {\n selectors: [\n '.blog-author a:first-of-type',\n ],\n },\n\n date_published: {\n selectors: [\n '.blog-author a:last-of-type',\n ],\n\n timezone: 'EST',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.blog-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwCnetComExtractor = {\n domain: 'www.cnet.com',\n\n title: {\n selectors: [\n ['meta[name=\"og:title\"]', 'value'],\n ],\n },\n\n author: {\n selectors: [\n 'a.author',\n ],\n },\n\n date_published: {\n selectors: [\n 'time',\n ],\n\n timezone: 'America/Los_Angeles',\n },\n\n dek: {\n selectors: [\n '.article-dek',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['img.__image-lead__', '.article-main-body'],\n '.article-main-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'figure.image': ($node) => {\n const $img = $node.find('img');\n $img.attr('width', '100%');\n $img.attr('height', '100%');\n $img.addClass('__image-lead__');\n $node.remove('.imgContainer').prepend($img);\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwCinemablendComExtractor = {\n domain: 'www.cinemablend.com',\n\n title: {\n selectors: [\n '.story_title',\n ],\n },\n\n author: {\n selectors: [\n '.author',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n\n timezone: 'EST',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div#wrap_left_content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwTodayComExtractor = {\n domain: 'www.today.com',\n\n title: {\n selectors: [\n 'h1.entry-headline',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"DC.date.issued\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.entry-container',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.label-comment',\n ],\n },\n};\n","export const WwwHowtogeekComExtractor = {\n domain: 'www.howtogeek.com',\n\n title: {\n selectors: [\n 'title',\n ],\n },\n\n author: {\n selectors: [\n '#authorinfobox a',\n ],\n },\n\n date_published: {\n selectors: [\n '#authorinfobox + div li',\n ],\n timezone: 'GMT',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.thecontent',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwAlComExtractor = {\n domain: 'www.al.com',\n\n title: {\n selectors: [\n ['meta[name=\"title\"]', 'value'],\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"article_author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article_date_original\"]', 'value'],\n ],\n timezone: 'EST',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.entry-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwThepennyhoarderComExtractor = {\n domain: 'www.thepennyhoarder.com',\n\n title: {\n selectors: [\n ['meta[name=\"dcterms.title\"]', 'value'],\n ],\n },\n\n author: {\n selectors: [\n ['link[rel=\"author\"]', 'title'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['.post-img', '.post-text'],\n '.post-text',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwWesternjournalismComExtractor = {\n domain: 'www.westernjournalism.com',\n\n title: {\n selectors: [\n 'title',\n 'h1.entry-title',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"DC.date.issued\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n '.subtitle',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.article-sharing.top + div',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.ad-notice-small',\n ],\n },\n};\n","export const FusionNetExtractor = {\n domain: 'fusion.net',\n\n title: {\n selectors: [\n '.post-title',\n '.single-title',\n '.headline',\n ],\n },\n\n author: {\n selectors: [\n '.show-for-medium .byline',\n ],\n },\n\n date_published: {\n selectors: [\n ['time.local-time', 'datetime'],\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['.post-featured-media', '.article-content'],\n '.article-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.fusion-youtube-oembed': 'figure',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwAmericanowComExtractor = {\n domain: 'www.americanow.com',\n\n title: {\n selectors: [\n '.title',\n ['meta[name=\"title\"]', 'value'],\n ],\n },\n\n author: {\n selectors: [\n '.byline',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"publish_date\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['.article-content', '.image', '.body'],\n '.body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.article-video-wrapper',\n '.show-for-small-only',\n ],\n },\n};\n","export const ScienceflyComExtractor = {\n domain: 'sciencefly.com',\n\n title: {\n selectors: [\n '.entry-title',\n '.cb-entry-title',\n '.cb-single-title',\n ],\n },\n\n author: {\n selectors: [\n 'div.cb-author',\n 'div.cb-author-title',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['div.theiaPostSlider_slides img', 'src'],\n ],\n },\n\n content: {\n selectors: [\n 'div.theiaPostSlider_slides',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const HellogigglesComExtractor = {\n domain: 'hellogiggles.com',\n\n title: {\n selectors: [\n '.title',\n ],\n },\n\n author: {\n selectors: [\n '.author-link',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.entry-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const ThoughtcatalogComExtractor = {\n domain: 'thoughtcatalog.com',\n\n title: {\n selectors: [\n 'h1.title',\n ['meta[name=\"og:title\"]', 'value'],\n ],\n },\n\n author: {\n selectors: [\n 'div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name',\n 'h1.writer-name',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.entry.post',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.tc_mark',\n ],\n },\n};\n","export const WwwNjComExtractor = {\n domain: 'www.nj.com',\n\n title: {\n selectors: [\n ['meta[name=\"title\"]', 'value'],\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"article_author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article_date_original\"]', 'value'],\n ],\n\n timezone: 'America/New_York',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.entry-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwInquisitrComExtractor = {\n domain: 'www.inquisitr.com',\n\n title: {\n selectors: [\n 'h1.entry-title.story--header--title',\n ],\n },\n\n author: {\n selectors: [\n 'div.story--header--author',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"datePublished\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'article.story',\n '.entry-content.',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.post-category',\n '.story--header--socials',\n '.story--header--content',\n ],\n },\n};\n","export const WwwNbcnewsComExtractor = {\n domain: 'www.nbcnews.com',\n\n title: {\n selectors: [\n 'div.article-hed h1',\n ],\n },\n\n author: {\n selectors: [\n 'span.byline_author',\n ],\n },\n\n date_published: {\n selectors: [\n ['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'],\n '.flag_article-wrapper time',\n ],\n\n timezone: 'America/New_York',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n 'div.article-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const FortuneComExtractor = {\n domain: 'fortune.com',\n\n title: {\n selectors: [\n 'h1',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n ],\n },\n\n date_published: {\n selectors: [\n '.MblGHNMJ',\n ],\n\n timezone: 'UTC',\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['picture', 'article.row'],\n 'article.row',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwLinkedinComExtractor = {\n domain: 'www.linkedin.com',\n\n title: {\n selectors: [\n '.article-title',\n 'h1',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"article:author\"]', 'value'],\n '.entity-name a[rel=author]',\n ],\n },\n\n date_published: {\n selectors: [\n ['time[itemprop=\"datePublished\"]', 'datetime'],\n ],\n\n timezone: 'America/Los_Angeles',\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n ['header figure', '.prose'],\n '.prose',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.entity-image',\n ],\n },\n};\n","export const ObamawhitehouseArchivesGovExtractor = {\n domain: 'obamawhitehouse.archives.gov',\n\n supportedDomains: [\n 'whitehouse.gov',\n ],\n\n title: {\n selectors: [\n 'h1',\n '.pane-node-title',\n ],\n },\n\n author: {\n selectors: [\n '.blog-author-link',\n '.node-person-name-link',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n '.field-name-field-forall-summary',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.pane-node-field-forall-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n};\n","export const WwwOpposingviewsComExtractor = {\n domain: 'www.opposingviews.com',\n\n title: {\n selectors: [\n 'h1.title',\n ],\n },\n\n author: {\n selectors: [\n 'div.date span span a',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"publish_date\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n content: {\n selectors: [\n '.article-content',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.show-for-small-only',\n ],\n },\n};\n","import mergeSupportedDomains from 'utils/merge-supported-domains';\nimport * as CustomExtractors from './custom/index';\n\nexport default Object.keys(CustomExtractors).reduce((acc, key) => {\n const extractor = CustomExtractors[key];\n return {\n ...acc,\n ...mergeSupportedDomains(extractor),\n };\n}, {});\n","// CLEAN AUTHOR CONSTANTS\nexport const CLEAN_AUTHOR_RE = /^\\s*(posted |written )?by\\s*:?\\s*(.*)/i;\n // author = re.sub(r'^\\s*(posted |written )?by\\s*:?\\s*(.*)(?i)',\n\n// CLEAN DEK CONSTANTS\nexport const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');\n// An ordered list of meta tag names that denote likely article deks.\n// From most distinct to least distinct.\n//\n// NOTE: There are currently no meta tags that seem to provide the right\n// content consistenty enough. Two options were:\n// - og:description\n// - dc.description\n// However, these tags often have SEO-specific junk in them that's not\n// header-worthy like a dek is. Excerpt material at best.\nexport const DEK_META_TAGS = [\n];\n\n// An ordered list of Selectors to find likely article deks. From\n// most explicit to least explicit.\n//\n// Should be more restrictive than not, as a failed dek can be pretty\n// detrimental to the aesthetics of an article.\nexport const DEK_SELECTORS = [\n '.entry-summary',\n];\n\n// CLEAN DATE PUBLISHED CONSTANTS\nexport const MS_DATE_STRING = /^\\d{13}$/i;\nexport const SEC_DATE_STRING = /^\\d{10}$/i;\nexport const CLEAN_DATE_STRING_RE = /^\\s*published\\s*:?\\s*(.*)/i;\nexport const TIME_MERIDIAN_SPACE_RE = /(.*\\d)(am|pm)(.*)/i;\nexport const TIME_MERIDIAN_DOTS_RE = /\\.m\\./i;\nconst months = [\n 'jan',\n 'feb',\n 'mar',\n 'apr',\n 'may',\n 'jun',\n 'jul',\n 'aug',\n 'sep',\n 'oct',\n 'nov',\n 'dec',\n];\nconst allMonths = months.join('|');\nconst timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';\nconst timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';\nconst timestamp3 = '-[0-9]{3,4}$';\nexport const SPLIT_DATE_STRING =\n new RegExp(`(${timestamp1})|(${timestamp2})|(${timestamp3})|([0-9]{1,4})|(${allMonths})`, 'ig');\n\n// 2016-11-22T08:57-500\n// Check if datetime string has an offset at the end\nexport const TIME_WITH_OFFSET_RE = /-\\d{3,4}$/;\n\n// CLEAN TITLE CONSTANTS\n// A regular expression that will match separating characters on a\n// title, that usually denote breadcrumbs or something similar.\nexport const TITLE_SPLITTERS_RE = /(: | - | \\| )/g;\n\nexport const DOMAIN_ENDINGS_RE =\n new RegExp('.com$|.net$|.org$|.co.uk$', 'g');\n","import { normalizeSpaces } from 'utils/text';\nimport { CLEAN_AUTHOR_RE } from './constants';\n\n// Take an author string (like 'By David Smith ') and clean it to\n// just the name(s): 'David Smith'.\nexport default function cleanAuthor(author) {\n return normalizeSpaces(\n author.replace(CLEAN_AUTHOR_RE, '$2').trim()\n );\n}\n","import validUrl from 'valid-url';\n\nexport default function clean(leadImageUrl) {\n leadImageUrl = leadImageUrl.trim();\n if (validUrl.isWebUri(leadImageUrl)) {\n return leadImageUrl;\n }\n\n return null;\n}\n","import { stripTags } from 'utils/dom';\nimport {\n excerptContent,\n normalizeSpaces,\n} from 'utils/text';\n\nimport { TEXT_LINK_RE } from './constants';\n\n// Take a dek HTML fragment, and return the cleaned version of it.\n// Return None if the dek wasn't good enough.\nexport default function cleanDek(dek, { $, excerpt }) {\n // Sanity check that we didn't get too short or long of a dek.\n if (dek.length > 1000 || dek.length < 5) return null;\n\n // Check that dek isn't the same as excerpt\n if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10)) return null;\n\n const dekText = stripTags(dek, $);\n\n // Plain text links shouldn't exist in the dek. If we have some, it's\n // not a good dek - bail.\n if (TEXT_LINK_RE.test(dekText)) return null;\n\n return normalizeSpaces(dekText.trim());\n}\n","import moment from 'moment-timezone';\nimport parseFormat from 'moment-parseformat';\n// Is there a compelling reason to use moment here?\n// Mostly only being used for the isValid() method,\n// but could just check for 'Invalid Date' string.\n\nimport {\n MS_DATE_STRING,\n SEC_DATE_STRING,\n CLEAN_DATE_STRING_RE,\n SPLIT_DATE_STRING,\n TIME_MERIDIAN_SPACE_RE,\n TIME_MERIDIAN_DOTS_RE,\n TIME_WITH_OFFSET_RE,\n} from './constants';\n\nexport function cleanDateString(dateString) {\n return (dateString.match(SPLIT_DATE_STRING) || [])\n .join(' ')\n .replace(TIME_MERIDIAN_DOTS_RE, 'm')\n .replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')\n .replace(CLEAN_DATE_STRING_RE, '$1')\n .trim();\n}\n\nexport function createDate(dateString, timezone, format) {\n if (TIME_WITH_OFFSET_RE.test(dateString)) {\n return moment(new Date(dateString));\n }\n\n return timezone ?\n moment.tz(dateString, format || parseFormat(dateString), timezone) :\n moment(dateString, format || parseFormat(dateString));\n}\n\n// Take a date published string, and hopefully return a date out of\n// it. Return none if we fail.\nexport default function cleanDatePublished(dateString, { timezone, format } = {}) {\n // If string is in milliseconds or seconds, convert to int and return\n if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {\n return new Date(parseInt(dateString, 10)).toISOString();\n }\n\n let date = createDate(dateString, timezone, format);\n\n if (!date.isValid()) {\n dateString = cleanDateString(dateString);\n date = createDate(dateString, timezone, format);\n }\n\n return date.isValid() ? date.toISOString() : null;\n}\n","import {\n cleanAttributes,\n cleanHeaders,\n cleanHOnes,\n cleanImages,\n cleanTags,\n removeEmpty,\n rewriteTopLevel,\n markToKeep,\n stripJunkTags,\n makeLinksAbsolute,\n} from 'utils/dom';\n\n// Clean our article content, returning a new, cleaned node.\nexport default function extractCleanNode(\n article,\n {\n $,\n cleanConditionally = true,\n title = '',\n url = '',\n defaultCleaner = true,\n }\n) {\n // Rewrite the tag name to div if it's a top level node like body or\n // html to avoid later complications with multiple body tags.\n rewriteTopLevel(article, $);\n\n // Drop small images and spacer images\n // Only do this is defaultCleaner is set to true;\n // this can sometimes be too aggressive.\n if (defaultCleaner) cleanImages(article, $);\n\n // Mark elements to keep that would normally be removed.\n // E.g., stripJunkTags will remove iframes, so we're going to mark\n // YouTube/Vimeo videos as elements we want to keep.\n markToKeep(article, $, url);\n\n // Drop certain tags like , etc\n // This is -mostly- for cleanliness, not security.\n stripJunkTags(article, $);\n\n // H1 tags are typically the article title, which should be extracted\n // by the title extractor instead. If there's less than 3 of them (<3),\n // strip them. Otherwise, turn 'em into H2s.\n cleanHOnes(article, $);\n\n // Clean headers\n cleanHeaders(article, $, title);\n\n // Make links absolute\n makeLinksAbsolute(article, $, url);\n\n // We used to clean UL's and OL's here, but it was leading to\n // too many in-article lists being removed. Consider a better\n // way to detect menus particularly and remove them.\n // Also optionally running, since it can be overly aggressive.\n if (defaultCleaner) cleanTags(article, $, cleanConditionally);\n\n // Remove empty paragraph nodes\n removeEmpty(article, $);\n\n // Remove unnecessary attributes\n cleanAttributes(article, $);\n\n return article;\n}\n","import { stripTags } from 'utils/dom';\nimport { normalizeSpaces } from 'utils/text';\n\nimport { TITLE_SPLITTERS_RE } from './constants';\nimport { resolveSplitTitle } from './index';\n\nexport default function cleanTitle(title, { url, $ }) {\n // If title has |, :, or - in it, see if\n // we can clean it up.\n if (TITLE_SPLITTERS_RE.test(title)) {\n title = resolveSplitTitle(title, url);\n }\n\n // Final sanity check that we didn't get a crazy title.\n // if (title.length > 150 || title.length < 15) {\n if (title.length > 150) {\n // If we did, return h1 from the document if it exists\n const h1 = $('h1');\n if (h1.length === 1) {\n title = h1.text();\n }\n }\n\n // strip any html tags in the title text\n return normalizeSpaces(stripTags(title, $).trim());\n}\n","import URL from 'url';\nimport wuzzy from 'wuzzy';\n\nimport {\n TITLE_SPLITTERS_RE,\n DOMAIN_ENDINGS_RE,\n} from './constants';\n\nfunction extractBreadcrumbTitle(splitTitle, text) {\n // This must be a very breadcrumbed title, like:\n // The Best Gadgets on Earth : Bits : Blogs : NYTimes.com\n // NYTimes - Blogs - Bits - The Best Gadgets on Earth\n if (splitTitle.length >= 6) {\n // Look to see if we can find a breadcrumb splitter that happens\n // more than once. If we can, we'll be able to better pull out\n // the title.\n const termCounts = splitTitle.reduce((acc, titleText) => {\n acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;\n return acc;\n }, {});\n\n const [maxTerm, termCount] =\n Reflect.ownKeys(termCounts)\n .reduce((acc, key) => {\n if (acc[1] < termCounts[key]) {\n return [key, termCounts[key]];\n }\n\n return acc;\n }, [0, 0]);\n\n // We found a splitter that was used more than once, so it\n // is probably the breadcrumber. Split our title on that instead.\n // Note: max_term should be <= 4 characters, so that \" >> \"\n // will match, but nothing longer than that.\n if (termCount >= 2 && maxTerm.length <= 4) {\n splitTitle = text.split(maxTerm);\n }\n\n const splitEnds = [splitTitle[0], splitTitle.slice(-1)];\n const longestEnd = splitEnds.reduce((acc, end) => acc.length > end.length ? acc : end, '');\n\n if (longestEnd.length > 10) {\n return longestEnd;\n }\n\n return text;\n }\n\n return null;\n}\n\nfunction cleanDomainFromTitle(splitTitle, url) {\n // Search the ends of the title, looking for bits that fuzzy match\n // the URL too closely. If one is found, discard it and return the\n // rest.\n //\n // Strip out the big TLDs - it just makes the matching a bit more\n // accurate. Not the end of the world if it doesn't strip right.\n const { host } = URL.parse(url);\n const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');\n\n const startSlug = splitTitle[0].toLowerCase().replace(' ', '');\n const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);\n\n if (startSlugRatio > 0.4 && startSlug.length > 5) {\n return splitTitle.slice(2).join('');\n }\n\n const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');\n const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);\n\n if (endSlugRatio > 0.4 && endSlug.length >= 5) {\n return splitTitle.slice(0, -2).join('');\n }\n\n return null;\n}\n\n// Given a title with separators in it (colons, dashes, etc),\n// resolve whether any of the segments should be removed.\nexport default function resolveSplitTitle(title, url = '') {\n // Splits while preserving splitters, like:\n // ['The New New York', ' - ', 'The Washington Post']\n const splitTitle = title.split(TITLE_SPLITTERS_RE);\n if (splitTitle.length === 1) {\n return title;\n }\n\n let newTitle = extractBreadcrumbTitle(splitTitle, title);\n if (newTitle) return newTitle;\n\n newTitle = cleanDomainFromTitle(splitTitle, url);\n if (newTitle) return newTitle;\n\n // Fuzzy ratio didn't find anything, so this title is probably legit.\n // Just return it all.\n return title;\n}\n","import cleanAuthor from './author';\nimport cleanImage from './lead-image-url';\nimport cleanDek from './dek';\nimport cleanDatePublished from './date-published';\nimport cleanContent from './content';\nimport cleanTitle from './title';\n\nconst Cleaners = {\n author: cleanAuthor,\n lead_image_url: cleanImage,\n dek: cleanDek,\n date_published: cleanDatePublished,\n content: cleanContent,\n title: cleanTitle,\n};\n\nexport default Cleaners;\n\nexport { cleanAuthor };\nexport { cleanImage };\nexport { cleanDek };\nexport { cleanDatePublished };\nexport { cleanContent };\nexport { cleanTitle };\nexport { default as resolveSplitTitle } from './resolve-split-title';\n","import {\n stripUnlikelyCandidates,\n convertToParagraphs,\n} from 'utils/dom';\n\nimport {\n scoreContent,\n findTopCandidate,\n} from './scoring';\n\n// Using a variety of scoring techniques, extract the content most\n// likely to be article text.\n//\n// If strip_unlikely_candidates is True, remove any elements that\n// match certain criteria first. (Like, does this element have a\n// classname of \"comment\")\n//\n// If weight_nodes is True, use classNames and IDs to determine the\n// worthiness of nodes.\n//\n// Returns a cheerio object $\nexport default function extractBestNode($, opts) {\n // clone the node so we can get back to our\n // initial parsed state if needed\n // TODO Do I need this? – AP\n // let $root = $.root().clone()\n\n if (opts.stripUnlikelyCandidates) {\n $ = stripUnlikelyCandidates($);\n }\n\n $ = convertToParagraphs($);\n $ = scoreContent($, opts.weightNodes);\n const $topCandidate = findTopCandidate($);\n\n return $topCandidate;\n}\n","import cheerio from 'cheerio';\n\nimport { nodeIsSufficient } from 'utils/dom';\nimport { cleanContent } from 'cleaners';\nimport { normalizeSpaces } from 'utils/text';\n\nimport extractBestNode from './extract-best-node';\n\nconst GenericContentExtractor = {\n defaultOpts: {\n stripUnlikelyCandidates: true,\n weightNodes: true,\n cleanConditionally: true,\n },\n\n // Extract the content for this resource - initially, pass in our\n // most restrictive opts which will return the highest quality\n // content. On each failure, retry with slightly more lax opts.\n //\n // :param return_type: string. If \"node\", should return the content\n // as a cheerio node rather than as an HTML string.\n //\n // Opts:\n // stripUnlikelyCandidates: Remove any elements that match\n // non-article-like criteria first.(Like, does this element\n // have a classname of \"comment\")\n //\n // weightNodes: Modify an elements score based on whether it has\n // certain classNames or IDs. Examples: Subtract if a node has\n // a className of 'comment', Add if a node has an ID of\n // 'entry-content'.\n //\n // cleanConditionally: Clean the node to return of some\n // superfluous content. Things like forms, ads, etc.\n extract({ $, html, title, url }, opts) {\n opts = { ...this.defaultOpts, ...opts };\n\n $ = $ || cheerio.load(html);\n\n // Cascade through our extraction-specific opts in an ordered fashion,\n // turning them off as we try to extract content.\n let node = this.getContentNode($, title, url, opts);\n\n if (nodeIsSufficient(node)) {\n return this.cleanAndReturnNode(node, $);\n }\n\n // We didn't succeed on first pass, one by one disable our\n // extraction opts and try again.\n for (const key of Reflect.ownKeys(opts).filter(k => opts[k] === true)) {\n opts[key] = false;\n $ = cheerio.load(html);\n\n node = this.getContentNode($, title, url, opts);\n\n if (nodeIsSufficient(node)) {\n break;\n }\n }\n\n return this.cleanAndReturnNode(node, $);\n },\n\n // Get node given current options\n getContentNode($, title, url, opts) {\n return cleanContent(\n extractBestNode($, opts),\n {\n $,\n cleanConditionally: opts.cleanConditionally,\n title,\n url,\n });\n },\n\n // Once we got here, either we're at our last-resort node, or\n // we broke early. Make sure we at least have -something- before we\n // move forward.\n cleanAndReturnNode(node, $) {\n if (!node) {\n return null;\n }\n\n return normalizeSpaces($.html(node));\n\n // if return_type == \"html\":\n // return normalize_spaces(node_to_html(node))\n // else:\n // return node\n },\n\n};\n\nexport default GenericContentExtractor;\n","// TODO: It would be great if we could merge the meta and selector lists into\n// a list of objects, because we could then rank them better. For example,\n// .hentry .entry-title is far better suited than .\n\n// An ordered list of meta tag names that denote likely article titles. All\n// attributes should be lowercase for faster case-insensitive matching. From\n// most distinct to least distinct.\nexport const STRONG_TITLE_META_TAGS = [\n 'tweetmeme-title',\n 'dc.title',\n 'rbtitle',\n 'headline',\n 'title',\n];\n\n// og:title is weak because it typically contains context that we don't like,\n// for example the source site's name. Gotta get that brand into facebook!\nexport const WEAK_TITLE_META_TAGS = [\n 'og:title',\n];\n\n// An ordered list of XPath Selectors to find likely article titles. From\n// most explicit to least explicit.\n//\n// Note - this does not use classes like CSS. This checks to see if the string\n// exists in the className, which is not as accurate as .className (which\n// splits on spaces/endlines), but for our purposes it's close enough. The\n// speed tradeoff is worth the accuracy hit.\nexport const STRONG_TITLE_SELECTORS = [\n '.hentry .entry-title',\n 'h1#articleHeader',\n 'h1.articleHeader',\n 'h1.article',\n '.instapaper_title',\n '#meebo-title',\n];\n\nexport const WEAK_TITLE_SELECTORS = [\n 'article h1',\n '#entry-title',\n '.entry-title',\n '#entryTitle',\n '#entrytitle',\n '.entryTitle',\n '.entrytitle',\n '#articleTitle',\n '.articleTitle',\n 'post post-title',\n 'h1.title',\n 'h2.article',\n 'h1',\n 'html head title',\n 'title',\n];\n","import { cleanTitle } from 'cleaners';\nimport {\n extractFromMeta,\n extractFromSelectors,\n} from 'utils/dom';\n\nimport {\n STRONG_TITLE_META_TAGS,\n WEAK_TITLE_META_TAGS,\n STRONG_TITLE_SELECTORS,\n WEAK_TITLE_SELECTORS,\n} from './constants';\n\nconst GenericTitleExtractor = {\n extract({ $, url, metaCache }) {\n // First, check to see if we have a matching meta tag that we can make\n // use of that is strongly associated with the headline.\n let title;\n\n title = extractFromMeta($, STRONG_TITLE_META_TAGS, metaCache);\n if (title) return cleanTitle(title, { url, $ });\n\n // Second, look through our content selectors for the most likely\n // article title that is strongly associated with the headline.\n title = extractFromSelectors($, STRONG_TITLE_SELECTORS);\n if (title) return cleanTitle(title, { url, $ });\n\n // Third, check for weaker meta tags that may match.\n title = extractFromMeta($, WEAK_TITLE_META_TAGS, metaCache);\n if (title) return cleanTitle(title, { url, $ });\n\n // Last, look for weaker selector tags that may match.\n title = extractFromSelectors($, WEAK_TITLE_SELECTORS);\n if (title) return cleanTitle(title, { url, $ });\n\n // If no matches, return an empty string\n return '';\n },\n};\n\nexport default GenericTitleExtractor;\n","// An ordered list of meta tag names that denote likely article authors. All\n// attributes should be lowercase for faster case-insensitive matching. From\n// most distinct to least distinct.\n//\n// Note: \"author\" is too often the -developer- of the page, so it is not\n// added here.\nexport const AUTHOR_META_TAGS = [\n 'byl',\n 'clmst',\n 'dc.author',\n 'dcsext.author',\n 'dc.creator',\n 'rbauthors',\n 'authors',\n];\n\nexport const AUTHOR_MAX_LENGTH = 300;\n\n// An ordered list of XPath Selectors to find likely article authors. From\n// most explicit to least explicit.\n//\n// Note - this does not use classes like CSS. This checks to see if the string\n// exists in the className, which is not as accurate as .className (which\n// splits on spaces/endlines), but for our purposes it's close enough. The\n// speed tradeoff is worth the accuracy hit.\nexport const AUTHOR_SELECTORS = [\n '.entry .entry-author',\n '.author.vcard .fn',\n '.author .vcard .fn',\n '.byline.vcard .fn',\n '.byline .vcard .fn',\n '.byline .by .author',\n '.byline .by',\n '.byline .author',\n '.post-author.vcard',\n '.post-author .vcard',\n 'a[rel=author]',\n '#by_author',\n '.by_author',\n '#entryAuthor',\n '.entryAuthor',\n '.byline a[href*=author]',\n '#author .authorname',\n '.author .authorname',\n '#author',\n '.author',\n '.articleauthor',\n '.ArticleAuthor',\n '.byline',\n];\n\n// An ordered list of Selectors to find likely article authors, with\n// regular expression for content.\nconst bylineRe = /^[\\n\\s]*By/i;\nexport const BYLINE_SELECTORS_RE = [\n ['#byline', bylineRe],\n ['.byline', bylineRe],\n];\n","import { cleanAuthor } from 'cleaners';\nimport {\n extractFromMeta,\n extractFromSelectors,\n} from 'utils/dom';\n\nimport {\n AUTHOR_META_TAGS,\n AUTHOR_MAX_LENGTH,\n AUTHOR_SELECTORS,\n BYLINE_SELECTORS_RE,\n} from './constants';\n\nconst GenericAuthorExtractor = {\n extract({ $, metaCache }) {\n let author;\n\n // First, check to see if we have a matching\n // meta tag that we can make use of.\n author = extractFromMeta($, AUTHOR_META_TAGS, metaCache);\n if (author && author.length < AUTHOR_MAX_LENGTH) {\n return cleanAuthor(author);\n }\n\n // Second, look through our selectors looking for potential authors.\n author = extractFromSelectors($, AUTHOR_SELECTORS, 2);\n if (author && author.length < AUTHOR_MAX_LENGTH) {\n return cleanAuthor(author);\n }\n\n // Last, use our looser regular-expression based selectors for\n // potential authors.\n for (const [selector, regex] of BYLINE_SELECTORS_RE) {\n const node = $(selector);\n if (node.length === 1) {\n const text = node.text();\n if (regex.test(text)) {\n return cleanAuthor(text);\n }\n }\n }\n\n return null;\n },\n};\n\nexport default GenericAuthorExtractor;\n","// An ordered list of meta tag names that denote\n// likely date published dates. All attributes\n// should be lowercase for faster case-insensitive matching.\n// From most distinct to least distinct.\nexport const DATE_PUBLISHED_META_TAGS = [\n 'article:published_time',\n 'displaydate',\n 'dc.date',\n 'dc.date.issued',\n 'rbpubdate',\n 'publish_date',\n 'pub_date',\n 'pagedate',\n 'pubdate',\n 'revision_date',\n 'doc_date',\n 'date_created',\n 'content_create_date',\n 'lastmodified',\n 'created',\n 'date',\n];\n\n// An ordered list of XPath Selectors to find\n// likely date published dates. From most explicit\n// to least explicit.\nexport const DATE_PUBLISHED_SELECTORS = [\n '.hentry .dtstamp.published',\n '.hentry .published',\n '.hentry .dtstamp.updated',\n '.hentry .updated',\n '.single .published',\n '.meta .published',\n '.meta .postDate',\n '.entry-date',\n '.byline .date',\n '.postmetadata .date',\n '.article_datetime',\n '.date-header',\n '.story-date',\n '.dateStamp',\n '#story .datetime',\n '.dateline',\n '.pubdate',\n];\n\n// An ordered list of compiled regular expressions to find likely date\n// published dates from the URL. These should always have the first\n// reference be a date string that is parseable by dateutil.parser.parse\nconst abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';\nexport const DATE_PUBLISHED_URL_RES = [\n // /2012/01/27/ but not /2012/01/293\n new RegExp('/(20\\\\d{2}/\\\\d{2}/\\\\d{2})/', 'i'),\n // 20120127 or 20120127T but not 2012012733 or 8201201733\n // /[^0-9](20\\d{2}[01]\\d[0-3]\\d)([^0-9]|$)/i,\n // 2012-01-27\n new RegExp('(20\\\\d{2}-[01]\\\\d-[0-3]\\\\d)', 'i'),\n // /2012/jan/27/\n new RegExp(`/(20\\\\d{2}/${abbrevMonthsStr}/[0-3]\\\\d)/`, 'i'),\n];\n","import { cleanDatePublished } from 'cleaners';\nimport {\n extractFromMeta,\n extractFromSelectors,\n} from 'utils/dom';\nimport { extractFromUrl } from 'utils/text';\n\nimport {\n DATE_PUBLISHED_META_TAGS,\n DATE_PUBLISHED_SELECTORS,\n DATE_PUBLISHED_URL_RES,\n} from './constants';\n\nconst GenericDatePublishedExtractor = {\n extract({ $, url, metaCache }) {\n let datePublished;\n // First, check to see if we have a matching meta tag\n // that we can make use of.\n // Don't try cleaning tags from this string\n datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false);\n if (datePublished) return cleanDatePublished(datePublished);\n\n // Second, look through our selectors looking for potential\n // date_published's.\n datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS);\n if (datePublished) return cleanDatePublished(datePublished);\n\n // Lastly, look to see if a dately string exists in the URL\n datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);\n if (datePublished) return cleanDatePublished(datePublished);\n\n return null;\n },\n};\n\nexport default GenericDatePublishedExtractor;\n","// import {\n// DEK_META_TAGS,\n// DEK_SELECTORS,\n// DEK_URL_RES,\n// } from './constants';\n\n// import { cleanDek } from 'cleaners';\n\n// import {\n// extractFromMeta,\n// extractFromSelectors,\n// } from 'utils/dom';\n\n// Currently there is only one selector for\n// deks. We should simply return null here\n// until we have a more robust generic option.\n// Below is the original source for this, for reference.\nconst GenericDekExtractor = {\n // extract({ $, content, metaCache }) {\n extract() {\n return null;\n },\n};\n\nexport default GenericDekExtractor;\n\n// def extract_dek(self):\n// # First, check to see if we have a matching meta tag that we can make\n// # use of.\n// dek = self.extract_from_meta('dek', constants.DEK_META_TAGS)\n// if not dek:\n// # Second, look through our CSS/XPath selectors. This may return\n// # an HTML fragment.\n// dek = self.extract_from_selectors('dek',\n// constants.DEK_SELECTORS,\n// text_only=False)\n//\n// if dek:\n// # Make sure our dek isn't in the first few thousand characters\n// # of the content, otherwise it's just the start of the article\n// # and not a true dek.\n// content = self.extract_content()\n// content_chunk = normalize_spaces(strip_tags(content[:2000]))\n// dek_chunk = normalize_spaces(dek[:100]) # Already has no tags.\n//\n// # 80% or greater similarity means the dek was very similar to some\n// # of the starting content, so we skip it.\n// if fuzz.partial_ratio(content_chunk, dek_chunk) < 80:\n// return dek\n//\n// return None\n","// An ordered list of meta tag names that denote likely article leading images.\n// All attributes should be lowercase for faster case-insensitive matching.\n// From most distinct to least distinct.\nexport const LEAD_IMAGE_URL_META_TAGS = [\n 'og:image',\n 'twitter:image',\n 'image_src',\n];\n\nexport const LEAD_IMAGE_URL_SELECTORS = [\n 'link[rel=image_src]',\n];\n\nexport const POSITIVE_LEAD_IMAGE_URL_HINTS = [\n 'upload',\n 'wp-content',\n 'large',\n 'photo',\n 'wp-image',\n];\nexport const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');\n\nexport const NEGATIVE_LEAD_IMAGE_URL_HINTS = [\n 'spacer',\n 'sprite',\n 'blank',\n 'throbber',\n 'gradient',\n 'tile',\n 'bg',\n 'background',\n 'icon',\n 'social',\n 'header',\n 'hdr',\n 'advert',\n 'spinner',\n 'loader',\n 'loading',\n 'default',\n 'rating',\n 'share',\n 'facebook',\n 'twitter',\n 'theme',\n 'promo',\n 'ads',\n 'wp-includes',\n];\nexport const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');\n\nexport const GIF_RE = /\\.gif(\\?.*)?$/i;\nexport const JPG_RE = /\\.jpe?g(\\?.*)?$/i;\n","import {\n POSITIVE_LEAD_IMAGE_URL_HINTS_RE,\n NEGATIVE_LEAD_IMAGE_URL_HINTS_RE,\n GIF_RE,\n JPG_RE,\n} from './constants';\n\nimport { PHOTO_HINTS_RE } from '../content/scoring/constants';\n\nfunction getSig($node) {\n return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`;\n}\n\n// Scores image urls based on a variety of heuristics.\nexport function scoreImageUrl(url) {\n url = url.trim();\n let score = 0;\n\n if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {\n score += 20;\n }\n\n if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {\n score -= 20;\n }\n\n // TODO: We might want to consider removing this as\n // gifs are much more common/popular than they once were\n if (GIF_RE.test(url)) {\n score -= 10;\n }\n\n if (JPG_RE.test(url)) {\n score += 10;\n }\n\n // PNGs are neutral.\n\n return score;\n}\n\n// Alt attribute usually means non-presentational image.\nexport function scoreAttr($img) {\n if ($img.attr('alt')) {\n return 5;\n }\n\n return 0;\n}\n\n// Look through our parent and grandparent for figure-like\n// container elements, give a bonus if we find them\nexport function scoreByParents($img) {\n let score = 0;\n const $figParent = $img.parents('figure').first();\n\n if ($figParent.length === 1) {\n score += 25;\n }\n\n const $parent = $img.parent();\n let $gParent;\n if ($parent.length === 1) {\n $gParent = $parent.parent();\n }\n\n [$parent, $gParent].forEach(($node) => {\n if (PHOTO_HINTS_RE.test(getSig($node))) {\n score += 15;\n }\n });\n\n return score;\n}\n\n// Look at our immediate sibling and see if it looks like it's a\n// caption. Bonus if so.\nexport function scoreBySibling($img) {\n let score = 0;\n const $sibling = $img.next();\n const sibling = $sibling.get(0);\n\n if (sibling && sibling.tagName.toLowerCase() === 'figcaption') {\n score += 25;\n }\n\n if (PHOTO_HINTS_RE.test(getSig($sibling))) {\n score += 15;\n }\n\n return score;\n}\n\nexport function scoreByDimensions($img) {\n let score = 0;\n\n const width = parseFloat($img.attr('width'));\n const height = parseFloat($img.attr('height'));\n const src = $img.attr('src');\n\n // Penalty for skinny images\n if (width && width <= 50) {\n score -= 50;\n }\n\n // Penalty for short images\n if (height && height <= 50) {\n score -= 50;\n }\n\n if (width && height && !src.includes('sprite')) {\n const area = width * height;\n if (area < 5000) { // Smaller than 50 x 100\n score -= 100;\n } else {\n score += Math.round(area / 1000);\n }\n }\n\n return score;\n}\n\nexport function scoreByPosition($imgs, index) {\n return ($imgs.length / 2) - index;\n}\n","import { extractFromMeta } from 'utils/dom';\nimport { cleanImage } from 'cleaners';\n\nimport {\n LEAD_IMAGE_URL_META_TAGS,\n LEAD_IMAGE_URL_SELECTORS,\n} from './constants';\n\nimport {\n scoreImageUrl,\n scoreAttr,\n scoreByParents,\n scoreBySibling,\n scoreByDimensions,\n scoreByPosition,\n} from './score-image';\n\n// Given a resource, try to find the lead image URL from within\n// it. Like content and next page extraction, uses a scoring system\n// to determine what the most likely image may be. Short circuits\n// on really probable things like og:image meta tags.\n//\n// Potential signals to still take advantage of:\n// * domain\n// * weird aspect ratio\nconst GenericLeadImageUrlExtractor = {\n extract({ $, content, metaCache, html }) {\n let cleanUrl;\n if (!$.browser && $('head').length === 0) {\n $('*').first().prepend(html);\n }\n\n // Check to see if we have a matching meta tag that we can make use of.\n // Moving this higher because common practice is now to use large\n // images on things like Open Graph or Twitter cards.\n // images usually have for things like Open Graph.\n const imageUrl =\n extractFromMeta(\n $,\n LEAD_IMAGE_URL_META_TAGS,\n metaCache,\n false\n );\n\n if (imageUrl) {\n cleanUrl = cleanImage(imageUrl);\n\n if (cleanUrl) return cleanUrl;\n }\n\n // Next, try to find the \"best\" image via the content.\n // We'd rather not have to fetch each image and check dimensions,\n // so try to do some analysis and determine them instead.\n const $content = $(content);\n const imgs = $('img', $content).toArray();\n const imgScores = {};\n\n imgs.forEach((img, index) => {\n const $img = $(img);\n const src = $img.attr('src');\n\n if (!src) return;\n\n let score = scoreImageUrl(src);\n score += scoreAttr($img);\n score += scoreByParents($img);\n score += scoreBySibling($img);\n score += scoreByDimensions($img);\n score += scoreByPosition(imgs, index);\n\n imgScores[src] = score;\n });\n\n const [topUrl, topScore] =\n Reflect.ownKeys(imgScores).reduce((acc, key) =>\n imgScores[key] > acc[1] ? [key, imgScores[key]] : acc\n , [null, 0]);\n\n if (topScore > 0) {\n cleanUrl = cleanImage(topUrl);\n\n if (cleanUrl) return cleanUrl;\n }\n\n // If nothing else worked, check to see if there are any really\n // probable nodes in the doc, like .\n for (const selector of LEAD_IMAGE_URL_SELECTORS) {\n const $node = $(selector).first();\n const src = $node.attr('src');\n if (src) {\n cleanUrl = cleanImage(src);\n if (cleanUrl) return cleanUrl;\n }\n\n const href = $node.attr('href');\n if (href) {\n cleanUrl = cleanImage(href);\n if (cleanUrl) return cleanUrl;\n }\n\n const value = $node.attr('value');\n if (value) {\n cleanUrl = cleanImage(value);\n if (cleanUrl) return cleanUrl;\n }\n }\n\n return null;\n },\n};\n\nexport default GenericLeadImageUrlExtractor;\n\n// def extract(self):\n// \"\"\"\n// # First, try to find the \"best\" image via the content.\n// # We'd rather not have to fetch each image and check dimensions,\n// # so try to do some analysis and determine them instead.\n// content = self.extractor.extract_content(return_type=\"node\")\n// imgs = content.xpath('.//img')\n// img_scores = defaultdict(int)\n// logger.debug('Scoring %d images from content', len(imgs))\n// for (i, img) in enumerate(imgs):\n// img_score = 0\n//\n// if not 'src' in img.attrib:\n// logger.debug('No src attribute found')\n// continue\n//\n// try:\n// parsed_img = urlparse(img.attrib['src'])\n// img_path = parsed_img.path.lower()\n// except ValueError:\n// logger.debug('ValueError getting img path.')\n// continue\n// logger.debug('Image path is %s', img_path)\n//\n// if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):\n// logger.debug('Positive URL hints match. Adding 20.')\n// img_score += 20\n//\n// if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):\n// logger.debug('Negative URL hints match. Subtracting 20.')\n// img_score -= 20\n//\n// # Gifs are more often structure than photos\n// if img_path.endswith('gif'):\n// logger.debug('gif found. Subtracting 10.')\n// img_score -= 10\n//\n// # JPGs are more often photographs\n// if img_path.endswith('jpg'):\n// logger.debug('jpg found. Adding 10.')\n// img_score += 10\n//\n// # PNGs are neutral.\n//\n// # Alt attribute usually means non-presentational image.\n// if 'alt' in img.attrib and len(img.attrib['alt']) > 5:\n// logger.debug('alt attribute found. Adding 5.')\n// img_score += 5\n//\n// # Look through our parent and grandparent for figure-like\n// # container elements, give a bonus if we find them\n// parents = [img.getparent()]\n// if parents[0] is not None and parents[0].getparent() is not None:\n// parents.append(parents[0].getparent())\n// for p in parents:\n// if p.tag == 'figure':\n// logger.debug('Parent with