chore: refactor format output adjustments (#272)

adampash · web-flow · commit 9bf88b0ba30a · 2019-02-13T13:30:49.000-08:00
I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
diff --git a/dist/mercury.js b/dist/mercury.js
@@ -8,6 +8,7 @@ var _objectWithoutProperties = _interopDefault(require('@babel/runtime-corejs2/h
 var _asyncToGenerator = _interopDefault(require('@babel/runtime-corejs2/helpers/asyncToGenerator'));
 var URL = _interopDefault(require('url'));
 var cheerio = _interopDefault(require('cheerio'));
+var TurndownService = _interopDefault(require('turndown'));
 var iconv = _interopDefault(require('iconv-lite'));
 var _parseInt = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-int'));
 var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
@@ -21,7 +22,6 @@ var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
 var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
 var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
 var _Object$keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
-var TurndownService = _interopDefault(require('turndown'));
 var stringDirection = _interopDefault(require('string-direction'));
 var validUrl = _interopDefault(require('valid-url'));
 var moment = _interopDefault(require('moment-timezone'));
@@ -6018,9 +6018,7 @@ var GenericExtractor = {
   },
   extract: function extract(options) {
     var html = options.html,
-        $ = options.$,
-        _options$contentType = options.contentType,
-        contentType = _options$contentType === void 0 ? 'html' : _options$contentType;
+        $ = options.$;
 
     if (html && !$) {
       var loaded = cheerio.load(html);
@@ -6054,24 +6052,13 @@ var GenericExtractor = {
         url = _this$url_and_domain.url,
         domain = _this$url_and_domain.domain;
 
-    var convertedContent;
-
-    if (contentType === 'html') {
-      convertedContent = content;
-    } else if (contentType === 'text') {
-      convertedContent = $.text(cheerio.load(content));
-    } else if (contentType === 'markdown') {
-      var turndownService = new TurndownService();
-      convertedContent = turndownService.turndown(content);
-    }
-
     return {
       title: title,
       author: author,
       date_published: date_published || null,
       dek: dek,
       lead_image_url: lead_image_url,
-      content: convertedContent,
+      content: content,
       next_page_url: next_page_url,
       url: url,
       domain: domain,
@@ -6161,9 +6148,7 @@ function select(opts) {
       type = opts.type,
       extractionOpts = opts.extractionOpts,
       _opts$extractHtml = opts.extractHtml,
-      extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml,
-      _opts$contentType = opts.contentType,
-      contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType; // Skip if there's not extraction for this type
+      extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type
 
   if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
   // contributors), return the string
@@ -6205,19 +6190,7 @@ function select(opts) {
     $content = Cleaners[type]($content, _objectSpread({}, opts, {
       defaultCleaner: defaultCleaner
     }));
-
-    if (contentType === 'html') {
-      return $.html($content);
-    }
-
-    if (contentType === 'text') {
-      return $.text($content);
-    }
-
-    if (contentType === 'markdown') {
-      var turndownService = new TurndownService();
-      return turndownService.turndown($.html($content));
-    }
+    return $.html($content);
   }
 
   var result; // if selector is an array (e.g., ['img', 'src']),
@@ -6270,9 +6243,7 @@ var RootExtractor = {
     var opts = arguments.length > 1 ? arguments[1] : undefined;
     var _opts = opts,
         contentOnly = _opts.contentOnly,
-        extractedTitle = _opts.extractedTitle,
-        _opts$contentType2 = _opts.contentType,
-        contentType = _opts$contentType2 === void 0 ? 'html' : _opts$contentType2; // This is the generic extractor. Run its extract method
+        extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method
 
     if (extractor.domain === '*') return extractor.extract(opts);
     opts = _objectSpread({}, opts, {
@@ -6283,8 +6254,7 @@ var RootExtractor = {
       var _content = extractResult(_objectSpread({}, opts, {
         type: 'content',
         extractHtml: true,
-        title: extractedTitle,
-        contentType: contentType
+        title: extractedTitle
       }));
 
       return {
@@ -6451,6 +6421,7 @@ var Mercury = {
           _result,
           title,
           next_page_url,
+          turndownService,
           _args = arguments;
 
       return _regeneratorRuntime.wrap(function _callee$(_context) {
@@ -6545,9 +6516,16 @@ var Mercury = {
               });
 
             case 23:
+              if (contentType === 'markdown') {
+                turndownService = new TurndownService();
+                result.content = turndownService.turndown(result.content);
+              } else if (contentType === 'text') {
+                result.content = $.text($(result.content));
+              }
+
               return _context.abrupt("return", result);
 
-            case 24:
+            case 25:
             case "end":
               return _context.stop();
           }
diff --git a/src/extractors/generic/index.js b/src/extractors/generic/index.js
@@ -1,5 +1,4 @@
 import cheerio from 'cheerio';
-import TurndownService from 'turndown';
 import stringDirection from 'string-direction';
 
 import GenericContentExtractor from './content/extractor';
@@ -29,7 +28,7 @@ const GenericExtractor = {
   direction: ({ title }) => stringDirection.getDirection(title),
 
   extract(options) {
-    const { html, $, contentType = 'html' } = options;
+    const { html, $ } = options;
 
     if (html && !$) {
       const loaded = cheerio.load(html);
@@ -48,24 +47,13 @@ const GenericExtractor = {
     const direction = this.direction({ title });
     const { url, domain } = this.url_and_domain(options);
 
-    let convertedContent;
-
-    if (contentType === 'html') {
-      convertedContent = content;
-    } else if (contentType === 'text') {
-      convertedContent = $.text(cheerio.load(content));
-    } else if (contentType === 'markdown') {
-      const turndownService = new TurndownService();
-      convertedContent = turndownService.turndown(content);
-    }
-
     return {
       title,
       author,
       date_published: date_published || null,
       dek,
       lead_image_url,
-      content: convertedContent,
+      content,
       next_page_url,
       url,
       domain,
diff --git a/src/extractors/root-extractor.js b/src/extractors/root-extractor.js
@@ -1,4 +1,3 @@
-import TurndownService from 'turndown';
 import Cleaners from 'cleaners';
 import { convertNodeTo } from 'utils/dom';
 import GenericExtractor from './generic';
@@ -67,13 +66,7 @@ function findMatchingSelector($, selectors, extractHtml) {
 }
 
 export function select(opts) {
-  const {
-    $,
-    type,
-    extractionOpts,
-    extractHtml = false,
-    contentType = 'html',
-  } = opts;
+  const { $, type, extractionOpts, extractHtml = false } = opts;
   // Skip if there's not extraction for this type
   if (!extractionOpts) return null;
 
@@ -120,16 +113,7 @@ export function select(opts) {
 
     $content = Cleaners[type]($content, { ...opts, defaultCleaner });
 
-    if (contentType === 'html') {
-      return $.html($content);
-    }
-    if (contentType === 'text') {
-      return $.text($content);
-    }
-    if (contentType === 'markdown') {
-      const turndownService = new TurndownService();
-      return turndownService.turndown($.html($content));
-    }
+    return $.html($content);
   }
 
   let result;
@@ -178,7 +162,7 @@ function extractResult(opts) {
 
 const RootExtractor = {
   extract(extractor = GenericExtractor, opts) {
-    const { contentOnly, extractedTitle, contentType = 'html' } = opts;
+    const { contentOnly, extractedTitle } = opts;
     // This is the generic extractor. Run its extract method
     if (extractor.domain === '*') return extractor.extract(opts);
 
@@ -193,7 +177,6 @@ const RootExtractor = {
         type: 'content',
         extractHtml: true,
         title: extractedTitle,
-        contentType,
       });
       return {
         content,
diff --git a/src/extractors/root-extractor.test.js b/src/extractors/root-extractor.test.js
@@ -32,73 +32,6 @@ describe('RootExtractor', () => {
 
     assert.equal(url, null);
   });
-  it('returns text content if text is passed as contentType', () => {
-    const fullUrl =
-      'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
-    const html = fs.readFileSync(
-      './src/extractors/custom/nymag.com/fixtures/test.html',
-      'utf8'
-    );
-    const $ = cheerio.load(html);
-
-    const { content } = RootExtractor.extract(NYMagExtractor, {
-      url: fullUrl,
-      html,
-      $,
-      metaCache: [],
-      fallback: false,
-      contentType: 'text',
-    });
-
-    const htmlRe = /<[a-z][\s\S]*>/g;
-
-    assert.equal(htmlRe.test(content), false);
-  });
-  it('returns markdown if markdown is passed as contentType', () => {
-    const fullUrl =
-      'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
-    const html = fs.readFileSync(
-      './src/extractors/custom/nymag.com/fixtures/test.html',
-      'utf8'
-    );
-    const $ = cheerio.load(html);
-
-    const { content } = RootExtractor.extract(NYMagExtractor, {
-      url: fullUrl,
-      html,
-      $,
-      metaCache: [],
-      fallback: false,
-      contentType: 'markdown',
-    });
-
-    const htmlRe = /<[a-z][\s\S]*>/;
-    const markdownRe = /\[[\w\s]+\]\(.*\)/;
-
-    assert.equal(htmlRe.test(content), false);
-    assert.equal(markdownRe.test(content), true);
-  });
-  it('also can select type on Generic Extractor', () => {
-    const fullUrl =
-      'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html';
-
-    const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
-    const $ = cheerio.load(html);
-    const { content } = RootExtractor.extract(undefined, {
-      url: fullUrl,
-      html,
-      $,
-      metaCache: [],
-      fallback: false,
-      contentType: 'markdown',
-    });
-
-    const htmlRe = /<[a-z][\s\S]*>/;
-    const markdownRe = /\[[\w\s]+\]\(.*\)/;
-
-    assert.equal(htmlRe.test(content), false);
-    assert.equal(markdownRe.test(content), true);
-  });
 });
 
 describe('cleanBySelectors($content, $, { clean })', () => {
diff --git a/src/mercury.js b/src/mercury.js
@@ -1,5 +1,6 @@
 import URL from 'url';
 import cheerio from 'cheerio';
+import TurndownService from 'turndown';
 
 import Resource from 'resource';
 import { validateUrl, Errors } from 'utils';
@@ -83,6 +84,13 @@ const Mercury = {
       };
     }
 
+    if (contentType === 'markdown') {
+      const turndownService = new TurndownService();
+      result.content = turndownService.turndown(result.content);
+    } else if (contentType === 'text') {
+      result.content = $.text($(result.content));
+    }
+
     return result;
   },
 
diff --git a/src/mercury.test.js b/src/mercury.test.js
@@ -4,6 +4,8 @@ import { Errors } from 'utils';
 import { record } from 'test-helpers';
 import Mercury from './mercury';
 
+const fs = require('fs');
+
 describe('Mercury', () => {
   const recorder = record('mercury-test');
   beforeAll(recorder.before);
@@ -92,4 +94,37 @@ describe('Mercury', () => {
       assert.equal(result.next_page_url, `${url}2`);
     });
   });
+
+  it('returns text content if text is passed as contentType', async () => {
+    const url =
+      'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
+    const html = fs.readFileSync(
+      './src/extractors/custom/nymag.com/fixtures/test.html',
+      'utf8'
+    );
+    const { content } = await Mercury.parse(url, { html, contentType: 'text' });
+
+    const htmlRe = /<[a-z][\s\S]*>/g;
+
+    assert.equal(htmlRe.test(content), false);
+  });
+
+  it('returns markdown if markdown is passed as contentType', async () => {
+    const url =
+      'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
+    const html = fs.readFileSync(
+      './src/extractors/custom/nymag.com/fixtures/test.html',
+      'utf8'
+    );
+    const { content } = await Mercury.parse(url, {
+      html,
+      contentType: 'markdown',
+    });
+
+    const htmlRe = /<[a-z][\s\S]*>/;
+    const markdownRe = /\[[\w\s]+\]\(.*\)/;
+
+    assert.equal(htmlRe.test(content), false);
+    assert.equal(markdownRe.test(content), true);
+  });
 });