Skip to content

Commit afbef9b

Browse files
authored
Fix Encoding on Body (#143)
* fix: check encoding on body
1 parent 9d4c883 commit afbef9b

15 files changed

+7155
-1034
lines changed

Diff for: dist/generate-custom-parser.js

+7,010-988
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: dist/generate-custom-parser.js.map

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: dist/mercury.js

+9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: dist/mercury.js.map

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: fixtures/nock/fetch-resource-test.js

+29
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: fixtures/nock/resource-test.js

+29
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: scripts/check-build.test.js

+3-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ if (process.env.CI) {
4343
assert.equal(article.title, result.title);
4444
done();
4545
}).catch((e) => {
46-
console.log('THIS WENT WRONG', e); // eslint-disable-line no-console
46+
console.log('There was an error', e.message); // eslint-disable-line no-console
47+
console.log('e.fileName', e.fileName);
48+
console.log('e.lineNumber', e.lineNumber);
4749
assert.equal(true, false);
4850
done();
4951
});

Diff for: src/resource/index.js

+21-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import cheerio from 'cheerio';
2+
import iconv from 'iconv-lite';
23

4+
import { getEncoding } from 'utils/text';
35
import { fetchResource } from './utils';
46
import {
57
normalizeMetaTags,
@@ -51,7 +53,7 @@ const Resource = {
5153
throw new Error('Content does not appear to be text.');
5254
}
5355

54-
let $ = cheerio.load(content);
56+
let $ = this.encodeDoc({ content, contentType });
5557

5658
if ($.root().children().length === 0) {
5759
throw new Error('No children, likely a bad parse.');
@@ -63,6 +65,24 @@ const Resource = {
6365

6466
return $;
6567
},
68+
69+
encodeDoc({ content, contentType }) {
70+
const encoding = getEncoding(contentType);
71+
let decodedContent = iconv.decode(content, encoding);
72+
let $ = cheerio.load(decodedContent);
73+
74+
// after first cheerio.load, check to see if encoding matches
75+
const metaContentType = $('meta[http-equiv=content-type]').attr('content');
76+
const properEncoding = getEncoding(metaContentType);
77+
78+
// if encodings in the header/body dont match, use the one in the body
79+
if (properEncoding !== encoding) {
80+
decodedContent = iconv.decode(content, properEncoding);
81+
$ = cheerio.load(decodedContent);
82+
}
83+
84+
return $;
85+
},
6686
};
6787

6888
export default Resource;

Diff for: src/resource/index.test.js

+21-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import assert from 'assert';
22
import cheerio from 'cheerio';
33
import { Errors } from 'utils';
4+
import { getEncoding } from 'utils/text';
45

56
import { record } from 'test-helpers';
67
import Resource from './index';
@@ -24,18 +25,31 @@ describe('Resource', () => {
2425

2526
assert.equal(error, Errors.badUrl);
2627
});
27-
});
2828

29-
describe('generateDoc({ body, response })', () => {
30-
it('returns a cheerio object if valid', () => {
31-
const response = { headers: { 'content-type': 'text/html' } };
29+
it('fetches with different encoding on body', async () => {
30+
const url = 'http://www.playnation.de/spiele-news/kojima-productions/hideo-kojima-reflektiert-ueber-seinen-werdegang-bei-konami-id68950.html';
31+
const $ = await Resource.create(url);
32+
const metaContentType = $('meta[http-equiv=content-type]').attr('value');
33+
34+
assert.equal(getEncoding(metaContentType), 'iso-8859-1');
35+
const encodedU = /ü/g;
36+
37+
assert.equal(encodedU.test($.html()), true);
38+
assert.equal(typeof $, 'function');
39+
});
40+
41+
it('handles special encoding', async () => {
42+
const url = 'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
43+
const $ = await Resource.create(url);
3244

33-
const body = '<div><p>Hi</p></div>';
34-
const $ = Resource.generateDoc({ body, response });
45+
const badEncodingRe = //g;
3546

36-
assert.equal($.html(), body);
47+
assert.equal(badEncodingRe.test($.html()), false);
48+
assert.equal(typeof $, 'function');
3749
});
50+
});
3851

52+
describe('generateDoc({ body, response })', () => {
3953
it('throws an error if the content is not text', () => {
4054
const response = {
4155
headers: {

Diff for: src/resource/utils/fetch-resource.js

-11
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import URL from 'url';
22
import request from 'request';
3-
import iconv from 'iconv-lite';
43
import { Errors } from 'utils';
5-
import { getEncoding } from 'utils/text';
64

75
import {
86
REQUEST_HEADERS,
@@ -17,12 +15,6 @@ function get(options) {
1715
if (err) {
1816
reject(err);
1917
} else {
20-
const encoding = getEncoding(response.headers['content-type']);
21-
22-
if (iconv.encodingExists(encoding)) {
23-
body = iconv.decode(body, encoding);
24-
}
25-
2618
resolve({ body, response });
2719
}
2820
});
@@ -97,9 +89,6 @@ export default async function fetchResource(url, parsedUrl) {
9789
url: parsedUrl.href,
9890
headers: { ...REQUEST_HEADERS },
9991
timeout: FETCH_TIMEOUT,
100-
// Don't set encoding; fixes issues
101-
// w/gzipped responses
102-
encoding: null,
10392
// Accept cookies
10493
jar: true,
10594
// Accept and decode gzip

Diff for: src/resource/utils/fetch-resource.test.js

+8-18
Original file line numberDiff line numberDiff line change
@@ -23,40 +23,30 @@ describe('fetchResource(url)', () => {
2323

2424
it('fetches nyt', async () => {
2525
const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
26-
const { body } = await fetchResource(url);
26+
const { response } = await fetchResource(url);
2727

28-
assert.equal(typeof body, 'string');
28+
assert.equal(response.statusCode, 200);
2929
});
3030

3131
it('fetches domains', async () => {
3232
const url = 'http://theconcourse.deadspin.com/1786177057';
33-
const { body } = await fetchResource(url);
33+
const { response } = await fetchResource(url);
3434

35-
assert.equal(typeof body, 'string');
35+
assert.equal(response.statusCode, 200);
3636
});
3737

3838
it('fetches nyt', async () => {
3939
const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
40-
const { body } = await fetchResource(url);
40+
const { response } = await fetchResource(url);
4141

42-
assert.equal(typeof body, 'string');
42+
assert.equal(response.statusCode, 200);
4343
});
4444

4545
it('handles this gzip error', async () => {
4646
const url = 'http://www.redcross.ca/blog/2016/11/photo-of-the-day--one-year-anniversary-of-the-end-of-ebola-in-sierra-leone';
47-
const { body } = await fetchResource(url);
47+
const { response } = await fetchResource(url);
4848

49-
assert.equal(typeof body, 'string');
50-
});
51-
52-
// this test addresses https://twitter.com/flikxxi/status/800074680342351872
53-
it('handles different encoding', async () => {
54-
const url = 'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
55-
const { body } = await fetchResource(url);
56-
57-
const badEncodingRe = //g;
58-
59-
assert.equal(badEncodingRe.test(body.toString()), false);
49+
assert.equal(response.statusCode, 200);
6050
});
6151
});
6252

Diff for: src/shims/iconv-lite.js

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// don't need it for already rendered text
44
const iconv = {
55
encodingExists: () => false,
6+
decode: s => s,
67
};
78

89
export default iconv;

Diff for: src/utils/text/constants.js

+1
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ export const IS_ALPHA_RE = /^[a-z]+$/i;
2222
export const IS_DIGIT_RE = /^[0-9]+$/i;
2323

2424
export const ENCODING_RE = /charset=([\w-]+)\b/;
25+
export const DEFAULT_ENCODING = 'utf-8';

Diff for: src/utils/text/get-encoding.js

+8-4
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
1-
import { ENCODING_RE } from './constants';
1+
import iconv from 'iconv-lite';
2+
import { DEFAULT_ENCODING, ENCODING_RE } from './constants';
23

34
// check a string for encoding; this is
45
// used in our fetchResource function to
56
// ensure correctly encoded responses
67
export default function getEncoding(str) {
8+
let encoding = DEFAULT_ENCODING;
79
if (ENCODING_RE.test(str)) {
8-
return ENCODING_RE.exec(str)[1];
10+
const testEncode = ENCODING_RE.exec(str)[1];
11+
if (iconv.encodingExists(testEncode)) {
12+
encoding = testEncode;
13+
}
914
}
10-
11-
return null;
15+
return encoding;
1216
}

Diff for: src/utils/text/get-encoding.test.js

+13-2
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,26 @@
11
import assert from 'assert';
2+
import cheerio from 'cheerio';
23

34
import getEncoding from './get-encoding';
45

6+
// Tests are bypassed in the browser because it has an encoding
7+
// A shim is used /src/shims/iconv-lite.js to decrease load size
8+
59
describe('getEncoding(str)', () => {
10+
if (cheerio.browser) return;
11+
612
it('returns the encoding as a string', () => {
713
const contentType = 'text/html; charset=iso-8859-15';
814
assert.equal(getEncoding(contentType), 'iso-8859-15');
915
});
1016

11-
it('returns null if no encoding found', () => {
17+
it('returns utf-8 as a default if no encoding found', () => {
1218
const contentType = 'text/html';
13-
assert.equal(getEncoding(contentType), null);
19+
assert.equal(getEncoding(contentType), 'utf-8');
20+
});
21+
22+
it('returns utf-8 if there is an invalid encoding', () => {
23+
const contentType = 'text/html; charset=fake-charset';
24+
assert.equal(getEncoding(contentType), 'utf-8');
1425
});
1526
});

0 commit comments

Comments
 (0)