From 818a19b2946b1bfd6256c5414d3425d6b94c2a6e Mon Sep 17 00:00:00 2001 From: Kiko Beats Date: Sat, 1 Jul 2017 21:57:02 +0200 Subject: [PATCH] Add html sanitization Remove unnecessary rules --- index.js | 9 ++++----- package.json | 1 + src/html/index.js | 23 +++++++++++++++++++++++ src/rules/date.js | 5 ----- 4 files changed, 28 insertions(+), 10 deletions(-) create mode 100644 src/html/index.js diff --git a/index.js b/index.js index 37d957e8a..0ea0582fe 100644 --- a/index.js +++ b/index.js @@ -1,8 +1,9 @@ 'use strict' -const rules = require('req-all')('./src/rules') const reduce = require('lodash.reduce') -const cheerio = require('cheerio') + +const rules = require('req-all')('./src/rules') +const loadHtml = require('./src/html') const isValid = result => result !== null && result !== undefined && result !== '' @@ -18,9 +19,7 @@ const getValue = ($, conditions) => { } module.exports = rawHtml => { - const html = cheerio.load(rawHtml, { - lowerCaseAttributeNames: true - }) + const html = loadHtml(rawHtml) return reduce(rules, (acc, conditions, ruleName) => { const value = getValue(html, conditions) diff --git a/package.json b/package.json index 967abf842..fb90042e3 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "lodash.reduce": "~4.6.0", "normalize-url": "~1.9.1", "req-all": "~1.0.0", + "sanitize-html": "~1.14.1", "to-title-case": "~1.0.0", "url-regex": "~4.1.1" }, diff --git a/src/html/index.js b/src/html/index.js new file mode 100644 index 000000000..8f588cf90 --- /dev/null +++ b/src/html/index.js @@ -0,0 +1,23 @@ +'use strict' + +const sanitizeHtml = require('sanitize-html') +const flow = require('lodash.flow') +const cheerio = require('cheerio') + +const sanitize = html => sanitizeHtml(html, { + allowedTags: false, + allowedAttributes: false, + transformTags: { + meta: (tagName, attribs) => { + if (attribs.name) attribs.name = attribs.name.toLowerCase() + return {tagName, attribs} + } + } +}) + +const load = cheerio.load.bind(cheerio) + +module.exports = flow([ + sanitize, + load +]) diff --git a/src/rules/date.js b/src/rules/date.js index a94917227..65018baae 100644 --- a/src/rules/date.js +++ b/src/rules/date.js @@ -36,16 +36,11 @@ const wrap = rule => $ => { module.exports = [ wrap($ => $('meta[property="article:published_time"]').attr('content')), wrap($ => $('meta[name="dc.date"]').attr('content')), - wrap($ => $('meta[name="DC.date"]').attr('content')), wrap($ => $('meta[name="dc.date.issued"]').attr('content')), - wrap($ => $('meta[name="DC.date.issued"]').attr('content')), wrap($ => $('meta[name="dc.date.created"]').attr('content')), - wrap($ => $('meta[name="DC.date.created"]').attr('content')), - wrap($ => $('meta[name="DC.Date"]').attr('content')), wrap($ => $('meta[name="date"]').attr('content')), wrap($ => $('meta[name="dcterms.date"]').attr('content')), wrap($ => $('[itemprop="datePublished"]').attr('content')), - wrap($ => $('time[itemprop*="pubDate"]').attr('datetime')), wrap($ => $('time[itemprop*="pubdate"]').attr('datetime')), wrap($ => $('[property*="dc:date"]').attr('content')), wrap($ => $('[property*="dc:created"]').attr('content')),