diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7e878ef60..9fba8ef18 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,7 +8,9 @@ Rules bundles are a collection of HTML selectors around a determinate property. ## Writing Your Own Rules -Just you need to declare your rules using the following interface: +### Get value from HTML + +Every rule receives `htmlDom` (*cheerio*) and `url` as parameters inside an object: ```js 'use strict' @@ -18,19 +20,32 @@ Just you need to declare your rules using the following interface: * **/ module.exports = () => { - return ({ + const rules = { logo: [ // They receive as parameter: // - `htmlDom`: the cheerio HTML instance. // - `url`: The input URL used for extact the content. - ({ htmlDom: $, url }) => wrap($ => $('meta[property="og:logo"]').attr('content')), - ({ htmlDom: $, url }) => wrap($ => $('meta[itemprop="logo"]').attr('content')) + ({ htmlDom: $, url }) => $('meta[property="og:logo"]').attr('content'), + ({ htmlDom: $, url }) => $('meta[itemprop="logo"]').attr('content') ] - }) + } + return rules } ``` -The order of rules are loaded are important: Just the first rule that returns a truthy value will be used. The rest rules after that will be not invoked. +You can declare any logic you need in order to determinate the output. + +A set of rules under the same namespace runs on series and only the value returned by the first rule that output a [truthy](https://developer.mozilla.org/en-US/docs/Glossary/Falsy) value will be taken. So remember, the order is important!. + +### Defining `test` function + +You can associate a `test` function with your rule bundle: + +```js +rules.test = ({url}) => getVideoInfo(url).service === 'youtube')) +``` + +The `test` function will receive the same arguments than a rule. This is useful for just skip all the rules into that doesn't target an specific URL. ## Testing your Rules diff --git a/packages/metascraper-amazon/index.js b/packages/metascraper-amazon/index.js index 1b7f610db..c967dfd8c 100644 --- a/packages/metascraper-amazon/index.js +++ b/packages/metascraper-amazon/index.js @@ -5,7 +5,6 @@ const { $filter, title, author, - createWard, createWrap, lang } = require('@metascraper/helpers') @@ -37,24 +36,28 @@ const wrapUrl = createWrap(urlFn) const wrapAuthor = createWrap(author) const wrapTitle = createWrap(title, { removeSeparator: false }) const wrapLang = createWrap(lang) -const ward = createWard(({ url }) => isValidUrl(url)) - -module.exports = () => ({ - lang: [ward(wrapLang(($, url) => getDomainLanguage(url)))], - author: [ - ward(wrapAuthor($ => $('.contributorNameID').text())), - ward(wrapAuthor($ => $('#bylineInfo').text())), - ward(wrapAuthor($ => $('#brand').text())) - ], - title: [ - ward(wrapTitle($ => $('#productTitle').text())), - ward(wrapTitle($ => $('#btAsinTitle').text())), - ward(wrapTitle($ => $filter($, $('h1.a-size-large')))), - ward(wrapTitle($ => $('#item_name').text())) - ], - publisher: [ward(() => 'Amazon')], - image: [ - ward(wrapUrl($ => $('.a-dynamic-image').attr('data-old-hires'))), - ward(wrapUrl($ => $('.a-dynamic-image').attr('src'))) - ] -}) + +module.exports = () => { + const rules = { + lang: [wrapLang(($, url) => getDomainLanguage(url))], + author: [ + wrapAuthor($ => $('.contributorNameID').text()), + wrapAuthor($ => $('#bylineInfo').text()), + wrapAuthor($ => $('#brand').text()) + ], + title: [ + wrapTitle($ => $('#productTitle').text()), + wrapTitle($ => $('#btAsinTitle').text()), + wrapTitle($ => $filter($, $('h1.a-size-large'))), + wrapTitle($ => $('#item_name').text()) + ], + publisher: [() => 'Amazon'], + image: [ + wrapUrl($ => $('.a-dynamic-image').attr('data-old-hires')), + wrapUrl($ => $('.a-dynamic-image').attr('src')) + ] + } + + rules.test = ({ url }) => isValidUrl(url) + return rules +} diff --git a/packages/metascraper-amazon/package.json b/packages/metascraper-amazon/package.json index 8989a3012..a3b3caaa4 100644 --- a/packages/metascraper-amazon/package.json +++ b/packages/metascraper-amazon/package.json @@ -17,7 +17,7 @@ ], "dependencies": { "@metascraper/helpers": "^5.5.4", - "memoize-one": "~5.0.4", + "memoize-one": "~5.0.5", "tldts": "~5.3.0" }, "devDependencies": { diff --git a/packages/metascraper-author/package.json b/packages/metascraper-author/package.json index 8b73a8597..e8a0834a4 100644 --- a/packages/metascraper-author/package.json +++ b/packages/metascraper-author/package.json @@ -17,7 +17,7 @@ ], "dependencies": { "@metascraper/helpers": "^5.5.4", - "lodash": "~4.17.11" + "lodash": "~4.17.12" }, "devDependencies": { "standard": "latest" diff --git a/packages/metascraper-clearbit/index.js b/packages/metascraper-clearbit/index.js index e885278ee..206a78020 100644 --- a/packages/metascraper-clearbit/index.js +++ b/packages/metascraper-clearbit/index.js @@ -27,9 +27,7 @@ const clearbit = memoizeOne(async ({ url }) => { const getClearbit = createValidator(clearbit) -module.exports = () => { - return { - logo: getClearbit({ from: 'logo' }), - publisher: getClearbit({ from: 'name', to: 'publisher' }) - } -} +module.exports = () => ({ + logo: getClearbit({ from: 'logo' }), + publisher: getClearbit({ from: 'name', to: 'publisher' }) +}) diff --git a/packages/metascraper-clearbit/package.json b/packages/metascraper-clearbit/package.json index b82652630..a8a416c63 100644 --- a/packages/metascraper-clearbit/package.json +++ b/packages/metascraper-clearbit/package.json @@ -20,7 +20,7 @@ "dependencies": { "@metascraper/helpers": "^5.5.4", "got": "~9.6.0", - "memoize-one": "~5.0.4", + "memoize-one": "~5.0.5", "tldts": "~5.3.0" }, "devDependencies": { diff --git a/packages/metascraper-helpers/index.js b/packages/metascraper-helpers/index.js index 1a67fd9ad..f64ffa9e8 100644 --- a/packages/metascraper-helpers/index.js +++ b/packages/metascraper-helpers/index.js @@ -285,12 +285,6 @@ const createWrap = (fn, opts) => rule => ({ htmlDom, url }) => { return fn(value, opts) } -/** - * Ward a rule only if `validator` returns `true`. - */ -const createWard = validator => fn => args => - validator(args) ? fn(args) : null - module.exports = { $filter, $jsonld, @@ -327,6 +321,5 @@ module.exports = { video, validator, createValidator, - createWrap, - createWard + createWrap } diff --git a/packages/metascraper-helpers/package.json b/packages/metascraper-helpers/package.json index a16fc91d5..33b8eabb2 100644 --- a/packages/metascraper-helpers/package.json +++ b/packages/metascraper-helpers/package.json @@ -26,7 +26,7 @@ "is-uri": "~1.2.0", "iso-639-3": "~1.2.0", "isostring": "0.0.1", - "lodash": "~4.17.11", + "lodash": "~4.17.12", "mem": "~5.1.1", "mime-types": "~2.1.24", "normalize-url": "~4.3.0", diff --git a/packages/metascraper-logo-favicon/package.json b/packages/metascraper-logo-favicon/package.json index 1ecc89379..059066a0b 100644 --- a/packages/metascraper-logo-favicon/package.json +++ b/packages/metascraper-logo-favicon/package.json @@ -19,7 +19,7 @@ "dependencies": { "@metascraper/helpers": "^5.5.4", "got": "~9.6.0", - "lodash": "~4.17.11" + "lodash": "~4.17.12" }, "devDependencies": { "coveralls": "latest", diff --git a/packages/metascraper-media-provider/package.json b/packages/metascraper-media-provider/package.json index 89fe1b9e7..00de3f22e 100644 --- a/packages/metascraper-media-provider/package.json +++ b/packages/metascraper-media-provider/package.json @@ -21,9 +21,9 @@ "@microlink/youtube-dl": "~2.0.0", "debug": "~4.1.1", "got": "~9.6.0", - "lodash": "~4.17.11", + "lodash": "~4.17.12", "luminati-tunnel": "~1.3.0", - "memoize-one": "~5.0.4" + "memoize-one": "~5.0.5" }, "devDependencies": { "coveralls": "latest", diff --git a/packages/metascraper-readability/package.json b/packages/metascraper-readability/package.json index 51942278d..81c97abe8 100644 --- a/packages/metascraper-readability/package.json +++ b/packages/metascraper-readability/package.json @@ -18,7 +18,7 @@ "dependencies": { "@metascraper/helpers": "^5.5.4", "jsdom": "~15.1.1", - "memoize-one": "~5.0.4", + "memoize-one": "~5.0.5", "readability": "github:mozilla/readability" }, "devDependencies": { diff --git a/packages/metascraper-soundcloud/index.js b/packages/metascraper-soundcloud/index.js index 19340b3d4..1f0cf28d6 100644 --- a/packages/metascraper-soundcloud/index.js +++ b/packages/metascraper-soundcloud/index.js @@ -4,21 +4,24 @@ const { $filter, author, description, - createWrap, - createWard + createWrap } = require('@metascraper/helpers') const memoizeOne = require('memoize-one') const { getDomain } = require('tldts') const isValidUrl = memoizeOne(url => getDomain(url) === 'soundcloud.com') -const ward = createWard(({ url }) => isValidUrl(url)) const wrapDescription = createWrap(description) const wrapAuthor = createWrap(author) -module.exports = () => ({ - author: [ward(wrapAuthor($ => $filter($, $('.soundTitle__username'))))], - description: [ - ward(wrapDescription($ => $filter($, $('.soundTitle__description')))) - ] -}) +module.exports = () => { + const rules = { + author: [wrapAuthor($ => $filter($, $('.soundTitle__username')))], + description: [ + wrapDescription($ => $filter($, $('.soundTitle__description'))) + ] + } + + rules.test = ({ url }) => isValidUrl(url) + return rules +} diff --git a/packages/metascraper-soundcloud/package.json b/packages/metascraper-soundcloud/package.json index cc81107b2..9f516ec0d 100644 --- a/packages/metascraper-soundcloud/package.json +++ b/packages/metascraper-soundcloud/package.json @@ -18,7 +18,7 @@ ], "dependencies": { "@metascraper/helpers": "^5.5.4", - "memoize-one": "~5.0.4", + "memoize-one": "~5.0.5", "tldts": "~5.3.0" }, "devDependencies": { diff --git a/packages/metascraper-title/package.json b/packages/metascraper-title/package.json index 5eafac767..4c93b5977 100644 --- a/packages/metascraper-title/package.json +++ b/packages/metascraper-title/package.json @@ -17,7 +17,7 @@ ], "dependencies": { "@metascraper/helpers": "^5.5.4", - "lodash": "~4.17.11" + "lodash": "~4.17.12" }, "devDependencies": { "standard": "latest" diff --git a/packages/metascraper-uol/index.js b/packages/metascraper-uol/index.js index 8f128c301..846237d11 100644 --- a/packages/metascraper-uol/index.js +++ b/packages/metascraper-uol/index.js @@ -4,8 +4,7 @@ const { $jsonld, title, description, - createWrap, - createWard + createWrap } = require('@metascraper/helpers') const memoizeOne = require('memoize-one') const { getDomain } = require('tldts') @@ -16,20 +15,21 @@ const isValidUrl = memoizeOne(url => ROOT_DOMAINS.some(domain => getDomain(url) === domain) ) -const ward = createWard(({ url }) => isValidUrl(url)) - const wrapTitle = createWrap(title) const wrapDescription = createWrap(description) -module.exports = () => ({ - title: [ - ward(wrapTitle(($, url) => $jsonld('headline')($, url))), - ward(wrapTitle(($, url) => $jsonld('name')($, url))), - ward(wrapTitle($ => $('title').text())) - ], - description: [ - ward(wrapDescription(($, url) => $jsonld('description')($, url))) - ] -}) +module.exports = () => { + const rules = { + title: [ + wrapTitle(($, url) => $jsonld('headline')($, url)), + wrapTitle(($, url) => $jsonld('name')($, url)), + wrapTitle($ => $('title').text()) + ], + description: [wrapDescription(($, url) => $jsonld('description')($, url))] + } + + rules.test = ({ url }) => isValidUrl(url) + return rules +} module.exports.isValidUrl = isValidUrl diff --git a/packages/metascraper-uol/package.json b/packages/metascraper-uol/package.json index 9469772b6..0680c747f 100644 --- a/packages/metascraper-uol/package.json +++ b/packages/metascraper-uol/package.json @@ -19,7 +19,7 @@ ], "dependencies": { "@metascraper/helpers": "^5.5.4", - "memoize-one": "~5.0.4", + "memoize-one": "~5.0.5", "tldts": "~5.3.0" }, "devDependencies": { diff --git a/packages/metascraper-video/package.json b/packages/metascraper-video/package.json index 6f25ed389..880d61841 100644 --- a/packages/metascraper-video/package.json +++ b/packages/metascraper-video/package.json @@ -17,7 +17,7 @@ ], "dependencies": { "@metascraper/helpers": "^5.5.4", - "lodash": "~4.17.11" + "lodash": "~4.17.12" }, "devDependencies": { "coveralls": "latest", diff --git a/packages/metascraper-youtube/index.js b/packages/metascraper-youtube/index.js index 6044f1866..be4889006 100644 --- a/packages/metascraper-youtube/index.js +++ b/packages/metascraper-youtube/index.js @@ -4,8 +4,7 @@ const { $filter, author, description, - createWrap, - createWard + createWrap } = require('@metascraper/helpers') const isReachable = require('is-reachable') @@ -34,24 +33,28 @@ const wrapDescription = createWrap(description) const getVideoInfo = memoizeOne(getVideoId) -const isValidUrl = url => getVideoInfo(url).service === 'youtube' - -const ward = createWard(({ url }) => isValidUrl(url)) - -module.exports = () => ({ - author: [ - ward(wrapAuthor($ => $('#owner-name').text())), - ward(wrapAuthor($ => $('#channel-title').text())), - ward(wrapAuthor($ => $filter($, $('[class*="user-info" i]')))) - ], - description: [ward(wrapDescription($ => $('#description').text()))], - publisher: [ward(() => 'YouTube')], - image: [ - ward(({ htmlDom, url }) => { - const { id } = getVideoId(url) - return id && getThumbnailUrl(id) - }) - ] -}) +const isValidUrl = memoizeOne(url => getVideoInfo(url).service === 'youtube') + +module.exports = () => { + const rules = { + author: [ + wrapAuthor($ => $('#owner-name').text()), + wrapAuthor($ => $('#channel-title').text()), + wrapAuthor($ => $filter($, $('[class*="user-info" i]'))) + ], + description: [wrapDescription($ => $('#description').text())], + publisher: [() => 'YouTube'], + image: [ + ({ htmlDom, url }) => { + const { id } = getVideoId(url) + return id && getThumbnailUrl(id) + } + ] + } + + rules.test = ({ url }) => isValidUrl(url) + + return rules +} module.exports.isValidUrl = isValidUrl diff --git a/packages/metascraper-youtube/package.json b/packages/metascraper-youtube/package.json index a864492f0..cbc948ddc 100644 --- a/packages/metascraper-youtube/package.json +++ b/packages/metascraper-youtube/package.json @@ -20,7 +20,7 @@ "@metascraper/helpers": "^5.5.4", "get-video-id": "~3.1.3", "is-reachable": "~3.1.0", - "memoize-one": "~5.0.4", + "memoize-one": "~5.0.5", "p-locate": "~4.1.0" }, "devDependencies": { diff --git a/packages/metascraper/package.json b/packages/metascraper/package.json index 45fbb31ae..3b6ee8a94 100644 --- a/packages/metascraper/package.json +++ b/packages/metascraper/package.json @@ -60,7 +60,7 @@ "@metascraper/helpers": "^5.5.4", "cheerio": "~1.0.0-rc.2", "cheerio-advanced-selectors": "~2.0.1", - "lodash": "~4.17.11", + "lodash": "~4.17.12", "whoops": "~4.0.2", "xss": "~1.0.6" }, diff --git a/packages/metascraper/src/get-data.js b/packages/metascraper/src/get-data.js index af803fe6d..5e5f7ec27 100644 --- a/packages/metascraper/src/get-data.js +++ b/packages/metascraper/src/get-data.js @@ -11,13 +11,19 @@ const { const xss = require('xss') -const getValue = async ({ htmlDom, url, conditions, meta }) => { - const lastIndex = conditions.length +const noopTest = () => true + +const getValue = async ({ htmlDom, url, rules, meta }) => { + const lastIndex = rules.length let index = 0 let value while (isEmpty(value) && index < lastIndex) { - value = await conditions[index++]({ htmlDom, url, meta }) + const rule = rules[index++] + const test = rule.test || noopTest + if (test({ htmlDom, url, meta })) { + value = await rule({ htmlDom, url, meta }) + } } return value @@ -40,8 +46,8 @@ const escapeValue = (value, { escape }) => const getData = async ({ rules, htmlDom, url, escape }) => { const data = await Promise.all( - map(rules, async ([propName, conditions]) => { - const rawValue = await getValue({ htmlDom, url, conditions }) + map(rules, async ([propName, innerRules]) => { + const rawValue = await getValue({ htmlDom, url, rules: innerRules }) const value = isEmpty(rawValue) ? null : escapeValue(rawValue, { escape }) return [propName, value] }) diff --git a/packages/metascraper/src/load-rules.js b/packages/metascraper/src/load-rules.js index fba21b0b7..00c0d4911 100644 --- a/packages/metascraper/src/load-rules.js +++ b/packages/metascraper/src/load-rules.js @@ -2,14 +2,18 @@ const { has, set, concat, forEach, chain } = require('lodash') -module.exports = rules => - chain(rules) - .reduce((acc, rules) => { +module.exports = rulesBundle => + chain(rulesBundle) + .reduce((acc, { test, ...rules }) => { forEach(rules, function (innerRules, propName) { + if (test) forEach(innerRules, rule => (rule.test = test)) + set( acc, propName, - has(acc, propName) ? concat(acc[propName], innerRules) : concat(innerRules) + has(acc, propName) + ? concat(acc[propName], innerRules) + : concat(innerRules) ) return acc diff --git a/packages/metascraper/src/merge-rules.js b/packages/metascraper/src/merge-rules.js index bc72aa1a6..7cdb395c3 100644 --- a/packages/metascraper/src/merge-rules.js +++ b/packages/metascraper/src/merge-rules.js @@ -1,17 +1,25 @@ 'use strict' -const { cloneDeep, concat, first, findIndex, forEach, chain } = require('lodash') +const { + cloneDeep, + concat, + first, + findIndex, + forEach, + chain +} = require('lodash') module.exports = (rules, baseRules) => chain(rules) - .reduce((acc, rules) => { - forEach(rules, (rule, propName) => { + .reduce((acc, { test, ...rules }) => { + forEach(rules, (innerRules, propName) => { + if (test) forEach(innerRules, rule => (rule.test = test)) // find the rules associated with `propName` const index = findIndex(acc, item => first(item) === propName) // if `propName` has more rule, add the new rule from the end - if (index !== -1) acc[index][1] = concat(rule, ...acc[index][1]) + if (index !== -1) acc[index][1] = concat(innerRules, ...acc[index][1]) // otherwise, create an array of rules - else acc.push([propName, rule]) + else acc.push([propName, innerRules]) }) return acc }, cloneDeep(baseRules)) diff --git a/packages/metascraper/test/unit/interface.js b/packages/metascraper/test/unit/interface.js index 673d39f4b..73b8535c8 100644 --- a/packages/metascraper/test/unit/interface.js +++ b/packages/metascraper/test/unit/interface.js @@ -2,9 +2,11 @@ const should = require('should') -const metascraper = require('../..')([require('metascraper-title')()]) +const createMetascraper = require('../..') +const titleRules = require('metascraper-title')() it('url is required', async () => { + const metascraper = createMetascraper([titleRules]) try { await metascraper() } catch (err) { @@ -47,7 +49,7 @@ it('escape is enabled by default', async () => { ` - + const metascraper = createMetascraper([titleRules]) const metadata = await metascraper({ html, url: 'http://127.0.0.1:8080' @@ -84,10 +86,49 @@ it('load extra rules', async () => { const rules = [ { - foo: [() => 'bar'] + foo: [() => 'bar', () => 'barz'], + barz: [() => 'foo', () => 'foorz'] } ] + const metascraper = createMetascraper([titleRules]) const meta = await metascraper({ url, html, rules }) should(meta.foo).equal('bar') }) + +it('associate test function with rules', async () => { + const url = 'https://microlink.io' + + const html = ` + + + + + + + Document + + +
+ + + + +
+ + +

Hello World

+ + + ` + + const rulesBundle = () => { + const rules = { foo: [() => 'bar'] } + rules.test = ({ url: urlBase }) => urlBase !== url + return rules + } + + const metascraper = createMetascraper([rulesBundle()]) + const meta = await metascraper({ url, html }) + should(meta.foo).be.null() +})