Skip to content

Commit

Permalink
feat(date): add datePublished and datemodified support (#374)
Browse files Browse the repository at this point in the history
* Add published and modified dates

Signed-off-by: Stiliyan Ivanov <[email protected]>

* Fix tests

Signed-off-by: Stiliyan Ivanov <[email protected]>

* Fix more tests

Signed-off-by: Stiliyan Ivanov <[email protected]>

* Merge code

Signed-off-by: Stiliyan Ivanov <[email protected]>

* Fxi modified date

Signed-off-by: Stiliyan Ivanov <[email protected]>

* New line

Signed-off-by: Stiliyan Ivanov <[email protected]>

* Wee need concat rules

Signed-off-by: Stiliyan Ivanov <[email protected]>

* Closer to old behaviour of date

Signed-off-by: Stiliyan Ivanov <[email protected]>

* YouTube does not seem to use <meta date>

Signed-off-by: Stiliyan Ivanov <[email protected]>

* Add date docs

Signed-off-by: Stiliyan Ivanov <[email protected]>

* Update README.md

* test: don't modify previous fixtures

* test: update snapshot

The rule:

```
toDate($ => $filter($, $('[class*="publish" i]')))
```

is taking more priority than:

```
toDate($ => $('[property*="dc:date" i]').attr('content'))
```

It's okay for this case since it's a fuzzy rule so isn't too deterministic

* test: update snapshot

The rule

```
toDate($ => $('[itemprop="datepublished" i]').attr('content'))
```

It's taking more priority than:

```
toDate($ => $('meta[name="date" i]').attr('content')
```

It's okay for this case since it's a fuzzy rule so isn't too deterministic

* test: update snapshot

---------

Signed-off-by: Stiliyan Ivanov <[email protected]>
Co-authored-by: Kiko Beats <[email protected]>
  • Loading branch information
madwings and Kikobeats authored Mar 11, 2023
1 parent ff0e5d8 commit d219b58
Show file tree
Hide file tree
Showing 23 changed files with 74 additions and 29 deletions.
19 changes: 19 additions & 0 deletions packages/metascraper-date/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,25 @@
```bash
$ npm install metascraper-date --save
```
## API

### metascraper-date([options])

#### options

##### datePublished

Type: `boolean`<br>
Default: `false`

Whether to return `datePublished` alongside date.

##### dateModified

Type: `boolean`<br>
Default: `false`

Whether to return `dateModified` alongside date.

## License

Expand Down
62 changes: 44 additions & 18 deletions packages/metascraper-date/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,67 @@ const { date, $filter, $jsonld, toRule } = require('@metascraper/helpers')

const toDate = toRule(date)

module.exports = () => ({
date: [
toDate($jsonld('dateModified')),
toDate($jsonld('datePublished')),
toDate($jsonld('dateCreated')),
toDate($ => $('meta[property*="updated_time" i]').attr('content')),
toDate($ => $('meta[property*="modified_time" i]').attr('content')),
toDate($ => $('meta[property*="published_time" i]').attr('content')),
toDate($ => $('meta[property*="release_date" i]').attr('content')),
const dateRules = () => {
return [
toDate($ => $('meta[name="date" i]').attr('content')),
toDate($ => $('[itemprop*="datemodified" i]').attr('content')),
toDate($ => $('[itemprop="datepublished" i]').attr('content')),
toDate($ => $('[itemprop*="date" i]').attr('content')),
toDate($ => $('time[itemprop*="date" i]').attr('datetime')),
toDate($ => $('time[datetime]').attr('datetime')),
toDate($ => $('time[datetime][pubdate]').attr('datetime')),
toDate($ => $('meta[name*="dc.date" i]').attr('content')),
toDate($ => $('meta[name*="dc.date.issued" i]').attr('content')),
toDate($ => $('meta[name*="dc.date.created" i]').attr('content')),
toDate($ => $('meta[name*="dcterms.date" i]').attr('content')),
toDate($ => $('[property*="dc:date" i]').attr('content')),
toDate($ => $('[property*="dc:created" i]').attr('content')),
toDate($ => $filter($, $('[class*="byline" i]'))),
toDate($ => $filter($, $('[class*="dateline" i]'))),
toDate($ => $filter($, $('[id*="metadata" i]'))),
toDate($ => $filter($, $('[class*="metadata" i]'))), // twitter, move into a bundle of rules
toDate($ => $filter($, $('[id*="date" i]'))),
toDate($ => $filter($, $('[class*="date" i]'))),
toDate($ => $filter($, $('[id*="publish" i]'))),
toDate($ => $filter($, $('[class*="publish" i]'))),
toDate($ => $filter($, $('[id*="post-timestamp" i]'))),
toDate($ => $filter($, $('[class*="post-timestamp" i]'))),
toDate($ => $filter($, $('[id*="post-meta" i]'))),
toDate($ => $filter($, $('[class*="post-meta" i]'))),
toDate($ => $filter($, $('[id*="time" i]'))),
toDate($ => $filter($, $('[class*="time" i]')))
]
})
}

const datePublishedRules = () => {
return [
toDate($jsonld('datePublished')),
toDate($jsonld('dateCreated')),
toDate($ => $('meta[property*="published_time" i]').attr('content')),
toDate($ => $('meta[property*="release_date" i]').attr('content')),
toDate($ => $('[itemprop="datepublished" i]').attr('content')),
toDate($ => $('time[datetime][pubdate]').attr('datetime')),
toDate($ => $('meta[name*="dc.date.issued" i]').attr('content')),
toDate($ => $('meta[name*="dc.date.created" i]').attr('content')),
toDate($ => $('[property*="dc:created" i]').attr('content')),
toDate($ => $filter($, $('[id*="publish" i]'))),
toDate($ => $filter($, $('[class*="publish" i]')))
]
}

const dateModifiedRules = () => {
return [
toDate($jsonld('dateModified')),
toDate($ => $('meta[property*="updated_time" i]').attr('content')),
toDate($ => $('meta[property*="modified_time" i]').attr('content')),
toDate($ => $('[itemprop*="datemodified" i]').attr('content'))
]
}

module.exports = ({ datePublished, dateModified } = { datePublished: false, dateModified: false }) => {
const result = {
date: dateModifiedRules().concat(datePublishedRules(), dateRules())
}

if (datePublished) {
result.datePublished = datePublishedRules()
}

if (dateModified) {
result.dateModified = dateModifiedRules()
}

return result
}
2 changes: 1 addition & 1 deletion packages/metascraper-youtube/test/snapshots/index.js.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ Generated by [AVA](https://avajs.dev).
{
author: 'USAGov',
date: '2023-01-30T21:21:15.000Z',
date: '2014-06-10T00:00:00.000Z',
description: 'Meet Chris Meyer, a marine biologist and scientist from the National Museum of Natural History. He talks about the ocean and his favorite animal the cowrie…',
image: 'https://img.youtube.com/vi/EAZvxukW8kY/sddefault.jpg',
lang: 'en',
Expand Down
Binary file modified packages/metascraper-youtube/test/snapshots/index.js.snap
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Generated by [AVA](https://avajs.dev).
{
audio: null,
author: 'David Spark',
date: '2015-06-03T00:00:00.000Z',
date: '2015-06-03T14:06:00.000Z',
description: 'If IT is going to better align themselves with business operations, they must be able to objectively quantify the value of the cloud.',
image: 'http://images.techhive.com/images/article/2015/06/msftone_cloudperspective4-100588855-primary.idge.png',
lang: 'en',
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Generated by [AVA](https://avajs.dev).
{
audio: null,
author: 'Clare Hopping',
date: '2016-05-20T08:58:00.000Z',
date: '2016-05-20T12:00:00.000Z',
description: 'The services will enable Sabre to provide its software, data, mobile and distribution solutions to more travel companies',
image: 'http://cdn2.cloudpro.co.uk/sites/cloudprod7/files/4/29/handshake_0.jpg',
lang: 'en',
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Generated by [AVA](https://avajs.dev).
{
audio: null,
author: 'Robert Shields',
date: '2016-04-15T00:00:00.000Z',
date: '2016-04-15T20:52:00.000Z',
description: 'My healthcare data is what I want protected the most (intimate details about my family’s health, where we live, and financial information). Anything and everything a hacker could want! It is safe? As a data security professional and citizen, I know the answer is not good.',
image: 'http://images.techhive.com/images/article/2016/04/blog-31_apr15_image-1-100656409-primary.idge.jpg',
lang: 'en',
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Generated by [AVA](https://avajs.dev).
{
audio: null,
author: 'Reuters',
date: '2016-05-04T15:40:00.000Z',
date: '2016-05-04T12:00:00.000Z',
description: '‘One of the big causes for the stagnation of middle class wages is essentially because of clever computer programs,’ said David Siegel, co-chairman of Two Sigma.',
image: 'https://assets.entrepreneur.com/content/3x2/1300/20160504155601-GettyImages-174457162.jpeg',
lang: 'en',
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Generated by [AVA](https://avajs.dev).
{
audio: null,
author: 'Lydia Dishman',
date: '2016-05-24T09:42:00.000Z',
date: '2016-05-24T21:27:05.000Z',
description: 'Lack of access to capital is a big challenge, but so is the lack of access to networks and advisors.',
image: 'http://b.fastcompany.net/multisite_files/fastcompany/imagecache/620x350/poster/2016/05/3060169-poster-p-1-one-of-the-biggest-challenges-of-getting-funding-for-minority-owned-business.jpg',
lang: 'en',
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Generated by [AVA](https://avajs.dev).
{
audio: null,
author: 'Mark Emmons',
date: '2016-02-23T13:37:22.000Z',
date: '2016-02-23T12:00:00.000Z',
description: 'Here’s the thing about pendulums. They swing back and forth. It’s just what they do. Grandfather clocks. Amusement park rides. Economic trends. And throughout most of 2015, Silicon Valley observers were engaged in a spirited debate about whether or not the booming tech economy had peaked and if…',
image: 'http://www.leandatainc.com/wp-content/uploads/2016/02/Trees_blown.jpg',
lang: 'en',
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Generated by [AVA](https://avajs.dev).
{
audio: null,
author: 'Los Angeles Times',
date: '2016-04-30T00:00:00.000Z',
date: '2016-05-02T10:03:18.000Z',
description: 'Tech start-up Appthority’s office has plush conference rooms, soundproof phone booths, an enormous kitchen and a view of San Francisco Bay. It has ping-pong and foosball tables, beer on tap and 11 types of tea.',
image: 'http://www.trbimg.com/img-572421a4/turbine/la-fi-tn-tech-downturn-20160429',
lang: 'en',
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Generated by [AVA](https://avajs.dev).
{
audio: null,
author: 'Marisa Kendall',
date: '2016-04-20T01:52:32.000Z',
date: '2016-04-20T02:05:48.000Z',
description: 'It’s no secret that few women hold positions of power at venture capital firms. A new study quantifies what many in the industry have reported anecdotally. According to CrunchBase data released Tuesday on …',
image: 'http://www.siliconbeat.com/wp-content/uploads/2016/04/Panel.jpg',
lang: 'en',
Expand Down
Binary file not shown.
3 changes: 2 additions & 1 deletion packages/metascraper/test/integration/venture-beat/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const url =

test('venture-beat', async t => {
const html = await readFile(resolve(__dirname, 'input.html'))
const metadata = await metascraper({ html, url })
const { date, ...metadata } = await metascraper({ html, url })
t.is(typeof date, 'string')
t.snapshot(metadata)
})
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ Generated by [AVA](https://avajs.dev).
{
audio: null,
author: 'Paul Sawers',
date: '2016-04-21T04:30:14.000Z',
description: 'Forter, a security-focused company that brings real-time fraud-prevention technology to online retailers, has raised $32 million to continue its global growth and expand across the U.S.',
image: 'http://1u88jj3r4db2x4txp44yqfj1.wpengine.netdna-cdn.com/wp-content/uploads/2016/04/ecommerce-780x473.jpg',
lang: 'en',
Expand Down
Binary file not shown.

0 comments on commit d219b58

Please sign in to comment.