Skip to content

Commit

Permalink
feat(video): improve iframe detection (#683)
Browse files Browse the repository at this point in the history
* refactor(media-provider): remove unnecessary dependency

* fix(audio): only consider http URLs

* fix(audio): mp4 audio only files

* feat(video): improve iframe detection

* test: update snapshot

* fix: file name

* test: update assertion

* test: update snapshot
  • Loading branch information
Kikobeats authored Dec 30, 2023
1 parent 04d321c commit 49a37e5
Show file tree
Hide file tree
Showing 9 changed files with 2,808 additions and 1,707 deletions.
2 changes: 2 additions & 0 deletions packages/metascraper-audio/src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ const toAudioFromDom = toRule((domNodes, opts) => {
.get(1)
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
.replace('mpeg', 'mp3')
/* mp4 is commonly used for video */
.replace('mp4', 'mp3')
.value()
}))
.uniqWith(isEqual)
Expand Down
15 changes: 15 additions & 0 deletions packages/metascraper-audio/test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,18 @@ test('`audio > source:src` with content type and relative src', async t => {
'https://www.theverge.com/2018/1/22/16921092/audio-small'
)
})

test('`audio > source:src` with mp4 mime type', async t => {
const html = `
<audio controls>
<source type="audio/mp4" src="https://cdn.microlink.io/file-examples/sample-audio.mp4">
</audio>
`
const url = 'https://metascraper.js.org'
const metascraper = createMetascraper()
const metadata = await metascraper({ html, url })
t.is(
metadata.audio,
'https://cdn.microlink.io/file-examples/sample-audio.mp4'
)
})
2 changes: 1 addition & 1 deletion packages/metascraper-helpers/test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ test('.date', t => {
t.is(date(undefined), undefined)
t.is(date(null), undefined)
t.is(date('null'), undefined)
t.is(date('Jun 20'), '2023-06-20T12:00:00.000Z')
t.is(date('Jun 20'), '2024-06-20T12:00:00.000Z')
t.is(date('Jun 20 2018'), '2018-06-20T12:00:00.000Z')
t.is(date('Jun 2018'), '2018-06-01T12:00:00.000Z')
t.is(date(2010), '2010-01-01T00:00:00.000Z')
Expand Down
3 changes: 2 additions & 1 deletion packages/metascraper-video/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"description": "Get video property from HTML markup",
"homepage": "https://github.com/microlinkhq/metascraper/packages/metascraper-video",
"version": "5.42.5",
"main": "src/index.js",
"types": "src/index.d.ts",
"main": "src/index.js",
"author": {
"email": "[email protected]",
"name": "microlink.io",
Expand All @@ -27,6 +27,7 @@
"lodash": "~4.17.21"
},
"devDependencies": {
"async-listen": "latest",
"ava": "5"
},
"engines": {
Expand Down
45 changes: 24 additions & 21 deletions packages/metascraper-video/src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@
const {
$jsonld,
$twitter,
loadIframe,
findRule,
has,
loadIframe,
normalizeUrl,
toRule,
url: urlFn,
video
} = require('@metascraper/helpers')

const pReflect = require('p-reflect')

const { chain, find, isEqual } = require('lodash')

const toUrl = toRule(urlFn)
Expand Down Expand Up @@ -74,26 +78,25 @@ const _getIframe = (url, $, { src }) =>

const withIframe = (rules, getIframe) =>
rules.concat(
// async ({ htmlDom: $, url }) => {
// // TODO: write a test embedding a youtube video as iframe
// const srcs = [
// ...new $('iframe[src^="http"], iframe[src^="/"]')
// .map((_, element) => $(element).attr('src'))
// .get()
// .map(src => normalizeUrl(url, src))
// ]
// if (srcs.length === 0) return
// return pReflect(
// Promise.any(
// srcs.map(async src => {
// const htmlDom = await getIframe(url, $, { src })
// const result = await findRule(audioRules, { htmlDom, url })
// if (!has(result)) throw TypeError('no result')
// return result
// })
// )
// ).then(({ value }) => value)
// },
async ({ htmlDom: $, url }) => {
const srcs = [
...new $('iframe[src^="http"], iframe[src^="/"]')
.map((_, element) => $(element).attr('src'))
.get()
.map(src => normalizeUrl(url, src))
]
if (srcs.length === 0) return
return pReflect(
Promise.any(
srcs.map(async src => {
const htmlDom = await getIframe(url, $, { src })
const result = await findRule(rules, { htmlDom, url })
if (!has(result)) throw TypeError('no result')
return result
})
)
).then(({ value }) => value)
},
async ({ htmlDom: $, url }) => {
const src = $twitter($, 'twitter:player')
return src
Expand Down
57 changes: 57 additions & 0 deletions packages/metascraper-video/test/iframe.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
'use strict'

const { default: listen } = require('async-listen')
const { createServer } = require('http')
const { promisify } = require('util')
const test = require('ava')

const closeServer = server => promisify(server.close)

const createMetascraper = (...args) =>
require('metascraper')([require('../src')(...args)])

test('absolute http', async t => {
const server = createServer((_, res) => {
res.setHeader('Content-Type', 'text/html')
res.end(
'<meta property="og:video" content="https://cdn.microlink.io/file-examples/sample.mp4">'
)
})

t.teardown(() => closeServer(server))
const url = (await listen(server, { port: 0, host: '0.0.0.0' })).toString()
const html = `<iframe src="${url}">`
const metascraper = createMetascraper()
const metadata = await metascraper({ html, url })
t.is(metadata.video, 'https://cdn.microlink.io/file-examples/sample.mp4')
})

test('relative http', async t => {
const server = createServer((_, res) => {
res.setHeader('Content-Type', 'text/html')
res.end('<meta property="og:video" content="/file-examples/sample.mp4">')
})

t.teardown(() => closeServer(server))
const url = (await listen(server, { port: 0, host: '0.0.0.0' })).toString()
const html = '<iframe src="/">'
const metascraper = createMetascraper()
const metadata = await metascraper({ html, url })
t.is(metadata.video, url + 'file-examples/sample.mp4')
})

test('ignore non http urls', async t => {
const server = createServer((_, res) => {
res.setHeader('Content-Type', 'text/html')
res.end(
'<meta property="og:video" content="tg://join?invite=n3gS0R7pjFJhMWM0">'
)
})

t.teardown(() => closeServer(server))
const url = (await listen(server, { port: 0, host: '0.0.0.0' })).toString()
const html = `<iframe src="${url}">`
const metascraper = createMetascraper()
const metadata = await metascraper({ html, url })
t.is(metadata.video, null)
})
Loading

0 comments on commit 49a37e5

Please sign in to comment.