Skip to content

Commit

Permalink
Parse media links from JSON feed
Browse files Browse the repository at this point in the history
  • Loading branch information
GreenTeaCake committed Apr 16, 2024
1 parent f50d549 commit db737c0
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 12 deletions.
39 changes: 29 additions & 10 deletions core/loader/json-feed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import type { TextResponse } from '../download.js'
import type { OriginPost } from '../post.js'
import { createPostsPage } from '../posts-page.js'
import type { Loader } from './index.js'
import { findAnchorHrefs, findLinksByType, toTime } from './utils.js'
import { findAnchorHrefs, findLinksByType, toTime, unique } from './utils.js'

// https://www.jsonfeed.org/version/1.1/
interface JsonFeed {
Expand Down Expand Up @@ -91,19 +91,38 @@ function validate<ValidatedType>(
return true
}

const IMG_URL_REGEXP = /<img[^>]+src="?([^"\s]+)"?\s*\/?>/g

function parsePosts(text: TextResponse): OriginPost[] {
let parsedJson = text.parseJson()
if (!validate<JsonFeed>(parsedJson, JSON_FEED_VALIDATORS)) return []

return parsedJson.items.map(item => ({
full: (item.content_html || item.content_text) ?? undefined,
intro: item.summary ?? undefined,
media: [],
originId: item.id,
publishedAt: toTime(item.date_published) ?? undefined,
title: item.title,
url: item.url ?? undefined
}))
return parsedJson.items.map(item => {
const full = (item.content_html || item.content_text) ?? undefined
const allImages: (string | null | undefined)[] = [
item.banner_image,
item.image
]

if (full) {
const parser = new DOMParser()
const fullDocument = parser.parseFromString(full, 'text/html')
const contentImages = [...fullDocument.querySelectorAll('img')].map(
element => element.getAttribute('src')
)
allImages.push(...contentImages)
}

return {
full,
intro: item.summary ?? undefined,
media: unique(allImages),
originId: item.id,
publishedAt: toTime(item.date_published) ?? undefined,
title: item.title,
url: item.url ?? undefined
}
})
}

export const jsonFeed: Loader = {
Expand Down
5 changes: 3 additions & 2 deletions core/loader/rss.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ import {
findAnchorHrefs,
findImageByAttr,
findLinksByType,
toTime
toTime,
unique
} from './utils.js'

const MEDIA_NS_URI = 'http://search.yahoo.com/mrss/'
Expand All @@ -32,7 +33,7 @@ function parsePosts(text: TextResponse): OriginPost[] {

return {
full: description?.textContent ?? undefined,
media: [...new Set([...descriptionImages, ...mediaImages])],
media: unique([...descriptionImages, ...mediaImages]),
originId:
item.querySelector('guid')?.textContent ??
item.querySelector('link')!.textContent!,
Expand Down
6 changes: 6 additions & 0 deletions core/loader/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,9 @@ export function findImageByAttr(
return url ? urls.concat(url) : urls
}, [])
}

export function unique<T extends string | number = string>(
collection: Iterable<T | null | undefined>
): T[] {
return [...new Set([...collection].filter(str => str != null))] as T[]
}
81 changes: 81 additions & 0 deletions core/test/loader/json-feed.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -467,3 +467,84 @@ test('validate wrong json feed format', async () => {
}
)
})

test('parses media', async () => {
let task = createDownloadTask()
let text = spyOn(task, 'text', async () =>
exampleJson(
JSON.stringify({
...jsonStub,
items: [
{
content_html:
'<p>HTML<img src="https://example.com/img_h.webp" /></p>',
content_text:
'<p>Text<img src="https://example.com/img_t.webp" /></p>',
date_published: '2022-01-04T00:00:00Z',
id: 'somehashid',
summary: 'summary',
title: 'title_1',
url: 'https://example.com/',
image: 'https://example.com/image.webp',
banner_image: 'https://example.com/banner_image.webp'
},
{
content_html: undefined,
content_text:
'<p><img src="https://example.com/img_0.webp">Text' +
'<img src="https://example.com/img_1.webp"></p>',
date_published: '2022-01-04T00:00:00Z',
id: 'somehashid2',
title: 'title_2',
url: 'https://example.com/2',
image: 'https://example.com/img.webp',
banner_image: 'https://example.com/img.webp'
}
]
})
)
)
let page = loaders.jsonFeed.getPosts(task, 'https://example.com/')
deepStrictEqual(page.get(), {
hasNext: true,
isLoading: true,
list: []
})

await setTimeout(10)
deepStrictEqual(page.get(), {
hasNext: false,
isLoading: false,
list: [
{
full: '<p>HTML<img src="https://example.com/img_h.webp" /></p>',
intro: 'summary',
media: [
'https://example.com/banner_image.webp',
'https://example.com/image.webp',
'https://example.com/img_h.webp'
],
originId: 'somehashid',
publishedAt: 1641254400,
title: 'title_1',
url: 'https://example.com/'
},
{
full:
'<p><img src="https://example.com/img_0.webp">Text' +
'<img src="https://example.com/img_1.webp"></p>',
intro: undefined,
media: [
'https://example.com/img.webp',
'https://example.com/img_0.webp',
'https://example.com/img_1.webp'
],
originId: 'somehashid2',
publishedAt: 1641254400,
title: 'title_2',
url: 'https://example.com/2'
}
]
})
deepStrictEqual(text.calls, [['https://example.com/']])
})

0 comments on commit db737c0

Please sign in to comment.