Skip to content

Commit

Permalink
Parse media links from JSON feed
Browse files Browse the repository at this point in the history
  • Loading branch information
GreenTeaCake committed Apr 16, 2024
1 parent f50d549 commit 5c8107c
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 9 deletions.
35 changes: 26 additions & 9 deletions core/loader/json-feed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,19 +91,36 @@ function validate<ValidatedType>(
return true
}

const IMG_URL_REGEXP = /<img[^>]+src="?([^"\s]+)"?\s*\/?>/g

function parsePosts(text: TextResponse): OriginPost[] {
let parsedJson = text.parseJson()
if (!validate<JsonFeed>(parsedJson, JSON_FEED_VALIDATORS)) return []

return parsedJson.items.map(item => ({
full: (item.content_html || item.content_text) ?? undefined,
intro: item.summary ?? undefined,
media: [],
originId: item.id,
publishedAt: toTime(item.date_published) ?? undefined,
title: item.title,
url: item.url ?? undefined
}))
return parsedJson.items.map(item => {
const full = (item.content_html || item.content_text) ?? undefined
const allImages: string[] = [item.banner_image ?? '', item.image ?? '']

if (full) {
let match: RegExpExecArray | null
while ((match = IMG_URL_REGEXP.exec(full))) {
const url = match[1]
if (url) {
allImages.push(url)
}
}
}

return {
full,
intro: item.summary ?? undefined,
media: [...new Set(allImages.filter(url => !!url))],
originId: item.id,
publishedAt: toTime(item.date_published) ?? undefined,
title: item.title,
url: item.url ?? undefined
}
})
}

export const jsonFeed: Loader = {
Expand Down
81 changes: 81 additions & 0 deletions core/test/loader/json-feed.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -467,3 +467,84 @@ test('validate wrong json feed format', async () => {
}
)
})

test('parses media', async () => {
let task = createDownloadTask()
let text = spyOn(task, 'text', async () =>
exampleJson(
JSON.stringify({
...jsonStub,
items: [
{
content_html:
'<p>HTML<img src="https://example.com/img_h.webp" /></p>',
content_text:
'<p>Text<img src="https://example.com/img_t.webp" /></p>',
date_published: '2022-01-04T00:00:00Z',
id: 'somehashid',
summary: 'summary',
title: 'title_1',
url: 'https://example.com/',
image: 'https://example.com/image.webp',
banner_image: 'https://example.com/banner_image.webp'
},
{
content_html: undefined,
content_text:
'<p><img src="https://example.com/img_0.webp">Text' +
'<img src="https://example.com/img_1.webp"></p>',
date_published: '2022-01-04T00:00:00Z',
id: 'somehashid2',
title: 'title_2',
url: 'https://example.com/2',
image: 'https://example.com/img.webp',
banner_image: 'https://example.com/img.webp'
}
]
})
)
)
let page = loaders.jsonFeed.getPosts(task, 'https://example.com/')
deepStrictEqual(page.get(), {
hasNext: true,
isLoading: true,
list: []
})

await setTimeout(10)
deepStrictEqual(page.get(), {
hasNext: false,
isLoading: false,
list: [
{
full: '<p>HTML<img src="https://example.com/img_h.webp" /></p>',
intro: 'summary',
media: [
'https://example.com/banner_image.webp',
'https://example.com/image.webp',
'https://example.com/img_h.webp'
],
originId: 'somehashid',
publishedAt: 1641254400,
title: 'title_1',
url: 'https://example.com/'
},
{
full:
'<p><img src="https://example.com/img_0.webp">Text' +
'<img src="https://example.com/img_1.webp"></p>',
intro: undefined,
media: [
'https://example.com/img.webp',
'https://example.com/img_0.webp',
'https://example.com/img_1.webp'
],
originId: 'somehashid2',
publishedAt: 1641254400,
title: 'title_2',
url: 'https://example.com/2'
}
]
})
deepStrictEqual(text.calls, [['https://example.com/']])
})

0 comments on commit 5c8107c

Please sign in to comment.