diff --git a/apps/worker/src/jobs/link-unfurl.ts b/apps/worker/src/jobs/link-unfurl.ts index 2d0dc8f..182444c 100644 --- a/apps/worker/src/jobs/link-unfurl.ts +++ b/apps/worker/src/jobs/link-unfurl.ts @@ -10,8 +10,10 @@ import { channelRoom } from "@repo/realtime-types/rooms" import type { Emitter } from "@socket.io/redis-emitter" import type { Job } from "bullmq" import ogs from "open-graph-scraper" +import { logger } from "@/lib/logger" const OG_FETCH_TIMEOUT_MS = 5000 +const MAX_REDIRECTS = 5 const PRIVATE_IP_REGEX = /^(127\.|10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.|0\.|169\.254\.|::1|fc|fd|fe80)/i @@ -32,11 +34,18 @@ async function isSafeUrl(urlString: string): Promise { const addresses = await lookup(hostname, { all: true }) for (const { address } of addresses) { - if (PRIVATE_IP_REGEX.test(address)) return false + if (PRIVATE_IP_REGEX.test(address)) { + logger.warn( + { hostname, address }, + "Blocked private IP after DNS lookup" + ) + return false + } } return true - } catch { + } catch (err) { + logger.warn({ err, url: urlString }, "URL safety check failed") return false } } @@ -68,15 +77,72 @@ function matchProxyRule(originalUrl: string) { return null } +/** Follow redirects manually, validating each hop through isSafeUrl. */ +async function resolveRedirects(startUrl: string): Promise { + let current = startUrl + for (let i = 0; i < MAX_REDIRECTS; i++) { + let res: Response + try { + res = await fetch(current, { + method: "HEAD", + headers: { "user-agent": "Townhall/1.0 OGBot" }, + redirect: "manual", + signal: AbortSignal.timeout(OG_FETCH_TIMEOUT_MS), + }) + } catch (err) { + logger.warn( + { err, startUrl, current }, + "Redirect resolution fetch failed" + ) + return null + } + + if (res.status >= 300 && res.status < 400) { + const location = res.headers.get("location") + if (!location) return null + // Resolve relative Location headers against the current URL + let next: string + try { + next = new URL(location, current).toString() + } catch { + logger.warn({ location, current }, "Malformed redirect Location header") + return null + } + if (!(await isSafeUrl(next))) { + logger.warn( + { from: current, to: next }, + "Redirect target failed safety check" + ) + return null + } + current = next + continue + } + + return current + } + logger.warn({ url: startUrl }, "Too many redirects") + return null +} + async function fetchOgEmbed(url: string): Promise { const proxy = matchProxyRule(url) const fetchUrl = proxy?.fetchUrl ?? url - if (!(await isSafeUrl(fetchUrl))) return null + if (!(await isSafeUrl(fetchUrl))) { + logger.info({ url, fetchUrl }, "Skipped unsafe URL") + return null + } + + const resolvedUrl = await resolveRedirects(fetchUrl) + if (!resolvedUrl) { + logger.info({ url, fetchUrl }, "Redirect resolution failed") + return null + } try { const { error, result } = await ogs({ - url: fetchUrl, + url: resolvedUrl, timeout: OG_FETCH_TIMEOUT_MS, fetchOptions: { headers: { @@ -87,12 +153,18 @@ async function fetchOgEmbed(url: string): Promise { }) if (error || !result.success) { + logger.warn({ url, resolvedUrl, error }, "OG scrape returned no result") return null } const imageUrl = result.ogImage?.[0]?.url ?? result.twitterImage?.[0]?.url ?? undefined + logger.info( + { url, title: result.ogTitle, hasImage: !!imageUrl }, + "OG scrape succeeded" + ) + return { type: "link", url, @@ -102,7 +174,8 @@ async function fetchOgEmbed(url: string): Promise { thumbnail: imageUrl, siteName: proxy?.siteName ?? result.ogSiteName ?? undefined, } - } catch { + } catch (err) { + logger.error({ err, url, resolvedUrl }, "OG scrape threw") return null } } @@ -112,11 +185,22 @@ export function createLinkUnfurlProcessor( ) { return async (job: Job) => { const { messageId, channelId, urls } = job.data + logger.info( + { jobId: job.id, messageId, urlCount: urls.length }, + "Processing link-unfurl job" + ) + if (urls.length === 0) return const results = await Promise.all(urls.map(fetchOgEmbed)) const embeds = results.filter((e): e is Embed => e !== null) - if (embeds.length === 0) return + + if (embeds.length === 0) { + logger.info( + { jobId: job.id, messageId }, + "No embeds produced, clearing stored embeds" + ) + } const [updated] = await db .update(schema.message) @@ -124,7 +208,13 @@ export function createLinkUnfurlProcessor( .where(eq(schema.message.id, messageId)) .returning({ id: schema.message.id }) - if (!updated) return + if (!updated) { + logger.warn( + { jobId: job.id, messageId }, + "Message not found for embed update" + ) + return + } const payload: RealtimeMessageEmbedsUpdated = { channelId, @@ -133,5 +223,9 @@ export function createLinkUnfurlProcessor( } emitter.to(channelRoom(channelId)).emit("message:embeds:updated", payload) + logger.info( + { jobId: job.id, messageId, embedCount: embeds.length }, + "Embeds updated and emitted" + ) } }