diff --git a/CLAUDE.md b/CLAUDE.md
index e86a7939f6..7846bc27c9 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -195,6 +195,24 @@ Defined in root `tsconfig.json`:
 - Migrations: Drizzle Kit (`drizzle-kit`)
 - Embeddings: pgvector with 1536 dimensions
 
+### Migration discipline (read before touching `packages/api/drizzle/`)
+
+1. **Always generate via drizzle-kit.** Edit `packages/api/src/db/schema.ts` (or `packages/db/src/schema.ts` for the shared workspace), then run from the API package:
+
+   ```bash
+   cd packages/api && bun run db:generate
+   ```
+
+   Drizzle-kit emits a random-name file like `0048_loud_squirrel_girl.sql`. That random name is fine — keep it. The naming convention here is "whatever drizzle-kit gives you."
+
+2. **Do not rename a generated migration file.** The `meta/_journal.json` `tag` field, the migration SQL filename, and the snapshot filename all encode the migration identity together. Renaming any one of them (even with corresponding journal edits) makes the migration look hand-authored and creates drift that future drizzle-kit operations can mis-handle.
+
+3. **Do not hand-edit `meta/_journal.json`, `meta/*_snapshot.json`, or the generated SQL.** If the generated migration is wrong, fix the schema, delete the bad migration + snapshot + journal entry, and regenerate. Do not patch around it.
+
+4. **Collapse additive changes into one migration when they ship together** — fewer snapshot files in the diff, easier to revert as a unit. Splitting only makes sense when migrations need to land in separate releases.
+
+5. **Verify after generating.** Run `bunx drizzle-kit check` from `packages/api/` — it validates the snapshot chain is internally consistent. Run before pushing.
+
 ## EAS Build Profiles
 
 | Profile | Use | Distribution |
diff --git a/bun.lock b/bun.lock
index 95b75188be..8c7e5a5b02 100644
--- a/bun.lock
+++ b/bun.lock
@@ -473,6 +473,7 @@
         "@packrat/schemas": "workspace:*",
         "@packrat/types": "workspace:*",
         "@packrat/units": "workspace:*",
+        "@sentry/cloudflare": "^10.37.0",
         "@sinclair/typebox": "^0.34.15",
         "@types/nodemailer": "^6.4.17",
         "ai": "catalog:",
@@ -1980,7 +1981,9 @@
 
     "@sentry/cli-win32-x64": ["@sentry/cli-win32-x64@2.58.4", "", { "os": "win32", "cpu": "x64" }, "sha512-cSzN4PjM1RsCZ4pxMjI0VI7yNCkxiJ5jmWncyiwHXGiXrV1eXYdQ3n1LhUYLZ91CafyprR0OhDcE+RVZ26Qb5w=="],
 
-    "@sentry/core": ["@sentry/core@10.37.0", "", {}, "sha512-hkRz7S4gkKLgPf+p3XgVjVm7tAfvcEPZxeACCC6jmoeKhGkzN44nXwLiqqshJ25RMcSrhfFvJa/FlBg6zupz7g=="],
+    "@sentry/cloudflare": ["@sentry/cloudflare@10.53.1", "", { "dependencies": { "@opentelemetry/api": "^1.9.1", "@sentry/core": "10.53.1" }, "peerDependencies": { "@cloudflare/workers-types": "^4.x" }, "optionalPeers": ["@cloudflare/workers-types"] }, "sha512-iSohVibGRAKg7zLUflfA2ePG69Uw6bqm6iCQLM18hoG2gT4DGigaKcjJmZLTfAtW1DInMCb0DYc/mltCznxMrQ=="],
+
+    "@sentry/core": ["@sentry/core@10.53.1", "", {}, "sha512-XG4ezlkyuAPjBC5+9kXC94rXXuqYTw9NRhfaDHssbTFaGnqBR8vQX2UUgZfY7ucbeelRDGfBu1sywoU+mB04uA=="],
 
     "@sentry/hub": ["@sentry/hub@6.19.7", "", { "dependencies": { "@sentry/types": "6.19.7", "@sentry/utils": "6.19.7", "tslib": "^1.9.3" } }, "sha512-y3OtbYFAqKHCWezF0EGGr5lcyI2KbaXW2Ik7Xp8Mu9TxbSTuwTe4rTntwg8ngPjUQU3SUHzgjqVB8qjiGqFXCA=="],
 
@@ -5090,6 +5093,16 @@
 
     "@reduxjs/toolkit/immer": ["immer@11.1.8", "", {}, "sha512-/tbkHMW7y10Lx6i1crLjD4/OhNkRG+Fo7byZHtah0547nIeXYcpIXaUh0IAQY6gO5459qpGGYapcEOHtFXkIuA=="],
 
+    "@sentry-internal/browser-utils/@sentry/core": ["@sentry/core@10.37.0", "", {}, "sha512-hkRz7S4gkKLgPf+p3XgVjVm7tAfvcEPZxeACCC6jmoeKhGkzN44nXwLiqqshJ25RMcSrhfFvJa/FlBg6zupz7g=="],
+
+    "@sentry-internal/feedback/@sentry/core": ["@sentry/core@10.37.0", "", {}, "sha512-hkRz7S4gkKLgPf+p3XgVjVm7tAfvcEPZxeACCC6jmoeKhGkzN44nXwLiqqshJ25RMcSrhfFvJa/FlBg6zupz7g=="],
+
+    "@sentry-internal/replay/@sentry/core": ["@sentry/core@10.37.0", "", {}, "sha512-hkRz7S4gkKLgPf+p3XgVjVm7tAfvcEPZxeACCC6jmoeKhGkzN44nXwLiqqshJ25RMcSrhfFvJa/FlBg6zupz7g=="],
+
+    "@sentry-internal/replay-canvas/@sentry/core": ["@sentry/core@10.37.0", "", {}, "sha512-hkRz7S4gkKLgPf+p3XgVjVm7tAfvcEPZxeACCC6jmoeKhGkzN44nXwLiqqshJ25RMcSrhfFvJa/FlBg6zupz7g=="],
+
+    "@sentry/browser/@sentry/core": ["@sentry/core@10.37.0", "", {}, "sha512-hkRz7S4gkKLgPf+p3XgVjVm7tAfvcEPZxeACCC6jmoeKhGkzN44nXwLiqqshJ25RMcSrhfFvJa/FlBg6zupz7g=="],
+
     "@sentry/cli/https-proxy-agent": ["https-proxy-agent@5.0.1", "", { "dependencies": { "agent-base": "6", "debug": "4" } }, "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA=="],
 
     "@sentry/cli/node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="],
@@ -5112,6 +5125,12 @@
 
     "@sentry/node/tslib": ["tslib@1.14.1", "", {}, "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg=="],
 
+    "@sentry/react/@sentry/core": ["@sentry/core@10.37.0", "", {}, "sha512-hkRz7S4gkKLgPf+p3XgVjVm7tAfvcEPZxeACCC6jmoeKhGkzN44nXwLiqqshJ25RMcSrhfFvJa/FlBg6zupz7g=="],
+
+    "@sentry/react-native/@sentry/core": ["@sentry/core@10.37.0", "", {}, "sha512-hkRz7S4gkKLgPf+p3XgVjVm7tAfvcEPZxeACCC6jmoeKhGkzN44nXwLiqqshJ25RMcSrhfFvJa/FlBg6zupz7g=="],
+
+    "@sentry/types/@sentry/core": ["@sentry/core@10.37.0", "", {}, "sha512-hkRz7S4gkKLgPf+p3XgVjVm7tAfvcEPZxeACCC6jmoeKhGkzN44nXwLiqqshJ25RMcSrhfFvJa/FlBg6zupz7g=="],
+
     "@sentry/utils/@sentry/types": ["@sentry/types@6.19.7", "", {}, "sha512-jH84pDYE+hHIbVnab3Hr+ZXr1v8QABfhx39KknxqKWr2l0oEItzepV0URvbEhB446lk/S/59230dlUUIBGsXbg=="],
 
     "@sentry/utils/tslib": ["tslib@1.14.1", "", {}, "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg=="],
diff --git a/docs/audits/2026-05-16-etl-audit.md b/docs/audits/2026-05-16-etl-audit.md
new file mode 100644
index 0000000000..84f1d69449
--- /dev/null
+++ b/docs/audits/2026-05-16-etl-audit.md
@@ -0,0 +1,183 @@
+# ETL Pipeline Audit — 2026-05-16
+
+## Summary
+
+The catalog ETL pipeline works end-to-end and has been hardened through a recent series of fixes (OOM, CPU-time budget, atomic counters, byte-range chunking), but it is not production-ready: chunking + a single shared `jobId` produces double-counted `totalProcessed`, mis-marks jobs `completed` after the first chunk finishes, and lacks any dead-letter / retry policy at the queue layer. Catastrophic per-message failures silently swallow errors in `processQueueBatch` (`try/catch` with `console.error` only), so the queue happily acks bad chunks. The retry endpoint also re-queues only the original object key, ignoring multi-chunk jobs entirely.
+
+**Top 3 risks**: (1) cross-chunk job-status race (any one chunk's completion marks the entire job `completed`), (2) consumer swallows errors so failed messages never retry/DLQ, (3) retry endpoint and stuck-job sweep are incompatible with byte-range chunking.
+
+## Architecture
+
+```
+POST /api/catalog/etl            ── api-key auth
+   │  body: { filename, chunks[], source, scraperRevision }
+   ▼
+1. INSERT etl_jobs (status='running')
+2. For each objectKey: R2.head() → split into 20 MB byte-range chunks
+3. queueCatalogETL → ETL_QUEUE.sendBatch (one message per chunk, same jobId)
+
+ETL_QUEUE (max_batch_size=1, max_concurrency=1)
+   ▼
+processQueueBatch
+   ▼
+processCatalogETL                ── per chunk
+   ├── R2.get(key, {range})      ── stream body
+   ├── if non-first chunk: GET first 4 KB → extract header → inject; skip partial row
+   ├── csv-parse stream w/ backpressure (parser.write returns false → wait 'drain')
+   ├── yield every 100 rows (setTimeout(1))
+   ├── flush at BATCH_SIZE=100 rows:
+   │     valid   → processValidItemsBatch → mergeBySku → embeddings → catalogService.upsert → updateEtlJobProgress
+   │     invalid → processLogsBatch       → invalid_item_logs        → updateEtlJobProgress
+   └── on success: UPDATE etl_jobs SET status='completed'   ◀── PROBLEM with multi-chunk jobs
+       on throw:    UPDATE etl_jobs SET status='failed' + rethrow
+```
+
+Counters are atomic per call (`COALESCE(col, 0) + n` in SQL). Job rows are not.
+
+## Findings
+
+### [P0] Multi-chunk jobs are marked `completed` after the first chunk finishes
+- **What**: All chunks for a single source file share one `jobId`; each chunk independently sets `status='completed'` on success.
+- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:188-191`; chunks created in `packages/api/src/routes/catalog/index.ts:182-200`.
+- **Why it matters**: A 100 MB file becomes 5 chunks → 5 messages → the first message to finish flips the job to `completed`, even though 80% of rows haven't been processed yet. The dashboard, `success_rate`, and any downstream check ("is the catalog refresh done?") fire prematurely. Subsequent chunks continue to mutate `totalProcessed/totalValid/totalInvalid`, so the row reads as `completed` with rising counters.
+- **Recommendation**: Track per-chunk completion. Two options: (a) add a `chunks_total` and `chunks_completed` column; only set `completed` when `chunks_completed = chunks_total`. (b) give each chunk its own jobId and group by a parent `batch_id`. Option (a) is the smaller change.
+
+### [P0] `processQueueBatch` swallows errors — failed chunks never retry or DLQ
+- **What**: Per-message exceptions are caught and logged but never rethrown; CF Queues auto-acks every message in the batch.
+- **Where**: `packages/api/src/services/etl/queue.ts:50-60`.
+- **Why it matters**: A transient DB error, OpenAI 429, or R2 read failure permanently loses the chunk. The job is marked `failed` (good) but the message is acked (bad) — there is no retry, no dead-letter queue, and `wrangler.jsonc` does not declare a `dead_letter_queue` or `max_retries`. Combined with the multi-chunk issue above, a single failure can corrupt the job state while other chunks succeed and mark it `completed`.
+- **Recommendation**: Rethrow in the catch (or call `message.retry()` explicitly on the specific message). Add `dead_letter_queue` and `max_retries: 3` to the ETL queue consumer in `wrangler.jsonc:76-82`. Process messages with `for...of` calling `message.ack()` / `message.retry()` explicitly so partial-batch semantics are correct even though `max_batch_size=1` today.
+
+### [P1] Retry endpoint discards multi-chunk structure
+- **What**: `POST /admin/etl/:jobId/retry` re-queues exactly one chunk built from `v2/${source}/${filename}` with no chunking.
+- **Where**: `packages/api/src/routes/admin/analytics/catalog.ts:434-450`.
+- **Why it matters**: If the original job was chunked (20 MB+ files), retry blasts the entire file at one Worker invocation, blowing past the 300s CPU-time limit that prompted the chunking work in the first place. Result: retries of any large failed job silently re-fail.
+- **Recommendation**: Re-run the same R2.head + chunk-split logic the producer endpoint uses (lines 182-200). Extract that to a shared helper so both call sites stay in sync.
+
+### [P1] Stuck-job sweep is wall-clock based and incompatible with serial chunked jobs
+- **What**: `POST /admin/etl/reset-stuck` flips any job in `running` for >30 min to `failed`.
+- **Where**: `packages/api/src/routes/admin/analytics/catalog.ts:384-403`.
+- **Why it matters**: With `max_concurrency=1` and 20 MB chunks each consuming most of a 300s CPU budget, a 500 MB file produces 25 chunks at up to ~5 minutes each → comfortably past 30 minutes. Healthy long jobs will be marked `failed`. The trigger should be "no progress for N minutes" (e.g., `totalProcessed` unchanged), not "started >30 min ago".
+- **Recommendation**: Add `lastProgressAt` updated on each `updateEtlJobProgress` call; sweep on `lastProgressAt < now - 15min`. Or check `completedAt IS NULL AND startedAt < now - 2h` for the absolute floor.
+
+### [P1] First-chunk header injection assumes the first 4 KB contains a complete header
+- **What**: For non-first chunks, the parser fetches `bytes 0-4095` and uses `headerText.split('\n')[0]` as the header row.
+- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:53-58`.
+- **Why it matters**: If the header row exceeds 4 KB (wide CSVs with 30+ columns and long names — possible here given the catalog schema has 25+ fields), `split('\n')[0]` returns a *truncated* header, so `fieldMap` silently maps the last column wrong. There is also no validation that the slice actually contained a newline before `byteEnd=4095`.
+- **Recommendation**: Loop the range request (or use a streaming `until newline` reader). At minimum, throw if no `\n` appears in the first 4 KB so the failure is loud, not silent.
+
+### [P1] Partial-row skip can drop a valid full row when chunk boundary lands on a newline
+- **What**: `skipPartialRow` discards everything up to and including the first `\n` after `byteStart`. If `byteStart` happens to be the first byte *after* a newline (i.e., the previous chunk's last byte is `\n`), the producing chunk processed the full row, and this chunk correctly starts on a row boundary — but the skip logic still throws away the first whole row.
+- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:95-108`.
+- **Why it matters**: Off-by-one row drop at every chunk boundary in worst case (data loss, not just dup). For 25-chunk file → potentially 24 lost catalog items. No test covers the boundary-aligned case.
+- **Recommendation**: When splitting chunks at line 195-196 of `routes/catalog/index.ts`, do not split on arbitrary 20 MB offsets — peek at R2 with a short range request and align `byteEnd` to a newline so the skip logic is unnecessary, *or* skip only when the previous byte (range `byteStart-1`) was non-newline.
+
+### [P1] CSV row spanning chunk boundary is never reassembled
+- **What**: A row beginning before `byteEnd` and ending after will be cut in half. The producing chunk parses a truncated row (likely fails validation); the next chunk discards the tail.
+- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:95-108` (skip logic), `routes/catalog/index.ts:182-200` (chunk creation).
+- **Why it matters**: Every chunk boundary loses (or invalidates) one row. Symptom would be `totalInvalid` rising by ~N per N-chunk job, with field-shaped errors. Severity depends on row width vs 20 MB.
+- **Recommendation**: Same as above — align chunk boundaries to row boundaries in the producer. Alternatively, the producing chunk should fetch ~64 KB beyond `byteEnd` to complete its final row, and the next chunk skip logic stays.
+
+### [P2] `console.log`/`console.error` only — no structured logging, no Sentry
+- **What**: Every log uses `console.log` with emoji prefixes; no Sentry integration in ETL paths despite Sentry being a documented monitoring tool.
+- **Where**: All ETL files; verified by `grep -rn "Sentry|captureException" packages/api/src/services/etl/` → no results. Same applies to `packages/api/src/`.
+- **Why it matters**: A stuck job cannot be debugged without paging through CF Workers logs by hand. No correlation IDs (other than jobId), no per-chunk structured fields (`byteStart`, `rowsProcessed`, `elapsed_ms`), no error categorization. Failures in `processLogsBatch` are caught and `console.error`-ed without rethrow (`packages/api/src/services/etl/processLogsBatch.ts:25-27`) — invalid logs can fail to write and nobody knows.
+- **Recommendation**: Add a thin logger (`logger.info({ jobId, chunk: { byteStart, byteEnd }, event: 'chunk_start' })`). Call `Sentry.captureException(err, { tags: { jobId, objectKey } })` in the `processCatalogETL` catch block.
+
+### [P2] `processLogsBatch` swallows DB errors silently
+- **What**: Catch logs to console and returns normally — caller has no idea logs were dropped.
+- **Where**: `packages/api/src/services/etl/processLogsBatch.ts:25-27`.
+- **Why it matters**: Invalid-item logs are the *only* forensic record of what failed validation. If the INSERT fails (Neon hiccup, payload size, FK violation), we lose visibility forever. The `updateEtlJobProgress` call is also inside the try, so `totalInvalid`/`totalProcessed` will be undercounted.
+- **Recommendation**: Rethrow. Let the outer ETL catch flip the job to `failed` — the alternative is silent data quality erosion.
+
+### [P2] Embedding failure path silently drops embeddings without marking it
+- **What**: When `generateManyEmbeddings` throws, items are upserted with `embedding=undefined` (i.e., NULL) but the job still reports as fully successful.
+- **Where**: `packages/api/src/services/etl/processValidItemsBatch.ts:52-63`.
+- **Why it matters**: No metric distinguishes "successful with embeddings" from "successful but degraded". The `/admin/embeddings` route reports coverage but cannot attribute the drop to a specific job. A backfill is required to recover, and there is no automatic re-queue.
+- **Recommendation**: Add a `totalEmbeddingFailures` column on `etl_jobs`, increment it in the fallback path, and surface in the admin dashboard. Optionally enqueue the affected SKUs into `EMBEDDINGS_QUEUE` from the fallback for automatic backfill.
+
+### [P2] `parser.end()` is called inside a fire-and-forget IIFE — errors are unhandled
+- **What**: The async writer is invoked as `(async () => { ... })()` with no `.catch()`. Any stream read error or `parser.write` throw becomes an unhandled rejection.
+- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:89-117`.
+- **Why it matters**: In CF Workers, unhandled rejections can terminate the isolate. More commonly the outer `for await (const record of parser)` loop will just hang on a stalled parser if the writer rejected. The job will sit in `running` until the stuck-job sweep notices.
+- **Recommendation**: Wrap in an explicit promise: `const writerPromise = (async () => { ... })().catch(err => parser.destroy(err));` and `await writerPromise` after the `for await` loop. Surface the error to the outer catch.
+
+### [P2] `setTimeout(resolve, 1)` every 100 rows is a fragile yield mechanism
+- **What**: Used to yield to event loop / give GC a chance.
+- **Where**: `packages/api/src/services/etl/processCatalogEtl.ts:120`.
+- **Why it matters**: `setTimeout` consumes wall-clock budget. Workers have a 30s wall-clock per invocation (separate from `cpu_ms`). At 1ms × 600 yields per 60k-row chunk = 0.6s — fine today, but the comment mentions a previous "per-row yield hits the CF Worker wall-clock limit". The thresholds are tightly coupled and undocumented.
+- **Recommendation**: Replace with `await scheduler.yield()` (CF supports it) or `await new Promise(setImmediate)`-equivalent. Add a unit test that verifies a 100k-row CSV completes within wall-clock.
+
+### [P2] `BATCH_SIZE = 100` is exported but reads inconsistent with comment/runtime
+- **What**: `processCatalogEtl.ts:13` exports `BATCH_SIZE = 100`. The catalog OpenAI embedding API supports 1000+ per call, so this is conservative; meanwhile the queue's `batchSize` for `sendBatch` is hard-coded at 100 (`queue.ts:17`) for an unrelated reason (max batch size from CF). Reusing the symbol `100` for two different concepts is fragile.
+- **Where**: `processCatalogEtl.ts:13`, `queue.ts:17`.
+- **Recommendation**: Rename to `ITEM_FLUSH_BATCH_SIZE` and `CF_QUEUE_BATCH_SIZE`, hoist both to a shared constants file.
+
+### [P3] `mergeItemsBySku` logs change diff on every merge — unbounded console output
+- **What**: Logs a `🔄 Merged SKU` line for every SKU collision with every changed field.
+- **Where**: `packages/api/src/services/etl/mergeItemsBySku.ts:34-48`.
+- **Why it matters**: On a 500 MB CSV with many duplicate SKUs across chunks, this can produce millions of log lines, polluting CF logs and possibly hitting `logpush` quotas.
+- **Recommendation**: Aggregate into a single per-batch summary or gate behind a debug flag.
+
+### [P3] Validator: no URL scheme check, no length limits, no SKU charset rules
+- **What**: `isValidUrl` allows any `new URL()`-parseable input (e.g., `mailto:`, `javascript:`, `file:`).
+- **Where**: `packages/api/src/services/etl/CatalogItemValidator.ts:60-67`.
+- **Why it matters**: `productUrl` is rendered in the mobile app and on the guides site. A scraper bug could inject `javascript:` URLs that survive to the UI.
+- **Recommendation**: Restrict to `http:`/`https:`. Add length caps (`name` ≤ 500, `description` ≤ 50k, `sku` matches `[A-Za-z0-9_.\-/]+`).
+
+### [P3] Soft-delete is not handled by the upsert
+- **What**: `catalogItems` has no `deletedAt` column (verified — grep returns nothing). CLAUDE.md notes "Soft deletes for all user content" but catalog items are scraper-controlled, so this may be intentional. However, an item that disappears from the source CSV is never marked unavailable.
+- **Where**: `packages/api/src/db/schema.ts:132-215`; `packages/api/src/services/catalogService.ts:337-407`.
+- **Why it matters**: The catalog grows monotonically. Discontinued products keep their `availability` from the last successful upsert. There is no "items present in last job but not in this one → mark out-of-stock" reconciliation.
+- **Recommendation**: After a successful ETL, run `UPDATE catalog_items SET availability='OutOfStock' WHERE NOT EXISTS (SELECT 1 FROM catalog_item_etl_jobs WHERE catalog_item_id = id AND etl_job_id IN (last N jobs for this source))`. Or accept the limitation and document it.
+
+### [P3] No invalid-items retention policy
+- **What**: `invalid_item_logs` grows forever; no TTL/sweep.
+- **Where**: `packages/api/src/db/schema.ts:481-490`.
+- **Why it matters**: Each bad row stores `raw_data` as JSONB plus an `errors` array — a single bad upload can write hundreds of MB to Neon.
+- **Recommendation**: Add a scheduled task (or CF Cron Trigger) to drop logs >90 days.
+
+### [P3] No runbook / deploy docs
+- **What**: No `docs/runbooks/etl.md`. `grep "etl|ETL"` in `README.md`/`docs/` returns only stale plan files.
+- **Where**: N/A (missing).
+- **Recommendation**: Write a 1-page runbook: how to trigger an ETL, how to inspect queue depth (`wrangler queues list/info packrat-etl-queue`), how to retry a failed job, how to drain the queue (`wrangler queues consumer remove`), how to interpret `success_rate`. Reference admin endpoints `/admin/etl/*`.
+
+## Test Coverage Gaps
+
+Tests cover the happy path with mocked R2 and globally-mocked DB. The following are **not** tested:
+
+- **Byte-range chunk processing** — no test sets `byteStart`/`byteEnd` in the message. The injected-header fetch, partial-row skip, and boundary off-by-ones (P1 above) are entirely uncovered.
+- **Multi-message job (same jobId, multiple chunks)** — no integration test exercises the "two chunks complete sequentially" path, so the P0 premature-completion bug is invisible to CI.
+- **Header > 4 KB** — see P1 finding.
+- **Row spanning chunk boundary** — see P1 finding.
+- **Embedding service failure path** — `processValidItemsBatch.test` mocks the rejection but does not assert that items were upserted without embeddings (the actual fallback behavior).
+- **`processLogsBatch` DB failure** — no test for the swallowed-error case.
+- **Backpressure** — `parser.write` returning `false` and waiting on `'drain'` is not unit-testable with the current mock (whole CSV emitted in one chunk).
+- **Yield/wall-clock budget** — no test asserts a 100k-row CSV completes under wall-clock.
+- **`processQueueBatch`** — no direct test; the per-message catch-and-swallow (P0) is untested.
+- **Retry endpoint** — no integration test verifies the retry produces a new running job and a queue send.
+- **Stuck-job sweep** — no test for the 30-minute cutoff.
+- **Concurrent updates to same job row** — no race-condition test (e.g., two batches calling `updateEtlJobProgress` interleaved). Atomicity at the SQL level is good but a parallel-batch test would lock it in.
+- **`mergeItemsBySku` cross-chunk SKU collisions** — merging happens within a single batch; SKUs duplicated across batches (or across chunks) hit the DB upsert path, not the merge path. No test for that.
+- **Header injection — wrong column ordering** — what if the source CSV has a BOM, or quoted headers with commas inside?
+
+## Production Readiness Checklist
+
+- [ ] Multi-chunk job completion tracked correctly (chunks_total / chunks_completed columns) — addresses P0 #1
+- [ ] Queue consumer rethrows on per-message failure; DLQ + max_retries configured in `wrangler.jsonc` — addresses P0 #2
+- [ ] Retry endpoint chunks large files the same way the producer does — addresses P1 #1
+- [ ] Stuck-job sweep keyed on `lastProgressAt`, not `startedAt` — addresses P1 #2
+- [ ] Chunk boundaries aligned to row boundaries in the producer (or reassembly in the consumer) — addresses P1 #3 and P1 #4
+- [ ] Header injection validates first 4 KB contains a `\n`; tested with wide CSV — addresses P1 #5
+- [ ] Sentry integration in ETL paths with `jobId`/`objectKey` tags — addresses P2 #1
+- [ ] `processLogsBatch` rethrows on DB failure — addresses P2 #2
+- [ ] Embedding fallback tracked via counter and visible in admin dashboard — addresses P2 #3
+- [ ] Writer IIFE error attached to outer flow — addresses P2 #4
+- [ ] Yield mechanism uses `scheduler.yield()` and has a wall-clock test — addresses P2 #5
+- [ ] Rename ambiguous `BATCH_SIZE` constants — addresses P2 #6
+- [ ] `mergeItemsBySku` summary log instead of per-SKU — addresses P3 #1
+- [ ] Validator enforces `http(s):` scheme and length caps — addresses P3 #2
+- [ ] Discontinued-item reconciliation strategy chosen and documented — addresses P3 #3
+- [ ] `invalid_item_logs` retention policy — addresses P3 #4
+- [ ] Runbook checked in at `docs/runbooks/etl.md` — addresses P3 #5
+- [ ] Test coverage added for all gaps listed above
diff --git a/docs/plans/2026-05-19-001-fix-etl-pipeline-audit-remediation-plan.md b/docs/plans/2026-05-19-001-fix-etl-pipeline-audit-remediation-plan.md
new file mode 100644
index 0000000000..6df4f8b89c
--- /dev/null
+++ b/docs/plans/2026-05-19-001-fix-etl-pipeline-audit-remediation-plan.md
@@ -0,0 +1,1063 @@
+---
+title: "fix: ETL pipeline audit remediation"
+type: fix
+status: superseded
+supersededBy: docs/plans/2026-05-20-001-fix-etl-pipeline-workflows-migration-plan.md
+supersededReason: "Pivoted execution engine from Cloudflare Queues + outbox to Cloudflare Workflows on 2026-05-20. Workflows natively provides the durable-step + idempotency + retry + state semantics that ~8 of the 15 units in this plan were manually reconstructing. The audit findings about CSV correctness, validator hardening, observability, retention, and runbook remain real and carry into the successor plan; the queue-as-state-machine subplot is dropped."
+date: 2026-05-19
+deepened: 2026-05-19
+origin: docs/audits/2026-05-16-etl-audit.md
+---
+
+# fix: ETL pipeline audit remediation
+
+## Summary
+
+Remediate the catalog ETL pipeline against every finding in the 2026-05-16 audit (2 P0, 5 P1, 6 P2, 3 P3), correct two stale assumptions the audit made about Cloudflare runtime APIs, add bucket-vs-job reconciliation (both an admin-triggered tool and automatic post-job verification), and add a "re-ingest from the top" recovery path for jobs the buggy stuck-job sweep has already corrupted. Delivered as one master plan in four sequenced phases — schema + P0 blockers first, then chunking correctness, then observability + reconciliation, then hardening + runbook.
+
+---
+
+## Problem Frame
+
+The pipeline ingests scraper CSVs from R2 (`packrat-scrapy-bucket`) into Neon Postgres via a Cloudflare Queue consumer. It is currently silently incorrect: live prod admin data (192 runs / 74 failed = 38% failure rate) shows seven large jobs from 2026-05-14 marked `failed` with identical `completedAt` timestamps — the wall-clock-based stuck-job sweep firing on healthy long jobs — while the dashboard reports `successRate: 100%` on those same failed jobs. Audit `docs/audits/2026-05-16-etl-audit.md` enumerates the structural causes: a single shared `jobId` across byte-range chunks lets the first finishing chunk flip the parent job to `completed`, per-message exceptions are swallowed (no DLQ, no retry), byte-range chunk boundaries silently drop or invalidate rows that span them, retries discard chunking entirely, and there is no Sentry / structured logging anywhere in the ETL path.
+
+The user's stated concern — *"some [data] is missing or falsely labeling as success"* — is corroborated on both ends: `completed` jobs can be premature (P0 #1), and `failed` jobs can be false failures (P1 #2). Either way the catalog count `totalItemsIngested: 304,431` cannot currently be trusted.
+
+---
+
+## Requirements
+
+- R1. **No chunk causes premature job completion.** A multi-chunk job transitions to `completed` only when every chunk has succeeded.
+- R2. **Per-message queue failures retry and ultimately DLQ.** No exception thrown by chunk processing is silently swallowed.
+- R3. **Stuck-job sweep is progress-based, not wall-clock-based.** Healthy long-running jobs are not falsely marked `failed`.
+- R4. **Chunk boundaries do not drop or invalidate rows.** Every row in the source CSV is processed exactly once.
+- R5. **Retry / repair endpoints chunk the same way the producer does.** Retrying a large file does not single-shot it.
+- R6. **CSV header injection for non-first chunks is correct or fails loudly.** No silent column misalignment.
+- R7. **Every ETL job has post-ingestion verification.** R2 row count is compared to `totalProcessed` and the result is observable; significant deltas are surfaced.
+- R8. **Operators can trigger a "from scratch" repair of any historical job** without invoking the original producer endpoint.
+- R9. **Failures emit Sentry events with structured context.** Operators can debug a stuck job without paging through raw Worker logs.
+- R10. **Embedding-fallback degradation is observable.** A job that completed without embeddings is distinguishable from a fully-successful one.
+- R11. **Validator rejects unsafe URLs and oversize fields.** Mobile/web cannot be tricked into rendering `javascript:` URLs from the catalog.
+- R12. **`invalid_item_logs` retention is bounded.** A bad upload cannot fill Neon storage indefinitely.
+- R13. **A documented runbook exists for ETL operations.** A new on-caller can trigger / inspect / retry / drain without reading source.
+- R14. **Test coverage exists for every behavior in R1–R12.** Specifically including the cases the global queue-mock in `packages/api/test/setup.ts` currently hides.
+
+---
+
+## Scope Boundaries
+
+- The plan does not raise `max_concurrency` above 1 for the ETL queue. Concurrency bump is blocked on per-chunk idempotency keys that this plan introduces; the actual bump is a follow-up after this lands and bakes.
+- The plan does not add a DLQ to the embeddings queue. ETL queue DLQ only.
+- The plan does not migrate or rewrite the existing `etl_jobs` row data for the 7 historical jobs falsely marked `failed`. The repair-from-scratch endpoint introduced in U6 is the mechanism operators will use; the actual recovery run is operational, not a code unit.
+- The plan does not change the producer endpoint's authentication, the source CSV schema, or the scraper revision pinning.
+- The plan does not introduce a new ETL Worker — the current `packages/api` Elysia Worker continues to host both the HTTP routes and the queue consumer.
+- The plan does not address `apps/landing` / `apps/guides` / `apps/expo` consumers of catalog data even when bucket-vs-job reconciliation finds drift. Surfacing inconsistencies is in scope; downstream cache invalidation is not.
+
+### Deferred to Follow-Up Work
+
+- **Concurrency bump on `packrat-etl-queue` consumer**: separate PR after this plan ships and per-chunk idempotency is verified in production for ≥2 weeks.
+- **Embeddings-queue DLQ + retry policy**: separate plan; same shape as ETL DLQ work in U3, but a distinct surface.
+- **Catalog reconciliation across multiple historical jobs**: only per-job reconciliation is in scope. Historical cross-source rollup ("did we lose 5% of the catalog last quarter?") is a separate analytics workstream.
+- **Soft-delete / discontinued-item reconciliation** (audit P3 #3): documented as accepted limitation in the runbook (catalog is scraper-controlled, not user content). A future plan can add `availability='OutOfStock'` reconciliation if business requirements emerge.
+- **CLI subcommand surface in `packages/cli/src/commands/admin/etl.ts`**: U12 wires the new admin endpoints into the existing CLI command file. Broader CLI ergonomics work is out of scope.
+
+---
+
+## Context & Research
+
+### Relevant Code and Patterns
+
+- **Producer endpoint:** `packages/api/src/routes/catalog/index.ts:229-293` — `POST /catalog/etl`, R2 head + 20 MB chunking at `:253-271`. Chunk creation logic to extract into a shared helper used by U6.
+- **Queue producer:** `packages/api/src/services/etl/queue.ts:6-41` — `queueCatalogETL`; uses `sendBatch` with `batchSize: 100` (CF queue per-call cap).
+- **Queue consumer dispatch:** `packages/api/src/services/etl/queue.ts:43-61` — `processQueueBatch` with the swallowed catch at `:50-60`. **This is the core P0 #2 surface.**
+- **Per-chunk processor:** `packages/api/src/services/etl/processCatalogEtl.ts` — header injection (`:50-58`), partial-row skip (`:95-108`), batch flush (`:120-187`), per-chunk completion (`:188-191`), per-chunk failure (`:201-204`).
+- **Atomic counter pattern (mirror this):** `packages/api/src/services/etl/updateEtlJobProgress.ts:16-23` — `sql\`COALESCE(${col}, 0) + ${n}\``. New `chunks_completed` / `total_embedding_failures` increments use the same idiom; the "set status=completed when chunks_completed+1 == chunks_total" branch uses a single `UPDATE ... SET ... WHERE` with a `CASE` expression in the same transaction.
+- **Embeddings queue pattern (mirror this):** `packages/api/src/services/catalogService.ts:461-507` — consumer rethrows on failure so CF Queue retries fire. ETL consumer must adopt the same shape.
+- **Admin routing pattern:** `packages/api/src/routes/admin/index.ts:117-237` mounts the admin prefix; `:230-237` enforces `adminAuthGuard` on every sub-route. New endpoints in `packages/api/src/routes/admin/analytics/catalog.ts` inherit the guard.
+- **R2 access (S3-API not Workers binding):** `packages/api/src/services/r2-bucket.ts:193-360` — `R2BucketService({ env, bucketType: 'catalog' })` wraps `@aws-sdk/client-s3` against the R2 S3 endpoint. `r2.head(key)` and `r2.get(key, { range: { offset, length } })` are the surface. Range format `bytes=offset-(offset+length-1)` at `:675-691`.
+- **Schema location:** `packages/db/src/schema.ts:446-510` — `etlJobs`, `invalidItemLogs`, `catalogItemEtlJobs`, status enum at `:460`. **Audit cites a stale path (`packages/api/src/db/schema.ts`); the file was extracted into the `packages/db` package — see merge `b14f4dbd5`.**
+- **Drizzle migration location:** `packages/api/drizzle/NNNN_<name>.sql` + `meta/NNNN_snapshot.json` + `_journal.json`. Latest is `0047_cute_bloodscream.sql`; new migrations land at `0048` and `0049` (split per Drizzle Kit's enum-add constraint). Generated via `bun run --cwd packages/api db:generate`. Custom linter at `scripts/lint/check-drizzle-migrations.ts` runs in `lint:custom`.
+- **Existing ETL integration test:** `packages/api/test/etl.test.ts` — mocks `R2BucketService` per-test, uses real Postgres via wsproxy at `localhost:5434`. Setup at `packages/api/test/setup.ts:535-572` globally mocks both `queueCatalogETL` and `processQueueBatch` (lines `:544-551`) — this is precisely *why* the per-message swallow in P0 #2 is invisible to CI today, and U14 must un-mock to cover it.
+- **Wrangler config:** `packages/api/wrangler.jsonc:65-89` (prod queues) and `:161-194` (dev). Currently `max_batch_size: 1, max_concurrency: 1`, **no `dead_letter_queue`, no `max_retries`** on either consumer. Queue routing handler at `packages/api/src/index.ts:109-124`.
+- **Admin CLI surface:** `packages/cli/src/commands/admin/etl.ts` already exists. New endpoints in U6 and U12 add corresponding subcommands.
+
+### Institutional Learnings
+
+- `docs/solutions/` has no prior ETL, Cloudflare Queues, R2 byte-range, or Sentry-in-Workers learnings — only an unrelated Better Auth CLI note and an Android UI bug. This remediation is greenfield from an institutional-knowledge standpoint, which makes it a strong `/ce-compound` target after each phase ships.
+
+### External References
+
+- **Cloudflare Queues — ack/retry semantics:** `message.ack()` / `message.retry({ delaySeconds })` / `ackAll()` / `retryAll()` documented at <https://developers.cloudflare.com/queues/configuration/javascript-apis/>. Throwing fails the un-acked remainder of the batch. `retryDelaySeconds` max is 24h per <https://developers.cloudflare.com/queues/platform/limits/>.
+- **Cloudflare Queues — DLQ:** `dead_letter_queue` (string name) + `max_retries` (default 3, max 100) in the consumer block per <https://developers.cloudflare.com/queues/configuration/dead-letter-queues/>.
+- **Cloudflare Workers Scheduler:** Only `scheduler.wait(ms)` is documented at <https://developers.cloudflare.com/workers/runtime-apis/scheduler/>. **`scheduler.yield()` does not exist** — the audit P2 #5 recommendation is wrong on this. Use `await scheduler.wait(0)` instead.
+- **Wall-clock limit:** Queue consumer wall-clock cap is **15 minutes**, not 30 seconds, per <https://developers.cloudflare.com/queues/platform/limits/>. The audit's "30 s wall-clock" framing under P2 #5 is stale.
+- **Sentry on Cloudflare:** Prefer the first-party `@sentry/cloudflare` over toucan-js. Wrap via `Sentry.withSentry(optsFn, { fetch, queue })` per <https://docs.sentry.io/platforms/javascript/guides/cloudflare/>. Queue instrumentation guidance at <https://docs.sentry.io/platforms/javascript/guides/cloudflare/tracing/instrumentation/queues-module/>.
+- **Drizzle enum-add limitation:** `ALTER TYPE … ADD VALUE` inside the same transaction as code that uses the new value fails. Split migrations. Tracked at <https://github.com/drizzle-team/drizzle-orm/issues/3249>.
+- **R2 range reads with AWS SDK:** R2's S3 API fully supports the `Range` header — `GetObjectCommand({ Range: 'bytes=0-1023' })` behaves identically to S3 per <https://developers.cloudflare.com/r2/api/s3/api/>.
+
+---
+
+## Key Technical Decisions
+
+- **Track chunk completion via two new columns (`chunks_total`, `chunks_completed`) on the existing `etl_jobs` row, gated by a per-chunk idempotency table `etl_job_chunks(job_id, chunk_index, completed_at)` with PK on `(job_id, chunk_index)`.** Rationale: even at `max_concurrency: 1` today, Cloudflare Queues are *at-least-once* — a chunk whose DB writes succeed but whose ack is lost will be redelivered, which would double-increment a naive `chunks_completed = chunks_completed + 1` and either crash through `chunks_total` or transition the job to `completed` while a sibling chunk is still pending. The idempotency table makes the increment a deterministic side-effect of `INSERT … ON CONFLICT (job_id, chunk_index) DO NOTHING RETURNING 1`; the counter only bumps when the insert created a new row. This was originally scoped as a follow-up under "Deferred" but the deepening pass surfaced it as a correctness prerequisite — pulled forward into U1/U2.
+- **No new `partial` enum value on `etl_job_status`.** Embedding-fallback degradation is observable via `total_embedding_failures > 0` on a `completed` row. Adding an enum value would force the audit P2 #3 split into two migrations (Drizzle Kit limitation) and complicate every admin filter without observable benefit.
+- **Use `@sentry/cloudflare` (first-party), not toucan-js as the audit suggested.** Toucan still works but is no longer the recommended Sentry path on Workers as of 2026. `withSentry({ fetch, queue })` wraps both entry points in one call; no manual `waitUntil` plumbing needed.
+- **Use `await scheduler.wait(0)` for yielding, not the non-existent `scheduler.yield()`.** Audit P2 #5 is corrected here.
+- **Stuck-job sweep keyed on `last_progress_at < now() - interval '15 minutes'` AND `status = 'running'`,** not on `started_at`. The 15-min figure derives from the actual CF Queue consumer wall-clock cap (15 min), not the audit's stale 30 s/30 min framing. With per-chunk progress updates writing `last_progress_at`, any chunk making real progress is safe; only truly stalled jobs flip to `failed`.
+- **Row-boundary alignment happens in the producer**, not the consumer. The producer's `r2.head(key)` flow does an extra small range read on each chunk-end region (e.g., 64 KB) to find the last `\n` and emits chunks with newline-aligned `byteEnd`. This eliminates both the partial-row skip bug (P1 #4) and the row-spanning-chunk bug (P1 #5) in one place. Consumer's `skipPartialRow` logic is removed.
+- **CSV header re-read with bounded loop, not a fixed 4 KB slice.** For non-first chunks, the consumer fetches `[0, 4096)`, and if no `\n` appears, expands to `[0, 16384)`, then `[0, 65536)`. If still no newline, throw — header is malformed. Eliminates P1 #3 silent column misalignment.
+- **Per-chunk idempotency key is `(jobId, chunkIndex)`** — added to `CatalogETLMessage`. Even though `max_concurrency: 1` means de-facto serialization today, threading the key now unblocks the future concurrency bump without another migration.
+- **DLQ is a dedicated new queue `packrat-etl-dlq`** with a minimal consumer that captures the failure to Sentry, persists a row to a new `etl_dlq_events` table for forensics, and acks. The DLQ does *not* attempt to re-process — it's an event sink + visibility tool.
+- **Reconciliation runs as both a manual admin endpoint and an automatic post-job step, with the automatic step on its own queue.** Manual endpoint stays synchronous (operator-explicit, scoped to one job). Automatic step is dispatched as a queue message to a new `packrat-etl-reconcile-queue` on the final-chunk completion transition, *not* via `ctx.waitUntil` — `waitUntil` shares the queue invocation's wall-clock budget, which for a multi-GB CSV exceeds the 15-min cap when added on top of the chunk's own processing time. The reconcile consumer streams the file in 100 MB byte-range windows with progress checkpointed to a transient column so retries resume. The consumer's `INSERT … RETURNING` includes `verified_at IS NULL` as an idempotency gate so a redelivered reconcile message is a no-op. Warning threshold remains `> max(10, ceil(0.01 * total_processed))`.
+- **Repair-from-scratch endpoint creates a NEW `etl_jobs` row and links it to the old via a new nullable `superseded_by_job_id` column with `ON DELETE SET NULL` and a paired `superseded_at timestamp`.** No mutation of the old row's counters — preserves audit trail and lets the dashboard show "originally failed, repaired by job X". `ON DELETE SET NULL` (not `CASCADE`) so deleting one row never silently nukes a chain of repair attempts. A CHECK constraint prevents self-reference (`superseded_by_job_id != id`). The runbook procedure (U15) requires verifying R2 source presence + ETag match before invoking repair, so an overwritten source cannot silently re-ingest the wrong file.
+- **Structured logger lives at `packages/api/src/utils/logger.ts`** as a thin wrapper around `console.*` for now, accepting a `LogContext` (jobId, chunkIndex, r2Key, etc.) and emitting JSON-prefixed lines. Sentry breadcrumbs piggyback on the same call surface. Not a full logger framework — that's a separate decision.
+
+---
+
+## Open Questions
+
+### Resolved During Planning
+
+- **Should the chunk completion track go on `etl_jobs` columns alone, or be paired with a per-chunk idempotency table?** Resolved during deepening: both. `etl_jobs.{chunks_total, chunks_completed}` are the counters; `etl_job_chunks(job_id, chunk_index)` is the idempotency gate that makes the increment safe under at-least-once delivery. See Key Technical Decisions.
+- **Should embedding-fallback get a new enum value `partial`?** Resolved: no — use `total_embedding_failures` counter on a `completed` row.
+- **Toucan-js or `@sentry/cloudflare`?** Resolved: `@sentry/cloudflare`. See External References.
+- **Wall-clock budget for the stuck-job sweep cutoff?** Resolved: `last_progress_at < now() - interval '15 minutes'`, matching the actual queue-consumer wall-clock cap.
+- **Should the row-boundary alignment happen in producer or consumer?** Resolved: producer. Single source of truth for chunk boundaries.
+- **Should auto-reconcile use `ctx.waitUntil` or its own queue?** Resolved during deepening: dedicated queue (`packrat-etl-reconcile-queue`) with resumable byte-range streaming. `waitUntil` shares the chunk consumer's wall-clock budget, which fails at multi-GB files.
+- **Should the DLQ consumer's INSERT + status UPDATE be transactional?** Resolved during deepening: yes, single `db.transaction()`. Same for the sweep's UPDATE + sentinel-event INSERT.
+- **Should the migration split into 0048a/0048b/0048c?** Resolved during deepening: no — at ~200 rows, the single-migration approach is fine. Splitting becomes correct when `etl_jobs` exceeds ~100k rows, and the migration header carries a comment to revisit at that scale.
+
+### Deferred to Implementation
+
+- **Exact Drizzle migration sequencing within Phase 1.** All six columns + the partial index + the new `etl_dlq_events` table can land in a single migration `0048` since none touch the enum. Whether to split `superseded_by_job_id` (added later in U6) into its own migration `0049` or include it in `0048` is decided at U1 implementation. Either way the enum stays untouched in this plan.
+- **`@sentry/cloudflare` instrumentation depth for the queue consumer.** The exact `Sentry.startSpan` attributes per queue message (some attributes are conventional, some are CF-specific) get finalized when U8 lands.
+- **Sentry sampling rate** for the queue consumer. Default to `tracesSampleRate: 0.1` and tune in production; not a plan-time decision.
+- **Exact threshold for "significant" reconciliation delta** that triggers a Sentry warning vs informational event. Default: `> max(10, ceil(0.01 * total_processed))` rows of delta. Tunable in production.
+- **Cron schedule for `invalid_item_logs` retention sweep.** Daily at 09:00 UTC unless ops has a quieter window.
+
+---
+
+## High-Level Technical Design
+
+> *This illustrates the intended approach and is directional guidance for review, not implementation specification. The implementing agent should treat it as context, not code to reproduce.*
+
+```text
+Producer  ─── POST /catalog/etl ──┐
+                                  │
+                                  ▼
+        ┌─────────────────────────────────────────────┐
+        │ chunkCsvForR2(key)  (NEW shared helper)     │
+        │   1. r2.head(key) -> size                   │
+        │   2. for each 20 MB window:                 │
+        │        peek (next 64 KB) to find last '\n'  │
+        │        emit chunk with byteEnd = newline-1  │
+        │   3. tag each chunk: { jobId, chunkIndex,   │
+        │                        chunksTotal, byteRange }
+        └─────────────────────────────────────────────┘
+                                  │
+                          INSERT etl_jobs
+                          (status='running',
+                           chunks_total=N,
+                           chunks_completed=0)
+                                  │
+                          ETL_QUEUE.sendBatch(chunks)
+                                  │
+                                  ▼
+        ┌─────────────────────────────────────────────┐
+        │ processQueueBatch (REWRITE)                 │
+        │   for message of batch:                     │
+        │     try {                                   │
+        │       processCatalogETL(msg)                │
+        │       message.ack()                         │
+        │     } catch (err) {                         │
+        │       Sentry.captureException(err, {...})   │
+        │       message.retry({ delaySeconds: 30 })   │
+        │     }                                       │
+        └─────────────────────────────────────────────┘
+                                  │
+                                  ▼
+        ┌─────────────────────────────────────────────┐
+        │ processCatalogETL (per chunk)               │
+        │   r2.get(key, range) -> stream              │
+        │   if chunkIndex > 0: re-fetch header        │
+        │     (expand 4K→16K→64K, throw if no '\n')   │
+        │   parse rows (csv-parse, backpressure)      │
+        │   per 100 rows: scheduler.wait(0)           │
+        │   flush valid -> processValidItemsBatch     │
+        │     (embedding fallback increments          │
+        │      total_embedding_failures atomically)   │
+        │   flush invalid -> processLogsBatch         │
+        │     (now RETHROWS on DB failure)            │
+        │   on success:                               │
+        │     UPDATE etl_jobs                         │
+        │       SET chunks_completed = chunks_completed+1,
+        │           last_progress_at = now(),         │
+        │           status = CASE                     │
+        │             WHEN chunks_completed+1         │
+        │                  = chunks_total             │
+        │             THEN 'completed'                │
+        │             ELSE status                     │
+        │           END                               │
+        │     if completed (in same txn):             │
+        │       enqueue ReconcileMessage to           │
+        │       packrat-etl-reconcile-queue           │
+        └─────────────────────────────────────────────┘
+                                  │
+                  (on completion transition)
+                                  ▼
+        ┌─────────────────────────────────────────────┐
+        │ processReconcileBatch                       │
+        │   reconcileJob(jobId, resumeFromByte=0):    │
+        │     if verified_at IS NOT NULL: ack         │
+        │     stream 100 MB byte-range windows        │
+        │       checkpoint to                         │
+        │         verified_row_count_partial          │
+        │       if budget low: throw ResumeError      │
+        │         (consumer re-enqueues)              │
+        │     on EOF: UPDATE verified_at, count       │
+        │     if delta > threshold: Sentry warning    │
+        └─────────────────────────────────────────────┘
+                                  │
+                  (on any thrown error after retries)
+                                  ▼
+                          packrat-etl-dlq
+                                  │
+                                  ▼
+        ┌─────────────────────────────────────────────┐
+        │ dlqConsumer                                 │
+        │   Sentry.captureException                   │
+        │   INSERT etl_dlq_events                     │
+        │   ack                                       │
+        └─────────────────────────────────────────────┘
+
+Background (CF Cron):
+  stuck-job sweep: status='running' AND last_progress_at < now()-15min
+                   -> status='failed', emit Sentry warning
+  invalid-log retention: DELETE FROM invalid_item_logs WHERE created_at < now()-90d
+```
+
+---
+
+## Implementation Units
+
+### U1. Schema migration: chunk tracking, idempotency table, progress timestamp, embedding failures, reconciliation columns, DLQ events table, constraint hardening
+
+**Goal:** Add the columns, tables, indexes, and constraints that the rest of the plan reads and writes. Lands first so every subsequent unit can compile and migrate against a known schema. Single migration `0048` is acceptable at the current ~200-row scale of `etl_jobs`; splitting into multiple migrations is unnecessary engineering at this size (revisit if `etl_jobs` exceeds ~100k rows).
+
+**Requirements:** R1, R3, R7, R8, R10
+
+**Dependencies:** None
+
+**Files:**
+- Modify: `packages/db/src/schema.ts` (add columns to `etlJobs`; add new `etlJobChunks` table; add new `etlDlqEvents` table; add UNIQUE constraint to `catalogItemEtlJobs`; export all)
+- Create: `packages/api/drizzle/0048_etl_chunking_and_observability.sql`
+- Create: `packages/api/drizzle/meta/0048_snapshot.json` (generated)
+- Modify: `packages/api/drizzle/meta/_journal.json` (generated)
+- Test: `packages/api/test/db-schema-etl.test.ts` (new — schema smoke test asserting columns exist with expected defaults; uses the existing Docker Postgres wsproxy setup at `localhost:5434`)
+
+**Approach:**
+- Columns added to `etl_jobs`:
+  - `chunks_total integer` (nullable — single-chunk legacy jobs leave it null)
+  - `chunks_completed integer DEFAULT 0 NOT NULL`
+  - `last_progress_at timestamp` (nullable initially; backfilled to `started_at` for legacy rows in the same migration)
+  - `total_embedding_failures integer DEFAULT 0 NOT NULL`
+  - `verified_at timestamp` (nullable)
+  - `verified_row_count integer` (nullable)
+  - `verified_row_count_partial integer` (nullable — checkpoint for resumable reconcile in U10)
+  - `superseded_by_job_id text` (nullable, FK to `etl_jobs.id` `ON DELETE SET NULL`)
+  - `superseded_at timestamp` (nullable — paired with `superseded_by_job_id` so the timeline survives even after FK cleanup)
+  - `source_etag text` (nullable — captured on producer insert from `r2.head(objectKey).etag`; U6's repair endpoint uses this for failure-closed source verification)
+  - `source_last_modified timestamp` (nullable — same capture; redundant with etag but cheap)
+- CHECK constraints on `etl_jobs`:
+  - `etl_jobs_chunks_completed_lte_total CHECK (chunks_total IS NULL OR chunks_completed <= chunks_total)` — fail loudly on over-count.
+  - `etl_jobs_no_self_supersede CHECK (superseded_by_job_id IS NULL OR superseded_by_job_id <> id)` — prevent self-referential repair loop.
+- New indexes on `etl_jobs`:
+  - Partial: `etl_jobs_running_progress_idx` on `(status, last_progress_at)` `WHERE status = 'running'` — for the U5 stuck-job sweep.
+  - Partial: `etl_jobs_unverified_idx` on `(verified_at)` `WHERE status = 'completed' AND verified_at IS NULL` — for the U10 watchdog scan.
+  - `etl_jobs_superseded_by_idx` on `(superseded_by_job_id)` — for the admin dashboard's "is this job superseded?" lookup.
+- New table `etl_job_chunks` (per-chunk idempotency, see Key Technical Decisions):
+  - `job_id text NOT NULL` (FK to `etl_jobs.id` `ON DELETE CASCADE`)
+  - `chunk_index integer NOT NULL`
+  - `completed_at timestamp DEFAULT now() NOT NULL`
+  - `PRIMARY KEY (job_id, chunk_index)`
+- New table `etl_dlq_events`: `id text PK`, `job_id text` (FK, nullable, `ON DELETE SET NULL`), `chunk_index integer`, `message_body jsonb`, `error_message text`, `error_stack text`, `attempts integer`, `source text` (one of `consumer`, `sweep`; defaults to `consumer`), `created_at timestamp DEFAULT now() NOT NULL`. Index on `created_at`.
+- Modification to `catalog_item_etl_jobs`: add `UNIQUE (catalog_item_id, etl_job_id)` so a redelivered chunk's upsert can use `ON CONFLICT DO NOTHING` and not produce duplicate provenance rows.
+- Backfill: `UPDATE etl_jobs SET last_progress_at = started_at WHERE last_progress_at IS NULL`. Safe — `etl_jobs` is ~200 rows; sub-100ms on Neon.
+- Drizzle generator: `bun run --cwd packages/api db:generate` then verify the SQL file matches the design. **Verify Drizzle Kit emits `DEFAULT 0 NOT NULL` literally in the SQL** — Drizzle sometimes drops the SQL-side default and keeps only the JS-side, which would break inserts from in-flight old workers during a rolling deploy. **Do NOT touch the `etl_job_status` enum in this migration** — no new enum value is needed (see Key Technical Decisions).
+- Drizzle Kit does not auto-emit `CONCURRENTLY` for indexes. At 200 rows the index build is instant so `CONCURRENTLY` is nice-to-have, not blocking. If the table grows >100k rows before this lands, hand-edit the generated SQL to use `CREATE INDEX CONCURRENTLY IF NOT EXISTS` and split each index into its own statement-breakpoint block.
+
+**Patterns to follow:**
+- Existing `etl_jobs` definition at `packages/db/src/schema.ts:460-479` for column shape and import style.
+- Migration `0027_past_madrox.sql` (added `scraper_revision` + index) for the "add column + partial index" pattern.
+- `scripts/lint/check-drizzle-migrations.ts` runs in `lint:custom`; the new migration must pass it.
+
+**Test scenarios:**
+- Happy path: After migration runs against a populated test DB, all 8 new `etl_jobs` columns are present with the documented defaults; `etl_job_chunks` and `etl_dlq_events` exist; the three new partial/normal indexes are queryable (`EXPLAIN SELECT ... WHERE status='running' ...` uses the running-progress index; the unverified index serves the watchdog).
+- Happy path: `INSERT INTO etl_job_chunks (job_id, chunk_index) VALUES ('j1', 0)` succeeds; a duplicate insert returns no row via `ON CONFLICT DO NOTHING RETURNING 1` and the table still contains exactly one row.
+- Edge case: Legacy rows have `chunks_total = NULL` and `last_progress_at` backfilled to `started_at`.
+- Edge case: `chunks_completed DEFAULT 0` is correctly applied to existing rows (verify with a row that has `chunks_completed = 0` post-migration). The generated SQL must literally include `DEFAULT 0 NOT NULL` — assert via SQL `information_schema.columns`.
+- Edge case: `UNIQUE (catalog_item_id, etl_job_id)` on `catalog_item_etl_jobs` prevents a duplicate-insert (returns conflict).
+- Error path: Attempting to insert a row with `chunks_completed > chunks_total` violates the CHECK constraint and errors clearly.
+- Error path: Attempting to set `superseded_by_job_id = id` violates the no-self-supersede CHECK.
+- Error path: Re-running the migration on an already-migrated DB is a no-op (Drizzle's migration log handles this; smoke-test the up/down via `bun run --cwd packages/api db:migrate`).
+- Edge case: Down-migration cleanly drops the new columns/tables on a DB with no Phase 2+ data. **Once Phase 2 ships and writes start landing in the new columns, the migration is forward-only** — document in the migration header comment.
+
+**Verification:**
+- `bun run --cwd packages/api db:migrate` applies cleanly against a fresh Docker Postgres + against a Postgres seeded with current-prod-shape `etl_jobs` rows.
+- `bunx drizzle-kit check` (run from `packages/api/`) validates the snapshot chain is internally consistent — run this before pushing any migration change.
+- `bun lint:custom` passes on the new migration.
+- `bun test:api:unit` includes the new schema test and it passes.
+
+---
+
+### U2. P0 #1 fix: chunk-completion lifecycle in producer + consumer
+
+**Goal:** A multi-chunk job's `status` transitions to `completed` only after every chunk has finished. Premature completion eliminated.
+
+**Requirements:** R1
+
+**Dependencies:** U1
+
+**Files:**
+- Modify: `packages/api/src/routes/catalog/index.ts` (producer endpoint sets `chunks_total` on `etl_jobs` insert and tags each `CatalogETLMessage` with `chunkIndex` and `chunksTotal`)
+- Modify: `packages/api/src/services/etl/types.ts` (extend `CatalogETLMessage.data` with `chunkIndex: number` and `chunksTotal: number`; `byteStart`/`byteEnd` remain)
+- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (rewrite the `:188-191` success-path UPDATE to use the `CASE` expression that flips status only when `chunks_completed + 1 = chunks_total`; also update `last_progress_at` on every counter write)
+- Modify: `packages/api/src/services/etl/updateEtlJobProgress.ts` (include `last_progress_at: sql\`now()\`` in the update set so every progress write refreshes the sweep timestamp)
+- Test: `packages/api/test/etl-chunk-completion.test.ts` (new)
+
+**Approach:**
+- Producer: compute `chunks` first, then `INSERT etl_jobs (..., chunks_total) VALUES (..., ${chunks.length})` — a single round-trip including `chunks_total`. Then `sendBatch` with each message carrying `chunkIndex` 0..N-1 and `chunksTotal: N`. Setting `chunks_total` in the initial INSERT (rather than a separate follow-up UPDATE) eliminates a window where a chunk consumer could observe `chunks_total IS NULL` and silently fail the `chunks_completed + 1 = chunks_total` CASE comparison.
+- Consumer success path runs inside a single Drizzle `db.transaction()`:
+  1. `INSERT INTO etl_job_chunks (job_id, chunk_index) VALUES ($1, $2) ON CONFLICT (job_id, chunk_index) DO NOTHING RETURNING 1` — the idempotency gate. If no row returned, this is a redelivery; skip the increment, ack the message, return.
+  2. If the insert created a row, run the atomic UPDATE: `UPDATE etl_jobs SET chunks_completed = chunks_completed + 1, last_progress_at = now(), status = CASE WHEN chunks_completed + 1 = chunks_total THEN 'completed' ELSE status END, completed_at = CASE WHEN chunks_completed + 1 = chunks_total THEN now() ELSE completed_at END WHERE id = $1 AND status = 'running' RETURNING status, chunks_completed, chunks_total`.
+  3. The `WHERE status = 'running'` gate prevents clobbering a row the U5 sweep has already flipped to `failed` (status-flip-flop hazard).
+  4. If the returned row shows the transition to `completed`, *and* this transaction was the one that created the chunk-row in step 1, send a message to `packrat-etl-reconcile-queue` (see U10) for the auto-reconcile.
+- On per-chunk failure: the consumer no longer flips the parent job to `failed` immediately. Instead it lets the message throw / retry. The parent job only flips to `failed` via (a) DLQ consumer when retries are exhausted, or (b) the stuck-job sweep (U5).
+- Single-chunk legacy jobs: when `chunks_total IS NULL`, the `etl_job_chunks` insert still gates the increment; legacy rows backfilled to `chunks_total = 1` migrate cleanly. Backwards-compatible with any in-flight legacy messages.
+- The CHECK constraint `chunks_completed <= chunks_total` from U1 is the loud-failure safety net — if the idempotency gate ever leaks (e.g., a code bug bypasses the chunk-table insert), the next `UPDATE` errors with a constraint violation rather than silently corrupting the counter.
+
+**Patterns to follow:**
+- Atomic SQL update idiom at `packages/api/src/services/etl/updateEtlJobProgress.ts:16-23`.
+- Drizzle transaction shape: `await db.transaction(async (tx) => { ... })`.
+
+**Test scenarios:**
+- Happy path: 5-chunk job; chunks 0..3 complete successfully → status remains `running` with `chunks_completed = 4`; chunk 4 completes → status flips to `completed`, `completed_at` set, `etl_job_chunks` has 5 rows.
+- Happy path (idempotency): Chunk 2 succeeds, ack lost, CF redelivers → second attempt's `INSERT … ON CONFLICT DO NOTHING RETURNING` returns no row → increment is skipped → `chunks_completed` increments exactly once over the two deliveries.
+- Edge case: Chunks complete out of order (chunk 3 finishes before chunk 1) → status flips only when all five have incremented; the `etl_job_chunks` rows record actual completion order.
+- Edge case: Single-chunk legacy job (`chunks_total = 1`) → flips to `completed` on its one success; `etl_job_chunks` has 1 row.
+- Edge case: Sweep flips job to `failed` mid-flight; the next chunk's UPDATE `WHERE … AND status = 'running'` returns zero rows → transaction sees the conflict, logs warning, lets the operator route to repair-from-scratch.
+- Error path: One chunk throws; other chunks succeed → parent job stays `running` while CF Queue retries the failed chunk; if retries exhaust, DLQ consumer (U3) handles state transition.
+- Error path: CHECK constraint trips (hypothetical leaked-idempotency bug) → UPDATE errors loudly, chunk retries, no silent corruption.
+- Integration: With `R2BucketService` mocked to return a small CSV split into 3 chunks via `byteRange`, the full producer→queue→consumer cycle ends in exactly one `status=completed` transition for the parent job AND exactly one reconcile message enqueued.
+- Integration (idempotency at scale): Replay every chunk message twice → `etl_job_chunks` has exactly `chunks_total` rows, counters match, status = `completed`.
+
+**Verification:**
+- Re-running `etl.test.ts` plus the new test under `bun test:api` shows no `status='completed'` write until `chunks_completed = chunks_total`.
+- A manual prod-shape replay (`POST /catalog/etl` against the dev Worker with a CSV that produces ≥3 chunks) shows the dashboard's `successRate` remain at the running state until all chunks finish.
+
+---
+
+### U3. P0 #2 fix: explicit ack/retry + DLQ wiring
+
+**Goal:** No per-message exception is silently swallowed. Failures retry; exhausted retries land in a dedicated DLQ that emits Sentry events and persists for forensics.
+
+**Requirements:** R2, R9
+
+**Dependencies:** U1 (for `etl_dlq_events` table)
+
+**Files:**
+- Modify: `packages/api/src/services/etl/queue.ts` (rewrite `processQueueBatch` for explicit per-message ack/retry; remove the swallow at `:50-60`)
+- Create: `packages/api/src/services/etl/processDlqEvent.ts` (DLQ consumer; INSERT into `etl_dlq_events`, capture Sentry exception, ack)
+- Modify: `packages/api/src/index.ts` (extend the `queue()` switch at `:109-124` with arms for `packrat-etl-dlq` and `packrat-etl-dlq-dev`)
+- Modify: `packages/api/wrangler.jsonc` (declare `packrat-etl-dlq` and `packrat-etl-dlq-dev` as producer + consumer; add `dead_letter_queue: "packrat-etl-dlq"` and `max_retries: 3` to the ETL consumer block at `:78-82` and dev equivalent at `:178-182`)
+- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (when a chunk's processing throws, also UPDATE `last_progress_at` and increment a transient `last_error_at` if useful — see Approach for trade-off; primary work is removing the per-chunk `status='failed'` write at `:201-204` since the DLQ consumer is now responsible for state transition)
+- Test: `packages/api/test/etl-queue-retry.test.ts` (new — covers the global-mock blind spot in `setup.ts:544-551`)
+
+**Approach:**
+- Rewrite `processQueueBatch`:
+  ```text
+  for (const message of batch.messages) {
+    try {
+      await processCatalogETL({ message: message.body, env });
+      message.ack();
+    } catch (err) {
+      logger.error('etl.chunk.failed', { jobId, chunkIndex, err });
+      Sentry.captureException(err, { tags: { jobId, chunkIndex, r2Key }, contexts: { queue: { messageId: message.id, attempts: message.attempts } } });
+      message.retry({ delaySeconds: 30 });
+    }
+  }
+  ```
+  (Sentry wiring lives in U8; in U3 the call sites are added as no-ops that U8 fills in.)
+- DLQ consumer reads from `packrat-etl-dlq` and, inside a single `db.transaction()`, performs: (1) `INSERT INTO etl_dlq_events (… source = 'consumer')` capturing `{ jobId, chunkIndex, message_body, error_message, error_stack, attempts }`, (2) `UPDATE etl_jobs SET status = 'failed', completed_at = now() WHERE id = $1 AND status = 'running'` — the `WHERE status = 'running'` clause is the no-op gate that prevents racing the U5 sweep. `Sentry.captureException` fires *before* the transaction (so the event survives even if the DB transaction rolls back) with tags `{ jobId, chunkIndex, r2Key }`. The `error_stack` field is contractually free of raw CSV row data — only structural error messages — to avoid accidental PII capture (documented at the call site).
+- Wrangler config additions:
+  ```text
+  // producer
+  { "queue": "packrat-etl-dlq", "binding": "ETL_DLQ" }
+  // consumer
+  { "queue": "packrat-etl-dlq", "max_batch_size": 10, "max_batch_timeout": 30 }
+  // on the existing ETL consumer:
+  "dead_letter_queue": "packrat-etl-dlq",
+  "max_retries": 3
+  ```
+  Same shape applied to `*-dev` queues.
+- The removal of the per-chunk `status='failed'` write at `processCatalogEtl.ts:201-204` is critical — leaving it would race with the DLQ consumer's state transition.
+- `processCatalogETL` rethrows on any internal failure (it already does); no behavioral change other than the consumer's catch now retries instead of swallowing.
+
+**Patterns to follow:**
+- Embeddings consumer pattern at `packages/api/src/services/catalogService.ts:461-507` for the rethrow shape.
+- Existing `queue()` dispatch at `packages/api/src/index.ts:109-124` for the new DLQ arm.
+
+**Test scenarios:**
+- Happy path: Single message processes successfully → `message.ack()` called exactly once; no retry; no DLQ row.
+- Error path: Transient throw (simulated R2 5xx) → first call: `message.retry({ delaySeconds: 30 })` and no DLQ; second call succeeds → ack. Total DLQ rows = 0.
+- Error path: Permanent throw (4 attempts all fail) → exhausts `max_retries: 3` → message routed to `packrat-etl-dlq` → DLQ consumer inserts row in `etl_dlq_events` with `attempts = 4`, captures Sentry, flips `etl_jobs.status = 'failed'`.
+- Integration: Un-mock `processQueueBatch` (override `setup.ts:544-551` per-file with `vi.doUnmock`) and exercise the real consumer against an in-memory queue stub.
+- Edge case: Two messages in a batch, first throws and second succeeds (this should not happen at `max_batch_size: 1` but the code path supports it) → first retries, second acks; no cross-contamination of state.
+
+**Verification:**
+- New test passes with the per-message catch removed; passes with the catch present too (so the test actually proves the new behavior).
+- `bun test:api` overall still green.
+- Inspecting `packrat-etl-dlq` queue depth in `wrangler queues info packrat-etl-dlq-dev` after a forced failure shows zero (because the DLQ consumer drains immediately).
+
+---
+
+### U4. Sweep cleanup: remove the broken wall-clock stuck-job sweep before U5 replaces it
+
+**Goal:** Take the existing `POST /admin/etl/reset-stuck` endpoint out of production rotation before U5's progress-based replacement lands, to stop new false-failures while the rest of Phase 2 ships.
+
+**Requirements:** R3
+
+**Dependencies:** None (independent of U1; this is a code removal)
+
+**Files:**
+- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (remove or guard the `POST /admin/etl/reset-stuck` route at `:384-409`; if removed, also remove from the OpenAPI spec)
+- Modify: `packages/cli/src/commands/admin/etl.ts` (drop any subcommand wired to the removed endpoint)
+- Test: `packages/api/test/admin-etl-routes.test.ts` (new or extend existing — assert the route returns 410 Gone or is absent)
+
+**Approach:**
+- Two options, both acceptable:
+  - **Remove the route entirely.** Anyone calling it gets a 404. Cleanest. Recommended if no automation depends on it.
+  - **Replace the route body with a 410 Gone response** that links to the runbook (added in U15) and the new sweep design from U5. Use if there's any concern about external automation calling it.
+- Existing endpoint logic at `:384-409` does `UPDATE etl_jobs SET status='failed' WHERE status='running' AND started_at < now() - interval '30 minutes'`. This is the SQL that wrongly failed the 7 jobs on 2026-05-14.
+- This unit ships before U5 lands the replacement, so for a short window there is no automated sweep at all. Acceptable because stuck-job recovery in that window is operational (U15 runbook documents the manual SQL).
+
+**Patterns to follow:**
+- Existing admin route removal pattern (none in repo as of this writing); fall back to standard Elysia route definition omission.
+
+**Test scenarios:**
+- Happy path: `POST /admin/etl/reset-stuck` returns 410 (or 404 if removed) — test asserts on the chosen behavior.
+- Edge case: Admin CLI subcommand for the old endpoint no longer exists (or returns a clear "removed, see runbook" message).
+
+**Verification:**
+- `bun test:api` passes with the new assertion.
+- Manual `curl` against dev Worker returns the chosen status code.
+
+---
+
+### U5. P1 #2 fix: progress-based stuck-job sweep
+
+**Goal:** Replace the wall-clock-based sweep with one that uses `last_progress_at` so healthy long jobs (e.g., 50,100-row `evo` file) are not falsely failed.
+
+**Requirements:** R3
+
+**Dependencies:** U1 (for `last_progress_at`), U2 (for the `last_progress_at` write-on-progress), U4 (so the old sweep is gone first)
+
+**Files:**
+- Create: `packages/api/src/services/etl/sweepStuckJobs.ts` (the sweep function — pure DB logic, no HTTP)
+- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (new `POST /admin/etl/sweep-stuck` endpoint that calls `sweepStuckJobs` and returns the affected rows; for manual triggering)
+- Modify: `packages/api/wrangler.jsonc` (declare a CF Cron Trigger for the sweep, e.g., `*/5 * * * *`)
+- Modify: `packages/api/src/index.ts` (add `scheduled()` handler that invokes `sweepStuckJobs` on the cron event; if a `scheduled` handler doesn't yet exist, add one)
+- Test: `packages/api/test/etl-stuck-job-sweep.test.ts` (new)
+
+**Approach:**
+- Sweep runs inside a single `db.transaction()`:
+  1. `UPDATE etl_jobs SET status='failed', completed_at = now() WHERE status='running' AND COALESCE(last_progress_at, started_at) < now() - interval '15 minutes' RETURNING id, source, filename, started_at, last_progress_at, chunks_total, chunks_completed`. (The `COALESCE` defends against any legacy row that somehow escaped the U1 backfill.)
+  2. For each returned row, `INSERT INTO etl_dlq_events (job_id, error_message, source) VALUES ($1, 'sweep:no_progress', 'sweep')` so the forensic table is the single source of truth for *every* failed transition — whether triggered by the consumer DLQ or by the sweep. `chunk_index = NULL` in sweep-sourced events.
+- Returned rows also feed a Sentry warning event per affected job (`level: warning`, tags `{ jobId, source: 'sweep' }`, extra includes `chunks_completed/chunks_total` so the operator immediately sees how far the job got).
+- 15-minute interval matches the CF Queue consumer wall-clock cap. Any chunk making real progress writes `last_progress_at = now()` (via U2's modification to `updateEtlJobProgress`), so this only catches truly stalled jobs.
+- CF Cron Trigger every 5 minutes (configurable via env if needed). The cron handler is idempotent — the partial index from U1 keeps the query cheap even at thousands of jobs. Wrangler config shape: `"triggers": { "crons": ["*/5 * * * *"] }` — top-level `triggers` object wrapping a `crons` array, not a bare top-level `crons` key.
+- Manual admin endpoint exists for on-demand sweep — useful during incident response.
+
+**Patterns to follow:**
+- Admin route structure at `packages/api/src/routes/admin/analytics/catalog.ts` for the new endpoint.
+- CF Cron Triggers config in `wrangler.jsonc` (the repo has none today — this is the first; reference <https://developers.cloudflare.com/workers/configuration/cron-triggers/>).
+
+**Test scenarios:**
+- Happy path: Insert a job with `status='running'`, `last_progress_at = now() - 30min` → sweep flips it to `failed`.
+- Edge case: Insert a job with `status='running'`, `last_progress_at = now() - 5min` → sweep leaves it alone (within budget).
+- Edge case: Insert a job with `last_progress_at = NULL` (somehow — legacy row that escaped backfill) → COALESCE the column with `started_at` in the WHERE clause so it still gets evaluated.
+- Edge case: 50,100-row job in progress — chunks write `last_progress_at = now()` every 100 rows → sweep never fires on it.
+- Integration: Cron-event simulation calls the same code path as the admin endpoint; both return identical results for the same DB state.
+- Error path: Sweep query fails (DB down) → caller observes the error; Sentry captures; cron does not silently mask.
+
+**Verification:**
+- After running the sweep against a DB with the seeded test cases, exactly the long-stalled rows are affected.
+- `bun test:api` includes the new test and passes.
+- Dev cron schedule fires (`wrangler dev --test-scheduled`) and exercises the handler.
+
+---
+
+### U6. P1 #1 fix: shared chunking helper + retry endpoint + repair-from-scratch endpoint
+
+**Goal:** Both retry and repair use the same producer chunking logic. The repair endpoint creates a brand-new `etl_jobs` row linked to the broken historical one — directly enabling the operational recovery of the 7 wrongly-`failed` jobs from 2026-05-14.
+
+**Requirements:** R5, R8
+
+**Dependencies:** U1 (for `superseded_by_job_id`), U2 (for `chunks_total` write semantics)
+
+**Files:**
+- Create: `packages/api/src/services/etl/chunkCsvForR2.ts` (extracted shared helper: takes `objectKey`, returns an array of `{ chunkIndex, chunksTotal, byteStart, byteEnd }` with newline-aligned boundaries — newline alignment itself ships in U7)
+- Modify: `packages/api/src/routes/catalog/index.ts` (replace inline chunking at `:253-271` with a call to `chunkCsvForR2`)
+- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (rewrite `POST /admin/etl/:jobId/retry` at `:413-470` to use `chunkCsvForR2`; add new `POST /admin/etl/:jobId/repair-from-scratch`)
+- Modify: `packages/api/src/services/etl/queue.ts` (extend `queueCatalogETL` to accept pre-computed chunks rather than constructing them — or accept either, with the chunk-construction path migrating to the shared helper)
+- Modify: `packages/cli/src/commands/admin/etl.ts` (add `retry <jobId>` subcommand if not present, plus new `repair-from-scratch <jobId>` subcommand)
+- Test: `packages/api/test/etl-retry-repair.test.ts` (new)
+
+**Approach:**
+- `chunkCsvForR2(objectKey, r2, options?)`: signature returns `Promise<ChunkSpec[]>`. Calls `r2.head(objectKey)`, splits into 20 MB windows. Newline-alignment lives in U7 but the shape lands here so U7 is a fill-in.
+- Retry endpoint (`POST /admin/etl/:jobId/retry`): looks up `(source, filename, scraperRevision)` from the existing job, generates a fresh `jobId`, INSERTs a new `etl_jobs` row with `chunks_total = chunkCsvForR2(...).length`, sets `superseded_by_job_id = <original jobId>` on the new row only if the original is `failed`, sends batch.
+- Repair-from-scratch (`POST /admin/etl/:jobId/repair-from-scratch`): same behavior as retry but always sets `superseded_by_job_id` and `superseded_at = now()` on the new row, and always re-reads the full file (even if the original was `completed`). Use case: an operator suspects a `completed` job is undercount; repair recreates from scratch.
+- **R2 ETag verification (failure-closed)**: before creating the new job row, both endpoints call `r2.head(objectKey)` and compare the returned `etag` (and `lastModified`) against the original job's recorded values. If the original job has no `etag` stored (legacy rows), require an explicit `?force=true` query flag. If the `etag` differs (source was overwritten by a later scrape), return 409 Conflict with a clear message naming both etags — never silently re-ingest a different file under the same path. (This implies adding `source_etag text` and `source_last_modified timestamp` to `etl_jobs` — fold into U1's column list if not already, or capture as a follow-up here.)
+- Both endpoints accept an optional `?dryRun=true` query that returns the planned chunk spec without enqueuing anything — operator preview.
+- The 7 historical jobs from 2026-05-14 will be recovered by calling repair-from-scratch on each of them once Phase 1+2 ships. U15 runbook documents the operator procedure including the ETag verification step.
+
+**Patterns to follow:**
+- Admin route structure at `packages/api/src/routes/admin/analytics/catalog.ts:178-235` for response shape.
+- Existing retry endpoint at `:413-470` for the basic flow (just don't replicate the broken single-chunk behavior).
+
+**Test scenarios:**
+- Happy path: Retry a failed job with a 50 MB source file → 3 chunks created via `chunkCsvForR2`, 3 messages sent, new `etl_jobs` row has `chunks_total = 3`, `superseded_by_job_id` matches original.
+- Happy path: Repair-from-scratch a `completed` job with apparent undercount → new job created with `superseded_by_job_id` set; original row untouched.
+- Edge case: Retry a single-chunk legacy job (file size < 20 MB) → 1 chunk, `chunks_total = 1`, behaves identically to the producer endpoint.
+- Edge case: Retry on a job whose `filename` no longer exists in R2 → endpoint returns 404 with a clear message; no new `etl_jobs` row.
+- Edge case: `?dryRun=true` returns the planned chunk spec; no DB writes, no queue sends.
+- Integration: Repair-from-scratch on a 50,100-row file (the `evo` case) produces the expected ~3 chunks, all enqueued, and after the full pipeline completes the new job's `total_processed` matches the file's actual row count.
+- Covers AE: the 7 jobs from 2026-05-14 can each be repaired by calling repair-from-scratch — verified manually post-deploy.
+
+**Verification:**
+- Both endpoints documented in the OpenAPI spec emitted by `@elysiajs/openapi`.
+- CLI subcommands invoke the endpoints with proper auth.
+- `bun test:api` passes the new integration test.
+
+---
+
+### U7. P1 #3 + P1 #4 + P1 #5 fix: row-boundary-aligned chunks + robust header injection
+
+**Goal:** No row is silently dropped, invalidated, or split across chunks. Wide-CSV headers (>4 KB) fail loudly instead of silently misaligning columns.
+
+**Requirements:** R4, R6
+
+**Dependencies:** U6 (for `chunkCsvForR2`)
+
+**Files:**
+- Modify: `packages/api/src/services/etl/chunkCsvForR2.ts` (implement newline alignment — for each 20 MB window, read the next 64 KB tail, find the last `\n`, snap `byteEnd` to the byte before that newline)
+- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (remove `skipPartialRow` at `:95-108`; rewrite header injection at `:50-58` with a bounded expand loop 4K→16K→64K; throw a typed error if no newline in 64 KB)
+- Test: `packages/api/test/etl-chunk-boundaries.test.ts` (new)
+
+**Approach:**
+- Newline alignment in producer:
+  - For each chunk window `[start, start + 20MB)`:
+    - Read `[start + 20MB - 64KB, start + 20MB)`.
+    - Find the index of the last `\n` in that slice.
+    - If found: `byteEnd = (start + 20MB - 64KB) + lastNewlineIndex`. The next chunk's `byteStart = byteEnd + 1`.
+    - If not found in 64 KB (extremely unlikely with normal CSV row sizes): throw `ChunkBoundaryError` immediately, surfacing to Sentry and aborting the job creation. Caller is told the file has a row larger than 64 KB.
+  - Last chunk: `byteEnd = file.size - 1`.
+- Header re-fetch in consumer (for `chunkIndex > 0`):
+  ```text
+  let headerSlice = await r2.get(key, { range: { offset: 0, length: 4096 }}).then(b => b.text());
+  let nlIdx = headerSlice.indexOf('\n');
+  if (nlIdx === -1) {
+    headerSlice = await r2.get(key, { range: { offset: 0, length: 16384 }}).then(b => b.text());
+    nlIdx = headerSlice.indexOf('\n');
+  }
+  if (nlIdx === -1) {
+    headerSlice = await r2.get(key, { range: { offset: 0, length: 65536 }}).then(b => b.text());
+    nlIdx = headerSlice.indexOf('\n');
+  }
+  if (nlIdx === -1) throw new EtlHeaderError(`No newline in first 64 KB of ${key} — malformed header`);
+  const headerRow = headerSlice.slice(0, nlIdx);
+  ```
+- Since chunks are now newline-aligned, `skipPartialRow` is no longer needed — the consumer can stream the chunk body directly into the parser after prepending the header.
+- BOM handling: if the first byte of the header slice is `0xEF 0xBB 0xBF`, strip it before extracting the header row. Same treatment for the first chunk.
+
+**Patterns to follow:**
+- R2 byte-range read pattern at `packages/api/src/services/etl/processCatalogEtl.ts:54, 71`.
+- Typed-error pattern: extend whatever the repo uses for domain errors (typically `Error` subclasses in `packages/api/src/utils/errors.ts`).
+
+**Test scenarios:**
+- Happy path: 5 MB file, 1 chunk → no boundary logic exercised; row count matches actual.
+- Happy path: 60 MB file, 3 chunks; rows of varying width; all `byteEnd` values land immediately before a `\n`; total row count across chunks = file row count.
+- Edge case: Chunk boundary lands exactly on a newline character (`source[byteEnd] === '\n'`) → still aligned; next chunk starts on next row; no dropped row.
+- Edge case: Header row of 4500 bytes (just over 4 KB) → re-fetch expands to 16 KB, succeeds; columns mapped correctly.
+- Edge case: Header row of 50 KB (one absurdly wide CSV) → re-fetch expands to 64 KB, succeeds.
+- Edge case: BOM at start of file → stripped from header extraction in both chunk-0 and re-fetch paths.
+- Error path: File with no newline in first 64 KB → throws `EtlHeaderError`; job marked `failed` via DLQ (U3).
+- Error path: Row larger than 64 KB encountered at a chunk boundary → producer throws `ChunkBoundaryError`; no job created.
+- Integration: A real CSV from prod (anonymized fixture in `packages/api/test/fixtures/`) splits into multiple chunks; sum of consumer-reported `totalProcessed` across all chunks equals `wc -l fixture.csv - 1` (subtract header).
+- Covers AE: A 50,100-row file (the `evo` shape) ingested via the new chunking logic shows `total_processed = 50100`, `total_valid + total_invalid = 50100`, no missing rows.
+
+**Verification:**
+- Manual run on a real prod fixture file with `wc -l` cross-check matches the job's `total_processed`.
+- `bun test:api` passes the new fixture-driven test.
+- Sentry catches the malformed-header case during the next dev exercise.
+
+---
+
+### U8. Sentry wiring via `@sentry/cloudflare`
+
+**Goal:** Every uncaught exception in the API Worker — including queue-consumer paths — emits a Sentry event with structured tags. Operators can debug a stuck job without paging through raw Worker logs.
+
+**Requirements:** R9
+
+**Dependencies:** None (independent; can start in parallel with Phase 1 but lands in Phase 3)
+
+**Files:**
+- Modify: `packages/api/package.json` (add `@sentry/cloudflare` dependency; pin to a specific version)
+- Modify: `packages/api/src/index.ts` (wrap the Worker default export with `Sentry.withSentry({...}, { fetch, queue })`; pass the Sentry options factory that reads `env.SENTRY_DSN`)
+- Modify: `packages/api/src/utils/env-validation.ts` (no schema change — `SENTRY_DSN` is already declared at `:9, 94`; verify it's required vs optional and adjust accordingly so dev doesn't break without a DSN)
+- Modify: `packages/api/wrangler.jsonc` (add `upload_source_maps: true` at the top level)
+- Modify: `packages/api/src/services/etl/queue.ts` (fill in the `Sentry.captureException(...)` call site that U3 stubbed)
+- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (Sentry breadcrumbs at chunk-start, batch-flush, and chunk-end; `Sentry.startSpan` around the chunk lifecycle)
+- Create: `packages/api/src/utils/logger.ts` (the thin structured logger — accepts `LogContext`, emits JSON-prefixed `console.log` lines, also calls `Sentry.addBreadcrumb` when Sentry is initialized)
+- Modify: All `packages/api/src/services/etl/*.ts` console calls migrated to `logger.{info,warn,error}` (mechanical change — sweeps across the ETL files)
+- Test: `packages/api/test/sentry-instrumentation.test.ts` (new — mocks `@sentry/cloudflare` and asserts captureException/breadcrumb call shape)
+
+**Approach:**
+- `withSentry({ fetch, queue })` wraps the existing default export at `packages/api/src/index.ts`. The Sentry options factory reads `env.SENTRY_DSN`, `env.ENVIRONMENT`, sets `tracesSampleRate: 0.1`.
+- Queue consumer instrumentation per <https://docs.sentry.io/platforms/javascript/guides/cloudflare/tracing/instrumentation/queues-module/>:
+  - `Sentry.startSpan({ op: 'queue.process', name: 'etl-chunk', attributes: { 'messaging.message.id': msg.id, 'messaging.message.retry.count': msg.attempts, 'jobId': msg.body.id, 'chunkIndex': msg.body.data.chunkIndex } }, async () => { ... })`.
+  - `Sentry.captureException(err, { tags: { jobId, chunkIndex, r2Key }, contexts: { queue: { messageId, attempts } } })` inside the catch.
+- DLQ consumer (from U3) gets the same treatment.
+- `logger.ts`: ~30 lines. Functions: `info(event, ctx)`, `warn(event, ctx)`, `error(event, ctx, err?)`. Emits a JSON line; if Sentry is initialized, also calls `Sentry.addBreadcrumb({ category: event, data: ctx, level })`.
+- Source maps: `upload_source_maps: true` works with Wrangler 4.x and `compatibility_date: 2025-06-01`.
+
+**Patterns to follow:**
+- No existing Sentry initialization in `packages/api` — this is the first.
+- Reference Sentry-in-CF guidance: <https://docs.sentry.io/platforms/javascript/guides/cloudflare/>.
+
+**Test scenarios:**
+- Happy path: Successful chunk → one `startSpan` invocation, breadcrumbs at chunk-start/flush/end, no `captureException`.
+- Error path: Chunk throws → `captureException` called once with expected tags; span marks status `internal_error`.
+- Edge case: `SENTRY_DSN` empty (dev without secret) → no Sentry calls fire; logger still emits lines; no crash.
+- Edge case: Logger called before Sentry initialized (cold-start race) → graceful no-op on breadcrumb path; logger.info still emits the line.
+- Integration: A real Sentry test project receives events from `bun api` dev-server when forced failures are triggered.
+
+**Verification:**
+- Dev `bun api` cold start logs the Sentry init line.
+- A forced chunk failure produces a Sentry event visible in the project.
+- All `packages/api/src/services/etl/*.ts` files have zero `console.*` references (`grep -rn 'console\.' packages/api/src/services/etl/` returns nothing).
+
+---
+
+### U9. P2 #2 + P2 #3 + P2 #4 fix: error propagation + embedding-failure observability + IIFE error handling
+
+**Goal:** Three related but smaller correctness issues that all share the theme "errors should not vanish silently."
+
+**Requirements:** R2, R10
+
+**Dependencies:** U1 (for `total_embedding_failures`), U8 (so the new error sites can `Sentry.captureException`)
+
+**Files:**
+- Modify: `packages/api/src/services/etl/processLogsBatch.ts` (rethrow on DB failure at `:25-27`; remove the swallow)
+- Modify: `packages/api/src/services/etl/processValidItemsBatch.ts` (in the embedding-fallback path at `:52-63`, atomically increment `etl_jobs.total_embedding_failures` before upserting; surface a Sentry warning event with `jobId` and the affected SKU count; do not throw)
+- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (wrap the writer IIFE at `:89-117` in an explicit promise: `const writerPromise = (async () => { ... })().catch(err => parser.destroy(err)); ...; await writerPromise.catch(err => { throw err })` so unhandled rejections become outer-flow throws)
+- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (extend the admin job-list response to include `totalEmbeddingFailures` so dashboards can surface degradation)
+- Test: `packages/api/test/etl-error-propagation.test.ts` (new)
+
+**Approach:**
+- `processLogsBatch`: catch block currently logs and returns. Replace with `throw err`. The outer `processCatalogETL` catch already exists and the chunk will retry/DLQ correctly via U3.
+- Embedding fallback: at `processValidItemsBatch.ts:52-63`, on `generateManyEmbeddings` throw:
+  ```text
+  await db.update(etlJobs).set({ totalEmbeddingFailures: sql`COALESCE(${etlJobs.totalEmbeddingFailures}, 0) + ${items.length}` }).where(eq(etlJobs.id, jobId));
+  logger.warn('etl.embedding.fallback', { jobId, skuCount: items.length });
+  Sentry.captureMessage('etl.embedding.fallback', { level: 'warning', tags: { jobId }, extra: { skuCount: items.length } });
+  // continue with upsert; embedding stays NULL
+  ```
+- IIFE wrap pattern:
+  ```text
+  const writerPromise = (async () => { ... })()
+    .catch(err => { parser.destroy(err); throw err; });
+  // ... for await loop ...
+  await writerPromise;
+  ```
+  Any rejection in the writer now propagates to the outer try/catch in `processCatalogETL` and triggers retry/DLQ via U3.
+- Admin response extension: extend the existing `GET /admin/analytics/catalog/etl` route's select shape to include `totalEmbeddingFailures` and update the response Zod schema if one is declared.
+
+**Patterns to follow:**
+- Atomic update idiom at `packages/api/src/services/etl/updateEtlJobProgress.ts:16-23`.
+- Admin route response shape at `packages/api/src/routes/admin/analytics/catalog.ts:178-235`.
+
+**Test scenarios:**
+- Happy path (embedding fallback): Embedding service throws → SKUs upserted with `embedding=NULL`; `total_embedding_failures` increments by exactly `items.length`; Sentry warning event fires once per batch (not per SKU).
+- Happy path (logs rethrow): `processLogsBatch` DB INSERT fails → exception propagates to outer catch → chunk retried by CF Queue.
+- Happy path (IIFE wrap): Writer throws inside the async IIFE → parser destroyed; outer `for await` loop terminates; outer catch fires; chunk retried.
+- Edge case: Multiple consecutive embedding batches in one chunk all fall back → counter increments cumulatively; Sentry warnings fire once per batch, not once per chunk.
+- Edge case: Mixed batch — some SKUs embed, then fallback kicks in for the rest → counter increments only for the failed batch's SKU count.
+- Integration: Admin endpoint response includes `totalEmbeddingFailures` field for every job; the prod-shape dashboard query still parses cleanly.
+
+**Verification:**
+- New test passes with the rethrow / wrap / counter increments in place.
+- `bun test:api` overall green.
+- Dev admin endpoint `GET /admin/analytics/catalog/etl?limit=5` returns the new field.
+
+---
+
+### U10. Reconciliation: admin endpoint + automatic post-job verification (via dedicated queue) + CLI subcommand
+
+**Goal:** Every ETL completion writes a verification row count; operators can also trigger reconciliation on any job on demand. Surfaces the user's "missing or falsely labeling" concern as a first-class observable signal. Auto-reconciliation runs on its own queue, not via `ctx.waitUntil`, so multi-GB files do not exceed the queue invocation's 15-min wall-clock.
+
+**Requirements:** R7
+
+**Dependencies:** U1 (for `verified_at`, `verified_row_count`, `verified_row_count_partial`), U2 (for the completion transition that enqueues the reconcile message), U8 (for Sentry warnings on delta)
+
+**Files:**
+- Create: `packages/api/src/services/etl/reconcileJob.ts` (pure function: given a `jobId` and optional `resumeFromByte`, stream the R2 source in 100 MB byte-range windows, count newlines, checkpoint progress, finalize verification on EOF, return delta)
+- Create: `packages/api/src/services/etl/processReconcileBatch.ts` (queue consumer for `packrat-etl-reconcile-queue`; calls `reconcileJob`; handles retry/resume)
+- Modify: `packages/api/src/services/etl/queue.ts` (extend producer to enqueue reconcile messages; type `ReconcileMessage { jobId: string; resumeFromByte?: number }`)
+- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` (on the final-chunk completion transition from U2, enqueue a `ReconcileMessage` to `packrat-etl-reconcile-queue` *inside the same transaction* as the status flip so a row can never transition to `completed` without an enqueued reconcile)
+- Modify: `packages/api/src/index.ts` (extend the `queue()` switch with an arm for `packrat-etl-reconcile-queue` and `packrat-etl-reconcile-queue-dev`)
+- Modify: `packages/api/wrangler.jsonc` (declare `packrat-etl-reconcile-queue` and `packrat-etl-reconcile-queue-dev` as producer + consumer with its own `dead_letter_queue: 'packrat-etl-dlq'` and `max_retries: 3`)
+- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (add `POST /admin/etl/:jobId/reconcile` — calls `reconcileJob` synchronously; for small/medium files returns inline; for large files returns 202 Accepted and enqueues to the reconcile queue with the existing job id)
+- Modify: `packages/cli/src/commands/admin/etl.ts` (add `reconcile <jobId>` subcommand)
+- Modify: admin list endpoint response shape (include `verifiedAt`, `verifiedRowCount`, and `verifiedRowCountPartial` so the dashboard surfaces it)
+- Test: `packages/api/test/etl-reconciliation.test.ts` (new)
+
+**Approach:**
+- `reconcileJob(jobId, resumeFromByte = 0)`:
+  1. Read `(filename, total_processed, verified_at, verified_row_count_partial)` from `etl_jobs`. If `verified_at IS NOT NULL`, return early — idempotent no-op for redelivered messages.
+  2. `r2.head(key)` → `fileSize`.
+  3. From `resumeFromByte` (or `verified_row_count_partial`'s checkpoint byte position if set), read 100 MB byte-range windows. For each window:
+     - Count `\n` bytes in the window.
+     - Add to running `lineCount`.
+     - On the last window, subtract 1 for the header row.
+     - Every 5 windows (500 MB processed) or when elapsed time > 10 min: `UPDATE etl_jobs SET verified_row_count_partial = $lineCount` (checkpoint), then throw a typed `ReconcileResumeError` carrying the current byte offset so the queue retry re-enqueues with `resumeFromByte` advanced. Wall-clock budget reset.
+  4. On EOF: `UPDATE etl_jobs SET verified_at = now(), verified_row_count = $lineCount, verified_row_count_partial = NULL WHERE id = $1 AND verified_at IS NULL` (idempotency gate).
+  5. Compute `delta = lineCount - total_processed`. If `abs(delta) > max(10, ceil(0.01 * lineCount))`: `Sentry.captureMessage('etl.reconciliation.delta', { level: 'warning', tags: { jobId }, extra: { delta, expected: lineCount, actual: total_processed } })`.
+  6. Return `{ jobId, expectedRowCount: lineCount, actualRowCount: total_processed, delta, withinThreshold }`.
+- `processReconcileBatch` (queue consumer):
+  - For each message: try `reconcileJob(msg.jobId, msg.resumeFromByte)` → on success `ack()`. On `ReconcileResumeError`: enqueue a new message with the advanced offset and `ack()` the current one. On any other error: `retry({ delaySeconds: 60 })`.
+- Auto-trigger: in U2's completion transaction, after the status flip to `completed`, enqueue `{ jobId, resumeFromByte: 0 }` to `packrat-etl-reconcile-queue`. Because both writes are in the same transaction, a row can never be `completed` without an enqueued reconcile.
+- Manual endpoint (`POST /admin/etl/:jobId/reconcile`):
+  - For files where `fileSize < 200 MB`: call `reconcileJob` synchronously and return the result inline.
+  - For files ≥ 200 MB: enqueue to `packrat-etl-reconcile-queue` and return 202 with a "poll the job for `verified_at`" message.
+  - Optional `?force=true` query: clear `verified_at` first and re-enqueue (operator override for a re-verify).
+- CLI subcommand: `packrat-admin etl reconcile <jobId>` → wraps the endpoint, polls until `verifiedAt` is set or timeout.
+- The 7 historical jobs from 2026-05-14 can each be reconciled retroactively via this endpoint *before* deciding to repair (U6). Confirms the suspicion that they processed partial data before being swept.
+
+**Patterns to follow:**
+- Queue consumer pattern from U3 (per-message ack/retry, DLQ wired).
+- Streaming-count pattern: `for await (const chunk of body)` and accumulate `chunk.filter(byte => byte === 0x0A).length`.
+
+**Test scenarios:**
+- Happy path: Job with `total_processed = 100`, R2 file has 101 lines (100 rows + header) → delta = 0; `verified_at` set; no Sentry warning.
+- Happy path: Job with `total_processed = 1000`, R2 file has 1006 lines (1005 rows + header) → delta = 5; within threshold; no warning.
+- Edge case: Job with `total_processed = 50000`, R2 file has 50100 lines + header → delta = 100; threshold = `max(10, 500)` = 500; within threshold; no warning. (The 50,100 case stays informational.)
+- Edge case (the real case): Job with `total_processed = 400`, R2 file has 50101 lines (50100 rows + header) — what the `campmor`-shape failures looked like → delta = 49700; way over threshold; Sentry warning fires.
+- Edge case (resume): A 1.5 GB file forces three resume-error checkpoints; each resume picks up at the right byte offset; final `verified_row_count` matches the true row count.
+- Edge case (idempotency): A redelivered reconcile message with `resumeFromByte = 0` against a job that already has `verified_at` set — `reconcileJob` returns early without re-reading the file.
+- Error path: R2 object missing → `reconcileJob` throws a typed error; queue consumer retries with backoff; after exhausting `max_retries: 3`, the DLQ captures it.
+- Edge case: Job with `total_processed = NULL` (legacy stuck-job-sweep casualty) → reconcileJob computes delta as `expected - 0 = expected`; the warning carries useful context for diagnosing the historical job.
+- Integration: Auto-verify fires exactly once per job, enqueued atomically with the completion transition; it does not fire for intermediate chunk completions; it does not fire twice on a redelivered final chunk (idempotency comes from the `etl_job_chunks` gate in U2).
+
+**Verification:**
+- New test passes.
+- Calling the endpoint on a real dev-seeded job returns the documented shape (inline for small files, 202 + queued for large).
+- The chunk-completion transaction either commits both the status flip and the reconcile enqueue, or neither (verify with a forced enqueue failure mid-transaction).
+
+---
+
+### U11. Quality-of-life: scheduler.wait, BATCH_SIZE rename, mergeBySku log aggregation
+
+**Goal:** Three tiny correctness/cleanliness wins that share a maintenance flavor and ship together.
+
+**Requirements:** R9 (log volume), and audit P2 #5, P2 #6, P3 #1
+
+**Dependencies:** U8 (for the logger surface used by the aggregated merge summary)
+
+**Files:**
+- Modify: `packages/api/src/services/etl/processCatalogEtl.ts:120` (replace `setTimeout(resolve, 1)` with `await scheduler.wait(0)`)
+- Create: `packages/api/src/services/etl/constants.ts` (new — exports `ITEM_FLUSH_BATCH_SIZE = 100` and `CF_QUEUE_BATCH_SIZE = 100`)
+- Modify: `packages/api/src/services/etl/processCatalogEtl.ts:13` and `packages/api/src/services/etl/queue.ts:17` (import from the new constants module instead of declaring inline)
+- Modify: `packages/api/src/services/etl/mergeItemsBySku.ts:34-48` (replace per-SKU `console.log` with a per-batch summary `logger.info('etl.merge.summary', { jobId, mergedSkuCount, totalChangedFields })`)
+- Test: `packages/api/test/etl-yield-and-constants.test.ts` (new — minimal; mostly behavior-preservation)
+
+**Approach:**
+- `await scheduler.wait(0)` is the documented Workers Scheduler API. `scheduler.yield()` does not exist (corrected from audit P2 #5).
+- The constants module is dead-simple — two exports — but the rename surfaces intent at the call site and ends the ambiguity the audit flagged at P2 #6.
+- The mergeBySku aggregation accumulates change counts across one batch (already a natural unit) and logs once at the end. No per-SKU lines.
+
+**Patterns to follow:**
+- Module organization mirrors `packages/api/src/services/etl/types.ts` for a constants file.
+
+**Test scenarios:**
+- Behavior preservation: A 10,000-row chunk completes at least as fast as before with `scheduler.wait(0)` (regression check, not a strict assertion).
+- Happy path (merge log): A batch with 50 SKU merges → exactly one log line emitted, summarizing the batch.
+- Edge case: A batch with 0 merges → no log line.
+
+**Verification:**
+- `grep -rn "setTimeout\(.*1.*\)" packages/api/src/services/etl/` returns nothing.
+- `grep -rn "BATCH_SIZE\s*=" packages/api/src/services/etl/` returns only the new constants.
+- A real ETL run on dev with 1k duplicate SKUs produces 1 merge summary line, not 1000.
+
+---
+
+### U12. Validator hardening: URL scheme + length caps + SKU charset
+
+**Goal:** Eliminate the audit P3 #2 attack surface — `javascript:` URLs and oversize fields cannot enter the catalog.
+
+**Requirements:** R11
+
+**Dependencies:** None (independent; can land any time after Phase 1)
+
+**Files:**
+- Modify: `packages/api/src/services/etl/CatalogItemValidator.ts` (rewrite `isValidUrl` at `:60-67`; add length caps and SKU regex)
+- Test: `packages/api/test/etl-validator.test.ts` (new or extend existing)
+
+**Approach:**
+- `isValidUrl`: parse with `new URL()`; reject any scheme other than `http:` and `https:`. Reject URLs longer than 2048 chars.
+- Length caps (rejects, not truncates): `name ≤ 500`, `description ≤ 50000`, `brand ≤ 200`, `category ≤ 200`.
+- SKU regex: `/^[A-Za-z0-9_.\-\/]+$/`; max length 200.
+- Each rejection produces a structured invalid-item log entry with the specific reason — surfaces in the existing `/admin/etl/:jobId/failures` endpoint.
+
+**Patterns to follow:**
+- Existing validator structure at `packages/api/src/services/etl/CatalogItemValidator.ts`.
+- Invalid-log shape at `packages/api/src/services/etl/processLogsBatch.ts`.
+
+**Test scenarios:**
+- Happy path: Valid `https://example.com/product/123` URL accepted.
+- Error path: `javascript:alert(1)` URL rejected with reason `INVALID_URL_SCHEME`.
+- Error path: `mailto:foo@bar` rejected with `INVALID_URL_SCHEME`.
+- Error path: URL of 3000 chars rejected with `URL_TOO_LONG`.
+- Edge case: Name of exactly 500 chars accepted; 501 chars rejected.
+- Edge case: SKU `ABC-123_/test.sku` accepted; SKU `<script>` rejected with `INVALID_SKU_CHARS`.
+- Integration: Run the existing `etl.test.ts` fixture with a row containing a `javascript:` URL → row routed to invalid logs, no DB insert into `catalog_items`.
+
+**Verification:**
+- New test passes.
+- A real prod-shape CSV with an injected `javascript:` URL run through `bun test:api` shows the row rejected.
+
+---
+
+### U13. Retention policy: `invalid_item_logs` cron sweep
+
+**Goal:** Bounded growth of the `invalid_item_logs` table. Bad uploads cannot fill Neon storage indefinitely.
+
+**Requirements:** R12
+
+**Dependencies:** U5 (for the existing `scheduled()` handler; the retention sweep adds a second cron arm)
+
+**Files:**
+- Create: `packages/api/src/services/etl/invalidLogRetention.ts` (the sweep function)
+- Modify: `packages/api/src/index.ts` (extend the `scheduled()` handler to dispatch on cron name; add the retention sweep arm)
+- Modify: `packages/api/wrangler.jsonc` (add a daily cron trigger, e.g., `0 9 * * *` UTC)
+- Test: `packages/api/test/etl-log-retention.test.ts` (new)
+
+**Approach:**
+- Sweep: `DELETE FROM invalid_item_logs WHERE created_at < now() - interval '90 days'`. Returns the deleted count; emits a Sentry breadcrumb.
+- Cron config in `wrangler.jsonc`: `"triggers": { "crons": ["*/5 * * * *", "0 9 * * *"] }` (sweep + retention). The top-level `triggers` wrapper is required by the Wrangler schema — a bare top-level `crons` key is silently ignored. The `scheduled` handler in `packages/api/src/index.ts` dispatches on the `event.cron` string.
+- 90-day window is a default; configurable via `env.INVALID_LOG_RETENTION_DAYS` if needed.
+
+**Patterns to follow:**
+- The stuck-job sweep cron from U5 establishes the `scheduled()` handler pattern.
+
+**Test scenarios:**
+- Happy path: Insert logs at `now() - 100d` and `now() - 30d`; sweep deletes only the 100d one.
+- Edge case: Empty table → sweep deletes 0 rows; no error.
+- Edge case: `INVALID_LOG_RETENTION_DAYS=30` env override → 30d-old logs swept.
+
+**Verification:**
+- New test passes.
+- `wrangler dev --test-scheduled` exercises both cron arms.
+
+---
+
+### U14. Test gap backfill: cross-cutting tests the global mock currently hides
+
+**Goal:** Add the specific tests that the per-unit tests above couldn't cover because of `packages/api/test/setup.ts:544-551`'s global queue mock — plus a few cross-cutting integration scenarios.
+
+**Requirements:** R14
+
+**Dependencies:** U2, U3, U6, U7 (all of which introduce behavior that should be covered end-to-end)
+
+**Files:**
+- Create: `packages/api/test/etl-queue-direct.test.ts` (per-file unmock of `queueCatalogETL` and `processQueueBatch`; exercise the real consumer)
+- Create: `packages/api/test/etl-multi-chunk-integration.test.ts` (full producer→queue→consumer→DB flow for a 3-chunk job)
+- Create: `packages/api/test/etl-csv-edge-cases.test.ts` (BOM at start, quoted header with embedded commas, header with 30+ columns straddling the 4KB initial slice, row-spanning-chunk fixture)
+- Modify: `packages/api/test/setup.ts` (if needed, document the `vi.doUnmock` escape hatch in a comment so future tests don't fight the global mock blindly)
+
+**Approach:**
+- Each new test file declares `vi.doUnmock('@packrat/api/services/etl/queue')` in `beforeAll` so the real implementation is exercised.
+- Fixtures live in `packages/api/test/fixtures/etl/`:
+  - `small-1chunk.csv` — 100 rows, ~10 KB
+  - `medium-3chunk.csv` — ~50 MB synthetic, designed to split into 3 byte-range chunks with row-boundary alignment work
+  - `wide-header.csv` — header row of 6 KB (forces the 4K→16K expansion path)
+  - `bom-prefixed.csv` — starts with `0xEF 0xBB 0xBF`
+  - `quoted-header.csv` — header has `"Item,Name","Description"` quoting
+- Tests assert behaviors that map directly to audit findings:
+  - Multi-chunk completion (P0 #1): full producer→consumer for a 3-chunk file ends with one `completed` transition.
+  - Queue retry (P0 #2): forced R2 5xx on first attempt → retry → success on second.
+  - Header > 4KB (P1 #3): consumer succeeds; columns mapped correctly.
+  - Row-spanning (P1 #4 / P1 #5): no rows dropped, no rows duplicated, no rows invalidated.
+  - BOM and quoted headers: parsed correctly.
+- Concurrent updates (audit also flagged this): a test that fires two simultaneous `updateEtlJobProgress` calls for the same `jobId` from different mocked workers; asserts atomic counter increment via the existing `COALESCE` idiom.
+
+**Patterns to follow:**
+- Existing `packages/api/test/etl.test.ts` for mocking + Postgres setup.
+- Per-test mock control via `vi.mocked(...).mockImplementationOnce(...)`.
+
+**Test scenarios:**
+- (Each described above as a fixture-driven scenario.)
+
+**Verification:**
+- `bun test:api` passes.
+- `grep -rn "vi.doUnmock" packages/api/test/etl-*.test.ts` shows the un-mock is applied where needed.
+- Coverage delta is positive on `packages/api/src/services/etl/queue.ts` and `packages/api/src/services/etl/processCatalogEtl.ts`.
+
+---
+
+### U15. Runbook at `docs/runbooks/etl-pipeline.md`
+
+**Goal:** A new on-caller can trigger / inspect / retry / drain / reconcile / recover without reading source.
+
+**Requirements:** R13
+
+**Dependencies:** U3, U5, U6, U10 (all of which create the operator-facing endpoints the runbook documents)
+
+**Files:**
+- Create: `docs/runbooks/etl-pipeline.md`
+
+**Approach:**
+- Sections in the runbook:
+  1. **Architecture** — one diagram showing producer → queue → consumer → DLQ, plus the cron jobs (sweep + retention).
+  2. **How to trigger an ETL** — `curl POST /catalog/etl` with payload schema; CLI command equivalent.
+  3. **How to inspect queue depth** — `wrangler queues list` and `wrangler queues info packrat-etl-queue`; same for `packrat-etl-dlq`.
+  4. **How to retry a failed job** — `curl POST /admin/etl/:jobId/retry`; CLI `packrat-admin etl retry <jobId>`.
+  5. **How to repair a corrupted job** (the 7-job case) — `POST /admin/etl/:jobId/repair-from-scratch`; CLI `packrat-admin etl repair-from-scratch <jobId>`. Includes the explicit one-time procedure for the seven 2026-05-14 jobs (list the jobIds).
+  6. **How to reconcile** — manual endpoint + automatic behavior; how to interpret the delta.
+  7. **How to drain the queue** — `wrangler queues consumer remove`.
+  8. **How to interpret `success_rate` and `verified_row_count`** — what 100%-failed means, what missing-but-present-in-source means.
+  9. **DLQ forensics** — querying `etl_dlq_events`; replay procedure (re-enqueue via `repair-from-scratch`).
+  10. **Accepted limitations** — soft-delete / discontinued-item reconciliation is not in scope; catalog grows monotonically; document the trade-off.
+  11. **References** — link to the audit, this plan, the Cloudflare Queues docs, the Sentry project.
+
+**Patterns to follow:**
+- No existing runbook in `docs/runbooks/` (verified absent). This is the first; establishes the convention.
+
+**Test scenarios:**
+- *Test expectation: none — documentation only, no behavioral change.*
+
+**Verification:**
+- The runbook is comprehensive enough that a new on-caller can complete each documented procedure without reading source.
+- Reviewer walks through every command in dev and confirms expected output.
+
+---
+
+## System-Wide Impact
+
+- **Interaction graph:** Producer endpoint → `chunkCsvForR2` (U6/U7) → ETL queue → consumer (idempotency gate via `etl_job_chunks` then atomic completion UPDATE → enqueue reconcile message inside the same transaction) → DLQ on exhaust → DLQ consumer → `etl_dlq_events`. Reconcile queue (`packrat-etl-reconcile-queue`) → reconcile consumer (resumable byte-range streaming, checkpointed via `verified_row_count_partial`). Two new cron jobs (sweep + retention). Sweep also inserts sentinel `etl_dlq_events` so the forensic table is single-source-of-truth for every `failed` transition. Sentry now intercepts every entry point via `withSentry({ fetch, queue })`.
+- **Error propagation:** Chunk-level exceptions now propagate from inner code → `processCatalogETL` outer catch → `processQueueBatch` per-message catch → `message.retry()` → exhaustion → DLQ → `etl_dlq_events` + Sentry. The `etl_jobs.status='failed'` transition happens only at the DLQ consumer or via the progress-based sweep. Nothing else writes `failed`.
+- **State lifecycle risks:** The chunk-completion path is correct under at-least-once delivery because every increment is gated by `INSERT INTO etl_job_chunks … ON CONFLICT DO NOTHING RETURNING 1` — a redelivered chunk produces no row and skips the increment. The combined transaction (chunk-table INSERT + counter UPDATE + reconcile-message enqueue) ensures atomicity: a row can never transition to `completed` without an enqueued reconcile, and a chunk increment can never be applied without the corresponding chunk-table row. The CHECK constraint `chunks_completed <= chunks_total` is the loud-failure safety net if any code path ever bypasses the gate. Status flip-flop (sweep flips to `failed` while a chunk completes) is prevented by the `WHERE status = 'running'` clause on every status-mutating UPDATE. The U10 reconcile checkpoint via `verified_row_count_partial` enables resumable verification of files that exceed a single queue invocation.
+- **API surface parity:** Three new admin endpoints (`/admin/etl/sweep-stuck`, `/admin/etl/:jobId/repair-from-scratch`, `/admin/etl/:jobId/reconcile`), one removed (`/admin/etl/reset-stuck`), one rewritten (`/admin/etl/:jobId/retry`). All three new endpoints get CLI subcommands in `packages/cli/src/commands/admin/etl.ts`. The producer endpoint at `POST /catalog/etl` is unchanged in shape (only the chunking internals change).
+- **Integration coverage:** U14's `etl-multi-chunk-integration.test.ts` exercises the full pipeline end-to-end against the test Postgres. The global queue mock in `setup.ts:544-551` is explicitly un-mocked per-test where the real consumer matters.
+- **Unchanged invariants:** The producer `POST /catalog/etl` request body shape; the `catalog_items` upsert behavior (still SKU-keyed); the OpenAPI client generated by `@elysiajs/openapi` for non-ETL routes; the admin auth surface (`adminAuthGuard` continues to gate every new admin route); the scraper-revision pinning. No mobile or web app code is touched.
+
+---
+
+## Risks & Dependencies
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|-----------|--------|------------|
+| **At-least-once chunk redelivery double-increments `chunks_completed`** before the per-chunk idempotency table exists | High (without mitigation) | High | U1 adds `etl_job_chunks(job_id, chunk_index)` PK table; U2 gates the increment on `INSERT … ON CONFLICT DO NOTHING RETURNING 1`. CHECK constraint `chunks_completed <= chunks_total` is the loud safety net |
+| **Chunk-completion UPDATE clobbers a row the sweep already failed** (status flip-flop) | Med | High | All status-mutating UPDATEs gate on `WHERE status = 'running'`. The chunk consumer also runs the completion UPDATE inside a transaction with the idempotency INSERT |
+| **U10 auto-reconcile via `ctx.waitUntil` exceeds 15-min wall-clock for multi-GB files** | High at >1 GB | High | Reconcile runs on its own `packrat-etl-reconcile-queue` with byte-range streaming + checkpointed resume via `verified_row_count_partial`. `waitUntil` is no longer used |
+| **DLQ event INSERT succeeds, status UPDATE fails — two-phase ordering bug** | Low | Med | Both writes inside one `db.transaction()` in U3. Post-deploy verification query in the runbook (`SELECT job_id FROM etl_dlq_events WHERE job_id IN (SELECT id FROM etl_jobs WHERE status != 'failed')`) alerts on inconsistency |
+| **`catalog_item_etl_jobs` accumulates duplicate provenance rows on chunk redelivery** | Med | Low | U1 adds `UNIQUE (catalog_item_id, etl_job_id)`; upserts use `ON CONFLICT DO NOTHING` |
+| **`error_stack` in `etl_dlq_events` accidentally captures CSV row data** (PII risk if scrapers ever ingest user-generated text) | Low today | Med | Documented contract at the call site: error messages MUST NOT include raw row data. U14 test asserts this property |
+| **Migration 0048 backfill blocks in-flight ETL writers during deploy** | Low (~200 rows) | Low | Single-migration approach is fine at current scale; UPDATE is sub-100ms on Neon. Comment in migration: "revisit if row count >100k" |
+| **`repair-from-scratch` re-ingests a different file than the original** because R2 source was overwritten | Low | High | U6 endpoint compares stored `source_etag` against fresh `r2.head(key).etag`; returns 409 on mismatch unless `?force=true` |
+| Drizzle Kit emits SQL without literal `DEFAULT 0 NOT NULL` (only JS-side default) breaking inserts from old workers mid-rolling-deploy | Med | High | U1 implementer hand-inspects the generated `.sql` before applying; assert via `information_schema.columns` in the schema smoke test |
+| `@sentry/cloudflare` adds bundle size that pushes the API Worker over a CF size limit | Low | Med | Sentry SDK is ~30 KB gzipped per their docs; current Worker bundle is well under the 10 MB limit; verify with `wrangler deploy --dry-run` after U8 |
+| The 7 historical jobs from 2026-05-14 cannot be repaired because their R2 source has been deleted by a separate retention policy | Low | Low | Verify R2 source presence as part of the U15 runbook procedure before invoking `repair-from-scratch`; if missing, document as accepted loss |
+| `@sentry/cloudflare` + `nodejs_compat` interaction introduces a cold-start regression | Low | Med | Measure cold-start delta against a control deploy; if regression > 50 ms, evaluate toucan-js fallback |
+| DLQ consumer fails (e.g., DB down when DLQ event arrives) | Low | Med | DLQ consumer is itself a queue consumer with `max_retries: 3` and its own DLQ semantics. Sentry capture happens before the DB write, so the event is preserved even if persistence fails. The U5 sweep is the bottom-floor safety net for any row that DLQ couldn't transition |
+| Down-migration loses Phase-2+ data after this plan ships | Cert if attempted | High | Migration is **forward-only after U2 ships** (documented in U1's test scenarios and in the migration header comment). Rollback strategy is a forward-fix migration, not a structural revert |
+| Wide-CSV fixture in U14 introduces a long-running test that destabilizes CI | Low | Low | Synthesize the fixture once at test-run startup with a deterministic seed instead of checking in a 50 MB file; cap fixture size at 5 MB in test mode via env |
+
+---
+
+## Documentation / Operational Notes
+
+- The new runbook at `docs/runbooks/etl-pipeline.md` (U15) is the operator entry point; link from the README and the CLAUDE.md ETL section in a follow-up doc PR.
+- Sentry project must be provisioned (or confirmed existing) before U8 lands. `env.SENTRY_DSN` is already validated in `packages/api/src/utils/env-validation.ts:9, 94` — verify the prod and dev env have it set via `wrangler secret list`.
+- Rollout sequencing across phases is incremental: each phase's PR is independently deployable. After Phase 1 ships, observe one week of prod data to confirm no regression before merging Phase 2. After Phase 2 ships, exercise `repair-from-scratch` against the 7 historical jobs as the explicit operational validation.
+- Source maps require `upload_source_maps: true` in `wrangler.jsonc` (U8). Pair with Sentry's CLI in CI for full symbolication; otherwise stack traces in Sentry will show minified line numbers.
+- The CF Cron Trigger added in U5 is the first in this Worker. Verify it appears in `wrangler triggers` after deploy and fires on schedule (`wrangler tail --format=pretty` during the 5-minute window).
+- The 7 historical-job recovery procedure (U15 §5) is a one-time operational task; record the run in the runbook's `## Historical Recoveries` appendix.
+
+---
+
+## Phased Delivery
+
+### Phase 1 — Foundation + P0 Blockers (U1, U2, U3, U4)
+
+Lands the schema migration plus the two production blockers and removes the broken wall-clock sweep. After this phase, multi-chunk jobs cannot prematurely complete, queue failures no longer silently swallow, and the wrongly-triggering sweep is gone. Independently deployable; no operational dependency on later phases. Ship as 1–2 PRs (migration + code, or both in one).
+
+### Phase 2 — Chunking Correctness + Recovery (U5, U6, U7)
+
+Replaces the sweep with a progress-based one; introduces the shared chunking helper with newline alignment; lands the retry + repair-from-scratch endpoints. After this phase, the 7 historical jobs from 2026-05-14 can be operationally recovered (run via the U15 runbook once Phase 4 ships, or earlier with a quick text note). Independently deployable. 2–3 PRs.
+
+### Phase 3 — Observability + Reconciliation (U8, U9, U10, U11)
+
+Wires Sentry, fixes the silent-error paths, adds reconciliation. After this phase, every job has a verified row count, every error reaches Sentry, and the smaller correctness issues (embedding fallback, IIFE error, scheduler.wait) are resolved. 2 PRs (Sentry + the rest).
+
+### Phase 4 — Hardening + Documentation (U12, U13, U14, U15)
+
+Validator hardening, log retention, the test gap backfill, and the runbook. After this phase, the test suite covers the previously-hidden surfaces and the on-call procedure is documented. 1–2 PRs.
+
+---
+
+## Documentation Plan
+
+- `docs/runbooks/etl-pipeline.md` — created in U15.
+- `CLAUDE.md` ETL section — minor update in a Phase 4 PR to link the runbook.
+- Update the existing `docs/audits/2026-05-16-etl-audit.md` with a footer linking to this plan (so future readers know remediation is in progress / done).
+- `/ce-compound` candidates after each phase:
+  - Phase 1: "Cloudflare Queue DLQ + explicit ack/retry pattern in a CF Worker"
+  - Phase 2: "Byte-range R2 chunking with newline alignment"
+  - Phase 3: "Sentry on Cloudflare Workers via `@sentry/cloudflare` (queue + fetch)"
+  - Phase 4: "ETL operational runbook structure"
+
+---
+
+## Operational / Rollout Notes
+
+- Each phase's PR is gated on the previous phase having shipped to prod and observed for at least 24h. No monitoring regression → promote to next phase.
+- The 7-job recovery (operational) happens after Phase 2 lands; document the jobIds and the run in the runbook's recoveries appendix.
+- New env vars: `INVALID_LOG_RETENTION_DAYS` (optional, default 90). Add to `.env.example` in Phase 4.
+- Wrangler secrets to verify: `SENTRY_DSN`, `R2_ACCESS_KEY_ID`, `R2_SECRET_ACCESS_KEY`, `PACKRAT_SCRAPY_BUCKET_R2_BUCKET_NAME`. None new, but confirm presence before Phase 3 deploy.
+- Rollback: each PR is independently revertable. The migration in U1 has a generated down-migration; verify it cleanly drops the new columns + table without affecting existing data.
+
+---
+
+## Sources & References
+
+- **Origin document:** `docs/audits/2026-05-16-etl-audit.md` (the ETL pipeline audit)
+- Related code:
+  - `packages/api/src/services/etl/`
+  - `packages/api/src/routes/catalog/index.ts`
+  - `packages/api/src/routes/admin/analytics/catalog.ts`
+  - `packages/api/wrangler.jsonc`
+  - `packages/db/src/schema.ts`
+  - `packages/api/test/etl.test.ts`
+  - `packages/cli/src/commands/admin/etl.ts`
+- Live prod evidence: `GET https://packrat-api.orange-frost-d665.workers.dev/api/admin/analytics/catalog/etl?limit=25` (2026-05-19; surfaced 7 wrongly-`failed` jobs at `completedAt = 2026-05-14T16:24:04.470Z`; 192 runs / 74 failed = 38% failure rate; `totalItemsIngested: 304,431`)
+- External docs:
+  - <https://developers.cloudflare.com/queues/configuration/javascript-apis/>
+  - <https://developers.cloudflare.com/queues/configuration/dead-letter-queues/>
+  - <https://developers.cloudflare.com/queues/platform/limits/>
+  - <https://developers.cloudflare.com/workers/runtime-apis/scheduler/>
+  - <https://developers.cloudflare.com/workers/configuration/cron-triggers/>
+  - <https://developers.cloudflare.com/r2/api/s3/api/>
+  - <https://docs.sentry.io/platforms/javascript/guides/cloudflare/>
+  - <https://docs.sentry.io/platforms/javascript/guides/cloudflare/tracing/instrumentation/queues-module/>
+  - <https://github.com/drizzle-team/drizzle-orm/issues/3249>
diff --git a/docs/plans/2026-05-20-001-fix-etl-pipeline-workflows-migration-plan.md b/docs/plans/2026-05-20-001-fix-etl-pipeline-workflows-migration-plan.md
new file mode 100644
index 0000000000..62ed13acf1
--- /dev/null
+++ b/docs/plans/2026-05-20-001-fix-etl-pipeline-workflows-migration-plan.md
@@ -0,0 +1,771 @@
+---
+title: "refactor: Migrate catalog ETL to Cloudflare Workflows"
+type: refactor
+status: active
+date: 2026-05-20
+origin: docs/audits/2026-05-16-etl-audit.md
+supersedes: docs/plans/2026-05-19-001-fix-etl-pipeline-audit-remediation-plan.md
+---
+
+# refactor: Migrate catalog ETL to Cloudflare Workflows
+
+## Summary
+
+Replace the Cloudflare-Queues-based catalog ETL pipeline with a Cloudflare Workflows-based pipeline. Workflows natively provides durable step execution, automatic memoized retries, durable state between steps, and built-in instance status — eliminating roughly half the original audit-remediation plan, which was manually reconstructing those primitives on top of Queues + Postgres. The audit's findings about CSV correctness (chunk-boundary row alignment, header injection), observability, validator hardening, log retention, and the operational runbook remain real and are addressed here. Delivered in three phases: spike + producer-side rewiring; correctness + observability; hardening + tests + docs.
+
+---
+
+## Problem Frame
+
+The catalog ETL audit at `docs/audits/2026-05-16-etl-audit.md` enumerated 16 findings (2 P0, 5 P1, 6 P2, 5 P3) and live prod data (192 runs / 74 failed = 38% failure rate; 7 large jobs falsely marked `failed` by a wall-clock-based sweep on 2026-05-14) confirmed the pipeline is silently incorrect. The first attempted remediation (the now-superseded `2026-05-19-001` plan) tried to fix the Queues-based design in place. Doc review on that plan surfaced a load-bearing P0: the design relied on atomicity between Postgres transactions and Cloudflare Queue `sendBatch` calls that the runtime cannot provide, plus the `drizzle-orm/neon-http` driver doesn't support session-bound transactions with external awaits. Resolving that fork added an outbox table, a cron dispatcher, a watchdog sweep, and a driver switch — making it ~8 units of plumbing to manually rebuild durable execution.
+
+Cloudflare Workflows ships durable execution natively: `step.do(name, fn)` is automatically memoized and retried, step results are persisted between steps (≤1 MiB each), step.sleep survives Worker restarts, instance state IS the job state, and the dashboard surfaces stuck/errored/complete instances without a custom sweep. The producer becomes a one-line `env.ETL_WORKFLOW.create({ params })` call; the entire chunk-completion state machine, DLQ wiring, sweep cron, and outbox plumbing collapse into Workflows-managed state.
+
+---
+
+## Requirements
+
+- R1. **Every catalog source CSV is fully ingested or fully failed-and-recoverable.** No partial completion, no premature "completed" status, no orphan rows.
+- R2. **Chunk-boundary correctness.** No row is dropped, invalidated, or duplicated at byte-range chunk boundaries. CSV headers wider than 4 KB do not silently misalign columns.
+- R3. **Operators can re-ingest any historical job from scratch** without invoking the original producer endpoint. The 7 historical jobs from 2026-05-14 are recoverable via this path.
+- R4. **Every completed ingest has post-ingestion row-count verification.** R2 source row count is compared to the ingested count and surfaced as observable signal; significant deltas emit Sentry warnings.
+- R5. **Failures are visible.** Every workflow error reaches Sentry with `jobId`, `chunkIndex`, source key, and step name. Operators can debug without reading raw Worker logs.
+- R6. **Embedding-fallback degradation is observable.** A workflow that completed without embeddings is distinguishable from a fully-successful one.
+- R7. **Validator rejects unsafe URLs and oversize fields.** Mobile/web cannot be tricked into rendering `javascript:`, IDN homograph, or RFC-1918 URLs from the catalog.
+- R8. **`invalid_item_logs` retention is bounded.** A bad upload cannot fill Neon storage indefinitely.
+- R9. **A documented runbook exists** covering trigger / inspect / retry / repair / reconcile / drain operations against Workflows.
+- R10. **Test coverage exists for every behavior above**, including the cases the legacy global queue-mock currently hides.
+
+---
+
+## Scope Boundaries
+
+- The plan does not migrate the embeddings pipeline. `EMBEDDINGS_QUEUE` continues to operate as a Cloudflare Queue with the existing producer/consumer pattern. Only the catalog ETL pipeline moves to Workflows.
+- The plan does not rewrite the existing `etl_jobs` data for the 7 historical jobs falsely marked `failed`. The re-ingest workflow is the recovery mechanism; the actual recovery run is operational, not a code unit.
+- The plan does not raise queue concurrency on `EMBEDDINGS_QUEUE` or alter its configuration.
+- The plan does not change the catalog data model, `catalogItems`, or downstream consumers (`apps/expo`, `apps/guides`, `apps/landing`).
+- The plan does not introduce a separate ETL Worker; the existing `packages/api` Worker hosts both the HTTP routes and the new Workflow binding.
+
+### Deferred to Follow-Up Work
+
+- **Soft-delete / discontinued-item reconciliation** (audit P3 #3): documented as accepted limitation in the runbook (catalog is scraper-controlled, not user content).
+- **Embeddings-queue DLQ + retry policy**: separate plan once the catalog ETL pivot is proven in production.
+- **Workflows-based scraper orchestration**: out of scope. Scrapers continue to write CSVs to `packrat-scrapy-bucket`; this plan only touches what happens after the file lands.
+- **`@sentry/cloudflare` cold-start performance regression**: measured if observed, addressed in a follow-up. Not blocking this plan.
+
+---
+
+## Context & Research
+
+### Relevant Code and Patterns
+
+- **Producer endpoint** (current): `packages/api/src/routes/catalog/index.ts:229-293` — `POST /catalog/etl`. Will be reduced to a Workflow trigger.
+- **Per-chunk processor** (current, to be replaced): `packages/api/src/services/etl/processCatalogEtl.ts` (208 lines). Its inner logic (R2 byte-range read, CSV parse, batch flush, embeddings + upsert + invalid-log handoff) becomes the body of `step.do()` calls inside the new workflow.
+- **Queue producer/consumer** (current, to be removed): `packages/api/src/services/etl/queue.ts` — `queueCatalogETL` and `processQueueBatch`. Deleted at end of Phase 1.
+- **R2 access**: `packages/api/src/services/r2-bucket.ts:193-360` — `R2BucketService` wrapping AWS S3 client. Works inside Workflows the same as inside a Worker handler. Will be spike-tested in U1.
+- **Drizzle Neon access**: `packages/api/src/db/index.ts:82-84` — `createDbClient(env)` returns the neon-http driver. Inside a `step.do()` callback, single-statement DB calls and multi-statement batched transactions both work; the issue with the Queues-based plan was awaiting *external* RPCs inside `db.transaction()`. In Workflows, each `step.do()` is its own unit of atomicity, so the driver's HTTP-batch limitation is no longer a blocker.
+- **Embeddings**: `packages/api/src/services/catalogService.ts` — `generateManyEmbeddings` + the existing `EMBEDDINGS_QUEUE` pattern. Unchanged. The ETL workflow calls this inside a `step.do()`; embedding failures increment a counter (R6) without re-firing on retry (memoization).
+- **Existing ETL test**: `packages/api/test/etl.test.ts` — integration test against real Postgres via Docker wsproxy at `localhost:5434`. New workflow-based tests follow the same fixture/mock pattern. `packages/api/test/setup.ts:544-551`'s global queue mock is removed since the queue no longer participates.
+- **Schema location**: `packages/db/src/schema.ts:446-510`. Smaller migration than the superseded plan needed — most chunk-tracking columns are absorbed into Workflows instance state.
+- **Wrangler config**: `packages/api/wrangler.jsonc` — new `workflows` binding added; the `packrat-etl-queue` producer + consumer entries removed at the end of Phase 1 once the producer cuts over.
+- **Admin routes**: `packages/api/src/routes/admin/analytics/catalog.ts` — `GET /admin/analytics/catalog/etl` continues to read from `etl_jobs`; admin retry/repair endpoints now trigger workflow instances rather than enqueue messages.
+- **Admin CLI**: `packages/cli/src/commands/admin/etl.ts` — subcommands re-target the new admin endpoints.
+
+### Institutional Learnings
+
+- `docs/solutions/` carries no prior Workflows or queue-based-ETL learnings — this is the first project Workflows footprint. After Phase 2 ships, `/ce-compound` candidates: (a) "Cloudflare Workflows step.do idempotency for batch ETL", (b) "Migration from Queues-based state machine to Workflows".
+
+### External References
+
+- **Cloudflare Workflows API** (verified 2026-05-20): <https://developers.cloudflare.com/workflows/>. `step.do(name, fn)` is idempotent + memoized by step name within an instance. Built-in retries with configurable backoff; `step.sleep` / `step.sleepUntil` for durable waits.
+- **Workflows limits** (verified 2026-05-20): <https://developers.cloudflare.com/workflows/reference/limits/>. 10,000 steps per instance (configurable to 25,000); 30s CPU per step (configurable to 5 min); wall-clock unlimited per step; step output max 1 MiB; 50,000 concurrent running instances on Paid; 1 GB state per instance.
+- **Workflows pricing**: documented but ambiguous in the public docs as of plan-write — verified in U1 spike with `wrangler workflows`/dashboard inspection at PackRat's scale (~250 jobs/day × ~3 steps/job).
+- **Cloudflare Queues** (existing, retained for `EMBEDDINGS_QUEUE` only): <https://developers.cloudflare.com/queues/configuration/javascript-apis/>.
+- **R2 S3 API compatibility**: <https://developers.cloudflare.com/r2/api/s3/api/> — range reads via AWS SDK work identically inside Workflows.
+- **Sentry on Cloudflare**: <https://docs.sentry.io/platforms/javascript/guides/cloudflare/> — first-party `@sentry/cloudflare` with `Sentry.withSentry({ fetch, queue, workflow })` wrapping pattern.
+
+---
+
+## Key Technical Decisions
+
+- **One workflow per source CSV.** `CatalogEtlWorkflow` takes `{ objectKey, source, scraperRevision }` as params; the instance ID derives from `(source, filename)` so duplicate triggers for the same file (e.g., from a producer-side retry) are no-ops via Workflows' instance-id idempotency. This subsumes the deepening pass's per-chunk idempotency table entirely.
+- **Chunks become workflow steps, not queue messages.** For each chunk index, the workflow runs `step.do(\`chunk-${i}\`, async () => processChunk(...))`. Workflows memoizes the step result, so a retry of a partially-completed workflow resumes from the last unfinished step. The audit's P0 #1 (premature completion) and P0 #2 (swallowed errors) are non-findings.
+- **No `etl_job_chunks`, `etl_outbox_messages`, `etl_dlq_events`, `chunks_total/chunks_completed/last_progress_at` columns.** Workflows instance state IS the job state. The `etl_jobs` table retains its existing shape (id, status, source, filename, started_at, completed_at, total_processed, total_valid, total_invalid, scraper_revision) plus new columns for DB-side denormalization that admin queries need: `workflow_instance_id text` (the Workflows instance id), `verified_at timestamp`, `verified_row_count integer`, `total_embedding_failures integer DEFAULT 0 NOT NULL` (R6), `superseded_by_job_id text`, `superseded_at timestamp`, `source_etag text`, and `source_last_modified timestamp`. See U2 for the full column list and migration SQL.
+- **Repair-from-scratch creates a new workflow instance with a new `(source, filename, scraperRevision)` triple keyed by a fresh nonce in the instance id**, so the original instance and the repair instance are both queryable in the Workflows dashboard and both have rows in `etl_jobs`. The `superseded_by_job_id text` column on `etl_jobs` (FK to `etl_jobs.id`, `ON DELETE SET NULL`) links them; `superseded_at timestamp` preserves the timeline even after FK cleanup. A CHECK constraint prevents self-reference.
+- **R2 source ETag captured at workflow start.** New `source_etag text` and `source_last_modified timestamp` columns on `etl_jobs`. Repair-from-scratch compares the stored ETag against fresh `r2.head().etag` and returns 409 on mismatch unless `?force=true` is supplied. For legacy rows (the 7 historical jobs), the migration SQL sets `source_etag = NULL` (the ETag is genuinely unknown at migration time and cannot be resolved inside a SQL migration). As a post-migration operational step, an operator calls `r2.head()` for each of the 7 job IDs and issues a targeted `UPDATE etl_jobs SET source_etag = $1, source_last_modified = $2 WHERE id = $3` only if the file still exists — closing the audit's source-verification gap. This procedure is documented in the U5 runbook and U8 historical-recovery appendix.
+- **Reconciliation is the final step of every workflow.** No separate queue; no `verified_row_count_partial` checkpoint column needed (a single step can run for 5 min CPU + unlimited wall-clock, which covers all realistic source sizes; if a workflow ever hits the 5-min step CPU limit, it's split into N counting steps by chunk range). Reconciliation reads the source via `r2.get(key)` and counts logical rows using `csv-parse` (not raw newline counting — this closes the audit-corrected finding about quoted multi-line CSV fields).
+- **Row-boundary alignment lives in the producer (`chunkCsvForR2` helper).** Each chunk window's `byteEnd` is snapped to the last `\n` in a small (64 KB) tail-read; chunks emit on row boundaries; the consumer no longer needs `skipPartialRow` logic. The 64 KB peek reads are parallelized with `Promise.all` so the producer-side CPU budget is not strained for multi-GB files. Resolves audit P1 #3/#4/#5.
+- **Header injection for non-first chunks uses a bounded-expand re-fetch loop** (4 KB → 16 KB → 64 KB), throwing a typed `EtlHeaderError` if no newline appears in 64 KB. Resolves audit P1 #3 silent column misalignment.
+- **Workflow retry policy** is per-step: `{ limit: 3, delay: '30s', backoff: 'exponential' }` for chunk-processing steps; `{ limit: 0 }` for reconciliation (a delta is data, not a failure — surface to Sentry and continue). On total workflow failure (all retries exhausted), Workflows marks the instance `errored`, the workflow's final cleanup step runs (set `etl_jobs.status='failed'`, capture Sentry event with all chunk failure history). No DLQ table needed; the Workflows dashboard is the forensic surface.
+- **`@sentry/cloudflare` wraps `{ fetch, queue, workflow }`** in `packages/api/src/index.ts`. Per-step `Sentry.startSpan` for chunk processing; `Sentry.captureException` on step failure; tags include `jobId`, `workflowInstanceId`, `chunkIndex`, `r2Key`. `error_stack` is contractually free of raw CSV row data (documented at call sites; the U10 test asserts it). Sentry source-map upload via `@sentry/cli sourcemaps upload` is wired into CI as part of U6 — not just `upload_source_maps: true` in wrangler.jsonc.
+- **URL validator** (U7) restricts to `http(s):`, rejects IDN homograph (deny non-ASCII hostnames or normalize via punycode and compare), rejects RFC-1918 / loopback / link-local hostnames after DNS resolution-pattern check (string-level, not network). Length cap 2048. SKU charset `/^[A-Za-z0-9_.\-\/]+$/` max 200 chars.
+- **`invalid_item_logs` retention runs as a scheduled Workflow** (or a CF Cron Trigger calling a deletion step). Batched DELETE: loop `DELETE FROM invalid_item_logs WHERE id IN (SELECT id FROM invalid_item_logs WHERE created_at < now() - interval '90 days' LIMIT 10000)` until 0 rows; surface Sentry warning if a single run hits a max-iteration cap.
+
+---
+
+## Open Questions
+
+### Resolved During Planning
+
+- **Queues vs Workflows for execution.** Resolved: Workflows. Eliminates ~8 units of plumbing that the prior plan needed.
+- **Per-chunk idempotency.** Resolved: free via `step.do(name, fn)` memoization. No `etl_job_chunks` table.
+- **DB+Queue atomicity.** Resolved: no longer applicable. Each `step.do()` is its own unit of durability; Workflows persists step results between steps.
+- **Drizzle driver choice.** Resolved: stay on `neon-http`. The audit-plan blocker (transactions with external awaits) doesn't apply because each step is atomic.
+- **Stuck-job sweep design.** Resolved: not needed. Workflows surfaces stuck/errored instances natively in the dashboard.
+- **DLQ design.** Resolved: not needed. Failed workflow instances are the forensic record.
+- **CSV parser for reconciliation.** Resolved: use `csv-parse` (not raw newline counting). Closes the quoted-multiline-field correctness gap.
+- **Workflow instance ID strategy.** Resolved: deterministic ID `${source}-${filename}` for first ingest (prevents duplicate triggers); repair-from-scratch uses `${source}-${filename}-repair-${nonce}`.
+- **Producer cutover strategy.** Resolved: coexist both paths during transition. The producer endpoint accepts a `?engine=workflow|queue` query parameter (default `workflow`); operators can fall back to the queue path during a rollback window. After Phase 1 bakes for a week with no fallback usage, the queue path is removed in a Phase 2 cleanup PR.
+
+### Deferred to Implementation
+
+- **Workflows pricing at PackRat's scale.** ~250 jobs/day × ~3 chunks/job = ~750 step executions/day. U1's spike confirms cost is comfortably within Workers Paid; if not, escalate before Phase 2.
+- **Exact step CPU budget per chunk.** The 30s default is likely sufficient; if R2 + Drizzle + embeddings + upsert overruns, bump to `cpu_ms: 60000` or split chunk processing into sub-steps (parse → embed → upsert).
+- **Reconciliation step CPU budget for the largest historical files (50,100 rows / ~30 MB).** Likely <10s in CPU; verified in U1 spike.
+- **Cron trigger for retention sweep — separate Workflow vs traditional cron-handler.** Both work; choose based on Phase 3 ergonomics.
+- **Sentry sampling rate** for Workflows spans. Default `tracesSampleRate: 0.1`; tune in production.
+
+---
+
+## Output Structure
+
+    packages/api/src/
+    ├── workflows/
+    │   ├── catalog-etl-workflow.ts          (NEW — the main ETL workflow)
+    │   ├── retention-workflow.ts            (NEW — invalid_item_logs sweep)
+    │   └── shared/
+    │       ├── chunkCsvForR2.ts             (NEW — row-boundary-aligned chunking)
+    │       └── reconcileJob.ts              (NEW — final-step row count comparison)
+    ├── services/etl/
+    │   ├── CatalogItemValidator.ts          (MODIFIED — U7 hardening)
+    │   ├── mergeItemsBySku.ts               (MODIFIED — aggregate log per batch)
+    │   ├── processValidItemsBatch.ts        (MODIFIED — embedding-fallback counter)
+    │   ├── processLogsBatch.ts              (MODIFIED — rethrow on DB failure)
+    │   ├── constants.ts                     (NEW — ITEM_FLUSH_BATCH_SIZE etc.)
+    │   ├── processCatalogEtl.ts             (DELETED — superseded by workflow)
+    │   └── queue.ts                         (DELETED at end of Phase 1)
+    ├── routes/catalog/index.ts              (MODIFIED — producer triggers workflow)
+    ├── routes/admin/analytics/catalog.ts    (MODIFIED — retry/repair/reconcile route workflows)
+    ├── utils/logger.ts                      (NEW — small; structured-field wrapper)
+    └── index.ts                             (MODIFIED — withSentry + workflow export)
+
+    packages/db/src/schema.ts                (MODIFIED — 5 new columns on etl_jobs)
+    packages/api/drizzle/0048_etl_workflow_columns.sql   (NEW)
+    packages/api/wrangler.jsonc              (MODIFIED — workflows binding; retire ETL_QUEUE end of Phase 1)
+    packages/cli/src/commands/admin/etl.ts   (MODIFIED — subcommands target workflow endpoints)
+    packages/api/test/                       (NEW workflow tests)
+    docs/runbooks/etl-pipeline.md            (NEW)
+
+---
+
+## High-Level Technical Design
+
+> *This illustrates the intended approach and is directional guidance for review, not implementation specification.*
+
+```text
+Producer  ─── POST /catalog/etl ──┐
+                                  │
+                                  ▼
+        ┌─────────────────────────────────────────────┐
+        │ chunkCsvForR2(key)                          │
+        │   r2.head → chunks[]                        │
+        │   parallel(Promise.all):                    │
+        │     for each window: peek tail, align       │
+        │       byteEnd to last '\n'                  │
+        └─────────────────────────────────────────────┘
+                                  │
+                          INSERT etl_jobs
+                          (status='running',
+                           source_etag, source_last_modified)
+                                  │
+                          env.ETL_WORKFLOW.create({
+                            id: `${source}-${filename}`,
+                            params: { objectKey, source,
+                                      scraperRevision, chunks,
+                                      jobId }
+                          })
+                                  │
+                                  ▼
+        ┌─────────────────────────────────────────────┐
+        │ CatalogEtlWorkflow.run({ event, step }):    │
+        │                                             │
+        │   for each chunk in params.chunks:          │
+        │     await step.do(`chunk-${i}`, {           │
+        │       retries: { limit: 3, delay: '30s',    │
+        │                  backoff: 'exponential' },  │
+        │       timeout: '5 minutes',                 │
+        │     }, async () => {                        │
+        │       // - r2.get(key, range)               │
+        │       // - csv-parse with backpressure      │
+        │       // - flush valid → embeddings → upsert│
+        │       // - flush invalid → logs             │
+        │       // - return { rowsProcessed,          │
+        │       //            rowsValid, rowsInvalid }│
+        │     })                                      │
+        │                                             │
+        │   await step.do('aggregate', async () => {  │
+        │     // sum chunk results, UPDATE etl_jobs   │
+        │     // SET total_processed, total_valid,    │
+        │     //     total_invalid                    │
+        │   })                                        │
+        │                                             │
+        │   await step.do('reconcile', async () => {  │
+        │     // csv-parse R2 source, count rows,     │
+        │     // UPDATE verified_at,                  │
+        │     //         verified_row_count           │
+        │     // Sentry.captureMessage on delta       │
+        │   })                                        │
+        │                                             │
+        │   await step.do('finalize', async () => {   │
+        │     // UPDATE status='completed',           │
+        │     //         completed_at=now()           │
+        │   })                                        │
+        └─────────────────────────────────────────────┘
+
+On step failure exhausting retries:
+  → Workflow instance → 'errored' state
+  → Workflows dashboard surfaces with full step history
+  → Sentry capture from a Sentry.withSentry workflow wrapper
+  → A final 'errored' lifecycle hook runs:
+    UPDATE etl_jobs SET status='failed', completed_at=now()
+
+Scheduled (CF Cron Trigger or scheduled workflow):
+  retention-workflow:
+    loop: DELETE FROM invalid_item_logs WHERE id IN (
+            SELECT id FROM invalid_item_logs
+            WHERE created_at < now() - interval '90 days'
+            LIMIT 10000
+          )
+    until 0 rows affected
+    or max iterations (Sentry warning if hit)
+```
+
+---
+
+## Implementation Units
+
+### U1. Workflows spike: R2 + Drizzle Neon + csv-parse inside step.do
+
+**Goal:** Prove the integration works at PackRat's actual scale before committing to the migration. 30-60 minutes of focused work; output is a GO/NO-GO with concrete observations.
+
+**Requirements:** Gates R1-R10 (if Workflows can't host the workload, the entire plan blocks).
+
+**Dependencies:** None
+
+**Files:**
+- Create: `packages/api/src/workflows/spike-etl-workflow.ts` (throwaway; deleted after the spike or kept as a reference fixture)
+- Modify: `packages/api/wrangler.jsonc` (add workflows binding for the spike)
+- No tests — this is a manual spike.
+
+**Approach:**
+- Build a minimal workflow with three steps:
+  1. `step.do('head', () => r2.head('v2/cotopaxi/cotopaxi_2026-05-14T16-54-05.csv'))` — verify R2 binding works inside step.do.
+  2. `step.do('chunk', () => r2.get(key, { range: { offset: 0, length: 1024*1024 } }).then(b => b.text()))` — verify byte-range read returns useable string.
+  3. `step.do('parse-and-write', async () => { /* csv-parse 100 rows, INSERT into a scratch test table via Drizzle Neon, return { rowsWritten } */ })` — verify csv-parse + Drizzle work inside step.do.
+  4. `step.sleep('rest', '5 seconds')` — verify durable sleep works.
+  5. `step.do('verify-memoization', async () => { console.log('this should fire exactly once even across retries'); return Date.now(); })` — trigger an artificial retry (throw on first call via a counter file) and verify the second attempt sees the memoized result.
+- Run via `wrangler dev --remote` (Workflows requires remote bindings) or `wrangler workflows trigger` against deployed dev environment.
+- Observe in Workflows dashboard: step durations, retry behavior, total instance latency, billing meter delta.
+- Document: pricing observed for this run, any unexpected friction, blocker confirmation/clearance.
+
+**Patterns to follow:**
+- The Workflows quickstart example pattern from <https://developers.cloudflare.com/workflows/get-started/guide/>.
+- Existing `R2BucketService` instantiation pattern at `packages/api/src/services/r2-bucket.ts:193-210`.
+- Existing `createDbClient(env)` pattern from `packages/api/src/db/index.ts:82-84`.
+
+**Test scenarios:**
+- *Test expectation: none — this is a manual spike. The workflow itself is throwaway.*
+
+**Verification:**
+- Spike workflow completes successfully end-to-end in the Workflows dashboard.
+- Memoization confirmed: the artificially-retried step shows the same return value on the second attempt.
+- Pricing observed at the dashboard's billing meter is within an order of magnitude of "negligible" for one run. (Extrapolated to 250 jobs/day, must stay clearly under any concerning threshold.)
+- Document one of: GO (proceed to U2), GO-WITH-CAVEATS (proceed but note the friction), NO-GO (fall back to the superseded plan's outbox design).
+
+---
+
+### U2. Drizzle migration 0048: workflow_instance_id + embedding-failure counter (slimmed during implementation)
+
+> **Scope adjustment (2026-05-20):** During PR review the schema additions were narrowed from 8 columns to 2. Rationale: most of the originally-scoped columns (`verified_at`, `verified_row_count`, `superseded_by_job_id`, `superseded_at`, `source_etag`, `source_last_modified`) exist to support audit findings whose consumers (U5 repair endpoint, U6 Sentry observability, the reconcile UI) ship in later PRs. Adding them now creates dead schema. Each follow-up unit adds the column it needs when it lands. The two columns kept are the ones with value from day one: `workflow_instance_id` links every new `etl_jobs` row to its CF Workflows instance for admin debugging, and `total_embedding_failures` makes embedding-fallback degradation observable in admin queries without code changes elsewhere. The text below describes the originally-scoped 8 columns for context; what actually ships is the slim version.
+
+**Goal:** Add the minimal schema columns Workflows-based execution needs for DB-side denormalization (admin queries continue to work without hitting the Workflows API for every list).
+
+**Requirements:** R1, R3, R4, R6
+
+**Dependencies:** U1 (spike must pass before committing migration to the new architecture)
+
+**Files:**
+- Modify: `packages/db/src/schema.ts` (add columns to `etlJobs`; UNIQUE constraint on `catalogItemEtlJobs`)
+- Create: `packages/api/drizzle/0048_etl_workflow_columns.sql`
+- Create: `packages/api/drizzle/meta/0048_snapshot.json` (generated)
+- Modify: `packages/api/drizzle/meta/_journal.json` (generated)
+- Test: `packages/api/test/db-schema-etl.test.ts` (new — assert columns exist with expected defaults)
+
+**Approach:**
+- Columns added to `etl_jobs`:
+  - `workflow_instance_id text` (nullable — legacy queue-based rows leave NULL until repair)
+  - `verified_at timestamp` (nullable)
+  - `verified_row_count integer` (nullable)
+  - `total_embedding_failures integer DEFAULT 0 NOT NULL`
+  - `superseded_by_job_id text` (nullable, FK to `etl_jobs.id` `ON DELETE SET NULL`)
+  - `superseded_at timestamp` (nullable)
+  - `source_etag text` (nullable)
+  - `source_last_modified timestamp` (nullable)
+- CHECK constraints on `etl_jobs`:
+  - `etl_jobs_no_self_supersede CHECK (superseded_by_job_id IS NULL OR superseded_by_job_id <> id)`
+- Indexes:
+  - `etl_jobs_workflow_instance_id_idx` on `(workflow_instance_id)` — for the admin "find by workflow" lookup
+  - `etl_jobs_superseded_by_idx` on `(superseded_by_job_id)`
+- Modification to `catalog_item_etl_jobs`: add `UNIQUE (catalog_item_id, etl_job_id)` so retried upserts use `ON CONFLICT DO NOTHING`.
+- **Source-ETag backfill (one-shot, in the migration itself)**: a `DO $$ BEGIN UPDATE etl_jobs SET source_etag = NULL, source_last_modified = NULL WHERE status IN ('completed', 'failed'); END $$;` is a no-op for the 7 historical jobs in the sense that the ETag is genuinely unknown — but a companion *operational* step (in U10 runbook) calls `r2.head()` for each of the 7 jobids and `UPDATE etl_jobs SET source_etag = $1, source_last_modified = $2 WHERE id = $3` ONLY IF the file still exists. This is the documented forensic recovery procedure; it does not run inside the SQL migration.
+- Drizzle generator: `bun run --cwd packages/api db:generate`. Hand-verify the generated SQL emits literal `DEFAULT 0 NOT NULL` for `total_embedding_failures` (Drizzle Kit sometimes drops SQL-side defaults).
+
+**Patterns to follow:**
+- Existing `etl_jobs` definition at `packages/db/src/schema.ts:460-479`.
+- Migration `0027_past_madrox.sql` (added `scraper_revision` + index) for the "add column + index" pattern.
+
+**Test scenarios:**
+- Happy path: After migration, all 8 new columns present with documented defaults; both indexes queryable; UNIQUE constraint on `catalog_item_etl_jobs` prevents duplicate inserts.
+- Edge case: Existing rows have `workflow_instance_id = NULL`, `total_embedding_failures = 0`, `source_etag = NULL`.
+- Error path: `INSERT etl_jobs SET superseded_by_job_id = id` violates the no-self-supersede CHECK.
+- Error path: Re-running the migration is a no-op (Drizzle's migration log handles this).
+
+**Verification:**
+- `bun run --cwd packages/api db:migrate` applies cleanly against a fresh Docker Postgres + against a Postgres seeded with current-prod-shape `etl_jobs` rows.
+- `bun lint:custom` passes.
+- `bun test:api:unit` includes the new schema test and it passes.
+
+---
+
+### U3. Define CatalogEtlWorkflow + producer cutover
+
+**Goal:** Replace `processCatalogEtl.ts` + `queue.ts` + `processQueueBatch` with a single `CatalogEtlWorkflow` class. Producer endpoint switches from `sendBatch` to `env.ETL_WORKFLOW.create()`. Old queue path coexists during transition (via `?engine=workflow|queue`).
+
+**Requirements:** R1, R3, R5
+
+**Dependencies:** U2 (schema columns must exist for workflow to write them)
+
+**Execution note:** Test-first for the workflow class itself. Write the integration test (small CSV, 3 chunks, full ingest path) before implementing the workflow body — the test acts as the executable specification of the desired behavior.
+
+**Files:**
+- Create: `packages/api/src/workflows/catalog-etl-workflow.ts` (the main workflow class)
+- Create: `packages/api/src/workflows/shared/chunkCsvForR2.ts` (row-boundary-aligned chunking; parallel peek reads via `Promise.all`; pulled forward from old plan's U6/U7)
+- Modify: `packages/api/src/index.ts` (export `CatalogEtlWorkflow`; extend the Worker module type)
+- Modify: `packages/api/wrangler.jsonc` (add `workflows` binding `ETL_WORKFLOW`; keep `packrat-etl-queue` for the coexistence window)
+- Modify: `packages/api/src/routes/catalog/index.ts` (producer accepts `?engine=workflow|queue`; default `workflow`; both paths INSERT into `etl_jobs` with `source_etag` capture)
+- Modify: `packages/api/src/services/etl/types.ts` (`CatalogEtlWorkflowParams` type)
+- Delete (Phase 1 cleanup PR, *after* coexistence window): `packages/api/src/services/etl/processCatalogEtl.ts`, `packages/api/src/services/etl/queue.ts`
+- Test: `packages/api/test/etl-workflow-integration.test.ts` (new — end-to-end test using a mocked `step` runtime)
+
+**Approach:**
+- `CatalogEtlWorkflow extends WorkflowEntrypoint<Env, CatalogEtlWorkflowParams>`:
+  ```text
+  async run(event, step) {
+    const { jobId, objectKey, chunks } = event.payload;
+
+    const chunkResults = [];
+    for (const [i, chunk] of chunks.entries()) {
+      const result = await step.do(
+        `chunk-${i}`,
+        {
+          retries: { limit: 3, delay: '30s', backoff: 'exponential' },
+          timeout: '5 minutes',
+        },
+        async () => this.processChunk(jobId, objectKey, chunk, i),
+      );
+      chunkResults.push(result);
+    }
+
+    await step.do('aggregate', async () => this.aggregateCounters(jobId, chunkResults));
+    await step.do('reconcile', async () => this.reconcile(jobId, objectKey));
+    await step.do('finalize', async () => this.finalizeJob(jobId));
+  }
+  ```
+- `processChunk` body absorbs the existing `processCatalogETL` logic: R2 byte-range read, csv-parse with backpressure, batch flush, embedding fallback path (increments `total_embedding_failures`), invalid-log handoff. Returns `{ rowsProcessed, rowsValid, rowsInvalid }` — small enough to fit in the 1 MiB step output cap.
+- Header injection for non-first chunks uses the bounded-expand re-fetch loop (4K → 16K → 64K → throw `EtlHeaderError`).
+- `chunkCsvForR2`: producer-side row-boundary alignment with parallel 64KB peek reads (closes audit P1 #3/#4/#5 + the previously-flagged producer CPU budget concern). Returns `Array<{ chunkIndex, chunksTotal, byteStart, byteEnd }>` plus the captured `etag` + `lastModified`.
+- Producer endpoint writes `etl_jobs` row with `source_etag`, `source_last_modified`, `workflow_instance_id`; then `env.ETL_WORKFLOW.create({ id: \`${source}-${filename}\`, params: { jobId, objectKey, source, scraperRevision, chunks } })`. The deterministic instance ID prevents duplicate triggers for the same file (Workflows rejects duplicate IDs).
+- Producer's `?engine=queue` branch keeps the old `queueCatalogETL` flow for rollback. Removed in the Phase 1 cleanup PR after one week of bake.
+- Test uses Workflows' test harness (`@cloudflare/vitest-pool-workers`) or mocks the `step` object directly with an in-memory implementation that exercises memoization.
+
+**Patterns to follow:**
+- Workflows quickstart: <https://developers.cloudflare.com/workflows/get-started/guide/>.
+- Existing `R2BucketService` and `createDbClient` instantiation patterns.
+- Existing CSV parse + backpressure handling in `processCatalogEtl.ts:80-130` (lifted into `processChunk`).
+
+**Test scenarios:**
+- Happy path: 3-chunk CSV (small fixture), workflow runs end-to-end, final `etl_jobs.status = 'completed'`, `total_processed = 100` (or fixture row count), `verified_at` set, `total_embedding_failures = 0`.
+- Edge case: One chunk throws a transient error; Workflows retries once and succeeds; final state correct; `aggregate` step's input includes the retried chunk's eventual success result (memoization).
+- Edge case: Embedding service throws on chunk 1's flush; `total_embedding_failures` increments by the flush size; chunk still completes (embedding fallback); workflow continues; `verified_at` set.
+- Edge case: Chunk boundary lands on a row boundary; total row count matches `wc -l` minus header.
+- Edge case: Header row >4 KB (synthetic fixture with 60 columns of long names); re-fetch expands to 16 KB; columns mapped correctly.
+- Error path: All retries on chunk 0 exhaust; workflow instance enters `errored`; lifecycle hook flips `etl_jobs.status = 'failed'`; Sentry captures with full step history.
+- Error path: Duplicate trigger for the same `(source, filename)` returns the existing instance ID; no duplicate row inserted.
+- Integration: Producer endpoint with `?engine=workflow` triggers a workflow; with `?engine=queue` triggers the legacy path. Both produce a working ingest. Compared row counts match.
+
+**Verification:**
+- Integration test passes against the test Postgres.
+- `bun api` dev server: hitting `POST /catalog/etl?engine=workflow` with a real R2 fixture triggers a visible workflow instance in `wrangler workflows list catalog-etl-workflow`.
+- Workflow instance completes; `etl_jobs` row reflects expected counters; Sentry event present on simulated chunk failure.
+
+---
+
+### U4. Validator hardening: scheme, IDN, SSRF, length caps, SKU charset
+
+**Goal:** Eliminate audit P3 #2 attack surface — `javascript:`, IDN homograph, RFC-1918, oversize fields cannot enter the catalog.
+
+**Requirements:** R7
+
+**Dependencies:** None (independent; can land any time)
+
+**Files:**
+- Modify: `packages/api/src/services/etl/CatalogItemValidator.ts`
+- Test: `packages/api/test/etl-validator.test.ts` (new or extend existing)
+
+**Approach:**
+- `isValidUrl`:
+  - Parse with `new URL()`.
+  - Reject scheme other than `http:` / `https:` → reason `INVALID_URL_SCHEME`.
+  - Reject length > 2048 → `URL_TOO_LONG`.
+  - Reject IDN homograph: if `url.hostname` contains any non-ASCII character, run through `punycode.toUnicode` and compare to original; reject mixed-script labels via the Unicode IDNA `getStringPrepProfile` heuristic (or a small allow-list of Latin-only scripts). Reason `INVALID_URL_HOMOGRAPH`.
+  - Reject private/loopback/link-local hostnames via string-level pattern check (no DNS resolution — that adds an unbounded fetch surface and is itself an SSRF risk): block hostname literals matching `/^(?:127\.|10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.|169\.254\.|fc00:|fd00:|fe80:|localhost$|::1$)/i`. Reason `INVALID_URL_PRIVATE`.
+- Length caps: `name ≤ 500`, `description ≤ 50000`, `brand ≤ 200`, `category ≤ 200`.
+- SKU regex: `/^[A-Za-z0-9_.\-\/]+$/` max 200.
+
+**Patterns to follow:**
+- Existing validator at `packages/api/src/services/etl/CatalogItemValidator.ts`.
+- Invalid-log shape at `packages/api/src/services/etl/processLogsBatch.ts`.
+
+**Test scenarios:**
+- Happy path: `https://example.com/product/123` accepted.
+- Error path: `javascript:alert(1)` rejected (`INVALID_URL_SCHEME`).
+- Error path: `https://192.168.1.1/admin` rejected (`INVALID_URL_PRIVATE`).
+- Error path: `https://xn--pypal-4ve.com/` rejected (`INVALID_URL_HOMOGRAPH`).
+- Error path: `https://localhost/` rejected (`INVALID_URL_PRIVATE`).
+- Edge case: URL of exactly 2048 chars accepted; 2049 rejected.
+- Edge case: SKU `ABC-123_/test.sku` accepted; SKU `<script>` rejected.
+- Integration: A real prod-shape CSV with an injected `javascript:` URL run through the workflow → row in `invalid_item_logs`, no `catalog_items` insert.
+
+**Verification:**
+- New tests pass.
+- `bun test:api` overall green.
+
+---
+
+### U5. Retry, repair-from-scratch, reconcile admin endpoints (workflow-aware)
+
+**Goal:** Operators can trigger a new workflow instance from a historical `jobId` (retry), force a re-ingest with verification (repair-from-scratch), or trigger reconciliation against any job.
+
+**Requirements:** R3, R4
+
+**Dependencies:** U3 (workflow class must exist)
+
+**Files:**
+- Modify: `packages/api/src/routes/admin/analytics/catalog.ts` (rewrite `POST /admin/etl/:jobId/retry`; add `POST /admin/etl/:jobId/repair-from-scratch`; add `POST /admin/etl/:jobId/reconcile`)
+- Modify: `packages/cli/src/commands/admin/etl.ts` (add/refresh subcommands)
+- Modify: admin list endpoint response shape (include `workflowInstanceId`, `verifiedAt`, `verifiedRowCount`, `totalEmbeddingFailures`)
+- Test: `packages/api/test/etl-admin-retry-repair-reconcile.test.ts` (new)
+
+**Approach:**
+- `POST /admin/etl/:jobId/retry`: look up original `(source, filename, scraperRevision)`; verify `r2.head` of the original `filename` matches stored `source_etag` (409 on mismatch unless `?force=true`); INSERT a new `etl_jobs` row with `superseded_by_job_id = :jobId`, `superseded_at = now()`; trigger workflow with a fresh instance ID `${source}-${filename}-retry-${nonce}`.
+- `POST /admin/etl/:jobId/repair-from-scratch`: same shape as retry but always sets supersession even for `completed` jobs. Use case: an operator suspects a `completed` job is undercount.
+- `POST /admin/etl/:jobId/reconcile`: synchronously reads the source via `r2.get(key)`, csv-parses + counts logical rows, updates `verified_at` + `verified_row_count` on the target job. For very large files the operator can pass `?async=true` to trigger a workflow whose only step is reconcile.
+- Both endpoints accept `?dryRun=true` returning the planned action without side effects.
+- 7-job historical recovery procedure documented in U8 runbook: for each of the 7 jobIds, operator (a) verifies R2 source still exists, (b) backfills `source_etag` via a one-time SQL UPDATE using the current `r2.head().etag`, (c) calls `POST /admin/etl/:jobId/repair-from-scratch` (no `force` needed once etag is backfilled).
+
+**Patterns to follow:**
+- Admin route structure at `packages/api/src/routes/admin/analytics/catalog.ts:178-235`.
+- Workflow trigger pattern from U3.
+
+**Test scenarios:**
+- Happy path: Retry of a `failed` job whose source still exists → 409? No, 200 (ETag matches), new workflow instance triggered, new `etl_jobs` row with `superseded_by_job_id` set.
+- Happy path: Repair-from-scratch on a `completed` job → new workflow instance, supersession recorded.
+- Edge case: Retry when source has been overwritten (ETag mismatch) → 409; operator must use `?force=true`.
+- Edge case: `?dryRun=true` returns planned action; no side effects.
+- Edge case: Reconcile on a tiny job returns inline; on a synthetic 1 GB fixture with `?async=true` triggers a reconcile-only workflow.
+- Integration: Repair-from-scratch on a 50,100-row file produces a new job whose `total_processed = 50100`, `verified_row_count = 50100`.
+- Covers AE: The 7 historical jobs from 2026-05-14 are recoverable via this endpoint after the manual ETag backfill step.
+
+**Verification:**
+- Endpoints documented in OpenAPI spec via `@elysiajs/openapi`.
+- CLI subcommands invoke endpoints with proper auth.
+- `bun test:api` passes.
+
+---
+
+### U6. Observability: Sentry wiring, structured logger, error propagation fixes
+
+**Goal:** Every workflow error reaches Sentry with structured context. Embedding fallback observable via counter + Sentry breadcrumb. Internal error-propagation fixes from audit P2 #2/#3/#4.
+
+**Requirements:** R5, R6
+
+**Dependencies:** U3 (workflow class to instrument)
+
+**Files:**
+- Modify: `packages/api/package.json` (add `@sentry/cloudflare`, pin version)
+- Modify: `packages/api/src/index.ts` (wrap with `Sentry.withSentry({ ...opts, fetch, workflow, queue })`)
+- Modify: `packages/api/wrangler.jsonc` (`upload_source_maps: true`)
+- Modify: `.github/workflows/api-deploy.yml` (or equivalent) (add `@sentry/cli sourcemaps upload` step after deploy)
+- Create: `packages/api/src/utils/logger.ts` (thin wrapper: `info/warn/error(event, ctx)`; emits JSON line + Sentry breadcrumb when initialized)
+- Modify: `packages/api/src/workflows/catalog-etl-workflow.ts` (instrument each step with `Sentry.startSpan`; capture exceptions in step bodies)
+- Modify: `packages/api/src/services/etl/processLogsBatch.ts` (rethrow on DB failure — audit P2 #2)
+- Modify: `packages/api/src/services/etl/processValidItemsBatch.ts` (embedding-fallback path atomically increments `etl_jobs.total_embedding_failures`, emits Sentry warning — audit P2 #3)
+- Modify: `packages/api/src/services/etl/mergeItemsBySku.ts` (per-batch summary log instead of per-SKU — audit P3 #1)
+- Modify: All ETL files' `console.*` → `logger.*` (mechanical)
+- Modify: `packages/api/src/services/etl/processCatalogEtl.ts` *if it still exists* (writer IIFE wrap — audit P2 #4); deletion in Phase 1 cleanup makes this moot
+- Create: `packages/api/src/services/etl/constants.ts` (`ITEM_FLUSH_BATCH_SIZE = 100`, `CF_QUEUE_BATCH_SIZE = 100` — audit P2 #6)
+- Test: `packages/api/test/sentry-instrumentation.test.ts` (mock `@sentry/cloudflare`; assert capture shape)
+- Test: `packages/api/test/etl-error-propagation.test.ts` (rethrows, fallback counter increments)
+
+**Approach:**
+- Wrap the default export at `packages/api/src/index.ts` with `Sentry.withSentry(getOptions, { fetch, workflow, queue })`. Options factory reads `env.SENTRY_DSN`, `env.ENVIRONMENT`, sets `tracesSampleRate: 0.1`.
+- Workflow instrumentation: each `step.do(name, fn)` callback wraps the body in `Sentry.startSpan({ op: 'workflow.step', name, attributes: { jobId, workflowInstanceId, chunkIndex } }, ...)`. Capture errors before rethrowing.
+- Source-map upload: `@sentry/cli sourcemaps upload --release=$SENTRY_RELEASE ./dist` in CI after `wrangler deploy` — symbolicated stack traces in Sentry. Just `upload_source_maps: true` in wrangler.jsonc only ships maps to Cloudflare, not Sentry.
+- `error_stack` contract: the Sentry capture call sites use `Sentry.captureException(err, { tags: { jobId, workflowInstanceId, chunkIndex }, contexts: { ... } })` and pass error-message-only payloads — never include raw CSV row data. U10 test asserts no row-data substrings leak into the captured payload across all error paths.
+- Compatibility flags: verify `@sentry/cloudflare`'s required flags for the chosen version against the current `wrangler.jsonc` flags. `nodejs_compat` is already set; if the chosen version requires `nodejs_compat_v2` or `nodejs_als`, add them.
+
+**Patterns to follow:**
+- Reference: <https://docs.sentry.io/platforms/javascript/guides/cloudflare/>.
+- Workflows-specific tracing: workflow-aware spans via `withSentry`'s `workflow` wrapper.
+
+**Test scenarios:**
+- Happy path: Successful workflow → one `startSpan` per step, no `captureException`.
+- Error path: A `step.do` throws → `captureException` called with `{ jobId, workflowInstanceId, chunkIndex }` tags; span marks status error; workflow retries per step retry policy.
+- Edge case: `SENTRY_DSN` empty (dev without secret) → no Sentry calls; logger still emits lines; no crash.
+- Edge case: `processLogsBatch` DB INSERT fails → exception propagates → step retried by Workflows.
+- Edge case: Embedding service throws → `total_embedding_failures` increments atomically by the batch size; `etl.embedding.fallback` Sentry warning fires once per batch.
+- Integration: A forced chunk failure in dev produces a Sentry event visible in the project with the expected tags.
+
+**Verification:**
+- `grep -rn 'console\.' packages/api/src/services/etl/ packages/api/src/workflows/` returns nothing.
+- A real `bun api` cold-start log contains the Sentry init line.
+- Sentry test project receives an event from a forced workflow failure.
+- CI sourcemaps upload step succeeds; minified frames in Sentry show original filenames.
+
+---
+
+### U7. Retention sweep: scheduled handler with batched DELETE
+
+**Goal:** Bounded growth of `invalid_item_logs`. Naive single-statement DELETE is replaced with a batched loop to survive multi-million-row pruning.
+
+**Requirements:** R8
+
+**Dependencies:** None
+
+**Files:**
+- Create: `packages/api/src/workflows/retention-workflow.ts` *(or)* `packages/api/src/services/retention/invalidLogRetention.ts` + a `scheduled()` handler arm — pick one based on Phase 3 ergonomics
+- Modify: `packages/api/src/index.ts` (`scheduled` handler dispatches on `event.cron`, or workflow trigger registered)
+- Modify: `packages/api/wrangler.jsonc` (add `"triggers": { "crons": ["0 9 * * *"] }` — top-level `triggers` wrapper, not bare `crons`)
+- Test: `packages/api/test/etl-log-retention.test.ts` (new)
+
+**Approach:**
+- Sweep: loop
+  ```text
+  DELETE FROM invalid_item_logs
+  WHERE id IN (
+    SELECT id FROM invalid_item_logs
+    WHERE created_at < now() - interval '90 days'
+    LIMIT 10000
+  );
+  ```
+  until `0 rows affected` OR `iterations >= 100`. Pause briefly between iterations (`await scheduler.wait(100)`). If max iterations hit, Sentry warning with the deleted-row count so operators know more remains.
+- 90-day window default; configurable via `env.INVALID_LOG_RETENTION_DAYS`.
+- Daily cron at 09:00 UTC.
+
+**Patterns to follow:**
+- CF Cron Triggers config: <https://developers.cloudflare.com/workers/configuration/cron-triggers/>.
+
+**Test scenarios:**
+- Happy path: Seed table with 30k rows older than 90 days and 100 rows younger → sweep deletes exactly 30k in 3 iterations, leaves 100 rows.
+- Edge case: Empty table → sweep deletes 0 rows; no error; no Sentry warning.
+- Edge case: 1.5M rows older than 90 days → sweep hits max iterations cap at 1M deleted, emits Sentry warning, leaves remaining for next run.
+- Edge case: `INVALID_LOG_RETENTION_DAYS=30` env override → 30d-old logs swept.
+
+**Verification:**
+- New test passes.
+- `wrangler dev --test-scheduled` exercises the handler; assertion via DB row count delta.
+
+---
+
+### U8. Runbook at `docs/runbooks/etl-pipeline.md`
+
+**Goal:** A new on-caller can trigger / inspect / retry / repair / reconcile / drain operations against Workflows without reading source.
+
+**Requirements:** R9
+
+**Dependencies:** U3, U5 (operator-facing endpoints must exist)
+
+**Files:**
+- Create: `docs/runbooks/etl-pipeline.md`
+
+**Approach:**
+Sections:
+1. **Architecture overview** — producer → workflow instance → step.do chunks → aggregate → reconcile → finalize, with a small Mermaid diagram.
+2. **Triggering an ETL** — `curl POST /catalog/etl` (params, auth); CLI equivalent.
+3. **Inspecting workflow status** — `wrangler workflows instances list catalog-etl-workflow`; `wrangler workflows instances describe <id>`; admin dashboard query.
+4. **Retrying a failed workflow** — `curl POST /admin/etl/:jobId/retry`; CLI `packrat-admin etl retry <jobId>`.
+5. **Repair-from-scratch** — including the explicit one-time procedure for the seven 2026-05-14 jobs (list jobIds; describe ETag backfill step; describe expected output).
+6. **Reconciliation** — manual sync endpoint vs async-workflow trigger; interpreting delta.
+7. **Draining the queue (legacy path)** — only relevant during the coexistence window; how to verify drain before removing the queue config.
+8. **DLQ alternative** — since Workflows is the forensic record, the runbook explains: "Failed workflow instances are queryable for 90 days via dashboard; `wrangler workflows instances describe <id>` shows full step history with errors."
+9. **Accepted limitations** — soft-delete/discontinued-item reconciliation is not in scope; catalog grows monotonically.
+10. **References** — link to the audit, this plan, Workflows docs, Sentry project.
+
+**Patterns to follow:**
+- First runbook in `docs/runbooks/`; establishes the convention.
+
+**Test scenarios:**
+- *Test expectation: none — documentation only.*
+
+**Verification:**
+- Reviewer walks through each documented procedure in dev and confirms expected output.
+
+---
+
+### U9. Test gap backfill
+
+**Goal:** Cover the behaviors the legacy global queue-mock hid; add fixtures for byte-range edge cases.
+
+**Requirements:** R10
+
+**Dependencies:** U3, U4, U6 (units under test must exist)
+
+**Files:**
+- Modify: `packages/api/test/setup.ts` (remove the global queue mock — `processQueueBatch` no longer exists)
+- Create: `packages/api/test/etl-workflow-multi-chunk.test.ts`
+- Create: `packages/api/test/etl-csv-edge-cases.test.ts`
+- Create: `packages/api/test/fixtures/etl/` (synthesized at test startup with deterministic seed):
+  - `small-1chunk.csv` (~10 KB)
+  - `medium-3chunk.csv` (~50 MB synthetic, splits into 3 chunks)
+  - `wide-header.csv` (6 KB header)
+  - `bom-prefixed.csv` (starts with BOM)
+  - `quoted-header.csv` (CSV-quoted commas in header)
+  - `quoted-multiline.csv` (newlines inside quoted fields — gated by U3 csv-parse reconciliation, not raw byte counting)
+
+**Approach:**
+- Each new test exercises the real workflow integration against the test Postgres + mocked `step` runtime.
+- Specific assertions:
+  - Multi-chunk workflow completes with one `status='completed'` transition.
+  - Header > 4 KB: re-fetch expands to 16 KB, columns mapped correctly.
+  - Row-spanning chunk: no rows dropped or duplicated; total row count matches `wc -l - 1`.
+  - BOM-prefixed file: stripped before header extraction.
+  - Quoted-multiline file: csv-parse counts logical rows correctly; reconcile delta = 0.
+  - Embedding fallback: `total_embedding_failures` increments; chunk completes; Sentry warning fires once per batch.
+  - Step memoization: forced retry of one chunk produces the same return value on the second attempt (mocked step runtime asserts this).
+
+**Patterns to follow:**
+- Existing `packages/api/test/etl.test.ts` for fixture setup + Docker Postgres pattern.
+- Vitest mocking conventions from `packages/api/test/setup.ts`.
+
+**Test scenarios:**
+- (Each described above.)
+
+**Verification:**
+- `bun test:api` passes.
+- Coverage delta is positive on `packages/api/src/workflows/` and the modified ETL service files.
+
+---
+
+## System-Wide Impact
+
+- **Interaction graph:** Producer endpoint → `chunkCsvForR2` (parallel peek reads) → INSERT etl_jobs with source_etag → `env.ETL_WORKFLOW.create(...)` → workflow instance runs `chunk-*` steps in order → `aggregate` → `reconcile` → `finalize`. Failed instances surface in Workflows dashboard. Sentry wraps every entry point (`fetch`, `workflow`, `queue` — the last for the unchanged embeddings queue). One scheduled cron arm for retention sweep.
+- **Error propagation:** Errors thrown inside `step.do` callbacks are captured by `Sentry.captureException`, rethrown to Workflows runtime, retried per step config; exhaustion routes the instance to `errored`; the workflow's terminal `errored` lifecycle hook flips `etl_jobs.status='failed'` and captures a final Sentry event with step history. Inner code (`processLogsBatch`, `processValidItemsBatch` embedding fallback) rethrows on DB failure so the step retries with the right backoff.
+- **State lifecycle:** Workflows step results are durably persisted and memoized by step name; retries are exactly-once-on-success. No `chunks_total/chunks_completed/last_progress_at` columns are needed because instance state is the source of truth. `etl_jobs` carries only the denormalized counters needed by admin queries.
+- **API surface parity:** Producer `POST /catalog/etl` keeps the same request body shape; accepts an additional optional `?engine=workflow|queue` parameter (default `workflow`) during the coexistence window. Admin endpoints: rewritten retry, new repair-from-scratch, new reconcile. Old endpoints (`/admin/etl/reset-stuck`) are removed in U3's PR (no replacement needed — Workflows surfaces stuck instances natively).
+- **Integration coverage:** U9 exercises the full pipeline end-to-end. The legacy global queue mock at `packages/api/test/setup.ts:544-551` is removed since the queue no longer participates in catalog ETL.
+- **Unchanged invariants:** `EMBEDDINGS_QUEUE` and `LOGS_QUEUE` configuration; `catalog_items` upsert behavior (still SKU-keyed); OpenAPI client generated by `@elysiajs/openapi` for non-ETL routes; admin auth surface (`adminAuthGuard`); scraper-revision pinning; mobile and web apps untouched.
+
+---
+
+## Risks & Dependencies
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|-----------|--------|------------|
+| **Workflows pricing surprises at PackRat's scale** | Med (unknown until measured) | Med | U1 spike captures dashboard billing meter for one run; extrapolate to ~750 step-executions/day. Escalate before Phase 2 if cost trajectory exceeds Workers Paid base. |
+| **`step.do` output exceeds 1 MiB cap** for very wide chunks | Low | Med | Chunk-step returns aggregated counters only (`{ rowsProcessed, rowsValid, rowsInvalid }`), not row data. Detail rows go to `catalog_items` / `invalid_item_logs` via DB writes inside the step. |
+| **Producer's parallel newline-peek reads collide with R2 rate limits** for multi-GB files | Low | Med | `Promise.all` over ≤50 chunks × one 64KB read each = ≤50 concurrent R2 GETs; well within R2's documented per-bucket throughput. If issue surfaces, bound concurrency with a small p-limit. |
+| **`@sentry/cloudflare` requires compatibility_flags beyond `nodejs_compat`** | Low | Med | Verify against the pinned Sentry version at U6 start; add any missing flags as part of U6. |
+| **Coexistence window misuse** (operators flip `?engine=queue` after cutover) | Low | Low | Producer logs a Sentry breadcrumb on `?engine=queue` usage; runbook documents the deprecation. Cleanup PR removes the option entirely a week after cutover. |
+| **The 7 historical jobs' R2 sources have been deleted** by a separate retention policy | Low | Low | U8 runbook procedure verifies `r2.head` before invoking repair; if missing, accept as documented data loss. |
+| **Workflow instance ID collision** if the same `(source, filename)` is triggered twice (deterministic ID) | Low | Low | Workflows returns the existing instance on duplicate; producer endpoint treats this as success and returns the existing `jobId`. Documented behavior. |
+| **`csv-parse` reconciliation is slower than naive byte counting** for very large files | Low | Low | At ~10MB/s parse rate, a 100MB file takes ~10s — well within step CPU. If a 1 GB+ file appears, the reconcile step is split by byte range (each sub-step parses 200 MB). |
+| **Drizzle Kit emits SQL without literal `DEFAULT 0 NOT NULL`** | Med | High | U2 implementer hand-verifies generated `.sql`; schema smoke test asserts via `information_schema.columns`. |
+| **Down-migration loses Phase-2+ data** once writes start landing | Cert if attempted | High | Migration is **forward-only after U3 ships**; documented in U2's test scenarios and migration header. Rollback strategy is a forward-fix migration. |
+| **Wide-CSV fixture in U9 destabilizes CI** | Low | Low | Synthesize at test startup with deterministic seed (no checked-in large file); cap size in test mode via env. |
+
+---
+
+## Documentation / Operational Notes
+
+- The new runbook at `docs/runbooks/etl-pipeline.md` (U8) is the operator entry point.
+- Sentry project must be provisioned (or confirmed existing) before U6 lands. `env.SENTRY_DSN` is already validated in `packages/api/src/utils/env-validation.ts:9, 94` — verify the prod and dev env have it set via `wrangler secret list`.
+- Rollout sequencing:
+  - **Phase 1** ships U1 + U2 + U3. Producer accepts `?engine=workflow|queue`; default `workflow`. Coexistence window of one week. Daily `wrangler workflows instances list` check during the window.
+  - **Phase 1 cleanup PR** (one week after Phase 1): delete `processCatalogEtl.ts`, `queue.ts`, the `?engine=queue` branch, the `packrat-etl-queue` config. `setup.ts:544-551` global queue mock removed.
+  - **Phase 2** ships U4 + U5 + U6 + U7. Validator hardening, admin endpoints, observability, retention.
+  - **Phase 3** ships U8 + U9. Runbook + test backfill.
+- The 7 historical-job recovery is a one-time operational task after Phase 2; record the run in the runbook's `## Historical Recoveries` appendix.
+- New env vars: `INVALID_LOG_RETENTION_DAYS` (optional, default 90). Add to `.env.example` in Phase 3.
+
+---
+
+## Phased Delivery
+
+### Phase 1 — Workflows foundation + producer cutover (U1, U2, U3)
+
+Spike → migration → workflow class → producer accepts both engines. Independently shippable in 2-3 PRs (spike result attached to U2's PR; U2 + U3 in one PR or split). After Phase 1 bakes for one week, Phase 1 cleanup PR removes the legacy queue path entirely.
+
+### Phase 2 — Validator + admin endpoints + observability + retention (U4, U5, U6, U7)
+
+Hardening + the operator surface that lets the 7-job recovery happen. 2-3 PRs.
+
+### Phase 3 — Runbook + test backfill (U8, U9)
+
+Documentation + test coverage. 1-2 PRs.
+
+---
+
+## Documentation Plan
+
+- `docs/runbooks/etl-pipeline.md` — created in U8.
+- `CLAUDE.md` ETL section — minor update in a Phase 3 PR to link the runbook and note the Workflows architecture.
+- Update `docs/audits/2026-05-16-etl-audit.md` footer linking to this plan (so future readers know remediation went through Workflows).
+- `/ce-compound` candidates after each phase:
+  - Phase 1: "Cloudflare Workflows step.do idempotency for batch ETL"
+  - Phase 1: "Migrating a Cloudflare Queues state machine to Workflows"
+  - Phase 2: "Sentry on Cloudflare Workers via `@sentry/cloudflare` (fetch + workflow + queue)"
+  - Phase 3: "ETL operational runbook structure (Workflows edition)"
+
+---
+
+## Operational / Rollout Notes
+
+- Each phase's PR is gated on the previous phase having shipped to prod and observed for at least 24h. Particular care during Phase 1's coexistence window — monitor `wrangler workflows instances list` daily and confirm the workflow path is the one being exercised.
+- The 7-job recovery happens after Phase 2 lands; document the jobIds and the run in the runbook recoveries appendix.
+- New env vars: `INVALID_LOG_RETENTION_DAYS` (optional, default 90). Add to `.env.example` in Phase 3.
+- Wrangler secrets to verify: `SENTRY_DSN`, `R2_ACCESS_KEY_ID`, `R2_SECRET_ACCESS_KEY`, `PACKRAT_SCRAPY_BUCKET_R2_BUCKET_NAME`. None new; confirm presence before Phase 2 deploy via `wrangler secret list`.
+- Rollback: each PR is independently revertable until U3's cleanup. Once the legacy queue path is removed, rollback requires a forward-fix.
+
+---
+
+## Sources & References
+
+- **Origin audit:** `docs/audits/2026-05-16-etl-audit.md`
+- **Superseded plan:** `docs/plans/2026-05-19-001-fix-etl-pipeline-audit-remediation-plan.md` (Queues + outbox design; pivoted to Workflows on 2026-05-20)
+- Related code:
+  - `packages/api/src/services/etl/`
+  - `packages/api/src/routes/catalog/index.ts`
+  - `packages/api/src/routes/admin/analytics/catalog.ts`
+  - `packages/api/wrangler.jsonc`
+  - `packages/db/src/schema.ts`
+  - `packages/api/test/etl.test.ts`
+  - `packages/cli/src/commands/admin/etl.ts`
+- Live prod evidence (pulled 2026-05-19 + 2026-05-20): `GET https://packrat-api.orange-frost-d665.workers.dev/api/admin/analytics/catalog/etl?limit=25` showed 192 runs / 74 failed, 7 jobs falsely-failed at 2026-05-14T16:24:04.470Z. Counters unchanged across the two pulls — pipeline is currently dormant.
+- External docs:
+  - <https://developers.cloudflare.com/workflows/>
+  - <https://developers.cloudflare.com/workflows/get-started/guide/>
+  - <https://developers.cloudflare.com/workflows/reference/limits/>
+  - <https://developers.cloudflare.com/queues/configuration/javascript-apis/> (for the embeddings queue, retained)
+  - <https://developers.cloudflare.com/r2/api/s3/api/>
+  - <https://docs.sentry.io/platforms/javascript/guides/cloudflare/>
+  - <https://developers.cloudflare.com/workers/configuration/cron-triggers/>
diff --git a/docs/runbooks/etl-pipeline.md b/docs/runbooks/etl-pipeline.md
new file mode 100644
index 0000000000..3390d57ba4
--- /dev/null
+++ b/docs/runbooks/etl-pipeline.md
@@ -0,0 +1,340 @@
+# Catalog ETL Pipeline — Runbook
+
+Operational guide for the Cloudflare Workflows-based catalog ingest pipeline.
+Audience: on-call engineers triaging ETL issues; scraper operators triggering
+new runs; anyone debugging why the catalog isn't updating.
+
+## Architecture at a glance
+
+```
+Scraper → R2 object (packrat-scrapy-bucket)
+                    │
+                    ▼
+POST /api/catalog/etl  ── api-key auth
+                    │
+                    ▼
+chunkCsvForR2  → newline-aligned ChunkSpec[]
+                    │
+                    ▼
+INSERT etl_jobs (status='running', workflow_instance_id)
+                    │
+                    ▼
+env.ETL_WORKFLOW.create(...)  ──► CatalogEtlWorkflow instance
+                                            │
+                              ┌─────────────┴─────────────┐
+                              ▼                           ▼
+                     step.do("chunk-N", ...) × N    (durable, memoized,
+                              │                       per-step retry 3x
+                              ▼                       exp backoff)
+                     step.do("aggregate")
+                              │
+                              ▼
+                     step.do("finalize")  → UPDATE etl_jobs
+                                            SET status='completed'
+```
+
+Two backstops:
+- **CF Cron Trigger** `0 9 * * *` runs the `scheduled` handler, which calls
+  `sweepInvalidItemLogs` to DELETE `invalid_item_logs` rows older than 90
+  days in 10k-row batches.
+- The Workflows dashboard is the **forensic record** for any errored
+  instance — no DLQ table is needed; the dashboard surfaces full step
+  history with stack traces.
+
+## What's the engine?
+
+The producer endpoint accepts `?engine=workflow|queue`. Workflow is the
+default. The queue path is kept during the coexistence window — operators
+can opt back in via `?engine=queue` if the workflow path misbehaves in
+production. Plan: delete the queue path one week after the workflow path
+has been in steady-state production use.
+
+## Triggering an ETL
+
+```bash
+# Via curl (admin API key in $PACKRAT_API_KEY)
+curl -X POST 'https://packrat-api.orange-frost-d665.workers.dev/api/catalog/etl?engine=workflow' \
+  -H "x-api-key: $PACKRAT_API_KEY" \
+  -H 'content-type: application/json' \
+  -d '{
+        "filename": "cotopaxi_2026-05-14T16-54-05.csv",
+        "chunks": ["v2/cotopaxi/cotopaxi_2026-05-14T16-54-05.csv"],
+        "source": "cotopaxi",
+        "scraperRevision": "abc123"
+      }'
+```
+
+Response:
+```json
+{
+  "message": "Catalog ETL workflow triggered",
+  "jobId": "<uuid>",
+  "engine": "workflow",
+  "workflowInstanceId": "cotopaxi-cotopaxi_2026-05-14T16-54-05.csv"
+}
+```
+
+The deterministic `workflowInstanceId` (`${source}-${filename}`) means
+duplicate triggers for the same file are rejected by the Workflows runtime
+— safe to retry the curl on network failures.
+
+## Inspecting a workflow instance
+
+```bash
+# List recent instances
+bunx wrangler workflows instances list packrat-catalog-etl
+
+# Describe one (replace <id> with workflowInstanceId or the UUID)
+bunx wrangler workflows instances describe packrat-catalog-etl <id>
+```
+
+`describe` shows:
+- Top-level status: `queued`, `running`, `paused`, `errored`, or `complete`
+- Each `chunk-N` step's start/end timestamps + output value (rowsProcessed,
+  rowsValid, rowsInvalid per chunk)
+- `aggregate` step result (the canonical totals written to `etl_jobs`)
+- `finalize` step result (status flip to `completed`)
+- For errored instances: full retry history with stack traces per attempt
+
+## Retrying a failed job
+
+```bash
+curl -X POST 'https://packrat-api.orange-frost-d665.workers.dev/api/admin/analytics/catalog/etl/<jobId>/retry' \
+  -H "Authorization: Bearer $ADMIN_JWT"
+```
+
+The retry endpoint:
+1. Looks up the original `etl_jobs` row (requires `status='failed'`)
+2. Re-chunks the source via `chunkCsvForR2` (newline-aligned)
+3. INSERTs a new `etl_jobs` row with a fresh `jobId` and a new
+   `workflowInstanceId` suffixed `-retry-<newJobId>` so duplicate retries
+   don't collide
+4. Calls `env.ETL_WORKFLOW.create(...)` with the chunks
+
+Response:
+```json
+{
+  "success": true,
+  "newJobId": "<uuid>",
+  "objectKey": "v2/cotopaxi/cotopaxi_2026-05-14T16-54-05.csv",
+  "workflowInstanceId": "cotopaxi-cotopaxi_...-retry-<newJobId>"
+}
+```
+
+Original job's `etl_jobs` row is left untouched (still `failed`); the new
+row reflects the retry. The new row's `superseded_by_job_id` is set to the
+original `jobId` (with `superseded_at = now()`) so the supersession chain
+is explicit — no manual correlation by `(source, filename)` and timestamp
+is required. Use `GET /admin/analytics/catalog/etl/:jobId` to see the full
+chain for any job.
+
+## Reconciling a job's row count
+
+After an ingest completes, you can compare the R2 source's logical row
+count against `etl_jobs.total_processed`:
+
+```bash
+curl -X POST 'https://packrat-api.orange-frost-d665.workers.dev/api/admin/analytics/catalog/etl/<jobId>/reconcile' \
+  -H "Authorization: Bearer $ADMIN_JWT"
+```
+
+The endpoint reads the entire R2 source, parses it with `csv-parse` (which
+correctly handles quoted multi-line fields, unlike raw `\n` counting), and
+writes the result to `etl_jobs.verified_row_count` + `etl_jobs.verified_at`.
+
+Response:
+```json
+{
+  "success": true,
+  "jobId": "<uuid>",
+  "expectedRowCount": 50100,
+  "actualRowCount": 50100,
+  "delta": 0
+}
+```
+
+A non-zero `delta` indicates data drift — either the source was modified
+since ingest, or the workflow dropped rows. Investigate before re-ingesting.
+
+For very large source files (>200 MB) this endpoint may exceed the fetch
+budget. Async-via-workflow is a documented follow-up.
+
+## The 7-job historical recovery procedure
+
+Seven jobs from 2026-05-14 were falsely marked `failed` by the old
+wall-clock-based stuck-job sweep. After this PR ships and is deployed:
+
+```sql
+-- List the affected jobs
+SELECT id, source, filename, total_processed, started_at, completed_at
+FROM etl_jobs
+WHERE status = 'failed'
+  AND completed_at = '2026-05-14T16:24:04.470Z';
+```
+
+For each `jobId` returned:
+
+```bash
+curl -X POST "https://packrat-api.orange-frost-d665.workers.dev/api/admin/analytics/catalog/etl/${jobId}/retry" \
+  -H "Authorization: Bearer $ADMIN_JWT"
+```
+
+Workflow instances will appear in the dashboard with names like
+`evo-evo_2026-04-27T03-25-18.csv-retry-<newJobId>`. Watch each to
+completion. Original `etl_jobs` rows stay `failed` for the audit trail;
+new rows reflect the successful re-ingest.
+
+If a source file has been overwritten since 2026-05-14, the retry will
+re-ingest the **current** content under the old `(source, filename)` —
+not the original. This is acceptable for the 7-job recovery (we want the
+latest catalog state) but operators should verify R2 contents before
+retrying if they're worried about historical accuracy. ETag-based
+fail-closed verification is a follow-up PR.
+
+## DLQ / forensic record
+
+There is no DLQ table. The CF Workflows dashboard is the forensic record:
+
+```bash
+# Errored instances
+bunx wrangler workflows instances list packrat-catalog-etl \
+  --status=errored
+```
+
+For each errored instance, `describe` shows the failed step, the
+exception message, and the retry attempt history. Workflows instance
+retention is per the CF account settings (default unlimited on paid
+plan).
+
+For DB-side history: the `etl_jobs` table retains all rows indefinitely.
+A failed `etl_jobs` row is the durable record that operators see in the
+admin UI; the linked workflow instance is the executable log behind it.
+
+## Invalid item logs retention
+
+`invalid_item_logs` is swept daily at 09:00 UTC by the `scheduled`
+handler in `packages/api/src/index.ts`. Default retention is 90 days.
+The sweep loops in 10k-row batches and caps at 100 iterations (1M rows
+per run). If the cap is hit, the next run picks up the remainder.
+
+To override defaults, edit `packages/api/src/services/retention/invalidLogRetention.ts`
+constants (no env-var override yet).
+
+To manually trigger a retention sweep (dev only):
+
+```bash
+bunx wrangler dev --test-scheduled
+# In another terminal:
+curl 'http://localhost:8787/__scheduled?cron=0+9+*+*+*'
+```
+
+## Draining / disabling the queue path
+
+After the workflow path bakes in production and the queue path is
+scheduled for removal:
+
+```bash
+# Check that no consumers are reading from the old queue
+bunx wrangler queues info packrat-etl-queue
+
+# Remove the consumer binding — NOTE: this does NOT drain messages already
+# sitting in the queue. Wait for the queue depth to reach 0 (visible in the
+# Cloudflare dashboard) before removing the consumer, or messages will be
+# lost. Only then is it safe to remove:
+bunx wrangler queues consumer remove packrat-etl-queue packrat-api
+```
+
+Then the queue path removal PR (follow-up to this work) deletes:
+- `packages/api/src/services/etl/queue.ts`
+- `packages/api/src/services/etl/processCatalogEtl.ts`
+- The `?engine=queue` branch in `packages/api/src/routes/catalog/index.ts`
+- The `packrat-etl-queue` producer + consumer entries in `wrangler.jsonc`
+- The legacy `processQueueBatch` arm in the `queue()` handler at
+  `packages/api/src/index.ts`
+- The `POST /admin/etl/reset-stuck` endpoint (the wall-clock sweep that
+  caused the 7-job false-failure incident; no longer needed with
+  Workflows owning instance lifecycle)
+
+## Interpreting admin dashboard fields
+
+`admin.packratai.com`'s catalog ETL page reads from `etl_jobs`. Field
+meanings under the Workflows architecture:
+
+| Field | Meaning |
+|---|---|
+| `status` | Mirrors the workflow's terminal state. `completed` = `finalize` step succeeded. `failed` = workflow errored (all retries exhausted). `running` = workflow still active. |
+| `total_processed`, `total_valid`, `total_invalid` | Written by the workflow's `aggregate` step. These are authoritative for the workflow run — any drift from per-row counts during processing is overridden by the aggregate write. |
+| `workflow_instance_id` | NULL for legacy queue-path rows; set for workflow-path rows. Use this to find the instance in the CF dashboard. |
+| `total_embedding_failures` | Number of SKUs upserted without embeddings because `generateManyEmbeddings` threw. Non-zero indicates degradation. The catalog items themselves are present; embeddings backfill happens via the existing `/admin/embeddings` workflow. |
+| `verified_at`, `verified_row_count` | NULL until an operator runs the reconcile endpoint. When set, `verified_row_count` is the R2 source's logical CSV row count; compare to `total_processed` to detect drift. |
+| `success_rate` (computed) | Existing field — `total_valid / total_processed`. Note that a job with `status='failed'` can still show 100% if all processed rows were valid before the failure; the field is per-row, not per-job. |
+
+## Accepted limitations
+
+- **No soft-delete / discontinued-item reconciliation.** When a catalog
+  item disappears from the source CSV, its row in `catalog_items` keeps
+  the last `availability` value. The catalog grows monotonically.
+  Reconciliation strategy not in scope; documented in audit P3 #3.
+- **`success_rate` on a `failed` job can read 100%.** Dashboard quirk —
+  the field is per-row, not per-job. A job that processed 400 rows
+  successfully then errored on chunk 5 shows `success_rate: 100`
+  because the 400 were all valid. The fix is documenting this above and
+  in the admin UI tooltip (admin app PR).
+- **Reconcile endpoint is synchronous.** Very large source files
+  (>200 MB) may exceed the fetch budget. Async-via-workflow path is a
+  documented follow-up.
+- **ETag fail-closed on repair-from-scratch (not plain retry).** The
+  `repair-from-scratch` endpoint compares the stored `source_etag` against
+  `r2.head().etag` and returns 409 on mismatch; pass `?force=true` to
+  override. The plain `retry` endpoint does not enforce ETag checks — if the
+  R2 source has been overwritten, retry re-ingests the new content. Use
+  repair-from-scratch when historical accuracy matters.
+- **Embedding failures still cost API calls on retry.** Workflows
+  memoizes step results, so a successful chunk step doesn't re-fire its
+  embedding call on a downstream failure. But a chunk that fails AT the
+  embedding call (and is then retried) calls the embedding API again.
+  Bounded by the per-step retry limit (3); cost is bounded.
+
+## Historical recoveries appendix
+
+Document each one-off recovery here for the audit trail.
+
+### 2026-05-14 false-failures (planned, post-merge)
+
+7 jobs from 2026-05-14T16:24:04.470Z were marked failed by the old
+wall-clock sweep mid-flight. Job IDs and recovery procedure documented
+above. To be executed after this PR deploys to production.
+
+## Sentry observability
+
+`@sentry/cloudflare` wraps the worker default export via `withSentry()` and
+the `CatalogEtlWorkflow` class via `instrumentWorkflowWithSentry()` —
+configured in `packages/api/src/index.ts`. Initialization uses
+`env.SENTRY_DSN`, `env.ENVIRONMENT`, and `env.CF_VERSION_METADATA.id`
+(release tag) with a 10% trace sample rate.
+
+The structured logger (`packages/api/src/utils/logger.ts`) forwards:
+- `logger.info(event, ctx)` → `Sentry.addBreadcrumb(level=info)`
+- `logger.warn(event, ctx)` → `Sentry.addBreadcrumb(level=warning)`
+- `logger.error(event, { err })` → `Sentry.captureException(err)` with `event` + ctx fields as tags
+- `logger.error(event)` without err → `Sentry.captureMessage(event, level=error)`
+
+ctx fields become Sentry tags (strings/numbers/booleans) or extras (objects).
+
+Source maps:
+- `upload_source_maps: true` in `wrangler.jsonc` uploads sourcemaps to
+  Cloudflare on every `wrangler deploy` — unminified stack traces in
+  `wrangler tail` and the Workers dashboard
+- For Sentry-side symbolication (unminified frames in the Sentry UI),
+  run `bunx @sentry/cli sourcemaps upload --release=$(git rev-parse HEAD) packages/api/dist`
+  after deploy. Not wired into CI because there's no automated deploy
+  pipeline today; run manually post-deploy until that changes.
+
+## References
+
+- [Audit (2026-05-16)](../audits/2026-05-16-etl-audit.md) — the source-of-truth list of pre-migration issues
+- [Active plan](../plans/2026-05-20-001-fix-etl-pipeline-workflows-migration-plan.md) — the Workflows migration plan
+- [Superseded plan](../plans/2026-05-19-001-fix-etl-pipeline-audit-remediation-plan.md) — the original Queues + outbox attempt (why we pivoted)
+- [Cloudflare Workflows docs](https://developers.cloudflare.com/workflows/)
+- [Cloudflare Workflows JS API](https://developers.cloudflare.com/workflows/build/workers-api/)
+- [Sentry on Cloudflare Workers](https://docs.sentry.io/platforms/javascript/guides/cloudflare/)
diff --git a/packages/api/drizzle.config.ts b/packages/api/drizzle.config.ts
index 59d6f0c44c..25f0acefc2 100644
--- a/packages/api/drizzle.config.ts
+++ b/packages/api/drizzle.config.ts
@@ -2,6 +2,9 @@ import { nodeEnv } from '@packrat/env/node';
 import { defineConfig } from 'drizzle-kit';
 
 export default defineConfig({
+  // Points at the in-package re-export at src/db/schema.ts, which re-exports
+  // everything from @packrat/db/schema. Keeps drizzle-kit + tooling scoped
+  // to packages/api without crossing the workspace boundary at config time.
   schema: './src/db/schema.ts',
   out: './drizzle',
   dialect: 'postgresql',
diff --git a/packages/api/drizzle/0047_clear_monster_badoon.sql b/packages/api/drizzle/0047_clear_monster_badoon.sql
new file mode 100644
index 0000000000..96f951f5d3
--- /dev/null
+++ b/packages/api/drizzle/0047_clear_monster_badoon.sql
@@ -0,0 +1,12 @@
+ALTER TABLE "etl_jobs" ADD COLUMN "workflow_instance_id" text;--> statement-breakpoint
+ALTER TABLE "etl_jobs" ADD COLUMN "total_embedding_failures" integer DEFAULT 0 NOT NULL;--> statement-breakpoint
+ALTER TABLE "etl_jobs" ADD COLUMN "verified_at" timestamp;--> statement-breakpoint
+ALTER TABLE "etl_jobs" ADD COLUMN "verified_row_count" integer;--> statement-breakpoint
+ALTER TABLE "etl_jobs" ADD COLUMN "source_etag" text;--> statement-breakpoint
+ALTER TABLE "etl_jobs" ADD COLUMN "source_last_modified" timestamp;--> statement-breakpoint
+ALTER TABLE "etl_jobs" ADD COLUMN "superseded_by_job_id" text;--> statement-breakpoint
+ALTER TABLE "etl_jobs" ADD COLUMN "superseded_at" timestamp;--> statement-breakpoint
+ALTER TABLE "etl_jobs" ADD CONSTRAINT "etl_jobs_superseded_by_job_id_etl_jobs_id_fk" FOREIGN KEY ("superseded_by_job_id") REFERENCES "public"."etl_jobs"("id") ON DELETE set null ON UPDATE no action;--> statement-breakpoint
+CREATE INDEX "etl_jobs_workflow_instance_id_idx" ON "etl_jobs" USING btree ("workflow_instance_id");--> statement-breakpoint
+CREATE INDEX "etl_jobs_superseded_by_idx" ON "etl_jobs" USING btree ("superseded_by_job_id");--> statement-breakpoint
+ALTER TABLE "etl_jobs" ADD CONSTRAINT "etl_jobs_no_self_supersede" CHECK ("etl_jobs"."superseded_by_job_id" IS NULL OR "etl_jobs"."superseded_by_job_id" <> "etl_jobs"."id");
\ No newline at end of file
diff --git a/packages/api/drizzle/meta/0047_snapshot.json b/packages/api/drizzle/meta/0047_snapshot.json
new file mode 100644
index 0000000000..c0ad26e3a6
--- /dev/null
+++ b/packages/api/drizzle/meta/0047_snapshot.json
@@ -0,0 +1,2351 @@
+{
+  "id": "79eab1cd-6669-4ece-95a3-4aecfcba8563",
+  "prevId": "1f086d6d-055d-4b37-a5d6-32b1141d2043",
+  "version": "7",
+  "dialect": "postgresql",
+  "tables": {
+    "public.account": {
+      "name": "account",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "account_id": {
+          "name": "account_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "provider_id": {
+          "name": "provider_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "access_token": {
+          "name": "access_token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "refresh_token": {
+          "name": "refresh_token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "id_token": {
+          "name": "id_token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "access_token_expires_at": {
+          "name": "access_token_expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "refresh_token_expires_at": {
+          "name": "refresh_token_expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "scope": {
+          "name": "scope",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "password": {
+          "name": "password",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "account_userId_idx": {
+          "name": "account_userId_idx",
+          "columns": [
+            {
+              "expression": "user_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "account_user_id_users_id_fk": {
+          "name": "account_user_id_users_id_fk",
+          "tableFrom": "account",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "account_provider_account_idx": {
+          "name": "account_provider_account_idx",
+          "nullsNotDistinct": false,
+          "columns": ["provider_id", "account_id"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.catalog_item_etl_jobs": {
+      "name": "catalog_item_etl_jobs",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "serial",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "catalog_item_id": {
+          "name": "catalog_item_id",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "etl_job_id": {
+          "name": "etl_job_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "catalog_item_etl_jobs_catalog_item_id_catalog_items_id_fk": {
+          "name": "catalog_item_etl_jobs_catalog_item_id_catalog_items_id_fk",
+          "tableFrom": "catalog_item_etl_jobs",
+          "tableTo": "catalog_items",
+          "columnsFrom": ["catalog_item_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "catalog_item_etl_jobs_etl_job_id_etl_jobs_id_fk": {
+          "name": "catalog_item_etl_jobs_etl_job_id_etl_jobs_id_fk",
+          "tableFrom": "catalog_item_etl_jobs",
+          "tableTo": "etl_jobs",
+          "columnsFrom": ["etl_job_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.catalog_items": {
+      "name": "catalog_items",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "serial",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "name": {
+          "name": "name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "product_url": {
+          "name": "product_url",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "sku": {
+          "name": "sku",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "weight": {
+          "name": "weight",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "weight_unit": {
+          "name": "weight_unit",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "description": {
+          "name": "description",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "categories": {
+          "name": "categories",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "images": {
+          "name": "images",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "brand": {
+          "name": "brand",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "rating_value": {
+          "name": "rating_value",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "color": {
+          "name": "color",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "size": {
+          "name": "size",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "price": {
+          "name": "price",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "availability": {
+          "name": "availability",
+          "type": "availability",
+          "typeSchema": "public",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "seller": {
+          "name": "seller",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "product_sku": {
+          "name": "product_sku",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "material": {
+          "name": "material",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "currency": {
+          "name": "currency",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "condition": {
+          "name": "condition",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "review_count": {
+          "name": "review_count",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "variants": {
+          "name": "variants",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "techs": {
+          "name": "techs",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "links": {
+          "name": "links",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "reviews": {
+          "name": "reviews",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "qas": {
+          "name": "qas",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "faqs": {
+          "name": "faqs",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "embedding": {
+          "name": "embedding",
+          "type": "vector(1536)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "embedding_idx": {
+          "name": "embedding_idx",
+          "columns": [
+            {
+              "expression": "embedding",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last",
+              "opclass": "vector_cosine_ops"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "hnsw",
+          "with": {}
+        }
+      },
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "catalog_items_sku_unique": {
+          "name": "catalog_items_sku_unique",
+          "nullsNotDistinct": false,
+          "columns": ["sku"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.comment_likes": {
+      "name": "comment_likes",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "serial",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "comment_id": {
+          "name": "comment_id",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "comment_likes_comment_id_post_comments_id_fk": {
+          "name": "comment_likes_comment_id_post_comments_id_fk",
+          "tableFrom": "comment_likes",
+          "tableTo": "post_comments",
+          "columnsFrom": ["comment_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "comment_likes_user_id_users_id_fk": {
+          "name": "comment_likes_user_id_users_id_fk",
+          "tableFrom": "comment_likes",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "comment_likes_comment_id_user_id_unique": {
+          "name": "comment_likes_comment_id_user_id_unique",
+          "nullsNotDistinct": false,
+          "columns": ["comment_id", "user_id"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.etl_jobs": {
+      "name": "etl_jobs",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "status": {
+          "name": "status",
+          "type": "etl_job_status",
+          "typeSchema": "public",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "source": {
+          "name": "source",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "filename": {
+          "name": "filename",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "started_at": {
+          "name": "started_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "completed_at": {
+          "name": "completed_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "total_processed": {
+          "name": "total_processed",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "total_valid": {
+          "name": "total_valid",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "total_invalid": {
+          "name": "total_invalid",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "scraper_revision": {
+          "name": "scraper_revision",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "workflow_instance_id": {
+          "name": "workflow_instance_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "total_embedding_failures": {
+          "name": "total_embedding_failures",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "default": 0
+        },
+        "verified_at": {
+          "name": "verified_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "verified_row_count": {
+          "name": "verified_row_count",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "source_etag": {
+          "name": "source_etag",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "source_last_modified": {
+          "name": "source_last_modified",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "superseded_by_job_id": {
+          "name": "superseded_by_job_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "superseded_at": {
+          "name": "superseded_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        }
+      },
+      "indexes": {
+        "etl_jobs_scraper_revision_idx": {
+          "name": "etl_jobs_scraper_revision_idx",
+          "columns": [
+            {
+              "expression": "scraper_revision",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        },
+        "etl_jobs_workflow_instance_id_idx": {
+          "name": "etl_jobs_workflow_instance_id_idx",
+          "columns": [
+            {
+              "expression": "workflow_instance_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        },
+        "etl_jobs_superseded_by_idx": {
+          "name": "etl_jobs_superseded_by_idx",
+          "columns": [
+            {
+              "expression": "superseded_by_job_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "etl_jobs_superseded_by_job_id_etl_jobs_id_fk": {
+          "name": "etl_jobs_superseded_by_job_id_etl_jobs_id_fk",
+          "tableFrom": "etl_jobs",
+          "tableTo": "etl_jobs",
+          "columnsFrom": ["superseded_by_job_id"],
+          "columnsTo": ["id"],
+          "onDelete": "set null",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {
+        "etl_jobs_no_self_supersede": {
+          "name": "etl_jobs_no_self_supersede",
+          "value": "\"etl_jobs\".\"superseded_by_job_id\" IS NULL OR \"etl_jobs\".\"superseded_by_job_id\" <> \"etl_jobs\".\"id\""
+        }
+      },
+      "isRLSEnabled": false
+    },
+    "public.invalid_item_logs": {
+      "name": "invalid_item_logs",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "serial",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "job_id": {
+          "name": "job_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "errors": {
+          "name": "errors",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "raw_data": {
+          "name": "raw_data",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "row_index": {
+          "name": "row_index",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "invalid_item_logs_job_id_etl_jobs_id_fk": {
+          "name": "invalid_item_logs_job_id_etl_jobs_id_fk",
+          "tableFrom": "invalid_item_logs",
+          "tableTo": "etl_jobs",
+          "columnsFrom": ["job_id"],
+          "columnsTo": ["id"],
+          "onDelete": "no action",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.jwks": {
+      "name": "jwks",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "public_key": {
+          "name": "public_key",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "private_key": {
+          "name": "private_key",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.pack_items": {
+      "name": "pack_items",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "name": {
+          "name": "name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "description": {
+          "name": "description",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "weight": {
+          "name": "weight",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "weight_unit": {
+          "name": "weight_unit",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "quantity": {
+          "name": "quantity",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "default": 1
+        },
+        "category": {
+          "name": "category",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "consumable": {
+          "name": "consumable",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "worn": {
+          "name": "worn",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "image": {
+          "name": "image",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "notes": {
+          "name": "notes",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "pack_id": {
+          "name": "pack_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "catalog_item_id": {
+          "name": "catalog_item_id",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "deleted": {
+          "name": "deleted",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "is_ai_generated": {
+          "name": "is_ai_generated",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "template_item_id": {
+          "name": "template_item_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "embedding": {
+          "name": "embedding",
+          "type": "vector(1536)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "pack_items_embedding_idx": {
+          "name": "pack_items_embedding_idx",
+          "columns": [
+            {
+              "expression": "embedding",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last",
+              "opclass": "vector_cosine_ops"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "hnsw",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "pack_items_pack_id_packs_id_fk": {
+          "name": "pack_items_pack_id_packs_id_fk",
+          "tableFrom": "pack_items",
+          "tableTo": "packs",
+          "columnsFrom": ["pack_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "pack_items_catalog_item_id_catalog_items_id_fk": {
+          "name": "pack_items_catalog_item_id_catalog_items_id_fk",
+          "tableFrom": "pack_items",
+          "tableTo": "catalog_items",
+          "columnsFrom": ["catalog_item_id"],
+          "columnsTo": ["id"],
+          "onDelete": "no action",
+          "onUpdate": "no action"
+        },
+        "pack_items_user_id_users_id_fk": {
+          "name": "pack_items_user_id_users_id_fk",
+          "tableFrom": "pack_items",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "no action",
+          "onUpdate": "no action"
+        },
+        "pack_items_template_item_id_pack_template_items_id_fk": {
+          "name": "pack_items_template_item_id_pack_template_items_id_fk",
+          "tableFrom": "pack_items",
+          "tableTo": "pack_template_items",
+          "columnsFrom": ["template_item_id"],
+          "columnsTo": ["id"],
+          "onDelete": "no action",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.pack_template_items": {
+      "name": "pack_template_items",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "name": {
+          "name": "name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "description": {
+          "name": "description",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "weight": {
+          "name": "weight",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "weight_unit": {
+          "name": "weight_unit",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "quantity": {
+          "name": "quantity",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "default": 1
+        },
+        "category": {
+          "name": "category",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "consumable": {
+          "name": "consumable",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "worn": {
+          "name": "worn",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "image": {
+          "name": "image",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "notes": {
+          "name": "notes",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "pack_template_id": {
+          "name": "pack_template_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "catalog_item_id": {
+          "name": "catalog_item_id",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "deleted": {
+          "name": "deleted",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "pack_template_items_pack_template_id_pack_templates_id_fk": {
+          "name": "pack_template_items_pack_template_id_pack_templates_id_fk",
+          "tableFrom": "pack_template_items",
+          "tableTo": "pack_templates",
+          "columnsFrom": ["pack_template_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "pack_template_items_catalog_item_id_catalog_items_id_fk": {
+          "name": "pack_template_items_catalog_item_id_catalog_items_id_fk",
+          "tableFrom": "pack_template_items",
+          "tableTo": "catalog_items",
+          "columnsFrom": ["catalog_item_id"],
+          "columnsTo": ["id"],
+          "onDelete": "no action",
+          "onUpdate": "no action"
+        },
+        "pack_template_items_user_id_users_id_fk": {
+          "name": "pack_template_items_user_id_users_id_fk",
+          "tableFrom": "pack_template_items",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "no action",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.pack_templates": {
+      "name": "pack_templates",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "name": {
+          "name": "name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "description": {
+          "name": "description",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "category": {
+          "name": "category",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "image": {
+          "name": "image",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "tags": {
+          "name": "tags",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "is_app_template": {
+          "name": "is_app_template",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "deleted": {
+          "name": "deleted",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "content_source": {
+          "name": "content_source",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "content_id": {
+          "name": "content_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "local_created_at": {
+          "name": "local_created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "local_updated_at": {
+          "name": "local_updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "pack_templates_user_id_users_id_fk": {
+          "name": "pack_templates_user_id_users_id_fk",
+          "tableFrom": "pack_templates",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.weight_history": {
+      "name": "weight_history",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "pack_id": {
+          "name": "pack_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "weight": {
+          "name": "weight",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "local_created_at": {
+          "name": "local_created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "weight_history_user_id_users_id_fk": {
+          "name": "weight_history_user_id_users_id_fk",
+          "tableFrom": "weight_history",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "weight_history_pack_id_packs_id_fk": {
+          "name": "weight_history_pack_id_packs_id_fk",
+          "tableFrom": "weight_history",
+          "tableTo": "packs",
+          "columnsFrom": ["pack_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.packs": {
+      "name": "packs",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "name": {
+          "name": "name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "description": {
+          "name": "description",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "category": {
+          "name": "category",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "template_id": {
+          "name": "template_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "is_public": {
+          "name": "is_public",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "image": {
+          "name": "image",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "tags": {
+          "name": "tags",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "deleted": {
+          "name": "deleted",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "is_ai_generated": {
+          "name": "is_ai_generated",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "local_created_at": {
+          "name": "local_created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "local_updated_at": {
+          "name": "local_updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "packs_user_id_users_id_fk": {
+          "name": "packs_user_id_users_id_fk",
+          "tableFrom": "packs",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "packs_template_id_pack_templates_id_fk": {
+          "name": "packs_template_id_pack_templates_id_fk",
+          "tableFrom": "packs",
+          "tableTo": "pack_templates",
+          "columnsFrom": ["template_id"],
+          "columnsTo": ["id"],
+          "onDelete": "no action",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.post_comments": {
+      "name": "post_comments",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "serial",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "post_id": {
+          "name": "post_id",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "content": {
+          "name": "content",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "parent_comment_id": {
+          "name": "parent_comment_id",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "post_comments_post_id_posts_id_fk": {
+          "name": "post_comments_post_id_posts_id_fk",
+          "tableFrom": "post_comments",
+          "tableTo": "posts",
+          "columnsFrom": ["post_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "post_comments_user_id_users_id_fk": {
+          "name": "post_comments_user_id_users_id_fk",
+          "tableFrom": "post_comments",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "post_comments_parent_comment_id_post_comments_id_fk": {
+          "name": "post_comments_parent_comment_id_post_comments_id_fk",
+          "tableFrom": "post_comments",
+          "tableTo": "post_comments",
+          "columnsFrom": ["parent_comment_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.post_likes": {
+      "name": "post_likes",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "serial",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "post_id": {
+          "name": "post_id",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "post_likes_post_id_posts_id_fk": {
+          "name": "post_likes_post_id_posts_id_fk",
+          "tableFrom": "post_likes",
+          "tableTo": "posts",
+          "columnsFrom": ["post_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "post_likes_user_id_users_id_fk": {
+          "name": "post_likes_user_id_users_id_fk",
+          "tableFrom": "post_likes",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "post_likes_post_id_user_id_unique": {
+          "name": "post_likes_post_id_user_id_unique",
+          "nullsNotDistinct": false,
+          "columns": ["post_id", "user_id"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.posts": {
+      "name": "posts",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "serial",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "caption": {
+          "name": "caption",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "images": {
+          "name": "images",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "posts_user_id_users_id_fk": {
+          "name": "posts_user_id_users_id_fk",
+          "tableFrom": "posts",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.reported_content": {
+      "name": "reported_content",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "serial",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_query": {
+          "name": "user_query",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "ai_response": {
+          "name": "ai_response",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "reason": {
+          "name": "reason",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_comment": {
+          "name": "user_comment",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "status": {
+          "name": "status",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'pending'"
+        },
+        "reviewed": {
+          "name": "reviewed",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": false,
+          "default": false
+        },
+        "reviewed_by": {
+          "name": "reviewed_by",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "reviewed_at": {
+          "name": "reviewed_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "reported_content_user_id_users_id_fk": {
+          "name": "reported_content_user_id_users_id_fk",
+          "tableFrom": "reported_content",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "no action",
+          "onUpdate": "no action"
+        },
+        "reported_content_reviewed_by_users_id_fk": {
+          "name": "reported_content_reviewed_by_users_id_fk",
+          "tableFrom": "reported_content",
+          "tableTo": "users",
+          "columnsFrom": ["reviewed_by"],
+          "columnsTo": ["id"],
+          "onDelete": "no action",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.session": {
+      "name": "session",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "expires_at": {
+          "name": "expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "token": {
+          "name": "token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "ip_address": {
+          "name": "ip_address",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "user_agent": {
+          "name": "user_agent",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "impersonated_by": {
+          "name": "impersonated_by",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        }
+      },
+      "indexes": {
+        "session_userId_idx": {
+          "name": "session_userId_idx",
+          "columns": [
+            {
+              "expression": "user_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "session_user_id_users_id_fk": {
+          "name": "session_user_id_users_id_fk",
+          "tableFrom": "session",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "session_token_unique": {
+          "name": "session_token_unique",
+          "nullsNotDistinct": false,
+          "columns": ["token"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.trail_condition_reports": {
+      "name": "trail_condition_reports",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "trail_name": {
+          "name": "trail_name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "trail_region": {
+          "name": "trail_region",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "surface": {
+          "name": "surface",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "overall_condition": {
+          "name": "overall_condition",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "hazards": {
+          "name": "hazards",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'[]'::jsonb"
+        },
+        "water_crossings": {
+          "name": "water_crossings",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "default": 0
+        },
+        "water_crossing_difficulty": {
+          "name": "water_crossing_difficulty",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "notes": {
+          "name": "notes",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "photos": {
+          "name": "photos",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'[]'::jsonb"
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "trip_id": {
+          "name": "trip_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "deleted": {
+          "name": "deleted",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "local_created_at": {
+          "name": "local_created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "local_updated_at": {
+          "name": "local_updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "trail_condition_reports_user_id_idx": {
+          "name": "trail_condition_reports_user_id_idx",
+          "columns": [
+            {
+              "expression": "user_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        },
+        "trail_condition_reports_active_created_idx": {
+          "name": "trail_condition_reports_active_created_idx",
+          "columns": [
+            {
+              "expression": "deleted",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "created_at",
+              "isExpression": false,
+              "asc": false,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        },
+        "trail_condition_reports_trail_name_idx": {
+          "name": "trail_condition_reports_trail_name_idx",
+          "columns": [
+            {
+              "expression": "trail_name",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        },
+        "trail_condition_reports_trip_id_idx": {
+          "name": "trail_condition_reports_trip_id_idx",
+          "columns": [
+            {
+              "expression": "trip_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "where": "\"trail_condition_reports\".\"trip_id\" IS NOT NULL",
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "trail_condition_reports_user_id_users_id_fk": {
+          "name": "trail_condition_reports_user_id_users_id_fk",
+          "tableFrom": "trail_condition_reports",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "trail_condition_reports_trip_id_trips_id_fk": {
+          "name": "trail_condition_reports_trip_id_trips_id_fk",
+          "tableFrom": "trail_condition_reports",
+          "tableTo": "trips",
+          "columnsFrom": ["trip_id"],
+          "columnsTo": ["id"],
+          "onDelete": "set null",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.trips": {
+      "name": "trips",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "name": {
+          "name": "name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "description": {
+          "name": "description",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "start_date": {
+          "name": "start_date",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "end_date": {
+          "name": "end_date",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "location": {
+          "name": "location",
+          "type": "jsonb",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "notes": {
+          "name": "notes",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "pack_id": {
+          "name": "pack_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "trail_osm_id": {
+          "name": "trail_osm_id",
+          "type": "bigint",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "local_created_at": {
+          "name": "local_created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "local_updated_at": {
+          "name": "local_updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "deleted": {
+          "name": "deleted",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "trips_user_id_users_id_fk": {
+          "name": "trips_user_id_users_id_fk",
+          "tableFrom": "trips",
+          "tableTo": "users",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "no action",
+          "onUpdate": "no action"
+        },
+        "trips_pack_id_packs_id_fk": {
+          "name": "trips_pack_id_packs_id_fk",
+          "tableFrom": "trips",
+          "tableTo": "packs",
+          "columnsFrom": ["pack_id"],
+          "columnsTo": ["id"],
+          "onDelete": "set null",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.users": {
+      "name": "users",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "name": {
+          "name": "name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "email": {
+          "name": "email",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "email_verified": {
+          "name": "email_verified",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "image": {
+          "name": "image",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "role": {
+          "name": "role",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'USER'"
+        },
+        "banned": {
+          "name": "banned",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": false,
+          "default": false
+        },
+        "ban_reason": {
+          "name": "ban_reason",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "ban_expires": {
+          "name": "ban_expires",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "first_name": {
+          "name": "first_name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "last_name": {
+          "name": "last_name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "avatar_url": {
+          "name": "avatar_url",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "password_hash": {
+          "name": "password_hash",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "users_email_unique": {
+          "name": "users_email_unique",
+          "nullsNotDistinct": false,
+          "columns": ["email"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.verification": {
+      "name": "verification",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "identifier": {
+          "name": "identifier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "value": {
+          "name": "value",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "expires_at": {
+          "name": "expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "verification_identifier_idx": {
+          "name": "verification_identifier_idx",
+          "columns": [
+            {
+              "expression": "identifier",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    }
+  },
+  "enums": {},
+  "schemas": {},
+  "sequences": {},
+  "roles": {},
+  "policies": {},
+  "views": {},
+  "_meta": {
+    "columns": {},
+    "schemas": {},
+    "tables": {}
+  }
+}
diff --git a/packages/api/drizzle/meta/_journal.json b/packages/api/drizzle/meta/_journal.json
index ca463a5058..61c4c6ea9d 100644
--- a/packages/api/drizzle/meta/_journal.json
+++ b/packages/api/drizzle/meta/_journal.json
@@ -337,6 +337,13 @@
       "when": 1778594728740,
       "tag": "0047_cute_bloodscream",
       "breakpoints": true
+    },
+    {
+      "idx": 47,
+      "version": "7",
+      "when": 1779334995277,
+      "tag": "0047_clear_monster_badoon",
+      "breakpoints": true
     }
   ]
 }
diff --git a/packages/api/package.json b/packages/api/package.json
index 7964bb1a55..3ad6e88eac 100644
--- a/packages/api/package.json
+++ b/packages/api/package.json
@@ -50,6 +50,7 @@
     "@packrat/schemas": "workspace:*",
     "@packrat/types": "workspace:*",
     "@packrat/units": "workspace:*",
+    "@sentry/cloudflare": "^10.37.0",
     "@sinclair/typebox": "^0.34.15",
     "@types/nodemailer": "^6.4.17",
     "ai": "catalog:",
diff --git a/packages/api/src/__test-stubs__/cloudflare-workers.ts b/packages/api/src/__test-stubs__/cloudflare-workers.ts
index f4cfbe60d9..f89e0a308e 100644
--- a/packages/api/src/__test-stubs__/cloudflare-workers.ts
+++ b/packages/api/src/__test-stubs__/cloudflare-workers.ts
@@ -1,5 +1,36 @@
-/**
- * Stub for `cloudflare:workers` — used only in unit-test environments.
- * The real module is only available in the Cloudflare Workers runtime.
- */
+// Stub for `cloudflare:workers` — used only in unit-test environments.
+// The real module is only available in the Cloudflare Workers runtime.
+
 export const env = {} as Record<string, unknown>;
+
+// Workflows surface — enough for unit tests to import and instantiate.
+// Tests provide their own `step` shim and never call `run` via the real
+// workflow runtime, so these are intentionally minimal.
+
+export type WorkflowEvent<T> = {
+  payload: Readonly<T>;
+  timestamp: Date;
+  instanceId: string;
+};
+
+export type WorkflowStepConfig = {
+  retries?: { limit: number; delay: string | number; backoff?: string };
+  timeout?: string | number;
+};
+
+export interface WorkflowStep {
+  do<T>(name: string, callback: () => Promise<T>): Promise<T>;
+  do<T>(name: string, config: WorkflowStepConfig, callback: () => Promise<T>): Promise<T>;
+  sleep(name: string, duration: string | number): Promise<void>;
+  sleepUntil(name: string, timestamp: Date | number): Promise<void>;
+}
+
+export abstract class WorkflowEntrypoint<Env = unknown, T = unknown> {
+  protected ctx: unknown;
+  protected env: Env;
+  constructor(ctx: unknown, env: Env) {
+    this.ctx = ctx;
+    this.env = env;
+  }
+  abstract run(event: Readonly<WorkflowEvent<T>>, step: WorkflowStep): Promise<unknown>;
+}
diff --git a/packages/api/src/db/schema.ts b/packages/api/src/db/schema.ts
new file mode 100644
index 0000000000..c6a9fe1fcc
--- /dev/null
+++ b/packages/api/src/db/schema.ts
@@ -0,0 +1,7 @@
+// Re-export of the shared schema from @packrat/db so drizzle.config.ts can
+// point at a path inside the API package without crossing the package
+// boundary. The schema source of truth lives in packages/db/src/schema.ts;
+// this file exists purely so drizzle-kit + any drizzle-aware tooling stays
+// scoped to packages/api and doesn't break if the workspace layout changes.
+
+export * from '@packrat/db/schema';
diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts
index 8b8afb651c..316f2ea826 100644
--- a/packages/api/src/index.ts
+++ b/packages/api/src/index.ts
@@ -6,20 +6,36 @@
  * Elysia-native so Eden Treaty gets full end-to-end type safety.
  */
 
-import type { MessageBatch } from '@cloudflare/workers-types';
+import type { MessageBatch, ScheduledController } from '@cloudflare/workers-types';
 import { cors } from '@elysiajs/cors';
 import { getAuth } from '@packrat/api/auth';
 import { AppContainer } from '@packrat/api/containers';
 import { routes } from '@packrat/api/routes';
 import { CatalogService } from '@packrat/api/services';
 import { processQueueBatch } from '@packrat/api/services/etl/queue';
+import { sweepInvalidItemLogs } from '@packrat/api/services/retention/invalidLogRetention';
 import type { Env } from '@packrat/api/utils/env-validation';
 import { getEnv, setWorkerEnv } from '@packrat/api/utils/env-validation';
 import { packratOpenApi } from '@packrat/api/utils/openapi';
+import { CatalogEtlWorkflow as RawCatalogEtlWorkflow } from '@packrat/api/workflows/catalog-etl-workflow';
+import { instrumentWorkflowWithSentry, withSentry } from '@sentry/cloudflare';
 import { Elysia } from 'elysia';
 import { CloudflareAdapter } from 'elysia/adapter/cloudflare-worker';
 import type { CatalogETLMessage } from './services/etl/types';
 
+// Sentry options for both the Worker handlers and the workflow class.
+// Reads SENTRY_DSN + ENVIRONMENT from the validated env. tracesSampleRate
+// defaults to 10% — observable enough for prod debugging without
+// overwhelming the Sentry quota.
+function sentryOptions(env: Env) {
+  return {
+    dsn: env.SENTRY_DSN,
+    environment: env.ENVIRONMENT,
+    tracesSampleRate: 0.1,
+    release: env.CF_VERSION_METADATA?.id,
+  };
+}
+
 export const app = new Elysia({ adapter: CloudflareAdapter })
   .use(
     cors({
@@ -77,6 +93,14 @@ export type App = typeof app;
 
 export { AppContainer };
 
+// Wrap the workflow class with Sentry instrumentation so each step.do span
+// + any uncaught throw inside a step lands in Sentry with workflow/instance
+// context attached automatically.
+export const CatalogEtlWorkflow = instrumentWorkflowWithSentry(
+  sentryOptions,
+  RawCatalogEtlWorkflow,
+);
+
 type CfFetchFn = (
   request: Request,
   env: Env,
@@ -90,7 +114,7 @@ function enrichEnv(env: Env): Env {
   return env;
 }
 
-export default {
+const handler: ExportedHandler<Env> = {
   async fetch(request: Request, env: Env, ctx: ExecutionContext): Promise<Response> {
     const e = enrichEnv(env);
     setWorkerEnv(e as unknown as Record<string, unknown>); // safe-cast: setWorkerEnv accepts Record; ValidatedEnv has no index signature by design
@@ -122,4 +146,31 @@ export default {
       throw new Error(`Unknown queue: ${batch.queue}`);
     }
   },
-} satisfies ExportedHandler<Env>;
+
+  async scheduled(controller: ScheduledController, env: Env): Promise<void> {
+    setWorkerEnv(enrichEnv(env) as unknown as Record<string, unknown>); // safe-cast: same as fetch handler above
+
+    if (controller.cron === '0 9 * * *') {
+      const result = await sweepInvalidItemLogs(env);
+      console.log(
+        `[retention] invalid_item_logs sweep: deleted=${result.deleted} ` +
+          `iterations=${result.iterations} capped=${result.capped} ` +
+          `retentionDays=${result.retentionDays}`,
+      );
+      if (result.capped) {
+        console.warn(
+          `[retention] invalid_item_logs sweep hit max-iterations cap; ` +
+            `remaining expired rows will be swept on the next run`,
+        );
+      }
+      return;
+    }
+
+    throw new Error(`Unknown cron: ${controller.cron}`);
+  },
+};
+
+// withSentry wraps the fetch/queue/scheduled handlers to initialize Sentry
+// on first invocation and forward uncaught exceptions to Sentry. The
+// instrumented workflow class is exported separately above.
+export default withSentry(sentryOptions, handler);
diff --git a/packages/api/src/routes/admin/analytics/catalog.ts b/packages/api/src/routes/admin/analytics/catalog.ts
index 90b901add5..8d5dff2ac8 100644
--- a/packages/api/src/routes/admin/analytics/catalog.ts
+++ b/packages/api/src/routes/admin/analytics/catalog.ts
@@ -1,22 +1,178 @@
 import { createDb } from '@packrat/api/db';
-import { queueCatalogETL } from '@packrat/api/services/etl/queue';
+import { R2BucketService } from '@packrat/api/services/r2-bucket';
 import { getEnv } from '@packrat/api/utils/env-validation';
+import type { CatalogEtlWorkflowParams } from '@packrat/api/workflows/catalog-etl-workflow';
+import { type ChunkSpec, chunkCsvForR2 } from '@packrat/api/workflows/shared/chunkCsvForR2';
 import { catalogItems, etlJobs, invalidItemLogs } from '@packrat/db';
 import {
   AdminErrorResponses,
   BrandRowSchema,
+  CatalogAuditSchema,
   CatalogOverviewSchema,
   EtlFailureSummarySchema,
   EtlJobFailuresSchema,
+  EtlReconcileSchema,
   EtlResetStuckSchema,
   EtlResponseSchema,
   EtlRetrySchema,
   PriceBucketSchema,
 } from '@packrat/schemas/admin';
+import { parse } from 'csv-parse';
 import { and, avg, count, desc, eq, gt, isNotNull, lt, max, min, sql } from 'drizzle-orm';
 import { Elysia, status } from 'elysia';
 import { z } from 'zod';
 
+type ReingestResult =
+  | {
+      success: true;
+      newJobId: string;
+      objectKey: string;
+      workflowInstanceId: string;
+    }
+  | {
+      _statusCode: 400 | 404 | 409 | 500;
+      error: string;
+      code?: string;
+    };
+
+/**
+ * Shared body for retry + repair-from-scratch admin endpoints.
+ *
+ * mode:
+ *   - 'retry'  — only `status='failed'` jobs are eligible (defensive).
+ *   - 'repair' — any job is eligible; always sets supersededByJobId.
+ *
+ * force=true skips the etag fail-closed check. Use when:
+ *   - The original job has no source_etag (legacy queue-era rows, or the
+ *     2026-05-14 false-failure rows).
+ *   - The operator has manually verified the R2 source content.
+ */
+async function reingestJob(args: {
+  originalJobId: string;
+  mode: 'retry' | 'repair';
+  force: boolean;
+}): Promise<ReingestResult> {
+  const { originalJobId, mode, force } = args;
+  const db = createDb();
+
+  try {
+    const [original] = await db
+      .select()
+      .from(etlJobs)
+      .where(eq(etlJobs.id, originalJobId))
+      .limit(1);
+
+    if (!original) {
+      return { _statusCode: 404, error: 'ETL job not found' };
+    }
+
+    if (mode === 'retry' && original.status !== 'failed') {
+      return {
+        _statusCode: 409,
+        error:
+          original.status === 'running'
+            ? 'Job is still running — wait for it to complete or use repair-from-scratch'
+            : 'Only failed jobs can be retried — use repair-from-scratch for completed jobs',
+      };
+    }
+
+    if (mode === 'repair' && original.status === 'running') {
+      return {
+        _statusCode: 409,
+        error: 'Job is still running — wait for it to complete before repair',
+      };
+    }
+
+    const newJobId = crypto.randomUUID();
+    const objectKey = `v2/${original.source}/${original.filename}`;
+    const env = getEnv();
+
+    if (!env.ETL_WORKFLOW) {
+      return { _statusCode: 400, error: 'ETL_WORKFLOW is not configured' };
+    }
+
+    const r2 = new R2BucketService({ env, bucketType: 'catalog' });
+    const head = await r2.head(objectKey);
+    if (!head) {
+      return { _statusCode: 404, error: `R2 source not found at ${objectKey}` };
+    }
+
+    // ETag fail-closed: if we have a stored etag and the live etag has
+    // drifted, refuse unless the operator explicitly forces. This is the
+    // guard that stops a scraper overwrite from being silently re-applied
+    // to an old (source, filename) under the wrong audit record.
+    if (!force && original.sourceEtag !== null && original.sourceEtag !== head.etag) {
+      return {
+        _statusCode: 409,
+        error:
+          `R2 source etag has drifted (stored=${original.sourceEtag}, ` +
+          `live=${head.etag}). Pass ?force=true to re-ingest the current content.`,
+        code: 'ETL_ETAG_MISMATCH',
+      };
+    }
+
+    const {
+      etag: liveEtag,
+      lastModified: liveLastModified,
+      chunks,
+    } = await chunkCsvForR2({
+      r2,
+      objectKey,
+    });
+    const totalChunks = chunks.length;
+    const indexedChunks: ChunkSpec[] = chunks.map((c, i) => ({
+      ...c,
+      chunkIndex: i,
+      chunksTotal: totalChunks,
+    }));
+
+    // Suffix the instance ID with the new jobId so duplicate retries
+    // don't collide with the original instance or with each other.
+    const suffix = mode === 'retry' ? 'retry' : 'repair';
+    const workflowInstanceId = `${original.source}-${original.filename}-${suffix}-${newJobId}`;
+
+    await db.insert(etlJobs).values({
+      id: newJobId,
+      status: 'running',
+      source: original.source,
+      filename: original.filename,
+      scraperRevision: original.scraperRevision,
+      startedAt: new Date(),
+      workflowInstanceId,
+      sourceEtag: liveEtag,
+      sourceLastModified: liveLastModified,
+      supersededByJobId: originalJobId,
+      supersededAt: new Date(),
+    });
+
+    const workflowParams: CatalogEtlWorkflowParams = {
+      jobId: newJobId,
+      source: original.source,
+      scraperRevision: original.scraperRevision,
+      chunks: indexedChunks,
+    };
+
+    try {
+      await env.ETL_WORKFLOW.create({ id: workflowInstanceId, params: workflowParams });
+    } catch (enqueueErr) {
+      await db
+        .update(etlJobs)
+        .set({ status: 'failed', completedAt: new Date() })
+        .where(eq(etlJobs.id, newJobId));
+      throw enqueueErr;
+    }
+
+    return { success: true, newJobId, objectKey, workflowInstanceId };
+  } catch (error) {
+    console.error(`ETL ${mode} error:`, error);
+    return {
+      _statusCode: 500,
+      error: `Failed to ${mode === 'retry' ? 'retry' : 'repair'} ETL job`,
+      code: mode === 'retry' ? 'ETL_RETRY_ERROR' : 'ETL_REPAIR_ERROR',
+    };
+  }
+}
+
 export const catalogAnalyticsRoutes = new Elysia({ prefix: '/catalog' })
   .get(
     '/overview',
@@ -409,62 +565,348 @@ export const catalogAnalyticsRoutes = new Elysia({ prefix: '/catalog' })
   )
 
   // ─── Retry a failed job ───────────────────────────────────────────────────────
+  //
+  // Re-ingests via the workflow path regardless of the original engine.
+  // Works for both legacy queue-era failures and workflow-era failures —
+  // the new instance carries chunks computed by chunkCsvForR2 so the
+  // re-ingest is row-boundary-aligned.
 
   .post(
     '/etl/:jobId/retry',
+    async ({ params, query }) => {
+      const result = await reingestJob({
+        originalJobId: params.jobId,
+        mode: 'retry',
+        force: query.force === true,
+      });
+      if ('_statusCode' in result) {
+        const { _statusCode, ...body } = result;
+        return status(_statusCode, body);
+      }
+      return result;
+    },
+    {
+      params: z.object({ jobId: z.string().uuid() }),
+      query: z.object({ force: z.coerce.boolean().optional() }),
+      response: { 200: EtlRetrySchema, ...AdminErrorResponses },
+      detail: { tags: ['Admin'], summary: 'Retry a failed ETL job via the workflow path' },
+    },
+  )
+
+  // ─── Repair-from-scratch (works on completed jobs too) ──────────────────────
+  //
+  // Same shape as retry but accepts `completed` jobs — for cases where an
+  // operator suspects the original ingest under-counted (e.g., the
+  // 2026-05-14 false-failures whose counters might be wrong even after
+  // status was correctly `completed`). Always sets superseded_by_job_id
+  // for full audit trail.
+
+  .post(
+    '/etl/:jobId/repair-from-scratch',
+    async ({ params, query }) => {
+      const result = await reingestJob({
+        originalJobId: params.jobId,
+        mode: 'repair',
+        force: query.force === true,
+      });
+      if ('_statusCode' in result) {
+        const { _statusCode, ...body } = result;
+        return status(_statusCode, body);
+      }
+      return result;
+    },
+    {
+      params: z.object({ jobId: z.string().uuid() }),
+      query: z.object({ force: z.coerce.boolean().optional() }),
+      response: { 200: EtlRetrySchema, ...AdminErrorResponses },
+      detail: {
+        tags: ['Admin'],
+        summary:
+          'Re-ingest a job from scratch via the workflow path (works on completed jobs; always supersedes)',
+      },
+    },
+  )
+
+  // ─── Reconcile a job's row count against its R2 source ───────────────────────
+  //
+  // Synchronous — counts logical CSV rows (csv-parse, not raw \n counting
+  // since quoted multi-line fields skew that) and persists the result on
+  // etl_jobs.verified_at + verified_row_count. For very large files this
+  // can be slow; an async-via-workflow path is a follow-up if needed.
+
+  .post(
+    '/etl/:jobId/reconcile',
     async ({ params }) => {
       const db = createDb();
 
       try {
-        const [original] = await db
-          .select()
-          .from(etlJobs)
-          .where(eq(etlJobs.id, params.jobId))
-          .limit(1);
-
-        if (!original) return status(404, { error: 'ETL job not found' });
-        if (original.status !== 'failed')
-          return status(409, {
-            error:
-              original.status === 'running'
-                ? 'Job is still running — wait for it to complete or reset stuck jobs first'
-                : 'Only failed jobs can be retried',
-          });
-
-        const newJobId = crypto.randomUUID();
-        const objectKey = `v2/${original.source}/${original.filename}`;
-        const env = getEnv();
+        const [job] = await db.select().from(etlJobs).where(eq(etlJobs.id, params.jobId)).limit(1);
 
-        if (!env.ETL_QUEUE) return status(400, { error: 'ETL_QUEUE is not configured' });
+        if (!job) return status(404, { error: 'ETL job not found' });
 
-        await db.insert(etlJobs).values({
-          id: newJobId,
-          status: 'running',
-          source: original.source,
-          filename: original.filename,
-          scraperRevision: original.scraperRevision,
-          startedAt: new Date(),
+        const objectKey = `v2/${job.source}/${job.filename}`;
+        const env = getEnv();
+        const r2 = new R2BucketService({ env, bucketType: 'catalog' });
+        const obj = await r2.get(objectKey);
+        if (!obj) return status(404, { error: `R2 source not found at ${objectKey}` });
+
+        const parser = parse({ relax_column_count: true, skip_empty_lines: true });
+        let totalRows = 0;
+        let isHeaderProcessed = false;
+
+        const writerPromise = (async () => {
+          const reader = obj.body.getReader();
+          const decoder = new TextDecoder();
+          try {
+            while (true) {
+              const { done, value } = await reader.read();
+              if (done) break;
+              const ok = parser.write(decoder.decode(value, { stream: true }));
+              if (!ok) {
+                await new Promise<void>((resolve) => parser.once('drain', resolve));
+              }
+            }
+          } finally {
+            reader.releaseLock();
+            parser.end();
+          }
+        })().catch((err) => {
+          parser.destroy(err instanceof Error ? err : new Error(String(err)));
+          throw err;
         });
 
         try {
-          await queueCatalogETL({ queue: env.ETL_QUEUE, chunks: [{ objectKey }], jobId: newJobId });
-        } catch (enqueueErr) {
-          await db
-            .update(etlJobs)
-            .set({ status: 'failed', completedAt: new Date() })
-            .where(eq(etlJobs.id, newJobId));
-          throw enqueueErr;
+          for await (const _record of parser) {
+            if (!isHeaderProcessed) {
+              isHeaderProcessed = true;
+              continue;
+            }
+            totalRows++;
+          }
+        } finally {
+          await writerPromise;
         }
 
-        return { success: true as const, newJobId, objectKey };
+        const expectedRowCount = totalRows;
+        const actualRowCount = job.totalProcessed;
+        const delta = actualRowCount === null ? null : expectedRowCount - actualRowCount;
+
+        await db
+          .update(etlJobs)
+          .set({
+            verifiedAt: new Date(),
+            verifiedRowCount: expectedRowCount,
+          })
+          .where(eq(etlJobs.id, params.jobId));
+
+        return {
+          success: true as const,
+          jobId: params.jobId,
+          expectedRowCount,
+          actualRowCount,
+          delta,
+        };
       } catch (error) {
-        console.error('ETL retry error:', error);
-        return status(500, { error: 'Failed to retry ETL job', code: 'ETL_RETRY_ERROR' });
+        console.error('ETL reconcile error:', error);
+        return status(500, {
+          error: 'Failed to reconcile ETL job',
+          code: 'ETL_RECONCILE_ERROR',
+        });
       }
     },
     {
       params: z.object({ jobId: z.string().uuid() }),
-      response: { 200: EtlRetrySchema, ...AdminErrorResponses },
-      detail: { tags: ['Admin'], summary: 'Retry a failed ETL job' },
+      response: { 200: EtlReconcileSchema, ...AdminErrorResponses },
+      detail: {
+        tags: ['Admin'],
+        summary: 'Count R2 source rows and persist verified_row_count on etl_jobs',
+      },
+    },
+  )
+
+  // ─── Catalog data-quality audit ────────────────────────────────────────────
+  //
+  // Per-source breakdown of catalog_items quality flags. Powers the scrapyd
+  // audit_db_catalog.py script so that scrapyd never needs DB credentials —
+  // it consumes the JSON from this endpoint and renders markdown.
+  //
+  // Flags surfaced (computed server-side from threshold constants):
+  //   decimal_bug — count of prices < $10 with 3+ decimal places
+  //   low_median — median price below $20 for a non-allowlisted source
+  //   high_null:<field> — > 30% NULL rate on a key field
+  //   bad_weight — count of weights < 1g or > 100kg
+  //   empty_name — count of empty/null names
+  //   stale — source has no completed ETL in 30+ days
+  //
+  // ?source=<name> filters to one source (faster + scoped). Omit for all sources.
+
+  .get(
+    '/etl/audit',
+    async ({ query }) => {
+      const db = createDb();
+
+      try {
+        const sourceFilter = query.source;
+
+        // Single GROUP BY query. catalog_item_etl_jobs is the per-item-per-job
+        // join; we attribute each catalog item to its most recent ingest source
+        // via DISTINCT ON. Then aggregate per source.
+        const rows = (await db.execute(sql`
+          WITH latest_per_item AS (
+            SELECT DISTINCT ON (cie.catalog_item_id)
+              cie.catalog_item_id,
+              j.source
+            FROM catalog_item_etl_jobs cie
+            JOIN etl_jobs j ON j.id = cie.etl_job_id
+            ORDER BY cie.catalog_item_id, cie.created_at DESC
+          ),
+          last_jobs AS (
+            SELECT DISTINCT ON (source)
+              source,
+              id AS last_id,
+              completed_at AS last_at
+            FROM etl_jobs
+            WHERE status = 'completed'
+            ORDER BY source, completed_at DESC NULLS LAST
+          )
+          SELECT
+            lpi.source,
+            COUNT(*)::int AS total_items,
+            lj.last_id,
+            lj.last_at,
+            percentile_cont(0.5) WITHIN GROUP (ORDER BY ci.price)::float AS median_price,
+            MIN(ci.price) FILTER (WHERE ci.price > 0)::float AS min_price,
+            MAX(ci.price)::float AS max_price,
+            COUNT(*) FILTER (WHERE ci.price IS NULL)::int AS null_price,
+            COUNT(*) FILTER (WHERE ci.brand IS NULL OR ci.brand = '')::int AS null_brand,
+            COUNT(*) FILTER (WHERE ci.description IS NULL OR ci.description = '')::int AS null_desc,
+            COUNT(*) FILTER (WHERE ci.weight IS NULL)::int AS null_weight,
+            COUNT(*) FILTER (
+              WHERE ci.images IS NULL OR jsonb_array_length(ci.images) = 0
+            )::int AS null_images,
+            COUNT(*) FILTER (WHERE ci.availability IS NULL)::int AS null_avail,
+            COUNT(*) FILTER (WHERE ci.name IS NULL OR ci.name = '')::int AS empty_name,
+            COUNT(*) FILTER (
+              WHERE ci.price IS NOT NULL
+                AND ci.price < 10
+                AND ci.price <> floor(ci.price)
+                AND (ci.price * 1000) = floor(ci.price * 1000)
+            )::int AS suspicious_decimal,
+            COUNT(*) FILTER (
+              WHERE ci.weight IS NOT NULL
+                AND (ci.weight < 1 OR ci.weight > 100000)
+            )::int AS suspicious_weight
+          FROM latest_per_item lpi
+          JOIN catalog_items ci ON ci.id = lpi.catalog_item_id
+          LEFT JOIN last_jobs lj ON lj.source = lpi.source
+          ${sourceFilter ? sql`WHERE lpi.source = ${sourceFilter}` : sql``}
+          GROUP BY lpi.source, lj.last_id, lj.last_at
+          ORDER BY lpi.source
+        `)) as unknown as Array<{
+          source: string;
+          total_items: number;
+          last_id: string | null;
+          last_at: Date | null;
+          median_price: number | null;
+          min_price: number | null;
+          max_price: number | null;
+          null_price: number;
+          null_brand: number;
+          null_desc: number;
+          null_weight: number;
+          null_images: number;
+          null_avail: number;
+          empty_name: number;
+          suspicious_decimal: number;
+          suspicious_weight: number;
+        }>;
+
+        const now = Date.now();
+        // Sources with no median price below this for non-allowlisted sources flag low_median.
+        // Allowlist matches the EXPECTED_LOW_PRICE_SOURCES constant in scrapyd's
+        // audit_r2_data.py — kept in sync manually for now.
+        const expectedLowPriceSources = new Set([
+          '3vgear',
+          'bioliteenergy',
+          'farmtofeet',
+          'kelty',
+          'darntough',
+        ]);
+        const minFillRate = 0.7;
+
+        const sources = rows.map((r) => {
+          const daysStale =
+            r.last_at !== null
+              ? Math.floor((now - new Date(r.last_at).getTime()) / (24 * 60 * 60 * 1000))
+              : null;
+          const total = r.total_items;
+          const nullRates = {
+            price: total > 0 ? r.null_price / total : 0,
+            brand: total > 0 ? r.null_brand / total : 0,
+            description: total > 0 ? r.null_desc / total : 0,
+            weight: total > 0 ? r.null_weight / total : 0,
+            images: total > 0 ? r.null_images / total : 0,
+            availability: total > 0 ? r.null_avail / total : 0,
+          };
+          const flags: string[] = [];
+          if (r.suspicious_decimal > 0) flags.push(`decimal_bug (${r.suspicious_decimal})`);
+          if (
+            r.median_price !== null &&
+            r.median_price < 20 &&
+            !expectedLowPriceSources.has(r.source)
+          ) {
+            flags.push(`low_median ($${r.median_price.toFixed(2)})`);
+          }
+          for (const [field, rate] of Object.entries(nullRates)) {
+            if (rate > 1 - minFillRate) {
+              flags.push(`high_null:${field} (${Math.round(rate * 100)}%)`);
+            }
+          }
+          if (r.suspicious_weight > 0) flags.push(`bad_weight (${r.suspicious_weight})`);
+          if (r.empty_name > 0) flags.push(`empty_name (${r.empty_name})`);
+          if (daysStale !== null && daysStale > 30) flags.push(`stale (${daysStale}d)`);
+
+          return {
+            source: r.source,
+            totalItems: total,
+            lastEtlId: r.last_id,
+            lastEtlAt: r.last_at ? new Date(r.last_at).toISOString() : null,
+            daysStale,
+            medianPrice: r.median_price,
+            minPrice: r.min_price,
+            maxPrice: r.max_price,
+            nullRates,
+            suspiciousDecimalCount: r.suspicious_decimal,
+            suspiciousWeightCount: r.suspicious_weight,
+            emptyNameCount: r.empty_name,
+            flags,
+          };
+        });
+
+        return {
+          generatedAt: new Date().toISOString(),
+          thresholds: {
+            decimalBugPriceThreshold: 10,
+            lowMedianPriceThreshold: 20,
+            minFillRate,
+            staleDaysThreshold: 30,
+            weightTooLightGrams: 1,
+            weightTooHeavyGrams: 100000,
+          },
+          sources,
+        };
+      } catch (error) {
+        console.error('Catalog audit error:', error);
+        return status(500, { error: 'Failed to generate catalog audit', code: 'AUDIT_ERROR' });
+      }
+    },
+    {
+      query: z.object({ source: z.string().optional() }),
+      response: { 200: CatalogAuditSchema, ...AdminErrorResponses },
+      detail: {
+        tags: ['Admin'],
+        summary:
+          'Per-source catalog_items data-quality audit (decimal bugs, NULL rates, staleness)',
+      },
     },
   );
diff --git a/packages/api/src/routes/catalog/index.ts b/packages/api/src/routes/catalog/index.ts
index 442a43fb95..bbcaed100c 100644
--- a/packages/api/src/routes/catalog/index.ts
+++ b/packages/api/src/routes/catalog/index.ts
@@ -6,6 +6,8 @@ import { queueCatalogETL } from '@packrat/api/services/etl/queue';
 import { R2BucketService } from '@packrat/api/services/r2-bucket';
 import { getEmbeddingText } from '@packrat/api/utils/embeddingHelper';
 import { getEnv } from '@packrat/api/utils/env-validation';
+import type { CatalogEtlWorkflowParams } from '@packrat/api/workflows/catalog-etl-workflow';
+import { type ChunkSpec, chunkCsvForR2 } from '@packrat/api/workflows/shared/chunkCsvForR2';
 import { catalogItems, etlJobs, packItems } from '@packrat/db';
 import { isString } from '@packrat/guards';
 import {
@@ -225,19 +227,107 @@ export const catalogRoutes = new Elysia({ prefix: '/catalog' })
     },
   )
 
-  // -- ETL queue (api-key auth)
+  // -- ETL trigger (api-key auth)
+  //
+  // Default engine is 'workflow' — triggers a CatalogEtlWorkflow instance
+  // per source file. The 'queue' engine routes to the legacy queue path and
+  // remains available during the coexistence window so operators can fall
+  // back if the workflow path misbehaves in production. The queue path will
+  // be removed after the workflow path bakes (per the migration plan).
   .post(
     '/etl',
-    async ({ body }) => {
+    async ({ body, query }) => {
       const { filename, chunks, source, scraperRevision } = body;
+      const engine = query.engine ?? 'workflow';
       const db = createDb();
       const env = getEnv();
+      const jobId = crypto.randomUUID();
+
+      if (engine === 'queue') {
+        if (!env.ETL_QUEUE) {
+          return status(400, { message: 'ETL_QUEUE is not configured' });
+        }
+
+        await db.insert(etlJobs).values({
+          id: jobId,
+          status: 'running',
+          source,
+          filename,
+          scraperRevision,
+          startedAt: new Date(),
+        });
+
+        const CHUNK_BYTES = 20 * 1024 * 1024;
+        const r2 = new R2BucketService({ env, bucketType: 'catalog' });
+        const queueChunks: Array<{
+          objectKey: string;
+          byteStart?: number;
+          byteEnd?: number;
+        }> = [];
+
+        for (const objectKey of chunks) {
+          const meta = await r2.head(objectKey);
+          if (!meta || meta.size <= CHUNK_BYTES) {
+            queueChunks.push({ objectKey });
+          } else {
+            const n = Math.ceil(meta.size / CHUNK_BYTES);
+            for (let i = 0; i < n; i++) {
+              queueChunks.push({
+                objectKey,
+                byteStart: i * CHUNK_BYTES,
+                byteEnd: Math.min((i + 1) * CHUNK_BYTES - 1, meta.size - 1),
+              });
+            }
+          }
+        }
 
-      if (!env.ETL_QUEUE) {
-        return status(400, { message: 'ETL_QUEUE is not configured' });
+        await queueCatalogETL({
+          queue: env.ETL_QUEUE,
+          chunks: queueChunks,
+          jobId,
+        });
+
+        return {
+          message: 'Catalog ETL job queued successfully (legacy queue path)',
+          jobId,
+          engine: 'queue' as const,
+        };
       }
 
-      const jobId = crypto.randomUUID();
+      // Workflow path (default).
+      if (!env.ETL_WORKFLOW) {
+        return status(400, { message: 'ETL_WORKFLOW is not configured' });
+      }
+
+      const r2 = new R2BucketService({ env, bucketType: 'catalog' });
+
+      // Chunk every source object up front so the workflow params carry the
+      // full plan. Single-file is the dominant case in prod (scrapers
+      // produce one CSV per run); multi-object requests bundle into one
+      // workflow instance. ETag from the first object is captured for the
+      // repair-from-scratch fail-closed verification (U5 follow-up).
+      const allChunks: ChunkSpec[] = [];
+      let firstEtag: string | null = null;
+      let firstLastModified: Date | null = null;
+      for (const objectKey of chunks) {
+        const { etag, lastModified, chunks: chunkSpecs } = await chunkCsvForR2({ r2, objectKey });
+        if (firstEtag === null) {
+          firstEtag = etag;
+          firstLastModified = lastModified;
+        }
+        allChunks.push(...chunkSpecs);
+      }
+
+      // Re-index chunkIndex / chunksTotal across the combined chunk array so
+      // step names in the workflow are globally unique within an instance.
+      const totalChunks = allChunks.length;
+      const indexedChunks: ChunkSpec[] = allChunks.map((c, i) => ({
+        ...c,
+        chunkIndex: i,
+        chunksTotal: totalChunks,
+      }));
+
+      const instanceId = `${source}-${filename}`;
 
       await db.insert(etlJobs).values({
         id: jobId,
@@ -246,48 +336,44 @@ export const catalogRoutes = new Elysia({ prefix: '/catalog' })
         filename,
         scraperRevision,
         startedAt: new Date(),
+        workflowInstanceId: instanceId,
+        sourceEtag: firstEtag,
+        sourceLastModified: firstLastModified,
       });
 
-      // Split large files into 20 MB byte-range chunks so each Worker
-      // invocation stays within the CPU time budget (~30k rows / chunk).
-      const CHUNK_BYTES = 20 * 1024 * 1024;
-      const r2 = new R2BucketService({ env, bucketType: 'catalog' });
-      const queueChunks: Array<{ objectKey: string; byteStart?: number; byteEnd?: number }> = [];
+      const params: CatalogEtlWorkflowParams = {
+        jobId,
+        source,
+        scraperRevision,
+        chunks: indexedChunks,
+      };
 
-      for (const objectKey of chunks) {
-        const meta = await r2.head(objectKey);
-        if (!meta || meta.size <= CHUNK_BYTES) {
-          queueChunks.push({ objectKey });
-        } else {
-          const n = Math.ceil(meta.size / CHUNK_BYTES);
-          for (let i = 0; i < n; i++) {
-            queueChunks.push({
-              objectKey,
-              byteStart: i * CHUNK_BYTES,
-              byteEnd: Math.min((i + 1) * CHUNK_BYTES - 1, meta.size - 1),
-            });
-          }
-        }
+      try {
+        await env.ETL_WORKFLOW.create({ id: instanceId, params });
+      } catch (err) {
+        await db
+          .update(etlJobs)
+          .set({ status: 'failed', completedAt: new Date() })
+          .where(eq(etlJobs.id, jobId));
+        throw err;
       }
 
-      await queueCatalogETL({
-        queue: env.ETL_QUEUE,
-        chunks: queueChunks,
-        jobId,
-      });
-
       return {
-        message: 'Catalog ETL job queued successfully',
+        message: 'Catalog ETL workflow triggered',
         jobId,
-        queued: true,
+        engine: 'workflow' as const,
+        workflowInstanceId: instanceId,
       };
     },
     {
       body: CatalogETLSchema,
+      query: z.object({
+        engine: z.enum(['workflow', 'queue']).optional(),
+      }),
       isValidApiKey: true,
       detail: {
         tags: ['Catalog'],
-        summary: 'Queue catalog ETL job from R2 CSV chunk files',
+        summary: 'Trigger catalog ETL ingest (Workflow by default; ?engine=queue for legacy path)',
       },
     },
   )
diff --git a/packages/api/src/services/etl/CatalogItemValidator.ts b/packages/api/src/services/etl/CatalogItemValidator.ts
index b700d74120..11af59f9d1 100644
--- a/packages/api/src/services/etl/CatalogItemValidator.ts
+++ b/packages/api/src/services/etl/CatalogItemValidator.ts
@@ -3,6 +3,28 @@ import type { NewCatalogItem } from '@packrat/db';
 import { isNumber, isString } from '@packrat/guards';
 import type { ValidationError } from '@packrat/schemas/validation';
 
+// Hostname patterns rejected by isValidUrl to close the SSRF surface — any
+// future server-side fetch of a catalog URL (OG-tag generation, preview
+// rendering, etc.) cannot be tricked into hitting internal infrastructure.
+// String-level check only; no DNS resolution (which is itself an SSRF vector).
+// IPv6 hostnames are bracket-stripped before matching (URL.hostname returns
+// bracketed form: `[::1]`).
+const PRIVATE_HOSTNAME_PATTERN =
+  /^(?:localhost|127\.|10\.|192\.168\.|172\.(?:1[6-9]|2\d|3[01])\.|169\.254\.|::1$|fc00:|fd00:|fe80:)/i;
+
+// Length caps — chosen to accommodate the widest real-world catalog rows while
+// preventing a scraper bug or supply-chain compromise from saturating the
+// catalog with multi-MB blobs.
+const URL_MAX_LENGTH = 2048;
+const NAME_MAX_LENGTH = 500;
+const DESCRIPTION_MAX_LENGTH = 50_000;
+const BRAND_MAX_LENGTH = 200;
+const CATEGORY_MAX_LENGTH = 200;
+const SKU_MAX_LENGTH = 200;
+
+const SKU_PATTERN = /^[A-Za-z0-9_./-]+$/;
+const IPV6_BRACKET_PATTERN = /^\[(.+)\]$/;
+
 export class CatalogItemValidator {
   validateItem(item: Partial<NewCatalogItem>): ValidatedCatalogItem {
     const errors: ValidationError[] = [];
@@ -14,6 +36,12 @@ export class CatalogItemValidator {
         reason: 'Name is required and must be a non-empty string',
         value: item.name,
       });
+    } else if (item.name.length > NAME_MAX_LENGTH) {
+      errors.push({
+        field: 'name',
+        reason: `Name exceeds maximum length (${NAME_MAX_LENGTH} chars)`,
+        value: item.name,
+      });
     }
 
     if (!item.sku || !isString(item.sku) || item.sku.trim().length === 0) {
@@ -22,6 +50,18 @@ export class CatalogItemValidator {
         reason: 'SKU is required and must be a non-empty string',
         value: item.sku,
       });
+    } else if (item.sku.length > SKU_MAX_LENGTH) {
+      errors.push({
+        field: 'sku',
+        reason: `SKU exceeds maximum length (${SKU_MAX_LENGTH} chars)`,
+        value: item.sku,
+      });
+    } else if (!SKU_PATTERN.test(item.sku)) {
+      errors.push({
+        field: 'sku',
+        reason: 'SKU contains invalid characters (allowed: A-Z a-z 0-9 _ . / -)',
+        value: item.sku,
+      });
     }
 
     if (!item.productUrl || !isString(item.productUrl) || item.productUrl.trim().length === 0) {
@@ -30,17 +70,53 @@ export class CatalogItemValidator {
         reason: 'Product URL is required and must be a non-empty string',
         value: item.productUrl,
       });
+    } else if (item.productUrl.length > URL_MAX_LENGTH) {
+      errors.push({
+        field: 'productUrl',
+        reason: `Product URL exceeds maximum length (${URL_MAX_LENGTH} chars)`,
+        value: item.productUrl,
+      });
+    } else {
+      const urlError = this.validateUrl(item.productUrl);
+      if (urlError) {
+        errors.push({ field: 'productUrl', reason: urlError, value: item.productUrl });
+      }
     }
 
     // Additional validations
     // Note: weight and weightUnit are intentionally not required — clothing/footwear brands often
     // omit weight data. Items without weight are ingested but won't appear in weight comparisons.
-    if (item.productUrl && !this.isValidUrl(item.productUrl)) {
-      errors.push({
-        field: 'productUrl',
-        reason: 'Product URL must be a valid URL format',
-        value: item.productUrl,
-      });
+    if (item.description !== undefined && item.description !== null) {
+      if (isString(item.description) && item.description.length > DESCRIPTION_MAX_LENGTH) {
+        errors.push({
+          field: 'description',
+          reason: `Description exceeds maximum length (${DESCRIPTION_MAX_LENGTH} chars)`,
+          value: undefined, // omit the raw value — it can be huge
+        });
+      }
+    }
+
+    if (item.brand !== undefined && item.brand !== null) {
+      if (isString(item.brand) && item.brand.length > BRAND_MAX_LENGTH) {
+        errors.push({
+          field: 'brand',
+          reason: `Brand exceeds maximum length (${BRAND_MAX_LENGTH} chars)`,
+          value: item.brand,
+        });
+      }
+    }
+
+    if (Array.isArray(item.categories)) {
+      for (const category of item.categories) {
+        if (isString(category) && category.length > CATEGORY_MAX_LENGTH) {
+          errors.push({
+            field: 'categories',
+            reason: `Category exceeds maximum length (${CATEGORY_MAX_LENGTH} chars)`,
+            value: category,
+          });
+          break; // one error is enough; don't spam
+        }
+      }
     }
 
     if (item.price !== undefined && (!isNumber(item.price) || item.price < 0)) {
@@ -58,12 +134,47 @@ export class CatalogItemValidator {
     };
   }
 
-  private isValidUrl(url: string): boolean {
+  /**
+   * Returns null when the URL is acceptable; otherwise a reason string.
+   *
+   * Rejects:
+   * - Non-http(s) schemes (javascript:, mailto:, data:, file:, etc.)
+   * - Private/loopback/link-local hostnames (SSRF surface for any future
+   *   server-side fetch)
+   * - Hostnames containing non-ASCII characters that survive punycode
+   *   round-tripping (IDN homograph attack surface for the user-facing
+   *   catalog UI)
+   */
+  private validateUrl(url: string): string | null {
+    let parsed: URL;
     try {
-      new URL(url);
-      return true;
+      parsed = new URL(url);
     } catch {
-      return false;
+      return 'Product URL must be a valid URL format';
+    }
+
+    if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
+      return `Product URL scheme must be http: or https: (got ${parsed.protocol})`;
+    }
+
+    // Strip IPv6 brackets so `[::1]` matches the IPv6 patterns and not the
+    // bracketed-string fallback.
+    const hostname = parsed.hostname.replace(IPV6_BRACKET_PATTERN, '$1');
+    if (PRIVATE_HOSTNAME_PATTERN.test(hostname)) {
+      return 'Product URL hostname must not be a private/loopback/link-local address';
+    }
+
+    // Hostnames with non-ASCII characters are IDN homograph candidates.
+    // Native URL parsing already encodes them to punycode in parsed.hostname,
+    // so non-ASCII presence here means the hostname survived encoding (rare)
+    // OR the URL was malformed in a way `new URL()` accepted. Either way,
+    // reject as a defense-in-depth measure for catalog-rendered links.
+    for (const ch of parsed.hostname) {
+      if (ch.charCodeAt(0) > 127) {
+        return 'Product URL hostname contains non-ASCII characters (IDN homograph guard)';
+      }
     }
+
+    return null;
   }
 }
diff --git a/packages/api/src/services/etl/__tests__/CatalogItemValidator.test.ts b/packages/api/src/services/etl/__tests__/CatalogItemValidator.test.ts
new file mode 100644
index 0000000000..df79a5e776
--- /dev/null
+++ b/packages/api/src/services/etl/__tests__/CatalogItemValidator.test.ts
@@ -0,0 +1,159 @@
+// Validator hardening tests — closes audit P3 #2 (the user-facing catalog
+// rendered any URL that new URL() accepted, including javascript: and
+// private IPs). These tests pin the new scheme / hostname / length
+// constraints so the attack surface cannot regress.
+
+import { CatalogItemValidator } from '@packrat/api/services/etl/CatalogItemValidator';
+import { describe, expect, it } from 'vitest';
+
+const baseItem = {
+  name: 'Test Item',
+  sku: 'SKU-1',
+  productUrl: 'https://example.com/product/1',
+};
+
+function reasonsFor(field: string, errors: { field: string; reason: string }[]): string[] {
+  return errors.filter((e) => e.field === field).map((e) => e.reason);
+}
+
+describe('CatalogItemValidator', () => {
+  const v = new CatalogItemValidator();
+
+  describe('URL scheme', () => {
+    it('accepts http and https URLs', () => {
+      const httpsOk = v.validateItem({ ...baseItem, productUrl: 'https://example.com/x' });
+      expect(httpsOk.isValid).toBe(true);
+
+      const httpOk = v.validateItem({ ...baseItem, productUrl: 'http://example.com/x' });
+      expect(httpOk.isValid).toBe(true);
+    });
+
+    it('rejects javascript:, mailto:, data:, file: URLs', () => {
+      for (const url of [
+        'javascript:alert(1)',
+        'mailto:foo@bar',
+        'data:text/html,x',
+        'file:///etc/passwd',
+      ]) {
+        const result = v.validateItem({ ...baseItem, productUrl: url });
+        expect(result.isValid).toBe(false);
+        expect(reasonsFor('productUrl', result.errors).join(' ')).toMatch(/scheme/i);
+      }
+    });
+  });
+
+  describe('URL hostname (SSRF guard)', () => {
+    it('rejects loopback hostnames', () => {
+      for (const url of [
+        'http://localhost/x',
+        'http://127.0.0.1/x',
+        'http://127.5.5.5/x',
+        'http://[::1]/x',
+      ]) {
+        const result = v.validateItem({ ...baseItem, productUrl: url });
+        expect(result.isValid).toBe(false);
+        expect(reasonsFor('productUrl', result.errors).join(' ')).toMatch(
+          /private|loopback|link-local/i,
+        );
+      }
+    });
+
+    it('rejects RFC-1918 private ranges', () => {
+      for (const url of [
+        'http://10.0.0.1/x',
+        'http://10.255.255.255/x',
+        'http://192.168.1.1/x',
+        'http://172.16.0.1/x',
+        'http://172.31.255.255/x',
+      ]) {
+        const result = v.validateItem({ ...baseItem, productUrl: url });
+        expect(result.isValid).toBe(false);
+        expect(reasonsFor('productUrl', result.errors).join(' ')).toMatch(
+          /private|loopback|link-local/i,
+        );
+      }
+    });
+
+    it('rejects link-local 169.254/16', () => {
+      const result = v.validateItem({ ...baseItem, productUrl: 'http://169.254.169.254/latest' });
+      expect(result.isValid).toBe(false);
+      expect(reasonsFor('productUrl', result.errors).join(' ')).toMatch(
+        /private|loopback|link-local/i,
+      );
+    });
+
+    it('allows 172.15 and 172.32 (outside the private 16-31 range)', () => {
+      const ok1 = v.validateItem({ ...baseItem, productUrl: 'http://172.15.0.1/x' });
+      expect(ok1.isValid).toBe(true);
+      const ok2 = v.validateItem({ ...baseItem, productUrl: 'http://172.32.0.1/x' });
+      expect(ok2.isValid).toBe(true);
+    });
+  });
+
+  describe('URL length cap', () => {
+    it('accepts a URL at the boundary (2048 chars)', () => {
+      const path = 'a'.repeat(2048 - 'https://example.com/'.length);
+      const url = `https://example.com/${path}`;
+      expect(url.length).toBe(2048);
+      const result = v.validateItem({ ...baseItem, productUrl: url });
+      expect(result.isValid).toBe(true);
+    });
+
+    it('rejects a URL of 2049 chars', () => {
+      const path = 'a'.repeat(2049 - 'https://example.com/'.length);
+      const url = `https://example.com/${path}`;
+      const result = v.validateItem({ ...baseItem, productUrl: url });
+      expect(result.isValid).toBe(false);
+      expect(reasonsFor('productUrl', result.errors).join(' ')).toMatch(/maximum length/i);
+    });
+  });
+
+  describe('SKU charset and length', () => {
+    it('accepts conventional SKUs (letters, digits, _ . / -)', () => {
+      const sku = 'ABC_def.123/test-9';
+      const result = v.validateItem({ ...baseItem, sku });
+      expect(result.isValid).toBe(true);
+    });
+
+    it('rejects SKUs with shell metacharacters or HTML', () => {
+      for (const sku of ['<script>', 'a"b', "a'b", 'a;b', 'a&b']) {
+        const result = v.validateItem({ ...baseItem, sku });
+        expect(result.isValid).toBe(false);
+        expect(reasonsFor('sku', result.errors).join(' ')).toMatch(/invalid characters/i);
+      }
+    });
+
+    it('rejects SKUs over 200 chars', () => {
+      const result = v.validateItem({ ...baseItem, sku: 'a'.repeat(201) });
+      expect(result.isValid).toBe(false);
+      expect(reasonsFor('sku', result.errors).join(' ')).toMatch(/maximum length/i);
+    });
+  });
+
+  describe('Length caps on prose fields', () => {
+    it('accepts a 500-char name; rejects 501', () => {
+      const ok = v.validateItem({ ...baseItem, name: 'a'.repeat(500) });
+      expect(ok.isValid).toBe(true);
+      const bad = v.validateItem({ ...baseItem, name: 'a'.repeat(501) });
+      expect(bad.isValid).toBe(false);
+    });
+
+    it('rejects description over 50,000 chars', () => {
+      const result = v.validateItem({ ...baseItem, description: 'a'.repeat(50_001) });
+      expect(result.isValid).toBe(false);
+      expect(reasonsFor('description', result.errors).join(' ')).toMatch(/maximum length/i);
+    });
+
+    it('rejects brand over 200 chars', () => {
+      const result = v.validateItem({ ...baseItem, brand: 'a'.repeat(201) });
+      expect(result.isValid).toBe(false);
+      expect(reasonsFor('brand', result.errors).join(' ')).toMatch(/maximum length/i);
+    });
+
+    it('rejects an oversized category', () => {
+      const result = v.validateItem({ ...baseItem, categories: ['ok', 'a'.repeat(201)] });
+      expect(result.isValid).toBe(false);
+      expect(reasonsFor('categories', result.errors).join(' ')).toMatch(/maximum length/i);
+    });
+  });
+});
diff --git a/packages/api/src/services/etl/processLogsBatch.ts b/packages/api/src/services/etl/processLogsBatch.ts
index cfab66517a..161be95f4d 100644
--- a/packages/api/src/services/etl/processLogsBatch.ts
+++ b/packages/api/src/services/etl/processLogsBatch.ts
@@ -1,6 +1,7 @@
+import { createDbClient } from '@packrat/api/db';
 import type { Env } from '@packrat/api/utils/env-validation';
+import { logger } from '@packrat/api/utils/logger';
 import { invalidItemLogs, type NewInvalidItemLog } from '@packrat/db';
-import { createDbClient } from '../../db';
 import { updateEtlJobProgress } from './updateEtlJobProgress';
 
 export async function processLogsBatch({
@@ -13,6 +14,7 @@ export async function processLogsBatch({
   env: Env;
 }): Promise<void> {
   const db = createDbClient(env);
+
   try {
     await db.insert(invalidItemLogs).values(logs);
     await updateEtlJobProgress(env, {
@@ -21,8 +23,13 @@ export async function processLogsBatch({
       processed: logs.length,
     });
 
-    console.log(`📝 Processed and wrote ${logs.length} invalid items for job ${jobId}`);
+    logger.info('etl.invalid_logs.persisted', { jobId, count: logs.length });
   } catch (error) {
-    console.error(`Failed to process log message:`, error);
+    // Rethrow — invalid_item_logs is the forensic record of what failed
+    // validation. Silently swallowing a DB write loss here means an
+    // operator chasing a data-quality complaint has no trail. Closes
+    // audit P2 #2.
+    logger.error('etl.invalid_logs.persist_failed', { jobId, count: logs.length, err: error });
+    throw error;
   }
 }
diff --git a/packages/api/src/services/etl/processValidItemsBatch.ts b/packages/api/src/services/etl/processValidItemsBatch.ts
index 354d777c10..9351eea03b 100644
--- a/packages/api/src/services/etl/processValidItemsBatch.ts
+++ b/packages/api/src/services/etl/processValidItemsBatch.ts
@@ -1,6 +1,9 @@
+import { createDbClient } from '@packrat/api/db';
 import { getEmbeddingText } from '@packrat/api/utils/embeddingHelper';
 import type { Env } from '@packrat/api/utils/env-validation';
-import type { NewCatalogItem } from '@packrat/db';
+import { logger } from '@packrat/api/utils/logger';
+import { etlJobs, type NewCatalogItem } from '@packrat/db';
+import { eq, sql } from 'drizzle-orm';
 import { CatalogService } from '../catalogService';
 import { generateManyEmbeddings } from '../embeddingService';
 import { mergeItemsBySku } from './mergeItemsBySku';
@@ -50,8 +53,16 @@ export async function processValidItemsBatch({
       processed: items.length,
     });
   } catch (error) {
-    console.error(`Error generating embeddings for batch ${jobId}:`, error);
-    // Fall back to processing without embeddings
+    // Embedding-fallback path. The upsert still happens (catalog gets the
+    // items minus their vectors), but we record the degradation on
+    // etl_jobs.total_embedding_failures so operators see the count via
+    // the admin endpoint without trawling logs. Closes audit P2 #3.
+    logger.warn('etl.embedding.fallback', {
+      jobId,
+      skuCount: items.length,
+      errorName: error instanceof Error ? error.name : 'unknown',
+    });
+
     const upsertedItems = await catalogService.upsertCatalogItems(mergedItems);
     await catalogService.trackEtlJob(upsertedItems, jobId);
     await updateEtlJobProgress(env, {
@@ -59,7 +70,15 @@ export async function processValidItemsBatch({
       valid: items.length,
       processed: items.length,
     });
+
+    const db = createDbClient(env);
+    await db
+      .update(etlJobs)
+      .set({
+        totalEmbeddingFailures: sql`COALESCE(${etlJobs.totalEmbeddingFailures}, 0) + ${items.length}`,
+      })
+      .where(eq(etlJobs.id, jobId));
   } finally {
-    console.log(`📦 Batch ${jobId}: Processed ${items.length} valid items`);
+    logger.info('etl.valid_items.batch_complete', { jobId, count: items.length });
   }
 }
diff --git a/packages/api/src/services/retention/__tests__/invalidLogRetention.test.ts b/packages/api/src/services/retention/__tests__/invalidLogRetention.test.ts
new file mode 100644
index 0000000000..5011e2d014
--- /dev/null
+++ b/packages/api/src/services/retention/__tests__/invalidLogRetention.test.ts
@@ -0,0 +1,90 @@
+// Unit tests for the invalid_item_logs retention sweep.
+//
+// The function's behavior with real DB rows is covered by integration tests
+// (test/etl-log-retention.test.ts). These unit tests stub createDbClient to
+// verify the loop semantics — stop on empty batch, iteration cap, and the
+// returned RetentionResult shape — without touching Postgres.
+
+import { sweepInvalidItemLogs } from '@packrat/api/services/retention/invalidLogRetention';
+import type { Env } from '@packrat/api/utils/env-validation';
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+type FakeRow = { id: number };
+
+// Hoisted state shared between the mock factory and the tests. `vi.mock` calls
+// hoist above imports, so this declaration uses `vi.hoisted` to ensure the
+// mock factory and the test code reference the same array.
+const mockState = vi.hoisted(() => ({
+  batches: [] as FakeRow[][],
+  callCount: 0,
+}));
+
+vi.mock('@packrat/api/db', () => {
+  const mockDb = {
+    select: () => ({ from: () => ({ where: () => ({ limit: () => mockState }) }) }),
+    delete: () => ({
+      where: () => ({
+        returning: async () => {
+          const batch = mockState.batches[mockState.callCount] ?? [];
+          mockState.callCount += 1;
+          return batch;
+        },
+      }),
+    }),
+  };
+  return { createDbClient: () => mockDb };
+});
+
+function setBatches(batches: FakeRow[][]) {
+  mockState.batches = batches;
+  mockState.callCount = 0;
+}
+
+describe('sweepInvalidItemLogs', () => {
+  afterEach(() => {
+    setBatches([]);
+  });
+
+  it('returns deleted=0 / iterations=1 when the first batch is empty', async () => {
+    setBatches([[]]);
+    const result = await sweepInvalidItemLogs({} as Env);
+    expect(result.deleted).toBe(0);
+    expect(result.iterations).toBe(1);
+    expect(result.capped).toBe(false);
+    expect(result.retentionDays).toBe(90);
+  });
+
+  it('accumulates deletions across batches until an empty one stops the loop', async () => {
+    const fullBatch: FakeRow[] = Array.from({ length: 10_000 }, () => ({ id: 1 }));
+    setBatches([fullBatch, fullBatch, [{ id: 1 }], []]);
+
+    const result = await sweepInvalidItemLogs({} as Env);
+
+    expect(result.deleted).toBe(20_001);
+    expect(result.iterations).toBe(4);
+    expect(result.capped).toBe(false);
+  });
+
+  it('caps at maxIterations and reports capped=true', async () => {
+    const fullBatch: FakeRow[] = Array.from({ length: 100 }, () => ({ id: 1 }));
+    setBatches([fullBatch, fullBatch, fullBatch, fullBatch, fullBatch]);
+
+    const result = await sweepInvalidItemLogs({} as Env, { maxIterations: 3 });
+
+    expect(result.iterations).toBe(3);
+    expect(result.capped).toBe(true);
+    expect(result.deleted).toBe(300);
+  });
+
+  it('honors a custom retentionDays option', async () => {
+    setBatches([[]]);
+    const result = await sweepInvalidItemLogs({} as Env, { retentionDays: 30 });
+    expect(result.retentionDays).toBe(30);
+  });
+
+  it('falls back to the default retentionDays when the option is zero or negative', async () => {
+    setBatches([[]]);
+    const result = await sweepInvalidItemLogs({} as Env, { retentionDays: 0 });
+    expect(result.retentionDays).toBe(90);
+  });
+});
diff --git a/packages/api/src/services/retention/invalidLogRetention.ts b/packages/api/src/services/retention/invalidLogRetention.ts
new file mode 100644
index 0000000000..f3fbc1e89a
--- /dev/null
+++ b/packages/api/src/services/retention/invalidLogRetention.ts
@@ -0,0 +1,93 @@
+// Bounded-batch DELETE of expired invalid_item_logs.
+//
+// Each ETL run can produce thousands of invalid_item_logs rows. Left alone
+// the table grows without bound — a single bad scraper upload can be
+// hundreds of MB of jsonb. This sweep is the periodic cleanup.
+//
+// Why batched: a naive `DELETE FROM invalid_item_logs WHERE created_at < ...`
+// on a table that has been accumulating for months would acquire row-level
+// locks on millions of rows in a single statement, hit Neon's statement
+// timeout, and roll back having pruned nothing. The batched loop deletes
+// in 10k-row chunks and bails after a configurable max iteration count so
+// a runaway first-run can't monopolize the daily window.
+
+import { createDbClient } from '@packrat/api/db';
+import type { Env } from '@packrat/api/utils/env-validation';
+import { invalidItemLogs } from '@packrat/db';
+import { inArray, lt, sql } from 'drizzle-orm';
+
+const DEFAULT_RETENTION_DAYS = 90;
+const DEFAULT_BATCH_SIZE = 10_000;
+const DEFAULT_MAX_ITERATIONS = 100;
+
+export type RetentionResult = {
+  /** Total rows deleted across all iterations. */
+  deleted: number;
+  /** How many DELETE batches ran. */
+  iterations: number;
+  /** True if the run hit `maxIterations` before exhausting expired rows; caller should alert. */
+  capped: boolean;
+  /** Effective retention window applied. */
+  retentionDays: number;
+};
+
+export type RetentionOptions = {
+  retentionDays?: number;
+  batchSize?: number;
+  maxIterations?: number;
+};
+
+/**
+ * Delete invalid_item_logs older than the retention window in bounded batches.
+ *
+ * Default retention is 90 days. The default 100-iteration cap × 10k batch
+ * size = up to 1M rows per run. If the table has more expired rows than
+ * that on first execution, the function returns `capped: true` and the
+ * remainder is swept on subsequent runs.
+ */
+export async function sweepInvalidItemLogs(
+  env: Env,
+  options: RetentionOptions = {},
+): Promise<RetentionResult> {
+  const retentionDays =
+    options.retentionDays !== undefined && options.retentionDays > 0
+      ? options.retentionDays
+      : DEFAULT_RETENTION_DAYS;
+  const batchSize = options.batchSize ?? DEFAULT_BATCH_SIZE;
+  const maxIterations = options.maxIterations ?? DEFAULT_MAX_ITERATIONS;
+
+  const db = createDbClient(env);
+
+  let deleted = 0;
+  let iterations = 0;
+  let rowCount = 0;
+  const cutoff = sql`now() - (${retentionDays}::int * interval '1 day')`;
+
+  for (let i = 0; i < maxIterations; i++) {
+    iterations++;
+
+    const selectExpired = db
+      .select({ id: invalidItemLogs.id })
+      .from(invalidItemLogs)
+      .where(lt(invalidItemLogs.createdAt, cutoff))
+      .limit(batchSize);
+
+    const removed = await db
+      .delete(invalidItemLogs)
+      .where(inArray(invalidItemLogs.id, selectExpired))
+      .returning();
+
+    rowCount = removed.length;
+    deleted += rowCount;
+    if (rowCount === 0) break;
+  }
+
+  return {
+    deleted,
+    iterations,
+    // capped only when we hit the iteration ceiling with rows still remaining;
+    // if the last batch returned 0 rows we exhausted the table (not capped).
+    capped: rowCount > 0,
+    retentionDays,
+  };
+}
diff --git a/packages/api/src/utils/__tests__/logger.test.ts b/packages/api/src/utils/__tests__/logger.test.ts
new file mode 100644
index 0000000000..038e929ab9
--- /dev/null
+++ b/packages/api/src/utils/__tests__/logger.test.ts
@@ -0,0 +1,98 @@
+// Unit tests for the structured logger.
+
+import { logger } from '@packrat/api/utils/logger';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+describe('logger', () => {
+  let logSpy: ReturnType<typeof vi.spyOn>;
+  let warnSpy: ReturnType<typeof vi.spyOn>;
+  let errorSpy: ReturnType<typeof vi.spyOn>;
+
+  beforeEach(() => {
+    logSpy = vi.spyOn(console, 'log').mockImplementation(() => undefined);
+    warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => undefined);
+    errorSpy = vi.spyOn(console, 'error').mockImplementation(() => undefined);
+  });
+
+  afterEach(() => {
+    logSpy.mockRestore();
+    warnSpy.mockRestore();
+    errorSpy.mockRestore();
+  });
+
+  function parseLastLine(spy: ReturnType<typeof vi.spyOn>): Record<string, unknown> {
+    const calls = spy.mock.calls;
+    const last = calls[calls.length - 1];
+    if (!last) throw new Error('expected console output but got none');
+    const arg = last[0];
+    if (typeof arg !== 'string') throw new Error('expected console arg to be a string');
+    return JSON.parse(arg);
+  }
+
+  describe('info', () => {
+    it('emits a JSON line with level=INFO and event', () => {
+      logger.info('etl.test');
+      expect(logSpy).toHaveBeenCalledOnce();
+      const line = parseLastLine(logSpy);
+      expect(line.level).toBe('INFO');
+      expect(line.event).toBe('etl.test');
+      expect(typeof line.ts).toBe('string');
+    });
+
+    it('merges ctx fields into the emitted line', () => {
+      logger.info('etl.test', { jobId: 'j1', count: 42 });
+      const line = parseLastLine(logSpy);
+      expect(line.jobId).toBe('j1');
+      expect(line.count).toBe(42);
+    });
+  });
+
+  describe('warn', () => {
+    it('emits to console.warn with level=WARN', () => {
+      logger.warn('etl.fallback', { jobId: 'j2' });
+      expect(warnSpy).toHaveBeenCalledOnce();
+      const line = parseLastLine(warnSpy);
+      expect(line.level).toBe('WARN');
+      expect(line.event).toBe('etl.fallback');
+      expect(line.jobId).toBe('j2');
+    });
+  });
+
+  describe('error', () => {
+    it('emits to console.error with level=ERROR', () => {
+      logger.error('etl.failed', { jobId: 'j3' });
+      expect(errorSpy).toHaveBeenCalledOnce();
+      const line = parseLastLine(errorSpy);
+      expect(line.level).toBe('ERROR');
+      expect(line.event).toBe('etl.failed');
+      expect(line.jobId).toBe('j3');
+    });
+
+    it('unpacks an Error attached as ctx.err into errorName / errorMessage / errorStack', () => {
+      const err = new Error('boom');
+      err.name = 'BoomError';
+      logger.error('etl.failed', { jobId: 'j4', err });
+      const line = parseLastLine(errorSpy);
+      expect(line.errorName).toBe('BoomError');
+      expect(line.errorMessage).toBe('boom');
+      expect(typeof line.errorStack).toBe('string');
+      // err should not appear as a raw field
+      expect(line.err).toBeUndefined();
+    });
+
+    it('coerces a non-Error err to a string errorMessage', () => {
+      logger.error('etl.failed', { err: 'plain string' });
+      const line = parseLastLine(errorSpy);
+      expect(line.errorMessage).toBe('plain string');
+      expect(line.errorName).toBeUndefined();
+    });
+
+    it('omits err-related fields when no err is provided', () => {
+      logger.error('etl.failed', { jobId: 'j5' });
+      const line = parseLastLine(errorSpy);
+      expect(line.errorName).toBeUndefined();
+      expect(line.errorMessage).toBeUndefined();
+      expect(line.errorStack).toBeUndefined();
+    });
+  });
+});
diff --git a/packages/api/src/utils/env-validation.ts b/packages/api/src/utils/env-validation.ts
index 8f65926c96..635f87e918 100644
--- a/packages/api/src/utils/env-validation.ts
+++ b/packages/api/src/utils/env-validation.ts
@@ -77,6 +77,7 @@ export const apiEnvSchema = z.object({
   ETL_QUEUE: z.unknown(),
   LOGS_QUEUE: z.unknown(),
   EMBEDDINGS_QUEUE: z.unknown(),
+  ETL_WORKFLOW: z.unknown(),
   // App container Durable Object binding (APP_CONTAINER)
   APP_CONTAINER: z.unknown(),
   // Rate limiting binding (optional — not present in local dev/test)
@@ -105,6 +106,7 @@ const testEnvSchema = apiEnvSchema.partial().extend({
   ETL_QUEUE: z.unknown().optional(),
   LOGS_QUEUE: z.unknown().optional(),
   EMBEDDINGS_QUEUE: z.unknown().optional(),
+  ETL_WORKFLOW: z.unknown().optional(),
   APP_CONTAINER: z.unknown().optional(),
   AUTH_KV: z.unknown().optional(),
 });
@@ -122,6 +124,7 @@ export type ValidatedEnv = Omit<
   | 'ETL_QUEUE'
   | 'LOGS_QUEUE'
   | 'EMBEDDINGS_QUEUE'
+  | 'ETL_WORKFLOW'
   | 'APP_CONTAINER'
   | 'TOKEN_RATE_LIMITER'
   | 'AUTH_KV'
@@ -134,6 +137,7 @@ export type ValidatedEnv = Omit<
   ETL_QUEUE: Queue;
   LOGS_QUEUE: Queue;
   EMBEDDINGS_QUEUE: Queue;
+  ETL_WORKFLOW: Workflow;
   APP_CONTAINER: DurableObjectNamespace<Container<unknown>>;
   TOKEN_RATE_LIMITER?: { limit(opts: { key: string }): Promise<{ success: boolean }> };
   OSM_HYPERDRIVE?: Hyperdrive;
@@ -174,6 +178,7 @@ function validate(rawEnv: Record<string, unknown>): ValidatedEnv {
     ETL_QUEUE: (rawEnv.ETL_QUEUE ?? validated.data.ETL_QUEUE) as Queue, // safe-cast: Cloudflare Worker binding injected by runtime
     LOGS_QUEUE: (rawEnv.LOGS_QUEUE ?? validated.data.LOGS_QUEUE) as Queue, // safe-cast: Cloudflare Worker binding injected by runtime
     EMBEDDINGS_QUEUE: (rawEnv.EMBEDDINGS_QUEUE ?? validated.data.EMBEDDINGS_QUEUE) as Queue, // safe-cast: Cloudflare Worker binding injected by runtime
+    ETL_WORKFLOW: (rawEnv.ETL_WORKFLOW ?? validated.data.ETL_WORKFLOW) as Workflow, // safe-cast: Cloudflare Worker binding injected by runtime
     // safe-cast: Cloudflare Worker binding injected by runtime
     APP_CONTAINER: (rawEnv.APP_CONTAINER ?? validated.data.APP_CONTAINER) as DurableObjectNamespace<
       Container<unknown>
diff --git a/packages/api/src/utils/logger.ts b/packages/api/src/utils/logger.ts
new file mode 100644
index 0000000000..766d1e5f44
--- /dev/null
+++ b/packages/api/src/utils/logger.ts
@@ -0,0 +1,127 @@
+// Thin structured-logger surface for the API worker.
+//
+// Two reasons this exists instead of bare console.log calls:
+//   1. Structured JSON lines are searchable in Workers logpush without
+//      regex parsing. A consistent { level, event, ...ctx } shape lets
+//      operators pivot on `event="etl.embedding.fallback"` in seconds.
+//   2. The emit() boundary forwards to @sentry/cloudflare when the SDK
+//      has been initialized by withSentry() in src/index.ts:
+//        - INFO/WARN → Sentry.addBreadcrumb (correlated with the next
+//          captureException if one fires)
+//        - ERROR with ctx.err → Sentry.captureException with tags from
+//          ctx (jobId, chunkIndex, workflowInstanceId, etc.)
+//        - ERROR without ctx.err → Sentry.captureMessage at error level
+//      isInitialized() returns false during unit tests or before withSentry
+//      runs, in which case Sentry calls are skipped silently.
+//
+// The error_stack contract: error messages MUST NOT include raw CSV row
+// data. Logger functions accept a structured `ctx` so callers pass jobId,
+// chunkIndex, etc. without smuggling row content into stringified errors.
+// To log an Error, attach it under the `err` key of ctx — the emit()
+// boundary unpacks it into errorName/errorMessage/errorStack fields and
+// forwards to Sentry.
+
+import { isNumber, isString } from '@packrat/guards';
+import { addBreadcrumb, captureException, captureMessage, isInitialized } from '@sentry/cloudflare';
+
+export type LogContext = Record<string, unknown> & { err?: unknown };
+
+type LogLevel = 'INFO' | 'WARN' | 'ERROR';
+
+type EmitArgs = { level: LogLevel; event: string; ctx?: LogContext };
+
+function forwardToSentry({ level, event, ctx }: EmitArgs): void {
+  // The Sentry SDK throws if it's accessed before withSentry has initialized
+  // the client (e.g., in unit tests or during cold-start). Skip silently in
+  // that case — console output above is still the durable record.
+  if (!isInitialized()) return;
+
+  const sentryTags: Record<string, string> = {};
+  const sentryExtras: Record<string, unknown> = { event };
+  let err: unknown;
+  if (ctx) {
+    for (const [k, v] of Object.entries(ctx)) {
+      if (k === 'err') {
+        err = v;
+        continue;
+      }
+      if (isString(v) || isNumber(v) || v === true || v === false) {
+        sentryTags[k] = String(v);
+      } else {
+        sentryExtras[k] = v;
+      }
+    }
+  }
+
+  if (level === 'ERROR') {
+    if (err !== undefined) {
+      captureException(err, { tags: { event, ...sentryTags }, extra: sentryExtras });
+    } else {
+      captureMessage(event, { level: 'error', tags: sentryTags, extra: sentryExtras });
+    }
+    return;
+  }
+
+  addBreadcrumb({
+    category: event,
+    level: level === 'WARN' ? 'warning' : 'info',
+    data: { ...sentryTags, ...sentryExtras },
+  });
+}
+
+function emit({ level, event, ctx }: EmitArgs): void {
+  const line: Record<string, unknown> = {
+    level,
+    event,
+    ts: new Date().toISOString(),
+  };
+  if (ctx) {
+    for (const [k, v] of Object.entries(ctx)) {
+      if (k === 'err') continue;
+      line[k] = v;
+    }
+    const err = ctx.err;
+    if (err !== undefined) {
+      if (err instanceof Error) {
+        line.errorName = err.name;
+        line.errorMessage = err.message;
+        if (err.stack) line.errorStack = err.stack;
+      } else {
+        line.errorMessage = String(err);
+      }
+    }
+  }
+  let out: string;
+  try {
+    out = JSON.stringify(line);
+  } catch {
+    out = JSON.stringify({ level, event, ts: line.ts, serializationError: true });
+  }
+  if (level === 'ERROR') {
+    console.error(out);
+  } else if (level === 'WARN') {
+    console.warn(out);
+  } else {
+    console.log(out);
+  }
+
+  // Best-effort forward to Sentry; failures here must never break the call
+  // site (the JSON line is already on console).
+  try {
+    forwardToSentry({ level, event, ctx });
+  } catch {
+    // swallow — Sentry forwarding is observability, not correctness
+  }
+}
+
+export const logger = {
+  info(event: string, ctx?: LogContext): void {
+    emit({ level: 'INFO', event, ctx });
+  },
+  warn(event: string, ctx?: LogContext): void {
+    emit({ level: 'WARN', event, ctx });
+  },
+  error(event: string, ctx?: LogContext): void {
+    emit({ level: 'ERROR', event, ctx });
+  },
+};
diff --git a/packages/api/src/workflows/catalog-etl-workflow.ts b/packages/api/src/workflows/catalog-etl-workflow.ts
new file mode 100644
index 0000000000..914239d303
--- /dev/null
+++ b/packages/api/src/workflows/catalog-etl-workflow.ts
@@ -0,0 +1,282 @@
+// Catalog ETL — runs as a Cloudflare Workflow.
+//
+// Replaces the Queues-based pipeline at packages/api/src/services/etl/queue.ts
+// + processCatalogEtl.ts. Workflows' durable step execution gives:
+//   - Per-step memoization (a successful step is never re-executed on retry)
+//   - Per-step retry policy (transient R2/DB/embedding failures retry with
+//     exponential backoff; persistent failures route the instance to errored)
+//   - Durable state between steps (no etl_job_chunks idempotency table needed)
+//   - Instance status as the source of truth for stuck-job detection (no
+//     wall-clock sweep cron needed)
+//
+// Counters on etl_jobs are written from the chunk steps (via existing
+// processValidItemsBatch / processLogsBatch which call updateEtlJobProgress).
+// On a step retry the underlying SKU upsert is idempotent (UNIQUE on
+// catalog_item_etl_jobs); embedding API calls and invalid_item_log inserts
+// can duplicate on retry — accepted trade-off for the simpler control flow.
+// The final aggregate step writes the authoritative totals from the
+// memoized step results.
+
+import { WorkflowEntrypoint, type WorkflowEvent, type WorkflowStep } from 'cloudflare:workers';
+import { createDbClient } from '@packrat/api/db';
+import { CatalogItemValidator } from '@packrat/api/services/etl/CatalogItemValidator';
+import { BATCH_SIZE } from '@packrat/api/services/etl/processCatalogEtl';
+import { processLogsBatch } from '@packrat/api/services/etl/processLogsBatch';
+import { processValidItemsBatch } from '@packrat/api/services/etl/processValidItemsBatch';
+import { R2BucketService } from '@packrat/api/services/r2-bucket';
+import { mapCsvRowToItem } from '@packrat/api/utils/csv-utils';
+import type { Env } from '@packrat/api/utils/env-validation';
+import { setWorkerEnv } from '@packrat/api/utils/env-validation';
+import { etlJobs, type NewCatalogItem, type NewInvalidItemLog } from '@packrat/db';
+import { parse } from 'csv-parse';
+import { eq } from 'drizzle-orm';
+import type { ChunkSpec } from './shared/chunkCsvForR2';
+
+export type CatalogEtlWorkflowParams = {
+  jobId: string;
+  source: string;
+  scraperRevision: string;
+  chunks: ChunkSpec[];
+};
+
+export type ChunkResult = {
+  chunkIndex: number;
+  rowsProcessed: number;
+  rowsValid: number;
+  rowsInvalid: number;
+};
+
+const HEADER_PEEK_SIZES = [4 * 1024, 16 * 1024, 64 * 1024];
+
+export class EtlHeaderError extends Error {
+  constructor(objectKey: string) {
+    super(`No newline found in the first 64 KiB of ${objectKey} — malformed CSV header.`);
+    this.name = 'EtlHeaderError';
+  }
+}
+
+async function* streamToText(stream: ReadableStream<Uint8Array>) {
+  const reader = stream.getReader();
+  const decoder = new TextDecoder();
+  try {
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      yield decoder.decode(value, { stream: true });
+    }
+  } finally {
+    reader.releaseLock();
+  }
+}
+
+async function fetchHeaderRow(r2: R2BucketService, objectKey: string): Promise<string> {
+  for (const length of HEADER_PEEK_SIZES) {
+    const obj = await r2.get(objectKey, { range: { offset: 0, length } });
+    if (!obj) throw new Error(`R2 header read returned null for ${objectKey}`);
+    const text = await obj.text();
+    const newlineIndex = text.indexOf('\n');
+    if (newlineIndex !== -1) {
+      return text.slice(0, newlineIndex);
+    }
+  }
+  throw new EtlHeaderError(objectKey);
+}
+
+export async function processChunk({
+  jobId,
+  chunk,
+  env,
+}: {
+  jobId: string;
+  chunk: ChunkSpec;
+  env: Env;
+}): Promise<ChunkResult> {
+  const r2 = new R2BucketService({ env, bucketType: 'catalog' });
+
+  const isNonFirstChunk = chunk.chunkIndex > 0;
+  const injectedHeader = isNonFirstChunk ? await fetchHeaderRow(r2, chunk.objectKey) : '';
+
+  const length = chunk.byteEnd - chunk.byteStart + 1;
+  const obj = await r2.get(chunk.objectKey, {
+    range: { offset: chunk.byteStart, length },
+  });
+  if (!obj) throw new Error(`R2 chunk read returned null for ${chunk.objectKey}`);
+
+  const validItemsBatch: Partial<NewCatalogItem>[] = [];
+  const invalidItemsBatch: NewInvalidItemLog[] = [];
+  const validator = new CatalogItemValidator();
+
+  const parser = parse({
+    relax_column_count: true,
+    skip_empty_lines: true,
+  });
+
+  const writerPromise = (async () => {
+    if (injectedHeader) {
+      parser.write(`${injectedHeader}\n`);
+    }
+    for await (const text of streamToText(obj.body)) {
+      const ok = parser.write(text);
+      if (!ok) {
+        await new Promise<void>((resolve) => parser.once('drain', resolve));
+      }
+    }
+    parser.end();
+  })().catch((err) => {
+    parser.destroy(err instanceof Error ? err : new Error(String(err)));
+    throw err;
+  });
+
+  let rowIndex = 0;
+  let rowsValid = 0;
+  let rowsInvalid = 0;
+  let fieldMap: Record<string, number> = {};
+  let isHeaderProcessed = false;
+
+  for await (const record of parser) {
+    if (rowIndex % 100 === 0) {
+      await new Promise((resolve) => setTimeout(resolve, 0));
+    }
+    const row = record as string[];
+
+    if (!isHeaderProcessed) {
+      fieldMap = {};
+      for (const [idx, header] of row.entries()) {
+        fieldMap[header.trim()] = idx;
+      }
+      isHeaderProcessed = true;
+      continue;
+    }
+
+    const item = mapCsvRowToItem({ values: row, fieldMap });
+    if (item) {
+      const validated = validator.validateItem(item);
+      if (validated.isValid) {
+        validItemsBatch.push(validated.item);
+      } else {
+        invalidItemsBatch.push({
+          jobId,
+          errors: validated.errors,
+          rawData: validated.item,
+          rowIndex,
+        });
+      }
+    }
+
+    rowIndex++;
+
+    if (validItemsBatch.length >= BATCH_SIZE) {
+      await processValidItemsBatch({ jobId, items: [...validItemsBatch], env });
+      rowsValid += validItemsBatch.length;
+      validItemsBatch.length = 0;
+    }
+    if (invalidItemsBatch.length >= BATCH_SIZE) {
+      await processLogsBatch({ jobId, logs: [...invalidItemsBatch], env });
+      rowsInvalid += invalidItemsBatch.length;
+      invalidItemsBatch.length = 0;
+    }
+  }
+
+  await writerPromise;
+
+  if (validItemsBatch.length > 0) {
+    await processValidItemsBatch({ jobId, items: validItemsBatch, env });
+    rowsValid += validItemsBatch.length;
+  }
+  if (invalidItemsBatch.length > 0) {
+    await processLogsBatch({ jobId, logs: invalidItemsBatch, env });
+    rowsInvalid += invalidItemsBatch.length;
+  }
+
+  return {
+    chunkIndex: chunk.chunkIndex,
+    rowsProcessed: rowIndex,
+    rowsValid,
+    rowsInvalid,
+  };
+}
+
+export class CatalogEtlWorkflow extends WorkflowEntrypoint<Env, CatalogEtlWorkflowParams> {
+  async run(
+    event: Readonly<WorkflowEvent<CatalogEtlWorkflowParams>>,
+    step: WorkflowStep,
+  ): Promise<{ jobId: string; rowsProcessed: number; rowsValid: number; rowsInvalid: number }> {
+    setWorkerEnv(this.env as unknown as Record<string, unknown>); // safe-cast: same shape as fetch handler
+    const { jobId, chunks } = event.payload;
+
+    // One step per chunk. Each step is memoized by name within the instance,
+    // so a chunk that succeeds is never re-run on a downstream step failure.
+    // Retries are bounded to 3 with exponential backoff for transient R2/DB
+    // failures; a chunk that exhausts retries marks the entire instance errored.
+    try {
+      const chunkResults: ChunkResult[] = [];
+      for (const chunk of chunks) {
+        const result = await step.do(
+          `chunk-${chunk.chunkIndex}`,
+          {
+            retries: { limit: 3, delay: '30 seconds', backoff: 'exponential' },
+            timeout: '5 minutes',
+          },
+          async () => processChunk({ jobId, chunk, env: this.env }),
+        );
+        chunkResults.push(result);
+      }
+
+      const totals = chunkResults.reduce(
+        (acc, r) => ({
+          rowsProcessed: acc.rowsProcessed + r.rowsProcessed,
+          rowsValid: acc.rowsValid + r.rowsValid,
+          rowsInvalid: acc.rowsInvalid + r.rowsInvalid,
+        }),
+        { rowsProcessed: 0, rowsValid: 0, rowsInvalid: 0 },
+      );
+
+      // Aggregate step writes the canonical totals — any over-counts from chunk
+      // retries (the inner processValidItemsBatch increments are non-idempotent
+      // on retry) get overridden here. This is the authoritative count.
+      if (chunks.length === 0) {
+        throw new Error(`Workflow ${jobId} received empty chunks array`);
+      }
+      await step.do('aggregate', async () => {
+        const db = createDbClient(this.env);
+        await db
+          .update(etlJobs)
+          .set({
+            totalProcessed: totals.rowsProcessed,
+            totalValid: totals.rowsValid,
+            totalInvalid: totals.rowsInvalid,
+          })
+          .where(eq(etlJobs.id, jobId));
+      });
+
+      await step.do('finalize', async () => {
+        const db = createDbClient(this.env);
+        await db
+          .update(etlJobs)
+          .set({ status: 'completed', completedAt: new Date() })
+          .where(eq(etlJobs.id, jobId));
+      });
+
+      return {
+        jobId,
+        rowsProcessed: totals.rowsProcessed,
+        rowsValid: totals.rowsValid,
+        rowsInvalid: totals.rowsInvalid,
+      };
+    } catch (err) {
+      // Best-effort: mark the DB row failed so operators aren't looking at a
+      // perpetually-running job. The workflow runtime also marks the instance
+      // errored, but that's only visible in the CF dashboard.
+      try {
+        const db = createDbClient(this.env);
+        await db
+          .update(etlJobs)
+          .set({ status: 'failed', completedAt: new Date() })
+          .where(eq(etlJobs.id, jobId));
+      } catch {
+        // ignore — status update is best-effort; don't mask the original error
+      }
+      throw err;
+    }
+  }
+}
diff --git a/packages/api/src/workflows/shared/__tests__/chunk-csv-for-r2.test.ts b/packages/api/src/workflows/shared/__tests__/chunk-csv-for-r2.test.ts
new file mode 100644
index 0000000000..f11d04f603
--- /dev/null
+++ b/packages/api/src/workflows/shared/__tests__/chunk-csv-for-r2.test.ts
@@ -0,0 +1,172 @@
+// Unit tests for the row-boundary-aligned R2 chunker.
+//
+// The chunker is the load-bearing piece for catalog ETL correctness: any
+// off-by-one at a chunk boundary either drops a row, invalidates one, or
+// produces duplicates. These tests exercise the boundary alignment against
+// in-memory CSV fixtures so the failure modes the audit identified
+// (P1 #3, #4, #5) cannot regress silently.
+
+import {
+  ChunkBoundaryError,
+  type ChunkerR2,
+  chunkCsvForR2,
+} from '@packrat/api/workflows/shared/chunkCsvForR2';
+import { describe, expect, it } from 'vitest';
+
+const encoder = new TextEncoder();
+
+function fakeR2(text: string, key = 'fixture.csv'): { r2: ChunkerR2; bytes: Uint8Array } {
+  const bytes = encoder.encode(text);
+
+  const head = async (k: string) => {
+    if (k !== key) return null;
+    return {
+      key,
+      size: bytes.length,
+      etag: 'fake-etag',
+      uploaded: new Date('2026-05-20T00:00:00Z'),
+    } as Awaited<ReturnType<ChunkerR2['head']>>;
+  };
+
+  const get = async (k: string, opts?: { range?: { offset: number; length: number } }) => {
+    if (k !== key) return null;
+    const offset = opts?.range?.offset ?? 0;
+    const length = opts?.range?.length ?? bytes.length - offset;
+    const slice = bytes.slice(offset, offset + length);
+    return {
+      size: slice.length,
+      etag: 'fake-etag',
+      text: async () => new TextDecoder().decode(slice),
+    } as Awaited<ReturnType<ChunkerR2['get']>>;
+  };
+
+  return { r2: { head, get } as unknown as ChunkerR2, bytes };
+}
+
+function makeCsv(rowCount: number, rowWidth = 50): string {
+  const header = 'col1,col2,col3\n';
+  const row = (i: number) => `row-${i},${'x'.repeat(rowWidth)},${i}\n`;
+  return header + Array.from({ length: rowCount }, (_, i) => row(i)).join('');
+}
+
+function expectDefined<T>(value: T | undefined, message: string): T {
+  if (value === undefined) throw new Error(message);
+  return value;
+}
+
+describe('chunkCsvForR2', () => {
+  it('returns a single chunk when the file is smaller than chunkBytes', async () => {
+    const csv = makeCsv(100);
+    const { r2 } = fakeR2(csv);
+    const result = await chunkCsvForR2({
+      r2,
+      objectKey: 'fixture.csv',
+      chunkBytes: 1024 * 1024,
+    });
+    expect(result.chunks).toHaveLength(1);
+    expect(result.chunks[0]).toMatchObject({
+      chunkIndex: 0,
+      chunksTotal: 1,
+      byteStart: 0,
+      byteEnd: result.size - 1,
+    });
+    expect(result.etag).toBe('fake-etag');
+  });
+
+  it('splits a larger file at newline boundaries', async () => {
+    const csv = makeCsv(1000, 50);
+    const { r2, bytes } = fakeR2(csv);
+    // Target ~3 chunks for a ~60KB file.
+    const result = await chunkCsvForR2({
+      r2,
+      objectKey: 'fixture.csv',
+      chunkBytes: Math.ceil(bytes.length / 3),
+      peekBytes: 256,
+    });
+
+    expect(result.chunks.length).toBeGreaterThanOrEqual(2);
+    const firstChunk = expectDefined(result.chunks[0], 'first chunk missing');
+    expect(firstChunk.chunkIndex).toBe(0);
+    const lastChunk = expectDefined(result.chunks.at(-1), 'last chunk missing');
+    expect(lastChunk.chunkIndex).toBe(result.chunks.length - 1);
+    expect(lastChunk.chunksTotal).toBe(result.chunks.length);
+    expect(lastChunk.byteEnd).toBe(bytes.length - 1);
+
+    // Every boundary byteEnd must be a newline; the byte immediately after
+    // must be the first byte of the next row.
+    for (let i = 0; i < result.chunks.length - 1; i++) {
+      const current = expectDefined(result.chunks[i], `chunk ${i} missing`);
+      const next = expectDefined(result.chunks[i + 1], `chunk ${i + 1} missing`);
+      const boundary = current.byteEnd;
+      expect(bytes[boundary]).toBe(0x0a); // '\n'
+      expect(next.byteStart).toBe(boundary + 1);
+    }
+  });
+
+  it('reassembles to the original byte content when chunks are concatenated', async () => {
+    const csv = makeCsv(500, 80);
+    const { r2, bytes } = fakeR2(csv);
+    const result = await chunkCsvForR2({
+      r2,
+      objectKey: 'fixture.csv',
+      chunkBytes: Math.ceil(bytes.length / 4),
+      peekBytes: 256,
+    });
+
+    // The chunks together must cover bytes [0, size-1] with no gaps or overlap.
+    let cursor = 0;
+    for (const chunk of result.chunks) {
+      expect(chunk.byteStart).toBe(cursor);
+      expect(chunk.byteEnd).toBeGreaterThanOrEqual(chunk.byteStart);
+      cursor = chunk.byteEnd + 1;
+    }
+    expect(cursor).toBe(bytes.length);
+  });
+
+  it('throws for an empty R2 object (0 bytes)', async () => {
+    const { r2 } = fakeR2('');
+    await expect(chunkCsvForR2({ r2, objectKey: 'fixture.csv' })).rejects.toThrow(
+      'empty (0 bytes)',
+    );
+  });
+
+  it('throws ChunkBoundaryError when no newline is found in the peek window', async () => {
+    // A single very long row with no internal newlines forces peekBytes=256
+    // to scan a tail with no \n at all.
+    const longRow = 'x'.repeat(8 * 1024);
+    const csv = `col1\n${longRow}\n`;
+    const { r2 } = fakeR2(csv);
+
+    await expect(
+      chunkCsvForR2({ r2, objectKey: 'fixture.csv', chunkBytes: 2048, peekBytes: 256 }),
+    ).rejects.toBeInstanceOf(ChunkBoundaryError);
+  });
+
+  it('preserves a CSV row at the boundary — first row of chunk N+1 is intact', async () => {
+    const csv = makeCsv(200, 40);
+    const { r2, bytes } = fakeR2(csv);
+    const result = await chunkCsvForR2({
+      r2,
+      objectKey: 'fixture.csv',
+      chunkBytes: Math.ceil(bytes.length / 3),
+      peekBytes: 256,
+    });
+
+    const text = new TextDecoder().decode(bytes);
+    const allRows = text.split('\n').filter((line) => line.length > 0);
+    const headerRow = expectDefined(allRows[0], 'fixture has no header');
+    const dataRows = allRows.slice(1);
+
+    // For each non-first chunk, the bytes at byteStart..next-newline should be
+    // a complete data row (matches one of dataRows verbatim).
+    for (let i = 1; i < result.chunks.length; i++) {
+      const chunk = expectDefined(result.chunks[i], `chunk ${i} missing`);
+      const slice = new TextDecoder().decode(bytes.slice(chunk.byteStart, chunk.byteEnd + 1));
+      const firstRow = expectDefined(slice.split('\n')[0], `chunk ${i} has no first row`);
+      expect(firstRow.startsWith('row-')).toBe(true);
+      expect(dataRows).toContain(firstRow);
+      // The header must NOT appear inside a non-first chunk.
+      expect(slice).not.toContain(headerRow);
+    }
+  });
+});
diff --git a/packages/api/src/workflows/shared/chunkCsvForR2.ts b/packages/api/src/workflows/shared/chunkCsvForR2.ts
new file mode 100644
index 0000000000..2ba613611e
--- /dev/null
+++ b/packages/api/src/workflows/shared/chunkCsvForR2.ts
@@ -0,0 +1,156 @@
+// Row-boundary-aligned byte-range chunking for catalog source CSVs in R2.
+//
+// The producer endpoint and the admin retry/repair endpoints both need the
+// same chunk spec. Boundaries snap to the byte immediately before a newline
+// so a chunk never splits a CSV row in half (closes audit P1 #4 and P1 #5).
+// Peek reads are issued in parallel to keep the producer's CPU budget under
+// control on multi-GB files (closes the deepening pass concern about
+// sequential peek latency).
+
+import type { R2BucketService } from '@packrat/api/services/r2-bucket';
+
+export type ChunkSpec = {
+  objectKey: string;
+  chunkIndex: number;
+  chunksTotal: number;
+  byteStart: number;
+  /** Inclusive end byte, matching R2 / S3 `Range: bytes=offset-end` semantics. */
+  byteEnd: number;
+};
+
+export type ChunkCsvResult = {
+  etag: string;
+  lastModified: Date;
+  size: number;
+  chunks: ChunkSpec[];
+};
+
+export type ChunkerR2 = Pick<R2BucketService, 'head' | 'get'>;
+
+const DEFAULT_CHUNK_BYTES = 20 * 1024 * 1024; // 20 MiB
+const DEFAULT_PEEK_BYTES = 64 * 1024; // 64 KiB
+
+export class ChunkBoundaryError extends Error {
+  constructor(objectKey: string, byteRange: { from: number; to: number }) {
+    super(
+      `No newline found in ${byteRange.to - byteRange.from} bytes ending at ${byteRange.to} ` +
+        `of ${objectKey} — row larger than the peek window or file is not line-oriented.`,
+    );
+    this.name = 'ChunkBoundaryError';
+  }
+}
+
+/**
+ * Plan the byte-range chunks for one R2 object.
+ *
+ * For files smaller than `chunkBytes`, returns a single chunk spanning the
+ * whole object. For larger files, splits into N chunks whose boundaries are
+ * aligned to newlines via parallel peek reads of the tail of each window.
+ *
+ * Throws ChunkBoundaryError if no newline is found within `peekBytes` of any
+ * proposed boundary — caller should treat this as fatal (the source file is
+ * malformed or has a row wider than 64 KiB, both of which warrant a loud
+ * failure rather than silent row drops).
+ */
+export async function chunkCsvForR2({
+  r2,
+  objectKey,
+  chunkBytes = DEFAULT_CHUNK_BYTES,
+  peekBytes = DEFAULT_PEEK_BYTES,
+}: {
+  r2: ChunkerR2;
+  objectKey: string;
+  chunkBytes?: number;
+  peekBytes?: number;
+}): Promise<ChunkCsvResult> {
+  const meta = await r2.head(objectKey);
+  if (!meta) throw new Error(`R2 object not found: ${objectKey}`);
+
+  const size = meta.size;
+  const etag = meta.etag;
+  const lastModified = meta.uploaded;
+
+  if (size === 0) {
+    throw new Error(`R2 object ${objectKey} is empty (0 bytes) — not a valid CSV source`);
+  }
+
+  if (size <= chunkBytes) {
+    return {
+      etag,
+      lastModified,
+      size,
+      chunks: [
+        {
+          objectKey,
+          chunkIndex: 0,
+          chunksTotal: 1,
+          byteStart: 0,
+          byteEnd: size - 1,
+        },
+      ],
+    };
+  }
+
+  // Compute the candidate boundaries (the byte AFTER the last byte of each
+  // non-final chunk). The final chunk always ends at size - 1.
+  const boundaryCount = Math.ceil(size / chunkBytes) - 1;
+  const candidates: Array<{ index: number; from: number; to: number }> = [];
+  for (let i = 0; i < boundaryCount; i++) {
+    const target = (i + 1) * chunkBytes; // exclusive end of chunk i
+    const from = Math.max(0, target - peekBytes);
+    const to = Math.min(size, target);
+    candidates.push({ index: i, from, to });
+  }
+
+  // Peek reads in bounded-parallel batches of 16 to keep R2 from rate-limiting
+  // on multi-GB ingests with many chunk boundaries.
+  const PEEK_CONCURRENCY = 16;
+  const peeks: Array<{ index: number; byteEnd: number }> = [];
+  for (let i = 0; i < candidates.length; i += PEEK_CONCURRENCY) {
+    const batch = candidates.slice(i, i + PEEK_CONCURRENCY);
+    const batchResults = await Promise.all(
+      batch.map(async ({ index, from, to }) => {
+        const obj = await r2.get(objectKey, { range: { offset: from, length: to - from } });
+        if (!obj) throw new Error(`R2 peek read returned null for ${objectKey} [${from},${to})`);
+        const text = await obj.text();
+        const lastNewlineIndex = text.lastIndexOf('\n');
+        if (lastNewlineIndex === -1) {
+          throw new ChunkBoundaryError(objectKey, { from, to });
+        }
+        // TextEncoder gives byte length of the prefix — accurate for non-ASCII CSV
+        // content where char index != byte offset (e.g. accented product names).
+        const byteEnd = from + new TextEncoder().encode(text.slice(0, lastNewlineIndex)).byteLength;
+        return { index, byteEnd };
+      }),
+    );
+    peeks.push(...batchResults);
+  }
+
+  // Assemble the final chunk list in order. Each chunk's byteStart is the
+  // previous chunk's byteEnd + 1 (so the next chunk starts AFTER the
+  // newline at the previous boundary).
+  const sortedPeeks = peeks.sort((a, b) => a.index - b.index);
+  const chunksTotal = sortedPeeks.length + 1;
+  const chunks: ChunkSpec[] = [];
+  let byteStart = 0;
+  for (const [chunkIndex, { byteEnd }] of sortedPeeks.entries()) {
+    chunks.push({
+      objectKey,
+      chunkIndex,
+      chunksTotal,
+      byteStart,
+      byteEnd,
+    });
+    byteStart = byteEnd + 1;
+  }
+  // Final chunk runs to EOF.
+  chunks.push({
+    objectKey,
+    chunkIndex: chunksTotal - 1,
+    chunksTotal,
+    byteStart,
+    byteEnd: size - 1,
+  });
+
+  return { etag, lastModified, size, chunks };
+}
diff --git a/packages/api/test/db-schema-etl.test.ts b/packages/api/test/db-schema-etl.test.ts
new file mode 100644
index 0000000000..b972a7e450
--- /dev/null
+++ b/packages/api/test/db-schema-etl.test.ts
@@ -0,0 +1,64 @@
+// Schema smoke test for the ETL Workflows columns on etl_jobs. Runs against
+// the Docker Postgres wsproxy at localhost:5434 (docker-compose.test.yml).
+// If the proxy is down the queries throw — intentional; the test would not
+// silently skip schema drift.
+
+import { createDbClient } from '@packrat/api/db';
+import type { Env } from '@packrat/api/utils/env-validation';
+import { sql } from 'drizzle-orm';
+import { describe, expect, it } from 'vitest';
+
+type ColumnInfo = {
+  column_name: string;
+  data_type: string;
+  is_nullable: 'YES' | 'NO';
+  column_default: string | null;
+};
+
+type IndexInfo = { indexname: string; indexdef: string };
+
+async function describeColumns(table: string): Promise<ColumnInfo[]> {
+  const db = createDbClient({} as Env);
+  const result = (await db.execute(sql`
+    SELECT column_name, data_type, is_nullable, column_default
+    FROM information_schema.columns
+    WHERE table_schema = 'public' AND table_name = ${table}
+    ORDER BY ordinal_position
+  `)) as unknown as ColumnInfo[];
+  return result;
+}
+
+async function describeIndexes(table: string): Promise<IndexInfo[]> {
+  const db = createDbClient({} as Env);
+  const result = (await db.execute(sql`
+    SELECT indexname, indexdef
+    FROM pg_indexes
+    WHERE schemaname = 'public' AND tablename = ${table}
+  `)) as unknown as IndexInfo[];
+  return result;
+}
+
+describe('Migration 0047 — ETL workflow columns', () => {
+  it('adds workflow_instance_id as nullable text', async () => {
+    const cols = await describeColumns('etl_jobs');
+    const col = cols.find((c) => c.column_name === 'workflow_instance_id');
+    expect(col).toBeDefined();
+    expect(col?.data_type).toBe('text');
+    expect(col?.is_nullable).toBe('YES');
+  });
+
+  it('adds total_embedding_failures as integer NOT NULL DEFAULT 0', async () => {
+    const cols = await describeColumns('etl_jobs');
+    const col = cols.find((c) => c.column_name === 'total_embedding_failures');
+    expect(col).toBeDefined();
+    expect(col?.data_type).toBe('integer');
+    expect(col?.is_nullable).toBe('NO');
+    expect(col?.column_default).toBe('0');
+  });
+
+  it('adds the workflow_instance_id index', async () => {
+    const indexes = await describeIndexes('etl_jobs');
+    const names = new Set(indexes.map((i) => i.indexname));
+    expect(names.has('etl_jobs_workflow_instance_id_idx')).toBe(true);
+  });
+});
diff --git a/packages/api/vitest.unit.config.ts b/packages/api/vitest.unit.config.ts
index d8db428f52..c6771ce8ff 100644
--- a/packages/api/vitest.unit.config.ts
+++ b/packages/api/vitest.unit.config.ts
@@ -58,6 +58,10 @@ export default defineConfig({
         'src/auth/index.ts',
         // ETL and AI utilities (defer to integration tests)
         'src/services/etl/**',
+        // CatalogEtlWorkflow needs the CF Workflows runtime for end-to-end
+        // execution; covered by integration tests in /test once Docker Postgres
+        // is wired. Sibling chunker (src/workflows/shared/) IS unit-tested.
+        'src/workflows/catalog-etl-workflow.ts',
         'src/utils/ai/**',
         // Complex orchestration services (defer to integration tests)
         'src/services/aiService.ts',
diff --git a/packages/api/wrangler.jsonc b/packages/api/wrangler.jsonc
index a3d7e913c7..4cdbd450f2 100644
--- a/packages/api/wrangler.jsonc
+++ b/packages/api/wrangler.jsonc
@@ -5,9 +5,15 @@
   // Elysia 1.4+ CloudflareAdapter requires compatibility_date >= 2025-06-01.
   "compatibility_date": "2025-06-01",
   // nodejs_compat is kept because other dependencies (bcryptjs, pg) rely on it.
-  "compatibility_flags": ["nodejs_compat"],
+  // nodejs_als is required by @sentry/cloudflare for AsyncLocalStorage-based
+  // request/workflow context propagation across awaits.
+  "compatibility_flags": ["nodejs_compat", "nodejs_als"],
   "keep_vars": true,
   "logpush": true,
+  // Generate + upload source maps to Cloudflare so Workers logs show
+  // unminified stack traces. Sentry symbolication uses a separate
+  // @sentry/cli sourcemaps upload step in CI (see deploy workflow).
+  "upload_source_maps": true,
   "version_metadata": {
     "binding": "CF_VERSION_METADATA"
   },
@@ -90,6 +96,23 @@
   "ai": {
     "binding": "AI"
   },
+  // Catalog ETL workflow — the durable execution engine for catalog ingests.
+  // The producer endpoint at POST /catalog/etl triggers a new instance per
+  // source CSV; each chunk is a step.do call inside the workflow. Replaces
+  // the previous Queues-based path (which remains available during the
+  // coexistence window via ?engine=queue on the producer endpoint).
+  "workflows": [
+    {
+      "name": "packrat-catalog-etl",
+      "binding": "ETL_WORKFLOW",
+      "class_name": "CatalogEtlWorkflow"
+    }
+  ],
+  // Daily 09:00 UTC retention sweep of invalid_item_logs (>90 days old).
+  // Handled by the `scheduled` arm in src/index.ts.
+  "triggers": {
+    "crons": ["0 9 * * *"]
+  },
   // OSM / trail database — dedicated Postgres instance with PostGIS.
   // Add a Hyperdrive binding when ready:
   //   wrangler hyperdrive create osm-db --connection-string="postgresql://..."
@@ -195,6 +218,16 @@
       "ai": {
         "binding": "AI"
       },
+      "workflows": [
+        {
+          "name": "packrat-catalog-etl-dev",
+          "binding": "ETL_WORKFLOW",
+          "class_name": "CatalogEtlWorkflow"
+        }
+      ],
+      "triggers": {
+        "crons": ["0 9 * * *"]
+      },
       "containers": [
         {
           "name": "packrat-api-container-dev",
diff --git a/packages/db/src/schema.ts b/packages/db/src/schema.ts
index 6f8b9d807a..e8bc3edfe2 100644
--- a/packages/db/src/schema.ts
+++ b/packages/db/src/schema.ts
@@ -4,6 +4,7 @@ import {
   type AnyPgColumn,
   bigint,
   boolean,
+  check,
   index,
   integer,
   jsonb,
@@ -472,9 +473,42 @@ export const etlJobs = pgTable(
     totalValid: integer('total_valid'),
     totalInvalid: integer('total_invalid'),
     scraperRevision: text('scraper_revision').notNull(),
+    // Workflows-aware columns. workflowInstanceId links the row to its
+    // CatalogEtlWorkflow instance (null on legacy queue-path rows; set on
+    // workflow-path rows). totalEmbeddingFailures counts SKUs that were
+    // upserted without embeddings because generateManyEmbeddings threw —
+    // observable degradation signal for the embedding service.
+    workflowInstanceId: text('workflow_instance_id'),
+    totalEmbeddingFailures: integer('total_embedding_failures').default(0).notNull(),
+    // Post-ingestion row-count verification, written by the admin reconcile
+    // endpoint. verifiedRowCount is the logical CSV row count parsed from
+    // the R2 source; mismatches against totalProcessed indicate data drift.
+    verifiedAt: timestamp('verified_at'),
+    verifiedRowCount: integer('verified_row_count'),
+    // R2 source provenance captured at ingest time. Repair-from-scratch
+    // refuses to re-ingest when the live R2 etag no longer matches the
+    // stored value (unless overridden with ?force=true) so a scraper
+    // overwrite mid-flight can't be silently re-applied under the old
+    // (source, filename).
+    sourceEtag: text('source_etag'),
+    sourceLastModified: timestamp('source_last_modified'),
+    // Audit trail for repair-from-scratch / retry. supersededByJobId
+    // points at the ORIGINAL job (the new repair-job row carries the
+    // pointer); supersededAt is the time of supersession. CHECK
+    // constraint prevents self-reference.
+    supersededByJobId: text('superseded_by_job_id').references((): AnyPgColumn => etlJobs.id, {
+      onDelete: 'set null',
+    }),
+    supersededAt: timestamp('superseded_at'),
   },
   (table) => ({
     scraperRevisionIdx: index('etl_jobs_scraper_revision_idx').on(table.scraperRevision),
+    workflowInstanceIdIdx: index('etl_jobs_workflow_instance_id_idx').on(table.workflowInstanceId),
+    supersededByIdx: index('etl_jobs_superseded_by_idx').on(table.supersededByJobId),
+    noSelfSupersede: check(
+      'etl_jobs_no_self_supersede',
+      sql`${table.supersededByJobId} IS NULL OR ${table.supersededByJobId} <> ${table.id}`,
+    ),
   }),
 );
 
diff --git a/packages/schemas/src/admin.ts b/packages/schemas/src/admin.ts
index 17b2fe4a72..8190846245 100644
--- a/packages/schemas/src/admin.ts
+++ b/packages/schemas/src/admin.ts
@@ -230,6 +230,51 @@ export const EtlRetrySchema = z.object({
   success: z.literal(true),
   newJobId: z.string(),
   objectKey: z.string(),
+  workflowInstanceId: z.string().nullable(),
+});
+
+export const EtlReconcileSchema = z.object({
+  success: z.literal(true),
+  jobId: z.string(),
+  expectedRowCount: z.number().int(),
+  actualRowCount: z.number().int().nullable(),
+  delta: z.number().int().nullable(),
+});
+
+export const CatalogAuditSourceSchema = z.object({
+  source: z.string(),
+  totalItems: z.number().int(),
+  lastEtlId: z.string().nullable(),
+  lastEtlAt: z.string().nullable(),
+  daysStale: z.number().int().nullable(),
+  medianPrice: z.number().nullable(),
+  minPrice: z.number().nullable(),
+  maxPrice: z.number().nullable(),
+  nullRates: z.object({
+    price: z.number(),
+    brand: z.number(),
+    description: z.number(),
+    weight: z.number(),
+    images: z.number(),
+    availability: z.number(),
+  }),
+  suspiciousDecimalCount: z.number().int(),
+  suspiciousWeightCount: z.number().int(),
+  emptyNameCount: z.number().int(),
+  flags: z.array(z.string()),
+});
+
+export const CatalogAuditSchema = z.object({
+  generatedAt: z.string(),
+  thresholds: z.object({
+    decimalBugPriceThreshold: z.number(),
+    lowMedianPriceThreshold: z.number(),
+    minFillRate: z.number(),
+    staleDaysThreshold: z.number(),
+    weightTooLightGrams: z.number(),
+    weightTooHeavyGrams: z.number(),
+  }),
+  sources: z.array(CatalogAuditSourceSchema),
 });
 
 // ─── Trails ───────────────────────────────────────────────────────────────────