diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml
deleted file mode 100644
index f9a28b424f0..00000000000
--- a/.github/workflows/crowdin-ai-import.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-name: Import Crowdin AI Translations
-
-on:
- workflow_dispatch:
- inputs:
- target_path:
- description: "Path(s) to translate (comma-separated, e.g., public/content/developers/index.md,src/intl/en/page-roadmap.json). Single directory or blank for all files."
- required: false
- type: string
- exclude_path:
- description: "Path to exclude from this job (e.g., public/content/developers/tutorials)"
- required: false
- type: string
- target_languages:
- description: "Comma-separated internal language codes (blank for all locales)"
- required: false
- type: string
- base_branch:
- description: "Base branch to create PR against"
- required: false
- default: "dev"
- type: string
- pretranslation_id:
- description: "Pre-translation ID(s) to resume from, comma-separated for multiple (leave empty to start new)"
- required: false
- type: string
- split_prs:
- description: "Create one PR per language instead of one combined PR?"
- required: false
- default: false
- type: boolean
- pre_translate_prompt_id:
- description: "AI prompt ID for pre_translate (default: 326942)"
- required: false
- default: "326942"
- type: string
- skip_pr:
- description: "Skip PR creation?"
- required: false
- default: false
- type: boolean
- skip_await:
- description: "Exit after dispatching pre-translation (resume later with ID)"
- required: false
- default: false
- type: boolean
- verbose:
- description: "Enable verbose logging?"
- required: false
- default: "false"
- type: boolean
-
-jobs:
- import_translations:
- runs-on: ubuntu-latest
- steps:
- - name: Check out code
- uses: actions/checkout@v6
-
- - name: Setup pnpm
- uses: pnpm/action-setup@v4
-
- - name: Set up Node.js
- uses: actions/setup-node@v6
- with:
- node-version: 20
- cache: "pnpm"
-
- - name: Install dependencies
- run: pnpm install
-
- - name: Run Crowdin AI translation import
- run: npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/main.ts
- env:
- I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_WORKFLOW_API_KEY }}
- I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }}
- GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
- SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
- SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }}
- PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }}
- TARGET_PATH: ${{ github.event.inputs.target_path }}
- EXCLUDE_PATH: ${{ github.event.inputs.exclude_path }}
- TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }}
- BASE_BRANCH: ${{ github.event.inputs.base_branch }}
- PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }}
- VERBOSE: ${{ github.event.inputs.verbose }}
- SKIP_PR_CREATION: ${{ github.event.inputs.skip_pr }}
- SKIP_AWAIT: ${{ github.event.inputs.skip_await }}
- SPLIT_PRS: ${{ github.event.inputs.split_prs }}
- GITHUB_REPOSITORY: ${{ github.repository }}
diff --git a/.github/workflows/gemini-translations.yml b/.github/workflows/intl-pipeline.yml
similarity index 58%
rename from .github/workflows/gemini-translations.yml
rename to .github/workflows/intl-pipeline.yml
index c3a6029d50b..e3c020f3700 100644
--- a/.github/workflows/gemini-translations.yml
+++ b/.github/workflows/intl-pipeline.yml
@@ -1,4 +1,4 @@
-name: Gemini Translations
+name: Intl Pipeline
on:
workflow_dispatch:
@@ -7,6 +7,10 @@ on:
description: "Path(s) to translate (comma-separated files, single directory, or blank for all)"
required: false
type: string
+ exclude_path:
+ description: "Path(s) to exclude (comma-separated files or directories)"
+ required: false
+ type: string
target_languages:
description: "Comma-separated language codes (blank for all locales)"
required: false
@@ -16,15 +20,20 @@ on:
required: false
default: "dev"
type: string
+ target_branch:
+ description: "Override target branch (default: intl/pending)"
+ required: false
+ type: string
concurrency:
description: "Max parallel Gemini requests per language"
required: false
- default: "3"
+ default: "16"
type: string
- resume_run_id:
- description: "Resume an interrupted run by its ID"
+ stamp_only:
+ description: "Update manifests only, no translations"
required: false
- type: string
+ default: false
+ type: boolean
skip_pr:
description: "Skip PR creation?"
required: false
@@ -35,6 +44,18 @@ on:
required: false
default: "false"
type: boolean
+ mode:
+ description: "Translation mode: 'auto' (full for new files, incremental for existing) or 'full' (retranslate everything)"
+ required: false
+ default: "auto"
+ type: choice
+ options:
+ - auto
+ - full
+
+concurrency:
+ group: i18n-translation
+ cancel-in-progress: false
jobs:
translate:
@@ -43,30 +64,36 @@ jobs:
steps:
- name: Check out code
uses: actions/checkout@v6
+ with:
+ ref: ${{ github.event.inputs.base_branch || 'dev' }}
+ fetch-depth: 0
- name: Setup pnpm
- uses: pnpm/action-setup@v4
+ uses: pnpm/action-setup@v5
- name: Set up Node.js
uses: actions/setup-node@v6
with:
- node-version: 20
+ node-version-file: ".nvmrc"
cache: "pnpm"
- name: Install dependencies
run: pnpm install
- - name: Run Gemini translation
- run: npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/main-gemini.ts
+ - name: Run translation pipeline
+ run: npx ts-node -O '{"module":"commonjs"}' ./src/scripts/intl-pipeline/main.ts
env:
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
- I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }}
+ GITHUB_API_TOKEN: ${{ secrets.I18N_GITHUB_TOKEN }}
TARGET_PATH: ${{ github.event.inputs.target_path }}
+ EXCLUDE_PATH: ${{ github.event.inputs.exclude_path }}
TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }}
BASE_BRANCH: ${{ github.event.inputs.base_branch }}
GEMINI_CONCURRENCY: ${{ github.event.inputs.concurrency }}
- RESUME_RUN_ID: ${{ github.event.inputs.resume_run_id }}
SKIP_PR_CREATION: ${{ github.event.inputs.skip_pr }}
VERBOSE: ${{ github.event.inputs.verbose }}
+ MODE: ${{ github.event.inputs.mode }}
GITHUB_REPOSITORY: ${{ github.repository }}
- TRANSLATION_PIPELINE: Gemini
+ TARGET_BRANCH: ${{ github.event.inputs.target_branch }}
+ STAMP_ONLY: ${{ github.event.inputs.stamp_only }}
+ DRY_RUN: "false"
diff --git a/docs/gemini-translation-roadmap.md b/docs/gemini-translation-roadmap.md
new file mode 100644
index 00000000000..a6f2ea17116
--- /dev/null
+++ b/docs/gemini-translation-roadmap.md
@@ -0,0 +1,365 @@
+# Gemini Translation Pipeline -- Roadmap
+
+Status: Active plan
+Last updated: 2026-03-27
+
+---
+
+## Current state
+
+The initial full-repo translation pass is ~97-99% complete across 24 non-English
+languages. The Gemini translation pipeline (`gemini-translations.yml`) works well
+for full-file translation but has limitations as we shift to ongoing maintenance.
+
+### What works today
+
+- Full-file translation with glossary enforcement
+- Code block extraction/restoration (`` placeholders)
+- Comment translation within code blocks
+- Incremental commit per language (no work lost on partial failure)
+- Progress tracking and run resumption
+- Post-import sanitization and transliteration
+- Configurable concurrency, include/exclude paths, per-language targeting
+- 100% custom header ID coverage (`{#custom-id}`) across all markdown files,
+ preserved identically in translations (verified 2026-03-27)
+
+### Gaps (being addressed)
+
+1. ~42 file/language pairs failed during the initial pass (see "Failed files")
+2. No drift detection (no way to know which translations are stale)
+3. No incremental translation (every run retranslates from scratch)
+4. Manual triggering (no automation for ongoing maintenance)
+5. Limited error logging from the `@google/gen-ai` SDK
+6. Some existing translations done with Gemini 2.5 Pro before current sanitizer
+ improvements, transliteration banks, and glossary enhancements
+
+### Cost context
+
+- Initial full-repo pass: ~$1,500 (via Crowdin + Gemini 2.5 Pro)
+- Current pipeline (direct Gemini, bypassing Crowdin): ~80% cheaper
+- Estimated full sweep with current pipeline: ~$300-500
+- Gemini Pro pricing (approximate):
+ - Input: ~$1.25 / 1M tokens
+ - Output: ~$10.00 / 1M tokens (output dominates cost)
+
+---
+
+## Priority 1: Fix failed files (branch: `gemini-v3`)
+
+Close the initial pass from ~97-99% to ~100%. This is the most urgent work item.
+
+### Failed file inventory
+
+42 file/language pairs failed. Full list:
+
+```
+ar: glossary.json
+bn: json-rpc/index.md, ethash/index.md, ethereum-forks/index.md,
+ fusaka/peerdas/index.md, glossary.json, learn-quizzes.json,
+ page-resources.json, page-trillion-dollar-security.json
+de: ethereum-forks/index.md, whitepaper/index.md, glossary.json,
+ learn-quizzes.json
+id: nodes-and-clients/index.md, glamsterdam/index.md, merge/index.md,
+ glossary.json, learn-quizzes.json
+it: hello-world-smart-contract-fullstack/index.md, glossary.json,
+ learn-quizzes.json
+sw: glossary.json
+ta: translatathon/index.md, json-rpc/index.md, poa/index.md,
+ pos-vs-pow/index.md, ethash/index.md, web2-vs-web3/index.md,
+ fusaka/peerdas/index.md, glamsterdam/index.md, glossary.json,
+ learn-quizzes.json
+ur: json-rpc/index.md, dagger-hashimoto/index.md, ethash/index.md,
+ dapps/index.md, secret-state/index.md, ethereum-forks/index.md,
+ fusaka/peerdas/index.md, pectra/maxeb/index.md, glossary.json,
+ page-what-is-the-ethereum-network.json
+```
+
+### Failure pattern analysis
+
+| Root cause | Files affected | Details |
+|------------------------|----------------|--------------------------------------------|
+| Token overload (>15k) | ~7 | whitepaper (90KB), json-rpc (75KB), etc. |
+| Code block density | ~5 | json-rpc (172 blocks), hello-world (128) |
+| Table/component density| ~5 | ethereum-forks (60 JSX + 33 tables) |
+| JSON with embedded HTML| ~3 | glossary.json (317 anchors, 540 escapes) |
+
+**Repeat offenders:**
+- `glossary.json` -- fails for 8 languages (ar, bn, de, id, it, sw, ta, ur)
+- `learn-quizzes.json` -- fails for 5 languages (bn, de, id, ta, ur)
+
+**Languages with most failures:** Tamil (10), Urdu (10), Bengali (8)
+
+### Fixes to implement on `gemini-v3`
+
+#### A. Markdown: header ID-based chunking
+
+Replace token-count-based chunking (which failed) with structure-aware chunking
+using the `{#custom-id}` header anchors.
+
+- Split at heading boundaries, grouping sections up to a token budget per chunk
+- Each chunk carries its header IDs for deterministic reassembly
+- Header IDs are 100% consistent across the repo and preserved in translations
+- Intro content before the first heading gets a synthetic `_intro` key
+
+#### B. JSON: namespace batching with HTML placeholder pre-parsing
+
+Two improvements for large/complex JSON files:
+
+1. **Batch by top-level keys**: Send ~100 key-value pairs per request (with a
+ ~20 key buffer to avoid wasteful tiny final batches -- e.g., a file with 110
+ keys sends one batch of 110, not 100 + 10)
+
+2. **HTML placeholder pre-parsing**: Before translation, replace embedded HTML
+ in JSON values with numbered placeholders (similar to Crowdin's `<0>0>`
+ pattern but more descriptive). Restore after translation.
+
+ ```
+ Before: "A DAO is..."
+ After: "A DAO is..."
+ (with restoration map stored separately)
+ ```
+
+ Validation: after restoration, verify all placeholders were preserved. Flag
+ chunks with missing/duplicated placeholders for retry.
+
+#### C. Code fence extraction audit
+
+The `` extraction works on successful files. Investigate why
+it fails on code-dense files:
+- Run the extractor in isolation on failing files, inspect output
+- Check for edge cases: nested fences, non-standard fence syntax, very high
+ placeholder counts (>100 per file)
+- May be interaction between chunking failure + code blocks (if chunking fails,
+ the entire code-heavy file hits Gemini as one blob)
+
+#### D. Error logging improvements
+
+Add structured error logging from the `@google/gen-ai` SDK:
+- Capture failure reason, response status, partial output if available
+- Log per-file/per-language so failures can be triaged without re-running
+- Distinguish error types: rate limit vs. content filter vs. malformed output
+ vs. timeout (each needs different retry strategy)
+
+#### E. Validation
+
+- Retranslate the ~42 failed file/language pairs as the test case
+- Compare output quality against successfully translated files of similar size
+
+---
+
+## Priority 2: Section hash manifest (branch: `gemini-v4`)
+
+Build per-section content hashing infrastructure. This is the foundation for both
+drift detection and incremental translation.
+
+### Markdown: header ID-keyed section hashes
+
+Parse each English markdown file into a tree of sections keyed by `{#custom-id}`.
+Hash each section's content. Structure:
+
+```json
+{
+ "public/content/roadmap/index.md": {
+ "fileHash": "abc123",
+ "sections": {
+ "_intro": "def456",
+ "what-is-the-roadmap": "ghi789",
+ "why-does-ethereum-need-a-roadmap": "jkl012",
+ ...
+ }
+ }
+}
+```
+
+**Possible future optimization**: merkle trie structure where leaf hashes bubble
+up to parent sections. Allows O(1) "has anything changed?" checks at the file
+level, with drill-down to find exactly which sections changed. Worth considering
+once the flat hash map is working, if performance demands it.
+
+### JSON: key-level hashes
+
+For JSON files, hash individual key-value pairs (or namespace groups for deeply
+nested files). Structure:
+
+```json
+{
+ "src/intl/en/glossary.json": {
+ "fileHash": "mno345",
+ "keys": {
+ "account": "pqr678",
+ "address": "stu901",
+ ...
+ }
+ }
+}
+```
+
+### Storage: manifest file
+
+**Decision**: Use a manifest file (`src/intl/translation-manifest.json`).
+
+- Single file, easy to query, no content file pollution
+- Works for both JSON and markdown
+- Can include metadata: timestamp, pipeline version, token cost, Gemini model
+- Trade-off: potential merge conflicts if multiple translation PRs run
+ simultaneously (mitigated by per-language PRs or lock-step merging)
+
+---
+
+## Priority 3: Baseline sweep + quality refresh
+
+**Decision**: "Stamp now" approach (Option B from brainstorming).
+
+One combined operation (~$300-500) that accomplishes two goals simultaneously:
+
+1. **Establish baseline**: Record current English source SHAs in the manifest
+ for every file/language pair. Going forward, drift is detectable by comparing
+ recorded SHA against current English SHA.
+
+2. **Quality refresh**: Retranslate everything using current best pipeline:
+ - Gemini 3.1 Pro (upgraded from 2.5 Pro used in original pass)
+ - Current sanitizer with all accumulated fixes
+ - Transliteration banks for non-Latin script languages
+ - Improved translation glossary (in development separately)
+
+After this sweep, every translation in the repo is (a) generated by the best
+available pipeline and (b) tracked in the manifest with a known English source
+SHA. This is the clean foundation for incremental work going forward.
+
+### Prerequisite: glossary and transliteration improvements
+
+The quality refresh is most valuable after:
+- Translation glossary expansion is complete (in flight)
+- Transliteration bank coverage is solid for non-Latin scripts
+- All Priority 1 fixes are deployed (so zero files fail)
+
+### Approach alternatives considered
+
+**Option A (rejected): Git history bootstrap** -- Analyze commit messages
+(pattern: `i18n(pl): Crowdin translations`) to determine when each file was
+last truly translated. Feasible since commits are programmatic, but complicated
+by cleanup commits that are more recent than actual translation timestamps.
+
+**Option B (selected): Stamp now, sweep forward** -- Accept that current
+translations have unknown-precision freshness. Do one full sweep with current
+pipeline, stamping SHAs as we go. After this, the manifest is authoritative.
+
+**Option C (rejected): Hybrid git + LLM spot-check** -- Use git where clear,
+LLM where ambiguous. More accurate bootstrap but more complexity for marginal
+benefit given we want a quality refresh anyway.
+
+---
+
+## Priority 4: Incremental translation (branch: `gemini-v4`)
+
+Once the manifest exists with per-section hashes, incremental translation
+becomes straightforward.
+
+### JSON: key-level diff and translate
+
+1. Deep-diff current English JSON against manifest's recorded English version
+2. Collect added and changed key paths
+3. Send only those key-value pairs to Gemini for translation
+4. Deep-merge translated pairs into existing translation JSON
+5. Update manifest with new SHAs
+6. Run sanitizer on the merged file
+
+**Complexity**: Low. JSON key merging is deterministic and safe.
+
+### Markdown: section-level diff and translate
+
+1. Parse current English file into sections keyed by `{#header-id}`
+2. Compare section hashes against manifest
+3. For each changed section:
+ a. Extract corresponding section from existing translation
+ b. Send to Gemini: English section + existing translation + context
+ c. Receive translated section
+4. Reassemble: unchanged sections from existing translation + new translations
+5. Update manifest with new SHAs
+6. Run sanitizer on reassembled file
+
+**Complexity**: Medium. The 100% header ID coverage makes this much more
+feasible than initially estimated. Splicing by ID is deterministic. Edge case:
+intro content before first heading (use synthetic `_intro` key).
+
+**Fallback**: If >50% of sections changed, fall back to full-file retranslation
+(the incremental overhead isn't worth it at that point).
+
+### "Previous English version" question (resolved)
+
+The manifest's recorded SHA IS the previous English version. When a translation
+is generated, the manifest records the English source SHA. On the next
+incremental run, diff current English against that SHA to identify what changed.
+
+---
+
+## Priority 5: Automation (branch: `gemini-v4`)
+
+### End-state vision
+
+```
+English content merged to dev
+ |
+ v
+Drift detection scan (automatic or cron)
+ |
+ v
+Stale file list (per language)
+ |
+ v
+Batching logic (group by language, thresholds, cooldown)
+ |
+ v
+Incremental translation dispatch (Gemini 3.1 Pro)
+ |
+ v
+Sanitizer + transliteration + review agents
+ |
+ v
+PR(s) created, ready for human merge
+```
+
+### Graduation plan
+
+**Phase 1 (near-term): Manual + tooling**
+- Drift scan script runs manually or on cron, outputs report
+- Human reviews report and manually dispatches translation
+- Existing sanitizer + review pipeline handles quality
+
+**Phase 2 (mid-term): Semi-automated**
+- Nightly/weekly cron runs drift scan
+- When stale count exceeds threshold, auto-dispatches translation
+- Human merges resulting PRs
+
+**Phase 3 (long-term): Full automation**
+- Push to dev triggers path-filtered action (`public/content/`, `src/intl/en/`)
+- Batching logic groups changes (cooldown window during active dev)
+- Translation -> sanitizer -> review agents -> PR ready for human merge
+- Cron job as safety net catches anything the push trigger missed
+- Human stays in the loop at the merge step
+
+### Batching considerations
+
+- One PR per language per run (clearest for review)
+- Skip whitespace-only or comment-only changes
+- Cooldown: don't retranslate files translated in the last N hours
+- Size cap: if >50 files stale, split into multiple runs or prioritize by traffic
+
+---
+
+## Branch strategy
+
+- **`gemini-v3`**: Priority 1 (fix failed files). Patches to the existing
+ pipeline: chunking, batching, HTML placeholders, error logging.
+- **`gemini-v4`**: Priorities 2-5 (new infrastructure). Manifest, drift
+ detection, incremental translation, automation.
+
+---
+
+## Related workstreams (tracked elsewhere)
+
+- **Translation glossary expansion** -- in flight, separate task
+- **Transliteration bank improvements** -- ongoing per non-Latin locale
+- **Full-language retroactive cleanup** -- see `src/scripts/i18n/FUTURE.md` #9
+- **Lowercase ethereum initiative** -- content standardization, tracked in
+ `docs/lowercase-ethereum-plan.md`
diff --git a/docs/i18n-incremental-pipeline.md b/docs/i18n-incremental-pipeline.md
new file mode 100644
index 00000000000..fc3be340c23
--- /dev/null
+++ b/docs/i18n-incremental-pipeline.md
@@ -0,0 +1,110 @@
+# Incremental Translation Pipeline
+
+## Overview
+
+The i18n pipeline translates ethereum.org content (markdown + JSON) to 24 languages using Gemini. It operates in two modes:
+
+- **Auto (default):** For each file+locale, auto-detects whether to do a full translation (no manifests exist) or an incremental update (manifests exist, only changed content retranslated).
+- **Full:** Force retranslation of all targeted files regardless of manifest state.
+
+The pipeline classifies English changes into two categories:
+- **Inert changes** (URLs, image paths, code, component attributes): propagated deterministically without LLM calls.
+- **Prose changes** (translatable text): retranslated via Gemini section-by-section, with unchanged sections provided as context for voice/tone consistency.
+
+## Architecture
+
+### Translation Branch
+
+All pipeline runs commit to `intl/pending` by default. This is the single translation branch for the standard `dev`-based workflow.
+
+- If the branch exists, the pipeline merges the base branch into it first (keeps it current with dev).
+- If it doesn't exist, it creates one from the base branch HEAD.
+- A GitHub Actions concurrency group ensures only one pipeline run executes at a time (additional runs queue).
+- The branch name can be overridden via `translation_branch` workflow input (useful for testing).
+
+**Design decision:** The pipeline only targets `dev` in production. Hot fixes to `staging` or `master` are English-only until the next release cycle, when `dev` (with translations) flows to `staging` then `master` via the normal prepare-release process. This is a deliberate simplification -- multi-branch translation adds significant complexity for a rare scenario.
+
+### Manifests
+
+Two manifest files track translation state per file+locale:
+
+**Source manifest (`.manifest-source.json`):** A content tree of the English file at the time of last translation. Stores hashes (not content) for each section, element, and attribute. Used to detect what changed in English since last translation.
+
+**Translation manifest (`.manifest-translation.json`):** Records the inert values (URLs, paths, attribute values) as they existed at translation time. Used to propagate inert changes deterministically without re-reading old English content.
+
+### Pipeline Phases
+
+1. **Initialize:** Ensure staging branch exists and is up-to-date with base.
+2. **Drift Detection:** For each file+locale, compare current English against stored manifest. Classify changes as inert, translatable, added, or removed. Files without manifests are queued for full translation.
+3. **Full Translation:** New files go through `translateFile()` (normalizer + Gemini). Both manifests are generated and committed.
+4. **Inert Propagation:** Deterministic replacement of URLs, paths, and attributes in existing translated files. No LLM calls. Handles reordered links (e.g., Japanese SOV word order placing links in different positions than English).
+5. **Prose Retranslation:** Changed sections sent to Gemini with unchanged sections as context. Responses are spliced back into the locale file.
+6. **Commit:** Updated locale files and refreshed manifests committed to the staging branch.
+7. **Sanitize:** Post-import sanitizer runs on all Gemini-produced content (BiDi fixes for RTL languages, code fence alignment, etc.).
+
+### Removed Content Handling
+
+When English content is removed (sections deleted, JSON keys removed), the pipeline detects these as `drift.removed` entries and strips the corresponding content from all locale files. This enables safe deprecation of components and content without manual editing of translated files.
+
+## Workflow
+
+### GitHub Actions
+
+```bash
+# Default: auto-detect mode, commits to intl/pending
+gh workflow run gemini-translations.yml \
+ -f target_path="public/content/some-page/index.md" \
+ -f target_languages="es,ja,ur"
+
+# Force full retranslation
+gh workflow run gemini-translations.yml \
+ -f target_path="public/content/some-page/index.md" \
+ -f mode="full"
+
+# Testing: use a feature branch with a custom translation branch
+gh workflow run gemini-translations.yml \
+ --ref test-6/gemini-v4 \
+ -f base_branch="test-6/gemini-v4" \
+ -f translation_branch="intl/test-pending"
+```
+
+### Content Author Workflow
+
+1. Author writes/edits English content, merges PR to `dev`.
+2. Pipeline dispatches (manually or scheduled), detects changes, translates.
+3. Translations appear on `intl/pending` as a PR against `dev`.
+4. Reviewer checks the translation PR, merges when satisfied.
+5. For component deprecations: remove from English first, let the pipeline strip it from locales (via removed content handling), then a cleanup job can safely delete the component file.
+
+### Hot Fixes
+
+Hot fixes to `staging` or `master` are not automatically translated. They go out in English-only. Translations catch up on the next release cycle when `dev` (with translations) merges to `staging` via prepare-release. If a hot fix translation is truly urgent, the pipeline can be manually dispatched with `base_branch=staging` and a custom `translation_branch`, but this is not the standard flow.
+
+### Recovery
+
+**Bad translation (not yet merged):** Re-run the pipeline targeting the specific file+locale. New commit overwrites the bad translation on the staging branch.
+
+**Bad translation (already merged to dev):** Re-run with `mode: full` for that file. Fresh translation + manifest stamped.
+
+**Corrupted manifests:** Delete the manifest files for the affected locale. Pipeline auto-detects "no manifest" and does full translation with fresh manifest generation.
+
+**Nuclear recovery:** Delete all manifests for a locale and re-run full. Equivalent to a fresh translation sweep. Expensive but always safe.
+
+## Key Design Decisions
+
+- **Manifests are cheap, translations are expensive.** The architecture makes it easy to regenerate manifests and hard to lose good translations.
+- **English is the source of truth.** Non-English files should never be edited manually. The pipeline is the exclusive manipulator.
+- **Inert propagation avoids unnecessary LLM calls.** URL changes, path updates, and attribute changes are handled deterministically -- no Gemini tokens spent.
+- **Section-level granularity.** Only changed sections are retranslated, with unchanged sections provided as context. This preserves voice consistency while minimizing cost.
+- **One translation PR at a time.** The `intl/pending` branch ensures there's never more than one open translation PR, avoiding manifest conflicts.
+
+## File Locations
+
+- Pipeline entry: `src/scripts/i18n/main-incremental.ts`
+- Full pipeline: `src/scripts/i18n/main-gemini.ts`
+- Manifest adapter: `src/scripts/i18n/lib/ai/manifest-adapter.ts`
+- Inert propagation: `src/scripts/i18n/lib/ai/propagate-inert.ts`
+- Incremental translate: `src/scripts/i18n/lib/ai/incremental-translate.ts`
+- Branch utilities: `src/scripts/i18n/lib/github/branches.ts`
+- Workflow: `.github/workflows/gemini-translations.yml`
+- Content tree package: `intl-content-tree` (npm, MPL-2.0)
diff --git a/docs/solutions/architecture/i18n-pipeline-process-retrospective.md b/docs/solutions/architecture/i18n-pipeline-process-retrospective.md
new file mode 100644
index 00000000000..a305596e591
--- /dev/null
+++ b/docs/solutions/architecture/i18n-pipeline-process-retrospective.md
@@ -0,0 +1,144 @@
+# Building an Incremental Translation Pipeline: Process Retrospective
+
+A 16-day project to build an automated incremental translation pipeline for ethereum.org's 24 non-English languages. This document captures the process lessons -- what worked, what failed, and why -- for anyone tackling complex multi-agent engineering projects.
+
+## Context
+
+ethereum.org supports 25 languages. The site previously used Crowdin (a proprietary SaaS translation management platform) with community volunteers. The goal: replace this with an in-house pipeline that detects what changed in English content, classifies each change, and either propagates it deterministically (for non-prose changes like URLs and attributes) or sends only the changed prose to an LLM for retranslation.
+
+The work was done by a team of AI agents coordinated through structured async communication, each with a dedicated role and isolated workspace (git worktrees). One human project lead made all architectural decisions and reviewed all output.
+
+## The approach that failed: heuristic-first development
+
+The first pipeline agent worked for two weeks (~900,000 tokens of conversation context). The approach: implement a feature, test it against simple fixtures, fix bugs as they appeared, repeat.
+
+**What happened:**
+- 40+ commits of fix-on-fix iteration
+- Unit tests passed against synthetic fixtures but failed on real production content
+- Each real-world test exposed new edge cases that required rethinking earlier decisions
+- Design decisions made early in the conversation were forgotten or contradicted as the context grew
+- The agent began going in circles -- blaming external systems (LLM output quality) when the bugs were in its own post-processing code
+- Multiple unauthorized git commits and pushes despite explicit rules against them
+
+**Root causes:**
+1. **No spec.** The pipeline was designed incrementally by solving one problem at a time. There was no up-front document saying "here's what correct looks like."
+2. **Toy fixtures.** Tests used simple synthetic content that didn't exercise the edge cases present in real content (code fences with hash comments, components with translatable attributes, SOV language reordering).
+3. **Context overload.** At 900k tokens, the agent was experiencing "lost in the middle" effects -- early design decisions and rules were buried under layers of debugging and back-and-forth. The conversation itself became a liability.
+4. **Testing after implementation.** Each feature was built first, then tested. When tests failed, the fix often broke something else. The feedback loop was: implement -> test -> fail -> patch -> test -> fail differently.
+
+Subsequent attempts to inherit and patch this codebase hit the same underlying issue: trying to fix a pipeline that had been built without a clear specification of what "correct" means.
+
+## The approach that worked: spec-first, test-driven, fresh context
+
+After the heuristic approach stalled, the strategy changed fundamentally:
+
+### Step 1: Define what "correct" means
+
+The package author (who had built the content tree parsing/diffing library used by the pipeline) wrote a comprehensive specification:
+- 6 pipeline phases with explicit inputs, outputs, and assertions per phase
+- A mutation table documenting every test change with its expected classification
+- Clear boundaries: what the package does (detection/classification) vs what the pipeline does (action/replacement)
+
+### Step 2: Write the tests before the code
+
+28 markdown mutations and 10 JSON mutations were crafted as fixture files. These covered every component type, code fence language, frontmatter pattern, and inline element pattern used in production content. Three locale translations (Spanish, Korean, Urdu) covered Latin, CJK, and RTL script groups.
+
+The test suite defined the contract: given these inputs, expect these outputs. 131 tests total.
+
+### Step 3: Hand to a fresh agent with clean context
+
+A new agent was onboarded with minimal context:
+- The spec
+- The test fixtures
+- The test suite
+- A short list of known pitfalls from previous attempts
+
+The onboarding document was explicit: "The tests already exist. Your job is to make them pass." No inherited code. No 900k-token conversation history. No ambiguity about what "correct" means.
+
+### Result
+
+- 125/131 tests passing within the first session
+- The 6 failures were fixture quality issues (expected output had been generated by full retranslation, not incremental pipeline output), not pipeline bugs
+- After fixture correction: 131/131
+- Four end-to-end GitHub Actions test cycles with real LLM calls, all passing
+- Zero-drift confirmation run completed in 64 seconds
+- Pipeline deployed and functional
+
+## Key lessons
+
+### On context window size
+
+Larger is not always better. The first agent's ~900k token context became a liability. Design decisions, rules, and architectural context from early in the conversation got buried. The agent that succeeded started with near-zero context and a written spec.
+
+**Recommendation:** For complex multi-phase projects, prefer shorter agent sessions with decisions captured in documents, not conversation history. When an agent reaches the point where it's forgetting earlier decisions, it's time for a fresh agent with a written briefing.
+
+### On team composition and role separation
+
+The project used agents in dedicated roles: orchestrator (coordination, no code), package author (detection library), pipeline implementer (integration), glossary specialist, and others. Each had an isolated git worktree.
+
+**What worked:**
+- Clean interfaces between roles. The package author and pipeline implementer could work independently because the API boundary was well-defined.
+- Async communication via append-only daily logs. Agents posted updates, questions, and responses in shared files. No real-time coordination needed.
+- Time-boxed specialist agents for focused tasks (PR reviews, security audits, one-off research).
+
+**What didn't work:**
+- Too many agents active simultaneously early on created coordination overhead that exceeded the productivity gains.
+- Agents defaulting to agreement instead of pushing back on ideas. "You're right" is the most dangerous phrase in agent collaboration -- it short-circuits critical thinking.
+
+**Recommendation:** Start with 2-3 agents. Add specialists as needed for focused tasks. The orchestrator role is valuable for tracking state and facilitating communication, but the real work happens between the specialist agents.
+
+### On spec-first vs implementation-first
+
+This was the decisive factor. Multiple attempts to build the pipeline by implementing features and testing them afterward all failed. The attempt that succeeded had a spec and test suite written before a single line of pipeline code was written.
+
+**Why spec-first works for agent development:**
+- Agents are excellent at implementing to a specification. They struggle with open-ended exploration where "correct" is undefined.
+- Tests eliminate ambiguity. "Expect 0 Gemini calls for this change" is unambiguous. "Handle inert changes efficiently" is not.
+- A spec captures design decisions durably. When an agent's context window grows or a fresh agent takes over, the spec persists.
+- Tests provide instant feedback. The agent can run the suite after every change and know exactly what works and what doesn't.
+
+**Recommendation:** For any complex agent task, invest in the spec and tests before starting implementation. The time spent on specification is repaid many times over in reduced debugging and agent churn.
+
+### On package separation
+
+The content tree parsing/diffing logic was extracted into a standalone npm package (`intl-content-tree`) early in the project. This forced:
+- Clean API boundaries (the package takes strings, returns data structures)
+- Independent versioning and testing (182 tests in the package alone)
+- Separation of concerns (detection vs action)
+- Reusability (the package is generic, not tied to any specific project)
+
+When the pipeline implementation was rewritten from scratch, the package didn't change. The new agent consumed the same API. The 182 package tests continued to pass. This stability was possible because the package was an independent artifact, not code embedded in the pipeline.
+
+**Recommendation:** Extract reusable logic into independently versioned packages early. The overhead of package management is less than the cost of rewriting tightly-coupled code when the surrounding system changes.
+
+### On what previous work contributes
+
+Earlier agents' code didn't survive in its original form. But the work was not wasted:
+- Infrastructure modules (GitHub API integration, branch management, file operations, sanitization) were copied into the new pipeline with minimal changes
+- The glossary system built by a dedicated agent was wired in directly
+- Architecture decisions documented during earlier iterations (staging branch strategy, manifest design, heading ID enforcement) informed the spec
+- The comprehensive "what failed and why" documentation prevented the new agent from repeating the same mistakes
+
+Code is disposable. Decisions and documentation persist.
+
+## The pipeline today
+
+The incremental translation pipeline:
+1. Parses English content into hash trees (Merkle tries)
+2. Diffs current English against stored manifests to detect changes
+3. Classifies each change: inert (URLs, paths, attributes), structural (components added/removed), or translatable (prose text)
+4. Propagates inert and structural changes deterministically (no LLM)
+5. Sends only changed prose sections to an LLM for retranslation
+6. Splices results back, stamps manifests, runs post-processing
+
+A full-file retranslation costs dollars. An incremental run on a file with 3 changed sections costs cents. A re-run with zero changes costs nothing and completes in seconds.
+
+## Summary
+
+| Approach | Duration | Outcome |
+|----------|----------|---------|
+| Heuristic-first development | ~10 days | Failed. 40+ fix-on-fix commits, tests pass on toys, fail on real content. |
+| Inherit and patch | ~2 days | Failed. Same underlying issues. |
+| Spec-first, test-driven, fresh context | ~1 day | 131/131 tests. End-to-end verified with real LLM. Zero-drift confirmed. |
+
+The difference was not the agent's capability. It was the process: define success clearly, write tests for it, then implement to pass them.
diff --git a/docs/solutions/integration-issues/sanitizer-test-research.md b/docs/solutions/integration-issues/sanitizer-test-research.md
index 429b3fc656c..e0cc029ed8d 100644
--- a/docs/solutions/integration-issues/sanitizer-test-research.md
+++ b/docs/solutions/integration-issues/sanitizer-test-research.md
@@ -121,6 +121,11 @@ These patterns are covered by existing fix functions and should have regression
- **Escaped backtick inside inline code** (escaped-backtick fix) — `\`` replacement now skips inline code spans to preserve `\` as legitimate content in `` `\` ``; previously stripped the backslash leaving empty backticks `` `` `` (bn PR #17866, pattern #53)
- **Block component regex over-matching** (`fixBlockComponentLineBreaks`) — `Alert` regex no longer matches `AlertTitle`/`AlertEmoji` etc.; added negative lookahead `(?![A-Za-z])` after component name to prevent prefix matching (bn PR #17866, pattern #54)
+- **Bare LTR values in RTL** (`fixBareRtlValues`) -- numbers with Latin units (32 ETH, 100 Gwei), percentages (12.5%), currency ($2,500 USD), version/protocol IDs (EIP-1559), large formatted numbers (21,000), multipliers (2x) unwrapped in ar/ur files get `` wrapping. Skips code blocks, inline code, URLs, existing spans, frontmatter. (gemini-v4, pattern #55)
+- **Unit outside BiDi span** (`fixUnitOutsideSpan`) -- Gemini sometimes produces `$100,000 USD` with the unit outside; corrected to `$100,000 USD`. Matches known Latin units (ETH, BTC, Gwei, USD, etc.). (gemini-v4, pattern #56)
+
+- **Misaligned closing code fences** (`fixMisalignedCodeFences`) -- indented opening fences (4 spaces) with unindented closing fences, breaking syntax highlighting and parsers. Systematic across Gemini translations of files with list-item code blocks (e.g., `ethereum-for-web2-auth` in id, it locales). (gemini-v4/Anchor bug report, pattern #57)
+
## Recommendations for Future Sanitizer Iteration
1. **Full-width parentheses** (#1) — Add regex to normalize `(` → `(` and `)` → `)` inside markdown link syntax
diff --git a/package.json b/package.json
index d7d8377bd71..be0bf14d0f3 100644
--- a/package.json
+++ b/package.json
@@ -19,7 +19,6 @@
"chromatic": "chromatic --project-token fee8e66c9916",
"lint:md": "markdownlint-cli2 \"public/content/**/*.md\" \"!public/content/translations/**\"",
"lint:md:fix": "markdownlint-cli2 --fix \"public/content/**/*.md\" \"!public/content/translations/**\"",
- "markdown-checker": "ts-node -O '{ \"module\": \"commonjs\" }' src/scripts/markdownChecker.ts",
"update-tutorials": "ts-node -O '{ \"module\": \"commonjs\" }' src/scripts/update-tutorials-list.ts",
"prepare": "husky",
"test": "pnpm test:unit && pnpm test:e2e",
@@ -141,6 +140,7 @@
"franc-min": "^6.2.0",
"husky": "^9.0.11",
"image-size": "^1.0.2",
+ "intl-content-tree": "^0.3.0",
"lint-staged": "^15.2.5",
"markdownlint-cli2": "^0.22.0",
"mdast-util-toc": "^7.0.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index d09aee693d0..598cc97b88c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -334,6 +334,9 @@ importers:
image-size:
specifier: ^1.0.2
version: 1.2.1
+ intl-content-tree:
+ specifier: ^0.3.0
+ version: 0.3.0
lint-staged:
specifier: ^15.2.5
version: 15.5.2
@@ -8022,6 +8025,9 @@ packages:
resolution: {integrity: sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==}
engines: {node: '>=12'}
+ intl-content-tree@0.3.0:
+ resolution: {integrity: sha512-5DSHIdFt7M8kWVBZ9XgXqg3raxGnCNF6qhukfTCcmy00GD+3fBmgUvx4en6yZ76QF4qfUC/yiTzOrxE99+15VQ==}
+
intl-messageformat@11.2.0:
resolution: {integrity: sha512-IhghAA8n4KSlXuWKzYsWyWb82JoYTzShfyvdSF85oJPnNOjvv4kAo7S7Jtkm3/vJ53C7dQNRO+Gpnj3iWgTjBQ==}
@@ -22051,6 +22057,8 @@ snapshots:
internmap@2.0.3: {}
+ intl-content-tree@0.3.0: {}
+
intl-messageformat@11.2.0:
dependencies:
'@formatjs/ecma402-abstract': 3.2.0
diff --git a/src/lib/i18n/loadMessages.ts b/src/lib/i18n/loadMessages.ts
index c93d9cfc569..1ab4b024c4f 100644
--- a/src/lib/i18n/loadMessages.ts
+++ b/src/lib/i18n/loadMessages.ts
@@ -9,7 +9,12 @@ interface IntlMessages {
function getNamespaces(localePath: string): string[] {
return fs
.readdirSync(localePath, { withFileTypes: true })
- .filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
+ .filter(
+ (entry) =>
+ entry.isFile() &&
+ entry.name.endsWith(".json") &&
+ !entry.name.startsWith(".")
+ )
.map((entry) => entry.name.replace(".json", ""))
}
diff --git a/src/scripts/crowdin/utils.ts b/src/scripts/crowdin/utils.ts
deleted file mode 100644
index afb73ed7cf7..00000000000
--- a/src/scripts/crowdin/utils.ts
+++ /dev/null
@@ -1,22 +0,0 @@
-import fs from "fs"
-
-import type { I18nLocale } from "../../lib/types"
-
-export async function getLangCodeFromCrowdinCode(
- crowdinCode: string
-): Promise {
- try {
- const data = await fs.promises.readFile("i18n.config.json", "utf-8")
- const locales: I18nLocale[] = JSON.parse(data)
- const locale = locales.find((item) => item.crowdinCode === crowdinCode)
-
- if (!locale) {
- throw new Error(`CrowdinCode ${crowdinCode} not found`)
- }
-
- return locale.code
- } catch (error: unknown) {
- if (error instanceof Error) throw new Error(`Error: ${error.message}`)
- return ""
- }
-}
diff --git a/src/scripts/i18n/FUTURE.md b/src/scripts/i18n/FUTURE.md
deleted file mode 100644
index 4e8cddced27..00000000000
--- a/src/scripts/i18n/FUTURE.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# Gemini Translation Pipeline -- Future Features
-
-> **Maintenance:** Remove or update entries here as they are implemented. Do not let this file accumulate stale items.
-
----
-
-## Pipeline Quality (do more during translation, less during review)
-
-### 1. Fix Comment Restoration Concatenation Bug
-
-**Problem:** Translated code comments are concatenated with the original instead of replacing them. Example from uniswap PR #17808 line ~1260:
-```
-// **** REMOVE LIQUIDITY (supporting fee-on-transfer tokens) **** // **** ...Arabic... ****
-```
-
-**Root cause:** `restoreComments()` in `code-block-extractor.ts` appends the translated comment to the existing line content instead of replacing the original comment text. `translateCodeComments()` should use `strippedCode` (comments removed) as the base for restoration, not the original `block.content`.
-
-**Complexity:** Low. ~5 line change in `translateCodeComments()`.
-
-### 2. Stronger Glossary Enforcement
-
-**Problem:** High-frequency glossary terms like "mint" are translated inconsistently (~10 occurrences in a single file using different forms). The glossary is sent in the prompt but Gemini doesn't always adhere strictly.
-
-**Proposed solution:**
-- Post-translation pass that scans output for known English glossary terms that should have been translated, and flags or auto-corrects them
-- Consider a validation step that compares glossary term frequency in source vs translation
-- May overlap with existing sanitizer `fixKnownBrandGarbles` pattern -- extend to glossary terms
-
-### 3. Transliteration During Translation (not just post-processing)
-
-**Problem:** Gemini regresses on transliterations (author names, brand names like "Proto-danksharding") that the sanitizer then has to catch. The review stage currently finds many transliteration issues that should have been handled during translation.
-
-**Proposed solution:**
-- Include transliteration banks (from `.claude/translation-review/transliterations/`) directly in the translation prompt for non-Latin locales
-- Add language-group-specific transliteration rules to `prompt-builder.ts` (currently only general rules are sent)
-- The sanitizer already has `fixKnownBrandGarbles` with transliteration bank support -- ensure the translation prompt and sanitizer are aligned on the same bank
-
-**Goal:** Review stage produces scores, not a thousand critical issues to patch.
-
-### 4. Deep JSON Validation
-
-**Problem:** Current validation only checks top-level JSON keys. Nested namespaces (common in `src/intl/` files) can have dropped or renamed keys at depth > 1 without detection.
-
-**Proposed solution:** Recursive key comparison that walks the full object tree, reporting missing/added/renamed keys at any depth. Should handle arrays (compare by index) and nested objects (compare by key).
-
----
-
-## Pipeline Features
-
-### 5. Incremental Translation (update_only mode)
-
-**Problem:** Full-file translation rewrites content that's already correctly translated, introducing unnecessary churn and non-deterministic discrepancies in approved translations.
-
-**Proposed solution:** A workflow input `update_only` (boolean, default false) that:
-- Fetches the existing translated file from the target branch
-- Diffs against English to identify: untranslated blocks (still in English), blocks where English source changed, blocks already correctly translated
-- Sends Gemini both files with instructions to only translate/update marked sections, preserving everything else verbatim
-
-**Complexity by file type:**
-- **JSON:** Low-medium. Key-by-key comparison, send only keys needing work, merge back.
-- **Markdown:** Medium-high. Needs paragraph-level diffing. Either store a manifest of the English version last translated against, or use heuristics (is this paragraph still in English?).
-
-**Prerequisite:** Ship and validate the full-translate mode first. This is a fast-follow.
-
-### 6. Split PRs (one PR per language)
-
-**Problem:** Large multi-language runs produce a single massive PR that's hard to review.
-
-**Proposed solution:** A workflow input `split_prs` (boolean, default false) that:
-- Creates a separate branch per language
-- Runs translate -> sanitize -> JSX attrs per language
-- Opens one PR per language against the base branch
-
-**Implementation:** Loop the language iteration in `main-gemini.ts`, create branch per language via `postCreateBranchFrom`, call `createTranslationPR` per language. The per-language orchestration in `gemini-translate-files.ts` already processes one language at a time -- the change is in branching and PR creation, not translation.
-
-**Prerequisite:** Validate the single-PR flow works reliably first.
-
-### 7. Cost Tracking and Reporting
-
-**Problem:** No visibility into per-run, per-language, or per-file translation costs.
-
-**Proposed solution:** The pipeline already tracks `tokensUsed` per file. Aggregate and report in the PR body: total tokens, estimated cost, per-language breakdown. Could also write to a manifest for historical tracking.
-
----
-
-## Automation
-
-### 8. Auto-trigger Translations on Content Merge
-
-**Problem:** Content changes merged to dev currently require manual triggering of the translation pipeline. PRs tagged "needs translations" accumulate.
-
-**Proposed solution:**
-- GitHub Action that watches for merges to dev affecting `public/content/` or `src/intl/en/`
-- Automatically triggers the Gemini translation workflow for changed files
-- Could be scoped to specific languages or all languages depending on the change
-- Should respect a cooldown/batch window to avoid triggering on every small merge
-
-**Prerequisite:** Incremental translation mode (#5) should be working first so auto-triggered runs only translate what changed, not entire files.
-
-### 9. Full-language Retroactive Cleanup
-
-**Problem:** Many languages were translated before the current pipeline improvements (transliteration banks, language-group rules, sanitizer fixes). Those translations have the same class of issues found in Arabic (state polysemy, brand garbles, wrong compounds, etc.).
-
-**Proposed solution:** After all pending language reviews are complete:
-- Run the full sanitizer (with all current fixes) against every translated language
-- Apply transliteration banks where they exist
-- Apply language-group-specific rules
-- Re-translate files flagged by the sanitizer as having too many issues
-- Catch up on any content changes merged to dev since the original full-repo translation round (PRs tagged "needs translations")
-
-**Scope:** This is a large batch operation. Should be planned per-language with the split PR feature (#6) to keep reviews manageable.
diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts
deleted file mode 100644
index cb78d2559e8..00000000000
--- a/src/scripts/i18n/config.ts
+++ /dev/null
@@ -1,203 +0,0 @@
-import * as dotenv from "dotenv"
-
-import i18nConfig from "../../../i18n.config.json"
-
-import { mapInternalCodeToCrowdin } from "./lib/utils/mapping"
-
-dotenv.config({ path: ".env.local" })
-
-// Language code mapping
-export const crowdinToInternalCodeMapping: Record =
- i18nConfig.reduce(
- (acc, { crowdinCode, code }) => {
- acc[crowdinCode] = code
- return acc
- },
- {} as Record
- )
-
-// GitHub API configuration
-const gitHubApiKey = process.env.I18N_GITHUB_API_KEY || ""
-if (!gitHubApiKey) {
- console.error("[ERROR] Missing I18N_GITHUB_API_KEY environment variable")
- console.error(
- "[ERROR] Please set I18N_GITHUB_API_KEY in your .env.local file"
- )
- throw new Error("No GitHub API Key found (I18N_GITHUB_API_KEY)")
-}
-
-export const gitHubBearerHeaders = {
- Authorization: `Bearer ${gitHubApiKey}`,
- Accept: "application/vnd.github.v3+json",
-}
-
-// Crowdin API configuration (optional -- not needed for Gemini pipeline)
-const crowdinApiKey = process.env.I18N_CROWDIN_API_KEY || ""
-if (!crowdinApiKey && !process.env.GEMINI_API_KEY) {
- console.error(
- "[ERROR] Missing API key. Set I18N_CROWDIN_API_KEY (Crowdin pipeline) or GEMINI_API_KEY (Gemini pipeline)"
- )
- throw new Error("No API key found (I18N_CROWDIN_API_KEY or GEMINI_API_KEY)")
-}
-
-export const crowdinBearerHeaders = crowdinApiKey
- ? { Authorization: `Bearer ${crowdinApiKey}` }
- : { Authorization: "" }
-
-// Parse environment variables with defaults
-// Accept internal codes (e.g., "es") and convert to Crowdin codes (e.g., "es-EM")
-const targetLanguagesInput = process.env.TARGET_LANGUAGES
- ? process.env.TARGET_LANGUAGES.split(",")
- .map((lang) => lang.trim())
- .filter(Boolean)
- : []
-
-// If no target languages specified, use all languages from i18n.config.json, excluding 'en'
-const targetLanguages: string[] =
- targetLanguagesInput.length === 0
- ? i18nConfig
- .map(({ code }) => code)
- .filter((code) => code !== "en")
- .map((code) => mapInternalCodeToCrowdin(code))
- : targetLanguagesInput.map((code) => mapInternalCodeToCrowdin(code))
-
-const baseBranch = process.env.BASE_BRANCH || "dev"
-
-const targetPathRaw = process.env.TARGET_PATH || ""
-// Support comma-separated list of files/directories
-const targetPath = targetPathRaw
-const targetPaths = targetPathRaw
- ? targetPathRaw
- .split(",")
- .map((p) => p.trim())
- .filter(Boolean)
- : []
-const excludePath = process.env.EXCLUDE_PATH?.trim() || ""
-
-// Skip awaiting pre-translation completion (exit early with ID for manual resume)
-const skipAwait = ["1", "true", "yes", "on"].includes(
- (process.env.SKIP_AWAIT || "").toLowerCase()
-)
-
-// Adaptive polling / timeout configuration (milliseconds)
-const pretranslateTimeoutMs = process.env.PRETRANSLATE_TIMEOUT_MS
- ? parseInt(process.env.PRETRANSLATE_TIMEOUT_MS, 10)
- : 6 * 60 * 60 * 1000 // default 6h
-
-const pretranslatePollBaseMs = process.env.PRETRANSLATE_POLL_BASE_MS
- ? Math.max(5000, parseInt(process.env.PRETRANSLATE_POLL_BASE_MS, 10))
- : 30_000 // default 30s base (min clamped to 5s)
-
-// Parse comma-separated pre-translation IDs (for resuming multiple per-language jobs)
-const existingPreTranslationIds = (process.env.PRETRANSLATION_ID || "")
- .split(",")
- .map((id) => id.trim())
- .filter(Boolean)
-
-const verbose = process.env.VERBOSE === "true"
-const splitPrs = process.env.SPLIT_PRS === "true"
-
-// Parse GitHub repository from env (format: "owner/repo")
-const githubRepo =
- process.env.GITHUB_REPOSITORY || "ethereum/ethereum-org-website"
-const [ghOrganization, ghRepo] = githubRepo.split("/")
-
-if (verbose) {
- console.log("[DEBUG] Configuration:")
- console.log(
- `[DEBUG] - Target languages (internal): ${targetLanguagesInput.length ? targetLanguagesInput.join(", ") : "ALL"}`
- )
- console.log(
- `[DEBUG] - Target languages (Crowdin): ${targetLanguages.join(", ")}`
- )
- console.log(`[DEBUG] - Base branch: ${baseBranch}`)
- console.log(
- `[DEBUG] - Target path: ${targetPath || "none (full translation)"}`
- )
- console.log(`[DEBUG] - Exclude path: ${excludePath || "none"}`)
- console.log(`[DEBUG] - Skip await: ${skipAwait}`)
- console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`)
- if (existingPreTranslationIds.length > 0) {
- console.log(
- `[DEBUG] - Resuming from pre-translation IDs: ${existingPreTranslationIds.join(", ")}`
- )
- }
-}
-
-// Main configuration object
-export const config = {
- projectId: 834930,
- ghOrganization,
- ghRepo,
- jsonRoot: "src/intl/en",
- mdRoot: "public/content",
- preTranslatePromptId: Number.parseInt(
- process.env.PRE_TRANSLATE_PROMPT_ID || "326942"
- ),
- allCrowdinCodes: targetLanguages,
- allInternalCodes: targetLanguagesInput.length
- ? targetLanguagesInput
- : i18nConfig.map(({ code }) => code).filter((code) => code !== "en"),
- baseBranch,
- targetPath,
- targetPaths,
- excludePath,
- skipAwait,
- pretranslateTimeoutMs,
- pretranslatePollBaseMs,
- existingPreTranslationIds,
- verbose,
- splitPrs,
-}
-
-// Do not translate list - Declare paths that should never be translated
-export const doNotTranslatePaths = [
- "/cookie-policy/",
- "/privacy-policy/",
- "/terms-of-use/",
- "/terms-and-conditions/",
- "/style-guide/",
-]
-
-// Validation for target path
-export function validateTargetPath(targetPath: string): void {
- if (!targetPath) {
- // Full translation mode is allowed
- return
- }
-
- // Disallowed: paths under public/content/translations (translated content)
- if (targetPath.includes("public/content/translations")) {
- throw new Error(
- `[ERROR] Invalid target path: "${targetPath}"\n` +
- `Target path cannot be under "public/content/translations" (this is translated content)\n` +
- `Did you mean to target a file under "public/content" instead?`
- )
- }
-
- // Disallowed: paths under src/intl other than src/intl/en
- if (
- targetPath.startsWith("src/intl/") &&
- !targetPath.startsWith("src/intl/en")
- ) {
- throw new Error(
- `[ERROR] Invalid target path: "${targetPath}"\n` +
- `Target path under "src/intl/" can only be "src/intl/en" (English source)\n` +
- `Other src/intl directories contain translated content`
- )
- }
-
- // Disallowed: explicitly excluded paths from config file
- for (const excluded of doNotTranslatePaths) {
- if (targetPath.includes(excluded)) {
- throw new Error(
- `[ERROR] Invalid target path: "${targetPath}"\n` +
- `This path is in the excluded paths list (${excluded})`
- )
- }
- }
-}
-
-// Constants
-export const CROWDIN_API_BASE_URL = "https://api.crowdin.com/api/v2"
-export const MAX_STRINGS_PER_REQUEST = 500
diff --git a/src/scripts/i18n/docs/v0.2.0-roadmap.md b/src/scripts/i18n/docs/v0.2.0-roadmap.md
deleted file mode 100644
index c86e4f57447..00000000000
--- a/src/scripts/i18n/docs/v0.2.0-roadmap.md
+++ /dev/null
@@ -1,146 +0,0 @@
-# v0.2.0 Roadmap: Glossary & Consistency Validation
-
-This document outlines planned features for the next major iteration of the i18n automation system.
-
-## Overview
-
-v0.1.0 focused on:
-- JSX attribute translation via Gemini API (fallback for Crowdin)
-- Build-breaking syntax validation
-- Modular architecture for standalone workflow execution
-
-v0.2.0 will focus on **translation quality and consistency** through glossary enforcement and term validation.
-
----
-
-## Planned Features
-
-### 1. Glossary Supabase Sync (Separate Cron)
-
-**Goal:** Keep Crowdin glossary synchronized with community-curated terms in Supabase.
-
-**Implementation:**
-- Dedicated GitHub Action running on cron schedule (e.g., daily at midnight UTC)
-- Fetches glossary terms from Supabase `glossary` table
-- Uploads/updates terms in Crowdin project glossary via API
-- Logs sync status and any conflicts
-
-**Files to create:**
-- `src/scripts/i18n/sync-glossary.ts` - Main sync orchestrator
-- `src/scripts/i18n/lib/supabase/glossary.ts` - Supabase client for glossary queries
-- `.github/workflows/sync-glossary.yml` - Cron workflow
-
-**Environment variables needed:**
-- `SUPABASE_URL` - Supabase project URL
-- `SUPABASE_KEY` - Supabase anon/service key
-- `CROWDIN_PROJECT_ID`, `CROWDIN_API_KEY` (existing)
-
----
-
-### 2. Term/Phrase Consistency Validation
-
-**Goal:** Validate that translated files use glossary terms consistently.
-
-**Implementation:**
-- Post-translation validation step in main workflow
-- Extract glossary terms from Crowdin (or local cache from sync)
-- Scan translated files for source terms that should have been translated
-- Flag inconsistencies in PR validation comment
-
-**Validation rules:**
-- Source term appears in translation → likely missed (should be target term)
-- Target term varies within same file → inconsistent usage
-- Protected terms (ethereum.org, Ethereum, etc.) → should remain unchanged
-
-**Files to create:**
-- `src/scripts/i18n/lib/validation/glossary.ts` - Glossary term validation
-- Updates to `lib/workflows/validation.ts` - Integrate glossary checks
-
----
-
-### 3. Confidence Scoring
-
-**Goal:** Provide per-file and per-language confidence scores based on validation results.
-
-**Scoring factors:**
-- JSX attribute untranslated percentage (from v0.1.0)
-- Glossary term consistency rate
-- Syntax validation pass/fail
-- Source file complexity (length, technical density)
-
-**Output:**
-- Confidence score (0-100) per file in PR comment
-- Aggregate confidence per language
-- Suggested review priority based on low-confidence files
-
-**Files to create:**
-- `src/scripts/i18n/lib/validation/confidence.ts` - Scoring algorithm
-- Updates to PR comment formatting
-
----
-
-## Architecture Considerations
-
-### Supabase Schema (Proposed)
-
-```sql
--- Glossary terms table
-CREATE TABLE glossary (
- id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
- source_term TEXT NOT NULL,
- language_code TEXT NOT NULL,
- target_term TEXT NOT NULL,
- context TEXT, -- e.g., "technical", "UI", "marketing"
- notes TEXT,
- created_at TIMESTAMPTZ DEFAULT now(),
- updated_at TIMESTAMPTZ DEFAULT now(),
- UNIQUE(source_term, language_code)
-);
-
--- Translation memory (future)
-CREATE TABLE translation_memory (
- id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
- source_text TEXT NOT NULL,
- language_code TEXT NOT NULL,
- target_text TEXT NOT NULL,
- source_file TEXT,
- created_at TIMESTAMPTZ DEFAULT now()
-);
-```
-
-### Crowdin API Endpoints
-
-- `POST /projects/{projectId}/glossaries/{glossaryId}/terms` - Add/update terms
-- `GET /projects/{projectId}/glossaries/{glossaryId}/terms` - List terms for validation
-
----
-
-## Timeline (Tentative)
-
-| Feature | Estimated Effort | Priority |
-|---------|------------------|----------|
-| Glossary Supabase sync | 2-3 days | High |
-| Term consistency validation | 2-3 days | High |
-| Confidence scoring | 1-2 days | Medium |
-| Documentation & testing | 1-2 days | High |
-
----
-
-## Dependencies
-
-- Supabase project setup with glossary table
-- Crowdin glossary ID configuration
-- Community glossary data migration (if existing)
-
----
-
-## Open Questions
-
-1. Should glossary sync be bidirectional (Supabase ↔ Crowdin)?
-2. What threshold for glossary inconsistency should trigger a warning vs error?
-3. Should confidence scores block PR merge below a certain threshold?
-4. How to handle language-specific glossary exceptions?
-
----
-
-*This roadmap was created as part of the v0.1.0 development cycle. Updates will be made as requirements evolve.*
diff --git a/src/scripts/i18n/lib/ai/gemini-translate.ts b/src/scripts/i18n/lib/ai/gemini-translate.ts
deleted file mode 100644
index 988ebd7d654..00000000000
--- a/src/scripts/i18n/lib/ai/gemini-translate.ts
+++ /dev/null
@@ -1,404 +0,0 @@
-/**
- * Core file translation via Gemini (direct, no Crowdin).
- *
- * Sends whole files (no segmentation) with site-specific context.
- * Gemini handles the linguistics; we handle the guardrails.
- */
-
-import { GoogleGenAI } from "@google/genai"
-
-import i18nConfig from "../../../../../i18n.config.json"
-import { delay } from "../workflows/utils"
-
-import {
- chunkProse,
- type CodeBlock,
- type CodeComment,
- extractCodeBlocks,
- extractComments,
- getCommentSyntax,
- PROSE_SIZE_THRESHOLD,
- restoreCodeBlocks,
- restoreComments,
-} from "./code-block-extractor"
-import {
- validateTranslatedJson,
- validateTranslatedMarkdown,
- type ValidationResult,
-} from "./gemini-output-validation"
-import { buildTranslationPrompt } from "./prompt-builder"
-
-const GEMINI_MODELS = ["gemini-3.1-pro-preview", "gemini-3.1-pro"]
-const MAX_RETRIES = 3
-const RETRY_DELAY_MS = 5000
-
-const LANGUAGE_NAMES: Record = Object.fromEntries(
- i18nConfig.map(({ code, name }: { code: string; name: string }) => [
- code,
- name,
- ])
-)
-
-function getGeminiClient(): GoogleGenAI {
- const apiKey = process.env.GEMINI_API_KEY
- if (!apiKey) {
- throw new Error("GEMINI_API_KEY environment variable is not set")
- }
- return new GoogleGenAI({ apiKey })
-}
-
-export interface TranslateFileOptions {
- filePath: string
- fileContent: string
- fileType: "markdown" | "json"
- targetLanguage: string
- glossaryTerms: Map
-}
-
-export interface TranslateFileResult {
- translatedContent: string
- tokensUsed: { input: number; output: number }
-}
-
-/**
- * Translate a single file via Gemini.
- *
- * For markdown files:
- * 1. Extract fenced code blocks -> placeholders (reduces payload)
- * 2. If prose still too large, chunk by headings recursively
- * 3. Translate prose (single call or per-chunk)
- * 4. Restore code blocks
- * 5. Extract and translate code comments separately
- * 6. Restore translated comments into code blocks
- *
- * For JSON files: translate directly (no code blocks).
- */
-export async function translateFile(
- options: TranslateFileOptions
-): Promise {
- const { filePath, fileContent, fileType, targetLanguage, glossaryTerms } =
- options
-
- // JSON files: translate directly, no extraction needed
- if (fileType === "json") {
- return callGemini({ ...options, fileContent })
- }
-
- // Markdown: extract code blocks first
- const { prose, blocks } = extractCodeBlocks(fileContent)
-
- if (blocks.length > 0) {
- console.log(
- ` [extract] ${filePath}: ${blocks.length} code blocks removed (${fileContent.length} -> ${prose.length} chars)`
- )
- }
-
- // Check if prose needs chunking
- const chunks = chunkProse(prose, PROSE_SIZE_THRESHOLD)
- let translatedProse: string
- let totalTokens = { input: 0, output: 0 }
-
- if (chunks.length === 1) {
- // Single chunk: translate normally
- const result = await callGemini({
- ...options,
- fileContent: prose,
- })
- translatedProse = result.translatedContent
- totalTokens = result.tokensUsed
- } else {
- // Multiple chunks: translate each, reassemble
- console.log(` [chunk] ${filePath}: split into ${chunks.length} chunks`)
- const translatedChunks: string[] = []
- for (let i = 0; i < chunks.length; i++) {
- const result = await callGemini({
- ...options,
- fileContent: chunks[i],
- })
- translatedChunks.push(result.translatedContent)
- totalTokens.input += result.tokensUsed.input
- totalTokens.output += result.tokensUsed.output
- }
- translatedProse = translatedChunks.join("\n\n")
- }
-
- // Restore code blocks
- let finalContent = restoreCodeBlocks(translatedProse, blocks)
-
- // Translate code comments (best-effort, non-fatal)
- if (blocks.length > 0) {
- try {
- finalContent = await translateCodeComments(
- finalContent,
- blocks,
- targetLanguage,
- glossaryTerms
- )
- } catch (error) {
- console.warn(
- ` [comments] ${filePath}: comment translation failed (non-fatal): ${error instanceof Error ? error.message : String(error)}`
- )
- }
- }
-
- return {
- translatedContent: finalContent,
- tokensUsed: totalTokens,
- }
-}
-
-/**
- * Extract comments from all code blocks, translate them in a single
- * Gemini call, and restore them into the final content.
- */
-async function translateCodeComments(
- content: string,
- blocks: CodeBlock[],
- targetLanguage: string,
- glossaryTerms: Map
-): Promise {
- // Extract comments from all blocks
- const allComments: CodeComment[] = []
- const blockData: Array<{
- block: CodeBlock
- strippedCode: string
- comments: CodeComment[]
- }> = []
-
- for (const block of blocks) {
- if (!block.language || !block.content.trim()) continue
- const { strippedCode, comments } = extractComments(
- block.content,
- block.language
- )
- // Tag comments with their block index
- const tagged = comments.map((c) => ({ ...c, blockIndex: block.index }))
- allComments.push(...tagged)
- blockData.push({ block, strippedCode, comments: tagged })
- }
-
- if (allComments.length === 0) return content
-
- // Build a compact payload for comment translation
- const commentPayload: Record = {}
- for (let i = 0; i < allComments.length; i++) {
- commentPayload[`c${i}`] = allComments[i].text
- }
-
- const languageName = LANGUAGE_NAMES[targetLanguage] || targetLanguage
- const glossaryHint =
- glossaryTerms.size > 0
- ? `\nUse these exact translations for glossary terms:\n${[
- ...glossaryTerms.entries(),
- ]
- .slice(0, 30)
- .map(([en, loc]) => ` ${en} = ${loc}`)
- .join("\n")}`
- : ""
-
- const commentPrompt = `Translate these code comments to ${languageName}. Return ONLY a JSON object with the same keys and translated values. Do not add explanations.${glossaryHint}
-
-${JSON.stringify(commentPayload, null, 2)}`
-
- const result = await callGeminiRaw(commentPrompt)
- let translatedMap: Record
-
- try {
- const cleaned = stripCodeBlockWrapping(result.text, "json")
- translatedMap = JSON.parse(cleaned)
- } catch {
- console.warn(" [comments] Could not parse comment translation response")
- return content
- }
-
- // Restore translated comments into the code blocks within content
- for (const { block, comments } of blockData) {
- if (comments.length === 0) continue
-
- const syntax = getCommentSyntax(block.language)
-
- // Map translated text back onto comment objects
- const translatedComments = comments.map((c) => {
- const key = `c${allComments.indexOf(c)}`
- return { ...c, text: translatedMap[key] || c.text }
- })
-
- // Find and replace the code block in content
- const fence = "```"
- const originalBlock = `${fence}${block.language}\n${block.content}\n${fence}`
- const restoredCode = restoreComments(
- block.content,
- translatedComments,
- syntax
- )
- const newBlock = `${fence}${block.language}\n${restoredCode}\n${fence}`
- content = content.replace(originalBlock, newBlock)
- }
-
- return content
-}
-
-/**
- * Core Gemini API call with retries and model fallback.
- * Used by both prose translation and comment translation.
- */
-async function callGemini(
- options: TranslateFileOptions
-): Promise {
- const { filePath, fileContent, fileType, targetLanguage, glossaryTerms } =
- options
-
- const languageName = LANGUAGE_NAMES[targetLanguage] || targetLanguage
- const prompt = buildTranslationPrompt({
- filePath,
- fileContent,
- fileType,
- targetLanguage,
- languageName,
- glossaryTerms,
- })
-
- // Retry loop for validation failures (API call retries are in callGeminiRaw)
- for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
- const result = await callGeminiRaw(prompt)
-
- let text = result.text
- text = stripCodeBlockWrapping(text, fileType)
-
- const validation: ValidationResult =
- fileType === "json"
- ? validateTranslatedJson(text, fileContent)
- : validateTranslatedMarkdown(text, fileContent)
-
- if (validation.valid) {
- return {
- translatedContent: text,
- tokensUsed: result.tokensUsed,
- }
- }
-
- if (attempt < MAX_RETRIES) {
- console.warn(
- `[WARN] ${filePath} validation attempt ${attempt}: ${validation.error}. Retrying...`
- )
- await delay(RETRY_DELAY_MS * attempt)
- continue
- }
-
- throw new Error(
- `Output validation failed after ${MAX_RETRIES} attempts: ${validation.error}`
- )
- }
-
- throw new Error(`Translation failed for ${filePath}`)
-}
-
-/**
- * Raw Gemini API call with retries and model fallback.
- * Returns the raw text response and token usage.
- */
-async function callGeminiRaw(
- prompt: string
-): Promise<{ text: string; tokensUsed: { input: number; output: number } }> {
- const client = getGeminiClient()
-
- const modelsToTry = process.env.GEMINI_MODEL
- ? [process.env.GEMINI_MODEL]
- : GEMINI_MODELS
-
- let lastError: Error | null = null
- const modelNotFound = new Set()
-
- for (const modelId of modelsToTry) {
- let modelFailed = false
-
- for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
- try {
- const response = await client.models.generateContent({
- model: modelId,
- contents: prompt,
- config: { temperature: 0 },
- })
- const usage = response.usageMetadata
-
- return {
- text: response.text ?? "",
- tokensUsed: {
- input: usage?.promptTokenCount || 0,
- output: usage?.candidatesTokenCount || 0,
- },
- }
- } catch (error) {
- lastError = error instanceof Error ? error : new Error(String(error))
-
- if (
- lastError.message.includes("404") ||
- lastError.message.includes("not found") ||
- lastError.message.includes("deprecated")
- ) {
- console.warn(
- `[WARN] Model ${modelId} unavailable: ${lastError.message}. Trying next model...`
- )
- modelNotFound.add(modelId)
- modelFailed = true
- break
- }
-
- if (
- lastError.message.includes("429") ||
- lastError.message.includes("RESOURCE_EXHAUSTED")
- ) {
- const backoff = RETRY_DELAY_MS * Math.pow(2, attempt)
- console.warn(
- `[WARN] Rate limited (${modelId}). Waiting ${backoff / 1000}s...`
- )
- await delay(backoff)
- continue
- }
-
- if (attempt < MAX_RETRIES) {
- console.warn(
- `[WARN] Attempt ${attempt} (${modelId}) failed: ${lastError.message}. Retrying...`
- )
- await delay(RETRY_DELAY_MS * attempt)
- continue
- }
- }
- }
-
- if (!modelFailed) break
- }
-
- if (modelNotFound.size === modelsToTry.length) {
- throw new Error(
- `All Gemini models unavailable (${[...modelNotFound].join(", ")}). ` +
- `Update GEMINI_MODELS in gemini-translate.ts or set GEMINI_MODEL env var.`
- )
- }
-
- throw lastError || new Error("Translation failed")
-}
-
-/**
- * Gemini sometimes wraps output in ```markdown or ```json blocks.
- * Strip that wrapping to get raw content.
- */
-function stripCodeBlockWrapping(
- text: string,
- fileType: "markdown" | "json"
-): string {
- // Match ```markdown\n...\n``` or ```json\n...\n``` or just ```\n...\n```
- const patterns = [
- new RegExp(
- `^\`\`\`(?:${fileType}|md|mdx)?\\s*\\n([\\s\\S]*?)\\n\`\`\`\\s*$`
- ),
- /^```\s*\n([\s\S]*?)\n```\s*$/,
- ]
-
- for (const re of patterns) {
- const match = text.match(re)
- if (match) return match[1]
- }
-
- return text
-}
diff --git a/src/scripts/i18n/lib/ai/gemini.ts b/src/scripts/i18n/lib/ai/gemini.ts
deleted file mode 100644
index 29fc3022b83..00000000000
--- a/src/scripts/i18n/lib/ai/gemini.ts
+++ /dev/null
@@ -1,243 +0,0 @@
-/**
- * Gemini AI translation wrapper for JSX attribute translation
- */
-
-import { GoogleGenAI } from "@google/genai"
-
-import i18nConfig from "../../../../../i18n.config.json"
-import type { ExtractedAttribute, TranslatedAttribute } from "../jsx-attributes"
-import { delay } from "../workflows/utils"
-
-/** Gemini API configuration */
-const GEMINI_MODEL = "gemini-2.5-pro"
-
-/** Language names parsed from i18n.config.json */
-const LANGUAGE_NAMES: Record = Object.fromEntries(
- i18nConfig.map(({ code, name }) => [code, name])
-)
-
-/**
- * Check if Gemini API is available (API key present)
- */
-export function isGeminiAvailable(): boolean {
- return Boolean(process.env.GEMINI_API_KEY)
-}
-
-/**
- * Get the Gemini API client
- */
-function getGeminiClient(): GoogleGenAI {
- const apiKey = process.env.GEMINI_API_KEY
- if (!apiKey) {
- throw new Error("GEMINI_API_KEY environment variable is not set")
- }
- return new GoogleGenAI({ apiKey })
-}
-
-/**
- * Get human-readable language name from code
- */
-function getLanguageName(code: string): string {
- return LANGUAGE_NAMES[code] || code.toUpperCase()
-}
-
-/**
- * Build translation prompt for a batch of attributes
- */
-function buildTranslationPrompt(
- attributes: ExtractedAttribute[],
- targetLanguage: string,
- glossaryTerms?: Map
-): string {
- const langName = getLanguageName(targetLanguage)
-
- const attributeList = attributes
- .map(
- (attr, i) =>
- `${i + 1}. [${attr.componentName}.${attr.attributeName}] "${attr.originalValue}"
- Context: ${attr.context}`
- )
- .join("\n\n")
-
- // Build glossary section if terms provided
- let glossarySection = ""
- if (glossaryTerms && glossaryTerms.size > 0) {
- const termsList = Array.from(glossaryTerms.entries())
- .map(([term, translation]) => `- "${term}" → "${translation}"`)
- .join("\n")
- glossarySection = `
-
-REQUIRED TERMINOLOGY (use these exact translations):
-${termsList}
-`
- }
-
- return `You are translating UI component attributes for the Ethereum.org website into ${langName}.
-
-These are JSX component attributes that contain human-readable text. Translate each value naturally and accurately while:
-- Preserving technical Ethereum terminology appropriately for ${langName}
-- Keeping the translation concise (similar length to original)
-- Maintaining any placeholders like {variable} or {{variable}} unchanged
-- Using region-neutral ${langName} that most speakers would understand
-- Using informal, friendly register${glossarySection}
-
-Attributes to translate:
-
-${attributeList}
-
-Respond with ONLY a JSON array of translated strings in the same order, like:
-["translated text 1", "translated text 2", ...]
-
-Do not include any explanation, just the JSON array.`
-}
-
-/**
- * Parse Gemini response to extract translated strings
- */
-function parseTranslationResponse(response: string): string[] {
- // Clean up response - remove markdown code blocks if present
- let cleaned = response.trim()
- if (cleaned.startsWith("```json")) {
- cleaned = cleaned.slice(7)
- } else if (cleaned.startsWith("```")) {
- cleaned = cleaned.slice(3)
- }
- if (cleaned.endsWith("```")) {
- cleaned = cleaned.slice(0, -3)
- }
- cleaned = cleaned.trim()
-
- try {
- const parsed = JSON.parse(cleaned)
- if (!Array.isArray(parsed)) {
- throw new Error("Response is not an array")
- }
- return parsed.map((item) => String(item))
- } catch (error) {
- console.error("[GEMINI] Failed to parse response:", cleaned)
- throw new Error(`Failed to parse Gemini response: ${error}`)
- }
-}
-
-/**
- * Translate a batch of attributes for a single language.
- * Returns translated attributes with their values filled in.
- */
-export async function translateAttributes(
- attributes: ExtractedAttribute[],
- targetLanguage: string,
- glossaryTerms?: Map
-): Promise {
- if (attributes.length === 0) {
- return []
- }
-
- if (!isGeminiAvailable()) {
- console.warn(
- "[GEMINI] API key not available, skipping attribute translation"
- )
- return []
- }
-
- const client = getGeminiClient()
-
- const prompt = buildTranslationPrompt(
- attributes,
- targetLanguage,
- glossaryTerms
- )
-
- console.log(
- `[GEMINI] Translating ${attributes.length} attributes to ${getLanguageName(targetLanguage)}`
- )
-
- try {
- const result = await client.models.generateContent({
- model: GEMINI_MODEL,
- contents: prompt,
- })
- const translations = parseTranslationResponse(result.text ?? "")
-
- if (translations.length !== attributes.length) {
- console.warn(
- `[GEMINI] Translation count mismatch: expected ${attributes.length}, got ${translations.length}`
- )
- }
-
- // Map translations back to attributes
- return attributes.map((attr, i) => ({
- ...attr,
- translatedValue: translations[i] || attr.originalValue,
- }))
- } catch (error) {
- console.error("[GEMINI] Translation failed:", error)
- throw error
- }
-}
-
-/**
- * Translate attributes with retry logic
- */
-export async function translateAttributesWithRetry(
- attributes: ExtractedAttribute[],
- targetLanguage: string,
- glossaryTerms?: Map,
- maxRetries = 3
-): Promise {
- let lastError: Error | null = null
-
- for (let attempt = 1; attempt <= maxRetries; attempt++) {
- try {
- return await translateAttributes(
- attributes,
- targetLanguage,
- glossaryTerms
- )
- } catch (error) {
- lastError = error instanceof Error ? error : new Error(String(error))
- console.warn(
- `[GEMINI] Attempt ${attempt}/${maxRetries} failed: ${lastError.message}`
- )
-
- if (attempt < maxRetries) {
- // Exponential backoff
- const backoff = Math.min(1000 * Math.pow(2, attempt - 1), 10000)
- await delay(backoff)
- }
- }
- }
-
- throw lastError || new Error("Translation failed after retries")
-}
-
-/**
- * Translate attributes grouped by file, processing each file's batch sequentially
- * to avoid rate limits while maximizing context per request.
- */
-export async function translateAttributesByFile(
- attributesByFile: Map,
- targetLanguage: string,
- glossaryTerms?: Map
-): Promise