diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml deleted file mode 100644 index f9a28b424f0..00000000000 --- a/.github/workflows/crowdin-ai-import.yml +++ /dev/null @@ -1,90 +0,0 @@ -name: Import Crowdin AI Translations - -on: - workflow_dispatch: - inputs: - target_path: - description: "Path(s) to translate (comma-separated, e.g., public/content/developers/index.md,src/intl/en/page-roadmap.json). Single directory or blank for all files." - required: false - type: string - exclude_path: - description: "Path to exclude from this job (e.g., public/content/developers/tutorials)" - required: false - type: string - target_languages: - description: "Comma-separated internal language codes (blank for all locales)" - required: false - type: string - base_branch: - description: "Base branch to create PR against" - required: false - default: "dev" - type: string - pretranslation_id: - description: "Pre-translation ID(s) to resume from, comma-separated for multiple (leave empty to start new)" - required: false - type: string - split_prs: - description: "Create one PR per language instead of one combined PR?" - required: false - default: false - type: boolean - pre_translate_prompt_id: - description: "AI prompt ID for pre_translate (default: 326942)" - required: false - default: "326942" - type: string - skip_pr: - description: "Skip PR creation?" - required: false - default: false - type: boolean - skip_await: - description: "Exit after dispatching pre-translation (resume later with ID)" - required: false - default: false - type: boolean - verbose: - description: "Enable verbose logging?" - required: false - default: "false" - type: boolean - -jobs: - import_translations: - runs-on: ubuntu-latest - steps: - - name: Check out code - uses: actions/checkout@v6 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - - - name: Set up Node.js - uses: actions/setup-node@v6 - with: - node-version: 20 - cache: "pnpm" - - - name: Install dependencies - run: pnpm install - - - name: Run Crowdin AI translation import - run: npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/main.ts - env: - I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_WORKFLOW_API_KEY }} - I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} - GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} - SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} - PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} - TARGET_PATH: ${{ github.event.inputs.target_path }} - EXCLUDE_PATH: ${{ github.event.inputs.exclude_path }} - TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} - BASE_BRANCH: ${{ github.event.inputs.base_branch }} - PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} - VERBOSE: ${{ github.event.inputs.verbose }} - SKIP_PR_CREATION: ${{ github.event.inputs.skip_pr }} - SKIP_AWAIT: ${{ github.event.inputs.skip_await }} - SPLIT_PRS: ${{ github.event.inputs.split_prs }} - GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/.github/workflows/gemini-translations.yml b/.github/workflows/intl-pipeline.yml similarity index 58% rename from .github/workflows/gemini-translations.yml rename to .github/workflows/intl-pipeline.yml index c3a6029d50b..e3c020f3700 100644 --- a/.github/workflows/gemini-translations.yml +++ b/.github/workflows/intl-pipeline.yml @@ -1,4 +1,4 @@ -name: Gemini Translations +name: Intl Pipeline on: workflow_dispatch: @@ -7,6 +7,10 @@ on: description: "Path(s) to translate (comma-separated files, single directory, or blank for all)" required: false type: string + exclude_path: + description: "Path(s) to exclude (comma-separated files or directories)" + required: false + type: string target_languages: description: "Comma-separated language codes (blank for all locales)" required: false @@ -16,15 +20,20 @@ on: required: false default: "dev" type: string + target_branch: + description: "Override target branch (default: intl/pending)" + required: false + type: string concurrency: description: "Max parallel Gemini requests per language" required: false - default: "3" + default: "16" type: string - resume_run_id: - description: "Resume an interrupted run by its ID" + stamp_only: + description: "Update manifests only, no translations" required: false - type: string + default: false + type: boolean skip_pr: description: "Skip PR creation?" required: false @@ -35,6 +44,18 @@ on: required: false default: "false" type: boolean + mode: + description: "Translation mode: 'auto' (full for new files, incremental for existing) or 'full' (retranslate everything)" + required: false + default: "auto" + type: choice + options: + - auto + - full + +concurrency: + group: i18n-translation + cancel-in-progress: false jobs: translate: @@ -43,30 +64,36 @@ jobs: steps: - name: Check out code uses: actions/checkout@v6 + with: + ref: ${{ github.event.inputs.base_branch || 'dev' }} + fetch-depth: 0 - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@v5 - name: Set up Node.js uses: actions/setup-node@v6 with: - node-version: 20 + node-version-file: ".nvmrc" cache: "pnpm" - name: Install dependencies run: pnpm install - - name: Run Gemini translation - run: npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/main-gemini.ts + - name: Run translation pipeline + run: npx ts-node -O '{"module":"commonjs"}' ./src/scripts/intl-pipeline/main.ts env: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} - I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} + GITHUB_API_TOKEN: ${{ secrets.I18N_GITHUB_TOKEN }} TARGET_PATH: ${{ github.event.inputs.target_path }} + EXCLUDE_PATH: ${{ github.event.inputs.exclude_path }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} BASE_BRANCH: ${{ github.event.inputs.base_branch }} GEMINI_CONCURRENCY: ${{ github.event.inputs.concurrency }} - RESUME_RUN_ID: ${{ github.event.inputs.resume_run_id }} SKIP_PR_CREATION: ${{ github.event.inputs.skip_pr }} VERBOSE: ${{ github.event.inputs.verbose }} + MODE: ${{ github.event.inputs.mode }} GITHUB_REPOSITORY: ${{ github.repository }} - TRANSLATION_PIPELINE: Gemini + TARGET_BRANCH: ${{ github.event.inputs.target_branch }} + STAMP_ONLY: ${{ github.event.inputs.stamp_only }} + DRY_RUN: "false" diff --git a/docs/gemini-translation-roadmap.md b/docs/gemini-translation-roadmap.md new file mode 100644 index 00000000000..a6f2ea17116 --- /dev/null +++ b/docs/gemini-translation-roadmap.md @@ -0,0 +1,365 @@ +# Gemini Translation Pipeline -- Roadmap + +Status: Active plan +Last updated: 2026-03-27 + +--- + +## Current state + +The initial full-repo translation pass is ~97-99% complete across 24 non-English +languages. The Gemini translation pipeline (`gemini-translations.yml`) works well +for full-file translation but has limitations as we shift to ongoing maintenance. + +### What works today + +- Full-file translation with glossary enforcement +- Code block extraction/restoration (`` placeholders) +- Comment translation within code blocks +- Incremental commit per language (no work lost on partial failure) +- Progress tracking and run resumption +- Post-import sanitization and transliteration +- Configurable concurrency, include/exclude paths, per-language targeting +- 100% custom header ID coverage (`{#custom-id}`) across all markdown files, + preserved identically in translations (verified 2026-03-27) + +### Gaps (being addressed) + +1. ~42 file/language pairs failed during the initial pass (see "Failed files") +2. No drift detection (no way to know which translations are stale) +3. No incremental translation (every run retranslates from scratch) +4. Manual triggering (no automation for ongoing maintenance) +5. Limited error logging from the `@google/gen-ai` SDK +6. Some existing translations done with Gemini 2.5 Pro before current sanitizer + improvements, transliteration banks, and glossary enhancements + +### Cost context + +- Initial full-repo pass: ~$1,500 (via Crowdin + Gemini 2.5 Pro) +- Current pipeline (direct Gemini, bypassing Crowdin): ~80% cheaper +- Estimated full sweep with current pipeline: ~$300-500 +- Gemini Pro pricing (approximate): + - Input: ~$1.25 / 1M tokens + - Output: ~$10.00 / 1M tokens (output dominates cost) + +--- + +## Priority 1: Fix failed files (branch: `gemini-v3`) + +Close the initial pass from ~97-99% to ~100%. This is the most urgent work item. + +### Failed file inventory + +42 file/language pairs failed. Full list: + +``` +ar: glossary.json +bn: json-rpc/index.md, ethash/index.md, ethereum-forks/index.md, + fusaka/peerdas/index.md, glossary.json, learn-quizzes.json, + page-resources.json, page-trillion-dollar-security.json +de: ethereum-forks/index.md, whitepaper/index.md, glossary.json, + learn-quizzes.json +id: nodes-and-clients/index.md, glamsterdam/index.md, merge/index.md, + glossary.json, learn-quizzes.json +it: hello-world-smart-contract-fullstack/index.md, glossary.json, + learn-quizzes.json +sw: glossary.json +ta: translatathon/index.md, json-rpc/index.md, poa/index.md, + pos-vs-pow/index.md, ethash/index.md, web2-vs-web3/index.md, + fusaka/peerdas/index.md, glamsterdam/index.md, glossary.json, + learn-quizzes.json +ur: json-rpc/index.md, dagger-hashimoto/index.md, ethash/index.md, + dapps/index.md, secret-state/index.md, ethereum-forks/index.md, + fusaka/peerdas/index.md, pectra/maxeb/index.md, glossary.json, + page-what-is-the-ethereum-network.json +``` + +### Failure pattern analysis + +| Root cause | Files affected | Details | +|------------------------|----------------|--------------------------------------------| +| Token overload (>15k) | ~7 | whitepaper (90KB), json-rpc (75KB), etc. | +| Code block density | ~5 | json-rpc (172 blocks), hello-world (128) | +| Table/component density| ~5 | ethereum-forks (60 JSX + 33 tables) | +| JSON with embedded HTML| ~3 | glossary.json (317 anchors, 540 escapes) | + +**Repeat offenders:** +- `glossary.json` -- fails for 8 languages (ar, bn, de, id, it, sw, ta, ur) +- `learn-quizzes.json` -- fails for 5 languages (bn, de, id, ta, ur) + +**Languages with most failures:** Tamil (10), Urdu (10), Bengali (8) + +### Fixes to implement on `gemini-v3` + +#### A. Markdown: header ID-based chunking + +Replace token-count-based chunking (which failed) with structure-aware chunking +using the `{#custom-id}` header anchors. + +- Split at heading boundaries, grouping sections up to a token budget per chunk +- Each chunk carries its header IDs for deterministic reassembly +- Header IDs are 100% consistent across the repo and preserved in translations +- Intro content before the first heading gets a synthetic `_intro` key + +#### B. JSON: namespace batching with HTML placeholder pre-parsing + +Two improvements for large/complex JSON files: + +1. **Batch by top-level keys**: Send ~100 key-value pairs per request (with a + ~20 key buffer to avoid wasteful tiny final batches -- e.g., a file with 110 + keys sends one batch of 110, not 100 + 10) + +2. **HTML placeholder pre-parsing**: Before translation, replace embedded HTML + in JSON values with numbered placeholders (similar to Crowdin's `<0>` + pattern but more descriptive). Restore after translation. + + ``` + Before: "A DAO is..." + After: "A DAO is..." + (with restoration map stored separately) + ``` + + Validation: after restoration, verify all placeholders were preserved. Flag + chunks with missing/duplicated placeholders for retry. + +#### C. Code fence extraction audit + +The `` extraction works on successful files. Investigate why +it fails on code-dense files: +- Run the extractor in isolation on failing files, inspect output +- Check for edge cases: nested fences, non-standard fence syntax, very high + placeholder counts (>100 per file) +- May be interaction between chunking failure + code blocks (if chunking fails, + the entire code-heavy file hits Gemini as one blob) + +#### D. Error logging improvements + +Add structured error logging from the `@google/gen-ai` SDK: +- Capture failure reason, response status, partial output if available +- Log per-file/per-language so failures can be triaged without re-running +- Distinguish error types: rate limit vs. content filter vs. malformed output + vs. timeout (each needs different retry strategy) + +#### E. Validation + +- Retranslate the ~42 failed file/language pairs as the test case +- Compare output quality against successfully translated files of similar size + +--- + +## Priority 2: Section hash manifest (branch: `gemini-v4`) + +Build per-section content hashing infrastructure. This is the foundation for both +drift detection and incremental translation. + +### Markdown: header ID-keyed section hashes + +Parse each English markdown file into a tree of sections keyed by `{#custom-id}`. +Hash each section's content. Structure: + +```json +{ + "public/content/roadmap/index.md": { + "fileHash": "abc123", + "sections": { + "_intro": "def456", + "what-is-the-roadmap": "ghi789", + "why-does-ethereum-need-a-roadmap": "jkl012", + ... + } + } +} +``` + +**Possible future optimization**: merkle trie structure where leaf hashes bubble +up to parent sections. Allows O(1) "has anything changed?" checks at the file +level, with drill-down to find exactly which sections changed. Worth considering +once the flat hash map is working, if performance demands it. + +### JSON: key-level hashes + +For JSON files, hash individual key-value pairs (or namespace groups for deeply +nested files). Structure: + +```json +{ + "src/intl/en/glossary.json": { + "fileHash": "mno345", + "keys": { + "account": "pqr678", + "address": "stu901", + ... + } + } +} +``` + +### Storage: manifest file + +**Decision**: Use a manifest file (`src/intl/translation-manifest.json`). + +- Single file, easy to query, no content file pollution +- Works for both JSON and markdown +- Can include metadata: timestamp, pipeline version, token cost, Gemini model +- Trade-off: potential merge conflicts if multiple translation PRs run + simultaneously (mitigated by per-language PRs or lock-step merging) + +--- + +## Priority 3: Baseline sweep + quality refresh + +**Decision**: "Stamp now" approach (Option B from brainstorming). + +One combined operation (~$300-500) that accomplishes two goals simultaneously: + +1. **Establish baseline**: Record current English source SHAs in the manifest + for every file/language pair. Going forward, drift is detectable by comparing + recorded SHA against current English SHA. + +2. **Quality refresh**: Retranslate everything using current best pipeline: + - Gemini 3.1 Pro (upgraded from 2.5 Pro used in original pass) + - Current sanitizer with all accumulated fixes + - Transliteration banks for non-Latin script languages + - Improved translation glossary (in development separately) + +After this sweep, every translation in the repo is (a) generated by the best +available pipeline and (b) tracked in the manifest with a known English source +SHA. This is the clean foundation for incremental work going forward. + +### Prerequisite: glossary and transliteration improvements + +The quality refresh is most valuable after: +- Translation glossary expansion is complete (in flight) +- Transliteration bank coverage is solid for non-Latin scripts +- All Priority 1 fixes are deployed (so zero files fail) + +### Approach alternatives considered + +**Option A (rejected): Git history bootstrap** -- Analyze commit messages +(pattern: `i18n(pl): Crowdin translations`) to determine when each file was +last truly translated. Feasible since commits are programmatic, but complicated +by cleanup commits that are more recent than actual translation timestamps. + +**Option B (selected): Stamp now, sweep forward** -- Accept that current +translations have unknown-precision freshness. Do one full sweep with current +pipeline, stamping SHAs as we go. After this, the manifest is authoritative. + +**Option C (rejected): Hybrid git + LLM spot-check** -- Use git where clear, +LLM where ambiguous. More accurate bootstrap but more complexity for marginal +benefit given we want a quality refresh anyway. + +--- + +## Priority 4: Incremental translation (branch: `gemini-v4`) + +Once the manifest exists with per-section hashes, incremental translation +becomes straightforward. + +### JSON: key-level diff and translate + +1. Deep-diff current English JSON against manifest's recorded English version +2. Collect added and changed key paths +3. Send only those key-value pairs to Gemini for translation +4. Deep-merge translated pairs into existing translation JSON +5. Update manifest with new SHAs +6. Run sanitizer on the merged file + +**Complexity**: Low. JSON key merging is deterministic and safe. + +### Markdown: section-level diff and translate + +1. Parse current English file into sections keyed by `{#header-id}` +2. Compare section hashes against manifest +3. For each changed section: + a. Extract corresponding section from existing translation + b. Send to Gemini: English section + existing translation + context + c. Receive translated section +4. Reassemble: unchanged sections from existing translation + new translations +5. Update manifest with new SHAs +6. Run sanitizer on reassembled file + +**Complexity**: Medium. The 100% header ID coverage makes this much more +feasible than initially estimated. Splicing by ID is deterministic. Edge case: +intro content before first heading (use synthetic `_intro` key). + +**Fallback**: If >50% of sections changed, fall back to full-file retranslation +(the incremental overhead isn't worth it at that point). + +### "Previous English version" question (resolved) + +The manifest's recorded SHA IS the previous English version. When a translation +is generated, the manifest records the English source SHA. On the next +incremental run, diff current English against that SHA to identify what changed. + +--- + +## Priority 5: Automation (branch: `gemini-v4`) + +### End-state vision + +``` +English content merged to dev + | + v +Drift detection scan (automatic or cron) + | + v +Stale file list (per language) + | + v +Batching logic (group by language, thresholds, cooldown) + | + v +Incremental translation dispatch (Gemini 3.1 Pro) + | + v +Sanitizer + transliteration + review agents + | + v +PR(s) created, ready for human merge +``` + +### Graduation plan + +**Phase 1 (near-term): Manual + tooling** +- Drift scan script runs manually or on cron, outputs report +- Human reviews report and manually dispatches translation +- Existing sanitizer + review pipeline handles quality + +**Phase 2 (mid-term): Semi-automated** +- Nightly/weekly cron runs drift scan +- When stale count exceeds threshold, auto-dispatches translation +- Human merges resulting PRs + +**Phase 3 (long-term): Full automation** +- Push to dev triggers path-filtered action (`public/content/`, `src/intl/en/`) +- Batching logic groups changes (cooldown window during active dev) +- Translation -> sanitizer -> review agents -> PR ready for human merge +- Cron job as safety net catches anything the push trigger missed +- Human stays in the loop at the merge step + +### Batching considerations + +- One PR per language per run (clearest for review) +- Skip whitespace-only or comment-only changes +- Cooldown: don't retranslate files translated in the last N hours +- Size cap: if >50 files stale, split into multiple runs or prioritize by traffic + +--- + +## Branch strategy + +- **`gemini-v3`**: Priority 1 (fix failed files). Patches to the existing + pipeline: chunking, batching, HTML placeholders, error logging. +- **`gemini-v4`**: Priorities 2-5 (new infrastructure). Manifest, drift + detection, incremental translation, automation. + +--- + +## Related workstreams (tracked elsewhere) + +- **Translation glossary expansion** -- in flight, separate task +- **Transliteration bank improvements** -- ongoing per non-Latin locale +- **Full-language retroactive cleanup** -- see `src/scripts/i18n/FUTURE.md` #9 +- **Lowercase ethereum initiative** -- content standardization, tracked in + `docs/lowercase-ethereum-plan.md` diff --git a/docs/i18n-incremental-pipeline.md b/docs/i18n-incremental-pipeline.md new file mode 100644 index 00000000000..fc3be340c23 --- /dev/null +++ b/docs/i18n-incremental-pipeline.md @@ -0,0 +1,110 @@ +# Incremental Translation Pipeline + +## Overview + +The i18n pipeline translates ethereum.org content (markdown + JSON) to 24 languages using Gemini. It operates in two modes: + +- **Auto (default):** For each file+locale, auto-detects whether to do a full translation (no manifests exist) or an incremental update (manifests exist, only changed content retranslated). +- **Full:** Force retranslation of all targeted files regardless of manifest state. + +The pipeline classifies English changes into two categories: +- **Inert changes** (URLs, image paths, code, component attributes): propagated deterministically without LLM calls. +- **Prose changes** (translatable text): retranslated via Gemini section-by-section, with unchanged sections provided as context for voice/tone consistency. + +## Architecture + +### Translation Branch + +All pipeline runs commit to `intl/pending` by default. This is the single translation branch for the standard `dev`-based workflow. + +- If the branch exists, the pipeline merges the base branch into it first (keeps it current with dev). +- If it doesn't exist, it creates one from the base branch HEAD. +- A GitHub Actions concurrency group ensures only one pipeline run executes at a time (additional runs queue). +- The branch name can be overridden via `translation_branch` workflow input (useful for testing). + +**Design decision:** The pipeline only targets `dev` in production. Hot fixes to `staging` or `master` are English-only until the next release cycle, when `dev` (with translations) flows to `staging` then `master` via the normal prepare-release process. This is a deliberate simplification -- multi-branch translation adds significant complexity for a rare scenario. + +### Manifests + +Two manifest files track translation state per file+locale: + +**Source manifest (`.manifest-source.json`):** A content tree of the English file at the time of last translation. Stores hashes (not content) for each section, element, and attribute. Used to detect what changed in English since last translation. + +**Translation manifest (`.manifest-translation.json`):** Records the inert values (URLs, paths, attribute values) as they existed at translation time. Used to propagate inert changes deterministically without re-reading old English content. + +### Pipeline Phases + +1. **Initialize:** Ensure staging branch exists and is up-to-date with base. +2. **Drift Detection:** For each file+locale, compare current English against stored manifest. Classify changes as inert, translatable, added, or removed. Files without manifests are queued for full translation. +3. **Full Translation:** New files go through `translateFile()` (normalizer + Gemini). Both manifests are generated and committed. +4. **Inert Propagation:** Deterministic replacement of URLs, paths, and attributes in existing translated files. No LLM calls. Handles reordered links (e.g., Japanese SOV word order placing links in different positions than English). +5. **Prose Retranslation:** Changed sections sent to Gemini with unchanged sections as context. Responses are spliced back into the locale file. +6. **Commit:** Updated locale files and refreshed manifests committed to the staging branch. +7. **Sanitize:** Post-import sanitizer runs on all Gemini-produced content (BiDi fixes for RTL languages, code fence alignment, etc.). + +### Removed Content Handling + +When English content is removed (sections deleted, JSON keys removed), the pipeline detects these as `drift.removed` entries and strips the corresponding content from all locale files. This enables safe deprecation of components and content without manual editing of translated files. + +## Workflow + +### GitHub Actions + +```bash +# Default: auto-detect mode, commits to intl/pending +gh workflow run gemini-translations.yml \ + -f target_path="public/content/some-page/index.md" \ + -f target_languages="es,ja,ur" + +# Force full retranslation +gh workflow run gemini-translations.yml \ + -f target_path="public/content/some-page/index.md" \ + -f mode="full" + +# Testing: use a feature branch with a custom translation branch +gh workflow run gemini-translations.yml \ + --ref test-6/gemini-v4 \ + -f base_branch="test-6/gemini-v4" \ + -f translation_branch="intl/test-pending" +``` + +### Content Author Workflow + +1. Author writes/edits English content, merges PR to `dev`. +2. Pipeline dispatches (manually or scheduled), detects changes, translates. +3. Translations appear on `intl/pending` as a PR against `dev`. +4. Reviewer checks the translation PR, merges when satisfied. +5. For component deprecations: remove from English first, let the pipeline strip it from locales (via removed content handling), then a cleanup job can safely delete the component file. + +### Hot Fixes + +Hot fixes to `staging` or `master` are not automatically translated. They go out in English-only. Translations catch up on the next release cycle when `dev` (with translations) merges to `staging` via prepare-release. If a hot fix translation is truly urgent, the pipeline can be manually dispatched with `base_branch=staging` and a custom `translation_branch`, but this is not the standard flow. + +### Recovery + +**Bad translation (not yet merged):** Re-run the pipeline targeting the specific file+locale. New commit overwrites the bad translation on the staging branch. + +**Bad translation (already merged to dev):** Re-run with `mode: full` for that file. Fresh translation + manifest stamped. + +**Corrupted manifests:** Delete the manifest files for the affected locale. Pipeline auto-detects "no manifest" and does full translation with fresh manifest generation. + +**Nuclear recovery:** Delete all manifests for a locale and re-run full. Equivalent to a fresh translation sweep. Expensive but always safe. + +## Key Design Decisions + +- **Manifests are cheap, translations are expensive.** The architecture makes it easy to regenerate manifests and hard to lose good translations. +- **English is the source of truth.** Non-English files should never be edited manually. The pipeline is the exclusive manipulator. +- **Inert propagation avoids unnecessary LLM calls.** URL changes, path updates, and attribute changes are handled deterministically -- no Gemini tokens spent. +- **Section-level granularity.** Only changed sections are retranslated, with unchanged sections provided as context. This preserves voice consistency while minimizing cost. +- **One translation PR at a time.** The `intl/pending` branch ensures there's never more than one open translation PR, avoiding manifest conflicts. + +## File Locations + +- Pipeline entry: `src/scripts/i18n/main-incremental.ts` +- Full pipeline: `src/scripts/i18n/main-gemini.ts` +- Manifest adapter: `src/scripts/i18n/lib/ai/manifest-adapter.ts` +- Inert propagation: `src/scripts/i18n/lib/ai/propagate-inert.ts` +- Incremental translate: `src/scripts/i18n/lib/ai/incremental-translate.ts` +- Branch utilities: `src/scripts/i18n/lib/github/branches.ts` +- Workflow: `.github/workflows/gemini-translations.yml` +- Content tree package: `intl-content-tree` (npm, MPL-2.0) diff --git a/docs/solutions/architecture/i18n-pipeline-process-retrospective.md b/docs/solutions/architecture/i18n-pipeline-process-retrospective.md new file mode 100644 index 00000000000..a305596e591 --- /dev/null +++ b/docs/solutions/architecture/i18n-pipeline-process-retrospective.md @@ -0,0 +1,144 @@ +# Building an Incremental Translation Pipeline: Process Retrospective + +A 16-day project to build an automated incremental translation pipeline for ethereum.org's 24 non-English languages. This document captures the process lessons -- what worked, what failed, and why -- for anyone tackling complex multi-agent engineering projects. + +## Context + +ethereum.org supports 25 languages. The site previously used Crowdin (a proprietary SaaS translation management platform) with community volunteers. The goal: replace this with an in-house pipeline that detects what changed in English content, classifies each change, and either propagates it deterministically (for non-prose changes like URLs and attributes) or sends only the changed prose to an LLM for retranslation. + +The work was done by a team of AI agents coordinated through structured async communication, each with a dedicated role and isolated workspace (git worktrees). One human project lead made all architectural decisions and reviewed all output. + +## The approach that failed: heuristic-first development + +The first pipeline agent worked for two weeks (~900,000 tokens of conversation context). The approach: implement a feature, test it against simple fixtures, fix bugs as they appeared, repeat. + +**What happened:** +- 40+ commits of fix-on-fix iteration +- Unit tests passed against synthetic fixtures but failed on real production content +- Each real-world test exposed new edge cases that required rethinking earlier decisions +- Design decisions made early in the conversation were forgotten or contradicted as the context grew +- The agent began going in circles -- blaming external systems (LLM output quality) when the bugs were in its own post-processing code +- Multiple unauthorized git commits and pushes despite explicit rules against them + +**Root causes:** +1. **No spec.** The pipeline was designed incrementally by solving one problem at a time. There was no up-front document saying "here's what correct looks like." +2. **Toy fixtures.** Tests used simple synthetic content that didn't exercise the edge cases present in real content (code fences with hash comments, components with translatable attributes, SOV language reordering). +3. **Context overload.** At 900k tokens, the agent was experiencing "lost in the middle" effects -- early design decisions and rules were buried under layers of debugging and back-and-forth. The conversation itself became a liability. +4. **Testing after implementation.** Each feature was built first, then tested. When tests failed, the fix often broke something else. The feedback loop was: implement -> test -> fail -> patch -> test -> fail differently. + +Subsequent attempts to inherit and patch this codebase hit the same underlying issue: trying to fix a pipeline that had been built without a clear specification of what "correct" means. + +## The approach that worked: spec-first, test-driven, fresh context + +After the heuristic approach stalled, the strategy changed fundamentally: + +### Step 1: Define what "correct" means + +The package author (who had built the content tree parsing/diffing library used by the pipeline) wrote a comprehensive specification: +- 6 pipeline phases with explicit inputs, outputs, and assertions per phase +- A mutation table documenting every test change with its expected classification +- Clear boundaries: what the package does (detection/classification) vs what the pipeline does (action/replacement) + +### Step 2: Write the tests before the code + +28 markdown mutations and 10 JSON mutations were crafted as fixture files. These covered every component type, code fence language, frontmatter pattern, and inline element pattern used in production content. Three locale translations (Spanish, Korean, Urdu) covered Latin, CJK, and RTL script groups. + +The test suite defined the contract: given these inputs, expect these outputs. 131 tests total. + +### Step 3: Hand to a fresh agent with clean context + +A new agent was onboarded with minimal context: +- The spec +- The test fixtures +- The test suite +- A short list of known pitfalls from previous attempts + +The onboarding document was explicit: "The tests already exist. Your job is to make them pass." No inherited code. No 900k-token conversation history. No ambiguity about what "correct" means. + +### Result + +- 125/131 tests passing within the first session +- The 6 failures were fixture quality issues (expected output had been generated by full retranslation, not incremental pipeline output), not pipeline bugs +- After fixture correction: 131/131 +- Four end-to-end GitHub Actions test cycles with real LLM calls, all passing +- Zero-drift confirmation run completed in 64 seconds +- Pipeline deployed and functional + +## Key lessons + +### On context window size + +Larger is not always better. The first agent's ~900k token context became a liability. Design decisions, rules, and architectural context from early in the conversation got buried. The agent that succeeded started with near-zero context and a written spec. + +**Recommendation:** For complex multi-phase projects, prefer shorter agent sessions with decisions captured in documents, not conversation history. When an agent reaches the point where it's forgetting earlier decisions, it's time for a fresh agent with a written briefing. + +### On team composition and role separation + +The project used agents in dedicated roles: orchestrator (coordination, no code), package author (detection library), pipeline implementer (integration), glossary specialist, and others. Each had an isolated git worktree. + +**What worked:** +- Clean interfaces between roles. The package author and pipeline implementer could work independently because the API boundary was well-defined. +- Async communication via append-only daily logs. Agents posted updates, questions, and responses in shared files. No real-time coordination needed. +- Time-boxed specialist agents for focused tasks (PR reviews, security audits, one-off research). + +**What didn't work:** +- Too many agents active simultaneously early on created coordination overhead that exceeded the productivity gains. +- Agents defaulting to agreement instead of pushing back on ideas. "You're right" is the most dangerous phrase in agent collaboration -- it short-circuits critical thinking. + +**Recommendation:** Start with 2-3 agents. Add specialists as needed for focused tasks. The orchestrator role is valuable for tracking state and facilitating communication, but the real work happens between the specialist agents. + +### On spec-first vs implementation-first + +This was the decisive factor. Multiple attempts to build the pipeline by implementing features and testing them afterward all failed. The attempt that succeeded had a spec and test suite written before a single line of pipeline code was written. + +**Why spec-first works for agent development:** +- Agents are excellent at implementing to a specification. They struggle with open-ended exploration where "correct" is undefined. +- Tests eliminate ambiguity. "Expect 0 Gemini calls for this change" is unambiguous. "Handle inert changes efficiently" is not. +- A spec captures design decisions durably. When an agent's context window grows or a fresh agent takes over, the spec persists. +- Tests provide instant feedback. The agent can run the suite after every change and know exactly what works and what doesn't. + +**Recommendation:** For any complex agent task, invest in the spec and tests before starting implementation. The time spent on specification is repaid many times over in reduced debugging and agent churn. + +### On package separation + +The content tree parsing/diffing logic was extracted into a standalone npm package (`intl-content-tree`) early in the project. This forced: +- Clean API boundaries (the package takes strings, returns data structures) +- Independent versioning and testing (182 tests in the package alone) +- Separation of concerns (detection vs action) +- Reusability (the package is generic, not tied to any specific project) + +When the pipeline implementation was rewritten from scratch, the package didn't change. The new agent consumed the same API. The 182 package tests continued to pass. This stability was possible because the package was an independent artifact, not code embedded in the pipeline. + +**Recommendation:** Extract reusable logic into independently versioned packages early. The overhead of package management is less than the cost of rewriting tightly-coupled code when the surrounding system changes. + +### On what previous work contributes + +Earlier agents' code didn't survive in its original form. But the work was not wasted: +- Infrastructure modules (GitHub API integration, branch management, file operations, sanitization) were copied into the new pipeline with minimal changes +- The glossary system built by a dedicated agent was wired in directly +- Architecture decisions documented during earlier iterations (staging branch strategy, manifest design, heading ID enforcement) informed the spec +- The comprehensive "what failed and why" documentation prevented the new agent from repeating the same mistakes + +Code is disposable. Decisions and documentation persist. + +## The pipeline today + +The incremental translation pipeline: +1. Parses English content into hash trees (Merkle tries) +2. Diffs current English against stored manifests to detect changes +3. Classifies each change: inert (URLs, paths, attributes), structural (components added/removed), or translatable (prose text) +4. Propagates inert and structural changes deterministically (no LLM) +5. Sends only changed prose sections to an LLM for retranslation +6. Splices results back, stamps manifests, runs post-processing + +A full-file retranslation costs dollars. An incremental run on a file with 3 changed sections costs cents. A re-run with zero changes costs nothing and completes in seconds. + +## Summary + +| Approach | Duration | Outcome | +|----------|----------|---------| +| Heuristic-first development | ~10 days | Failed. 40+ fix-on-fix commits, tests pass on toys, fail on real content. | +| Inherit and patch | ~2 days | Failed. Same underlying issues. | +| Spec-first, test-driven, fresh context | ~1 day | 131/131 tests. End-to-end verified with real LLM. Zero-drift confirmed. | + +The difference was not the agent's capability. It was the process: define success clearly, write tests for it, then implement to pass them. diff --git a/docs/solutions/integration-issues/sanitizer-test-research.md b/docs/solutions/integration-issues/sanitizer-test-research.md index 429b3fc656c..e0cc029ed8d 100644 --- a/docs/solutions/integration-issues/sanitizer-test-research.md +++ b/docs/solutions/integration-issues/sanitizer-test-research.md @@ -121,6 +121,11 @@ These patterns are covered by existing fix functions and should have regression - **Escaped backtick inside inline code** (escaped-backtick fix) — `\`` replacement now skips inline code spans to preserve `\` as legitimate content in `` `\` ``; previously stripped the backslash leaving empty backticks `` `` `` (bn PR #17866, pattern #53) - **Block component regex over-matching** (`fixBlockComponentLineBreaks`) — `Alert` regex no longer matches `AlertTitle`/`AlertEmoji` etc.; added negative lookahead `(?![A-Za-z])` after component name to prevent prefix matching (bn PR #17866, pattern #54) +- **Bare LTR values in RTL** (`fixBareRtlValues`) -- numbers with Latin units (32 ETH, 100 Gwei), percentages (12.5%), currency ($2,500 USD), version/protocol IDs (EIP-1559), large formatted numbers (21,000), multipliers (2x) unwrapped in ar/ur files get `` wrapping. Skips code blocks, inline code, URLs, existing spans, frontmatter. (gemini-v4, pattern #55) +- **Unit outside BiDi span** (`fixUnitOutsideSpan`) -- Gemini sometimes produces `$100,000 USD` with the unit outside; corrected to `$100,000 USD`. Matches known Latin units (ETH, BTC, Gwei, USD, etc.). (gemini-v4, pattern #56) + +- **Misaligned closing code fences** (`fixMisalignedCodeFences`) -- indented opening fences (4 spaces) with unindented closing fences, breaking syntax highlighting and parsers. Systematic across Gemini translations of files with list-item code blocks (e.g., `ethereum-for-web2-auth` in id, it locales). (gemini-v4/Anchor bug report, pattern #57) + ## Recommendations for Future Sanitizer Iteration 1. **Full-width parentheses** (#1) — Add regex to normalize `(` → `(` and `)` → `)` inside markdown link syntax diff --git a/package.json b/package.json index d7d8377bd71..be0bf14d0f3 100644 --- a/package.json +++ b/package.json @@ -19,7 +19,6 @@ "chromatic": "chromatic --project-token fee8e66c9916", "lint:md": "markdownlint-cli2 \"public/content/**/*.md\" \"!public/content/translations/**\"", "lint:md:fix": "markdownlint-cli2 --fix \"public/content/**/*.md\" \"!public/content/translations/**\"", - "markdown-checker": "ts-node -O '{ \"module\": \"commonjs\" }' src/scripts/markdownChecker.ts", "update-tutorials": "ts-node -O '{ \"module\": \"commonjs\" }' src/scripts/update-tutorials-list.ts", "prepare": "husky", "test": "pnpm test:unit && pnpm test:e2e", @@ -141,6 +140,7 @@ "franc-min": "^6.2.0", "husky": "^9.0.11", "image-size": "^1.0.2", + "intl-content-tree": "^0.3.0", "lint-staged": "^15.2.5", "markdownlint-cli2": "^0.22.0", "mdast-util-toc": "^7.0.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index d09aee693d0..598cc97b88c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -334,6 +334,9 @@ importers: image-size: specifier: ^1.0.2 version: 1.2.1 + intl-content-tree: + specifier: ^0.3.0 + version: 0.3.0 lint-staged: specifier: ^15.2.5 version: 15.5.2 @@ -8022,6 +8025,9 @@ packages: resolution: {integrity: sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==} engines: {node: '>=12'} + intl-content-tree@0.3.0: + resolution: {integrity: sha512-5DSHIdFt7M8kWVBZ9XgXqg3raxGnCNF6qhukfTCcmy00GD+3fBmgUvx4en6yZ76QF4qfUC/yiTzOrxE99+15VQ==} + intl-messageformat@11.2.0: resolution: {integrity: sha512-IhghAA8n4KSlXuWKzYsWyWb82JoYTzShfyvdSF85oJPnNOjvv4kAo7S7Jtkm3/vJ53C7dQNRO+Gpnj3iWgTjBQ==} @@ -22051,6 +22057,8 @@ snapshots: internmap@2.0.3: {} + intl-content-tree@0.3.0: {} + intl-messageformat@11.2.0: dependencies: '@formatjs/ecma402-abstract': 3.2.0 diff --git a/src/lib/i18n/loadMessages.ts b/src/lib/i18n/loadMessages.ts index c93d9cfc569..1ab4b024c4f 100644 --- a/src/lib/i18n/loadMessages.ts +++ b/src/lib/i18n/loadMessages.ts @@ -9,7 +9,12 @@ interface IntlMessages { function getNamespaces(localePath: string): string[] { return fs .readdirSync(localePath, { withFileTypes: true }) - .filter((entry) => entry.isFile() && entry.name.endsWith(".json")) + .filter( + (entry) => + entry.isFile() && + entry.name.endsWith(".json") && + !entry.name.startsWith(".") + ) .map((entry) => entry.name.replace(".json", "")) } diff --git a/src/scripts/crowdin/utils.ts b/src/scripts/crowdin/utils.ts deleted file mode 100644 index afb73ed7cf7..00000000000 --- a/src/scripts/crowdin/utils.ts +++ /dev/null @@ -1,22 +0,0 @@ -import fs from "fs" - -import type { I18nLocale } from "../../lib/types" - -export async function getLangCodeFromCrowdinCode( - crowdinCode: string -): Promise { - try { - const data = await fs.promises.readFile("i18n.config.json", "utf-8") - const locales: I18nLocale[] = JSON.parse(data) - const locale = locales.find((item) => item.crowdinCode === crowdinCode) - - if (!locale) { - throw new Error(`CrowdinCode ${crowdinCode} not found`) - } - - return locale.code - } catch (error: unknown) { - if (error instanceof Error) throw new Error(`Error: ${error.message}`) - return "" - } -} diff --git a/src/scripts/i18n/FUTURE.md b/src/scripts/i18n/FUTURE.md deleted file mode 100644 index 4e8cddced27..00000000000 --- a/src/scripts/i18n/FUTURE.md +++ /dev/null @@ -1,111 +0,0 @@ -# Gemini Translation Pipeline -- Future Features - -> **Maintenance:** Remove or update entries here as they are implemented. Do not let this file accumulate stale items. - ---- - -## Pipeline Quality (do more during translation, less during review) - -### 1. Fix Comment Restoration Concatenation Bug - -**Problem:** Translated code comments are concatenated with the original instead of replacing them. Example from uniswap PR #17808 line ~1260: -``` -// **** REMOVE LIQUIDITY (supporting fee-on-transfer tokens) **** // **** ...Arabic... **** -``` - -**Root cause:** `restoreComments()` in `code-block-extractor.ts` appends the translated comment to the existing line content instead of replacing the original comment text. `translateCodeComments()` should use `strippedCode` (comments removed) as the base for restoration, not the original `block.content`. - -**Complexity:** Low. ~5 line change in `translateCodeComments()`. - -### 2. Stronger Glossary Enforcement - -**Problem:** High-frequency glossary terms like "mint" are translated inconsistently (~10 occurrences in a single file using different forms). The glossary is sent in the prompt but Gemini doesn't always adhere strictly. - -**Proposed solution:** -- Post-translation pass that scans output for known English glossary terms that should have been translated, and flags or auto-corrects them -- Consider a validation step that compares glossary term frequency in source vs translation -- May overlap with existing sanitizer `fixKnownBrandGarbles` pattern -- extend to glossary terms - -### 3. Transliteration During Translation (not just post-processing) - -**Problem:** Gemini regresses on transliterations (author names, brand names like "Proto-danksharding") that the sanitizer then has to catch. The review stage currently finds many transliteration issues that should have been handled during translation. - -**Proposed solution:** -- Include transliteration banks (from `.claude/translation-review/transliterations/`) directly in the translation prompt for non-Latin locales -- Add language-group-specific transliteration rules to `prompt-builder.ts` (currently only general rules are sent) -- The sanitizer already has `fixKnownBrandGarbles` with transliteration bank support -- ensure the translation prompt and sanitizer are aligned on the same bank - -**Goal:** Review stage produces scores, not a thousand critical issues to patch. - -### 4. Deep JSON Validation - -**Problem:** Current validation only checks top-level JSON keys. Nested namespaces (common in `src/intl/` files) can have dropped or renamed keys at depth > 1 without detection. - -**Proposed solution:** Recursive key comparison that walks the full object tree, reporting missing/added/renamed keys at any depth. Should handle arrays (compare by index) and nested objects (compare by key). - ---- - -## Pipeline Features - -### 5. Incremental Translation (update_only mode) - -**Problem:** Full-file translation rewrites content that's already correctly translated, introducing unnecessary churn and non-deterministic discrepancies in approved translations. - -**Proposed solution:** A workflow input `update_only` (boolean, default false) that: -- Fetches the existing translated file from the target branch -- Diffs against English to identify: untranslated blocks (still in English), blocks where English source changed, blocks already correctly translated -- Sends Gemini both files with instructions to only translate/update marked sections, preserving everything else verbatim - -**Complexity by file type:** -- **JSON:** Low-medium. Key-by-key comparison, send only keys needing work, merge back. -- **Markdown:** Medium-high. Needs paragraph-level diffing. Either store a manifest of the English version last translated against, or use heuristics (is this paragraph still in English?). - -**Prerequisite:** Ship and validate the full-translate mode first. This is a fast-follow. - -### 6. Split PRs (one PR per language) - -**Problem:** Large multi-language runs produce a single massive PR that's hard to review. - -**Proposed solution:** A workflow input `split_prs` (boolean, default false) that: -- Creates a separate branch per language -- Runs translate -> sanitize -> JSX attrs per language -- Opens one PR per language against the base branch - -**Implementation:** Loop the language iteration in `main-gemini.ts`, create branch per language via `postCreateBranchFrom`, call `createTranslationPR` per language. The per-language orchestration in `gemini-translate-files.ts` already processes one language at a time -- the change is in branching and PR creation, not translation. - -**Prerequisite:** Validate the single-PR flow works reliably first. - -### 7. Cost Tracking and Reporting - -**Problem:** No visibility into per-run, per-language, or per-file translation costs. - -**Proposed solution:** The pipeline already tracks `tokensUsed` per file. Aggregate and report in the PR body: total tokens, estimated cost, per-language breakdown. Could also write to a manifest for historical tracking. - ---- - -## Automation - -### 8. Auto-trigger Translations on Content Merge - -**Problem:** Content changes merged to dev currently require manual triggering of the translation pipeline. PRs tagged "needs translations" accumulate. - -**Proposed solution:** -- GitHub Action that watches for merges to dev affecting `public/content/` or `src/intl/en/` -- Automatically triggers the Gemini translation workflow for changed files -- Could be scoped to specific languages or all languages depending on the change -- Should respect a cooldown/batch window to avoid triggering on every small merge - -**Prerequisite:** Incremental translation mode (#5) should be working first so auto-triggered runs only translate what changed, not entire files. - -### 9. Full-language Retroactive Cleanup - -**Problem:** Many languages were translated before the current pipeline improvements (transliteration banks, language-group rules, sanitizer fixes). Those translations have the same class of issues found in Arabic (state polysemy, brand garbles, wrong compounds, etc.). - -**Proposed solution:** After all pending language reviews are complete: -- Run the full sanitizer (with all current fixes) against every translated language -- Apply transliteration banks where they exist -- Apply language-group-specific rules -- Re-translate files flagged by the sanitizer as having too many issues -- Catch up on any content changes merged to dev since the original full-repo translation round (PRs tagged "needs translations") - -**Scope:** This is a large batch operation. Should be planned per-language with the split PR feature (#6) to keep reviews manageable. diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts deleted file mode 100644 index cb78d2559e8..00000000000 --- a/src/scripts/i18n/config.ts +++ /dev/null @@ -1,203 +0,0 @@ -import * as dotenv from "dotenv" - -import i18nConfig from "../../../i18n.config.json" - -import { mapInternalCodeToCrowdin } from "./lib/utils/mapping" - -dotenv.config({ path: ".env.local" }) - -// Language code mapping -export const crowdinToInternalCodeMapping: Record = - i18nConfig.reduce( - (acc, { crowdinCode, code }) => { - acc[crowdinCode] = code - return acc - }, - {} as Record - ) - -// GitHub API configuration -const gitHubApiKey = process.env.I18N_GITHUB_API_KEY || "" -if (!gitHubApiKey) { - console.error("[ERROR] Missing I18N_GITHUB_API_KEY environment variable") - console.error( - "[ERROR] Please set I18N_GITHUB_API_KEY in your .env.local file" - ) - throw new Error("No GitHub API Key found (I18N_GITHUB_API_KEY)") -} - -export const gitHubBearerHeaders = { - Authorization: `Bearer ${gitHubApiKey}`, - Accept: "application/vnd.github.v3+json", -} - -// Crowdin API configuration (optional -- not needed for Gemini pipeline) -const crowdinApiKey = process.env.I18N_CROWDIN_API_KEY || "" -if (!crowdinApiKey && !process.env.GEMINI_API_KEY) { - console.error( - "[ERROR] Missing API key. Set I18N_CROWDIN_API_KEY (Crowdin pipeline) or GEMINI_API_KEY (Gemini pipeline)" - ) - throw new Error("No API key found (I18N_CROWDIN_API_KEY or GEMINI_API_KEY)") -} - -export const crowdinBearerHeaders = crowdinApiKey - ? { Authorization: `Bearer ${crowdinApiKey}` } - : { Authorization: "" } - -// Parse environment variables with defaults -// Accept internal codes (e.g., "es") and convert to Crowdin codes (e.g., "es-EM") -const targetLanguagesInput = process.env.TARGET_LANGUAGES - ? process.env.TARGET_LANGUAGES.split(",") - .map((lang) => lang.trim()) - .filter(Boolean) - : [] - -// If no target languages specified, use all languages from i18n.config.json, excluding 'en' -const targetLanguages: string[] = - targetLanguagesInput.length === 0 - ? i18nConfig - .map(({ code }) => code) - .filter((code) => code !== "en") - .map((code) => mapInternalCodeToCrowdin(code)) - : targetLanguagesInput.map((code) => mapInternalCodeToCrowdin(code)) - -const baseBranch = process.env.BASE_BRANCH || "dev" - -const targetPathRaw = process.env.TARGET_PATH || "" -// Support comma-separated list of files/directories -const targetPath = targetPathRaw -const targetPaths = targetPathRaw - ? targetPathRaw - .split(",") - .map((p) => p.trim()) - .filter(Boolean) - : [] -const excludePath = process.env.EXCLUDE_PATH?.trim() || "" - -// Skip awaiting pre-translation completion (exit early with ID for manual resume) -const skipAwait = ["1", "true", "yes", "on"].includes( - (process.env.SKIP_AWAIT || "").toLowerCase() -) - -// Adaptive polling / timeout configuration (milliseconds) -const pretranslateTimeoutMs = process.env.PRETRANSLATE_TIMEOUT_MS - ? parseInt(process.env.PRETRANSLATE_TIMEOUT_MS, 10) - : 6 * 60 * 60 * 1000 // default 6h - -const pretranslatePollBaseMs = process.env.PRETRANSLATE_POLL_BASE_MS - ? Math.max(5000, parseInt(process.env.PRETRANSLATE_POLL_BASE_MS, 10)) - : 30_000 // default 30s base (min clamped to 5s) - -// Parse comma-separated pre-translation IDs (for resuming multiple per-language jobs) -const existingPreTranslationIds = (process.env.PRETRANSLATION_ID || "") - .split(",") - .map((id) => id.trim()) - .filter(Boolean) - -const verbose = process.env.VERBOSE === "true" -const splitPrs = process.env.SPLIT_PRS === "true" - -// Parse GitHub repository from env (format: "owner/repo") -const githubRepo = - process.env.GITHUB_REPOSITORY || "ethereum/ethereum-org-website" -const [ghOrganization, ghRepo] = githubRepo.split("/") - -if (verbose) { - console.log("[DEBUG] Configuration:") - console.log( - `[DEBUG] - Target languages (internal): ${targetLanguagesInput.length ? targetLanguagesInput.join(", ") : "ALL"}` - ) - console.log( - `[DEBUG] - Target languages (Crowdin): ${targetLanguages.join(", ")}` - ) - console.log(`[DEBUG] - Base branch: ${baseBranch}`) - console.log( - `[DEBUG] - Target path: ${targetPath || "none (full translation)"}` - ) - console.log(`[DEBUG] - Exclude path: ${excludePath || "none"}`) - console.log(`[DEBUG] - Skip await: ${skipAwait}`) - console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) - if (existingPreTranslationIds.length > 0) { - console.log( - `[DEBUG] - Resuming from pre-translation IDs: ${existingPreTranslationIds.join(", ")}` - ) - } -} - -// Main configuration object -export const config = { - projectId: 834930, - ghOrganization, - ghRepo, - jsonRoot: "src/intl/en", - mdRoot: "public/content", - preTranslatePromptId: Number.parseInt( - process.env.PRE_TRANSLATE_PROMPT_ID || "326942" - ), - allCrowdinCodes: targetLanguages, - allInternalCodes: targetLanguagesInput.length - ? targetLanguagesInput - : i18nConfig.map(({ code }) => code).filter((code) => code !== "en"), - baseBranch, - targetPath, - targetPaths, - excludePath, - skipAwait, - pretranslateTimeoutMs, - pretranslatePollBaseMs, - existingPreTranslationIds, - verbose, - splitPrs, -} - -// Do not translate list - Declare paths that should never be translated -export const doNotTranslatePaths = [ - "/cookie-policy/", - "/privacy-policy/", - "/terms-of-use/", - "/terms-and-conditions/", - "/style-guide/", -] - -// Validation for target path -export function validateTargetPath(targetPath: string): void { - if (!targetPath) { - // Full translation mode is allowed - return - } - - // Disallowed: paths under public/content/translations (translated content) - if (targetPath.includes("public/content/translations")) { - throw new Error( - `[ERROR] Invalid target path: "${targetPath}"\n` + - `Target path cannot be under "public/content/translations" (this is translated content)\n` + - `Did you mean to target a file under "public/content" instead?` - ) - } - - // Disallowed: paths under src/intl other than src/intl/en - if ( - targetPath.startsWith("src/intl/") && - !targetPath.startsWith("src/intl/en") - ) { - throw new Error( - `[ERROR] Invalid target path: "${targetPath}"\n` + - `Target path under "src/intl/" can only be "src/intl/en" (English source)\n` + - `Other src/intl directories contain translated content` - ) - } - - // Disallowed: explicitly excluded paths from config file - for (const excluded of doNotTranslatePaths) { - if (targetPath.includes(excluded)) { - throw new Error( - `[ERROR] Invalid target path: "${targetPath}"\n` + - `This path is in the excluded paths list (${excluded})` - ) - } - } -} - -// Constants -export const CROWDIN_API_BASE_URL = "https://api.crowdin.com/api/v2" -export const MAX_STRINGS_PER_REQUEST = 500 diff --git a/src/scripts/i18n/docs/v0.2.0-roadmap.md b/src/scripts/i18n/docs/v0.2.0-roadmap.md deleted file mode 100644 index c86e4f57447..00000000000 --- a/src/scripts/i18n/docs/v0.2.0-roadmap.md +++ /dev/null @@ -1,146 +0,0 @@ -# v0.2.0 Roadmap: Glossary & Consistency Validation - -This document outlines planned features for the next major iteration of the i18n automation system. - -## Overview - -v0.1.0 focused on: -- JSX attribute translation via Gemini API (fallback for Crowdin) -- Build-breaking syntax validation -- Modular architecture for standalone workflow execution - -v0.2.0 will focus on **translation quality and consistency** through glossary enforcement and term validation. - ---- - -## Planned Features - -### 1. Glossary Supabase Sync (Separate Cron) - -**Goal:** Keep Crowdin glossary synchronized with community-curated terms in Supabase. - -**Implementation:** -- Dedicated GitHub Action running on cron schedule (e.g., daily at midnight UTC) -- Fetches glossary terms from Supabase `glossary` table -- Uploads/updates terms in Crowdin project glossary via API -- Logs sync status and any conflicts - -**Files to create:** -- `src/scripts/i18n/sync-glossary.ts` - Main sync orchestrator -- `src/scripts/i18n/lib/supabase/glossary.ts` - Supabase client for glossary queries -- `.github/workflows/sync-glossary.yml` - Cron workflow - -**Environment variables needed:** -- `SUPABASE_URL` - Supabase project URL -- `SUPABASE_KEY` - Supabase anon/service key -- `CROWDIN_PROJECT_ID`, `CROWDIN_API_KEY` (existing) - ---- - -### 2. Term/Phrase Consistency Validation - -**Goal:** Validate that translated files use glossary terms consistently. - -**Implementation:** -- Post-translation validation step in main workflow -- Extract glossary terms from Crowdin (or local cache from sync) -- Scan translated files for source terms that should have been translated -- Flag inconsistencies in PR validation comment - -**Validation rules:** -- Source term appears in translation → likely missed (should be target term) -- Target term varies within same file → inconsistent usage -- Protected terms (ethereum.org, Ethereum, etc.) → should remain unchanged - -**Files to create:** -- `src/scripts/i18n/lib/validation/glossary.ts` - Glossary term validation -- Updates to `lib/workflows/validation.ts` - Integrate glossary checks - ---- - -### 3. Confidence Scoring - -**Goal:** Provide per-file and per-language confidence scores based on validation results. - -**Scoring factors:** -- JSX attribute untranslated percentage (from v0.1.0) -- Glossary term consistency rate -- Syntax validation pass/fail -- Source file complexity (length, technical density) - -**Output:** -- Confidence score (0-100) per file in PR comment -- Aggregate confidence per language -- Suggested review priority based on low-confidence files - -**Files to create:** -- `src/scripts/i18n/lib/validation/confidence.ts` - Scoring algorithm -- Updates to PR comment formatting - ---- - -## Architecture Considerations - -### Supabase Schema (Proposed) - -```sql --- Glossary terms table -CREATE TABLE glossary ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - source_term TEXT NOT NULL, - language_code TEXT NOT NULL, - target_term TEXT NOT NULL, - context TEXT, -- e.g., "technical", "UI", "marketing" - notes TEXT, - created_at TIMESTAMPTZ DEFAULT now(), - updated_at TIMESTAMPTZ DEFAULT now(), - UNIQUE(source_term, language_code) -); - --- Translation memory (future) -CREATE TABLE translation_memory ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - source_text TEXT NOT NULL, - language_code TEXT NOT NULL, - target_text TEXT NOT NULL, - source_file TEXT, - created_at TIMESTAMPTZ DEFAULT now() -); -``` - -### Crowdin API Endpoints - -- `POST /projects/{projectId}/glossaries/{glossaryId}/terms` - Add/update terms -- `GET /projects/{projectId}/glossaries/{glossaryId}/terms` - List terms for validation - ---- - -## Timeline (Tentative) - -| Feature | Estimated Effort | Priority | -|---------|------------------|----------| -| Glossary Supabase sync | 2-3 days | High | -| Term consistency validation | 2-3 days | High | -| Confidence scoring | 1-2 days | Medium | -| Documentation & testing | 1-2 days | High | - ---- - -## Dependencies - -- Supabase project setup with glossary table -- Crowdin glossary ID configuration -- Community glossary data migration (if existing) - ---- - -## Open Questions - -1. Should glossary sync be bidirectional (Supabase ↔ Crowdin)? -2. What threshold for glossary inconsistency should trigger a warning vs error? -3. Should confidence scores block PR merge below a certain threshold? -4. How to handle language-specific glossary exceptions? - ---- - -*This roadmap was created as part of the v0.1.0 development cycle. Updates will be made as requirements evolve.* diff --git a/src/scripts/i18n/lib/ai/gemini-translate.ts b/src/scripts/i18n/lib/ai/gemini-translate.ts deleted file mode 100644 index 988ebd7d654..00000000000 --- a/src/scripts/i18n/lib/ai/gemini-translate.ts +++ /dev/null @@ -1,404 +0,0 @@ -/** - * Core file translation via Gemini (direct, no Crowdin). - * - * Sends whole files (no segmentation) with site-specific context. - * Gemini handles the linguistics; we handle the guardrails. - */ - -import { GoogleGenAI } from "@google/genai" - -import i18nConfig from "../../../../../i18n.config.json" -import { delay } from "../workflows/utils" - -import { - chunkProse, - type CodeBlock, - type CodeComment, - extractCodeBlocks, - extractComments, - getCommentSyntax, - PROSE_SIZE_THRESHOLD, - restoreCodeBlocks, - restoreComments, -} from "./code-block-extractor" -import { - validateTranslatedJson, - validateTranslatedMarkdown, - type ValidationResult, -} from "./gemini-output-validation" -import { buildTranslationPrompt } from "./prompt-builder" - -const GEMINI_MODELS = ["gemini-3.1-pro-preview", "gemini-3.1-pro"] -const MAX_RETRIES = 3 -const RETRY_DELAY_MS = 5000 - -const LANGUAGE_NAMES: Record = Object.fromEntries( - i18nConfig.map(({ code, name }: { code: string; name: string }) => [ - code, - name, - ]) -) - -function getGeminiClient(): GoogleGenAI { - const apiKey = process.env.GEMINI_API_KEY - if (!apiKey) { - throw new Error("GEMINI_API_KEY environment variable is not set") - } - return new GoogleGenAI({ apiKey }) -} - -export interface TranslateFileOptions { - filePath: string - fileContent: string - fileType: "markdown" | "json" - targetLanguage: string - glossaryTerms: Map -} - -export interface TranslateFileResult { - translatedContent: string - tokensUsed: { input: number; output: number } -} - -/** - * Translate a single file via Gemini. - * - * For markdown files: - * 1. Extract fenced code blocks -> placeholders (reduces payload) - * 2. If prose still too large, chunk by headings recursively - * 3. Translate prose (single call or per-chunk) - * 4. Restore code blocks - * 5. Extract and translate code comments separately - * 6. Restore translated comments into code blocks - * - * For JSON files: translate directly (no code blocks). - */ -export async function translateFile( - options: TranslateFileOptions -): Promise { - const { filePath, fileContent, fileType, targetLanguage, glossaryTerms } = - options - - // JSON files: translate directly, no extraction needed - if (fileType === "json") { - return callGemini({ ...options, fileContent }) - } - - // Markdown: extract code blocks first - const { prose, blocks } = extractCodeBlocks(fileContent) - - if (blocks.length > 0) { - console.log( - ` [extract] ${filePath}: ${blocks.length} code blocks removed (${fileContent.length} -> ${prose.length} chars)` - ) - } - - // Check if prose needs chunking - const chunks = chunkProse(prose, PROSE_SIZE_THRESHOLD) - let translatedProse: string - let totalTokens = { input: 0, output: 0 } - - if (chunks.length === 1) { - // Single chunk: translate normally - const result = await callGemini({ - ...options, - fileContent: prose, - }) - translatedProse = result.translatedContent - totalTokens = result.tokensUsed - } else { - // Multiple chunks: translate each, reassemble - console.log(` [chunk] ${filePath}: split into ${chunks.length} chunks`) - const translatedChunks: string[] = [] - for (let i = 0; i < chunks.length; i++) { - const result = await callGemini({ - ...options, - fileContent: chunks[i], - }) - translatedChunks.push(result.translatedContent) - totalTokens.input += result.tokensUsed.input - totalTokens.output += result.tokensUsed.output - } - translatedProse = translatedChunks.join("\n\n") - } - - // Restore code blocks - let finalContent = restoreCodeBlocks(translatedProse, blocks) - - // Translate code comments (best-effort, non-fatal) - if (blocks.length > 0) { - try { - finalContent = await translateCodeComments( - finalContent, - blocks, - targetLanguage, - glossaryTerms - ) - } catch (error) { - console.warn( - ` [comments] ${filePath}: comment translation failed (non-fatal): ${error instanceof Error ? error.message : String(error)}` - ) - } - } - - return { - translatedContent: finalContent, - tokensUsed: totalTokens, - } -} - -/** - * Extract comments from all code blocks, translate them in a single - * Gemini call, and restore them into the final content. - */ -async function translateCodeComments( - content: string, - blocks: CodeBlock[], - targetLanguage: string, - glossaryTerms: Map -): Promise { - // Extract comments from all blocks - const allComments: CodeComment[] = [] - const blockData: Array<{ - block: CodeBlock - strippedCode: string - comments: CodeComment[] - }> = [] - - for (const block of blocks) { - if (!block.language || !block.content.trim()) continue - const { strippedCode, comments } = extractComments( - block.content, - block.language - ) - // Tag comments with their block index - const tagged = comments.map((c) => ({ ...c, blockIndex: block.index })) - allComments.push(...tagged) - blockData.push({ block, strippedCode, comments: tagged }) - } - - if (allComments.length === 0) return content - - // Build a compact payload for comment translation - const commentPayload: Record = {} - for (let i = 0; i < allComments.length; i++) { - commentPayload[`c${i}`] = allComments[i].text - } - - const languageName = LANGUAGE_NAMES[targetLanguage] || targetLanguage - const glossaryHint = - glossaryTerms.size > 0 - ? `\nUse these exact translations for glossary terms:\n${[ - ...glossaryTerms.entries(), - ] - .slice(0, 30) - .map(([en, loc]) => ` ${en} = ${loc}`) - .join("\n")}` - : "" - - const commentPrompt = `Translate these code comments to ${languageName}. Return ONLY a JSON object with the same keys and translated values. Do not add explanations.${glossaryHint} - -${JSON.stringify(commentPayload, null, 2)}` - - const result = await callGeminiRaw(commentPrompt) - let translatedMap: Record - - try { - const cleaned = stripCodeBlockWrapping(result.text, "json") - translatedMap = JSON.parse(cleaned) - } catch { - console.warn(" [comments] Could not parse comment translation response") - return content - } - - // Restore translated comments into the code blocks within content - for (const { block, comments } of blockData) { - if (comments.length === 0) continue - - const syntax = getCommentSyntax(block.language) - - // Map translated text back onto comment objects - const translatedComments = comments.map((c) => { - const key = `c${allComments.indexOf(c)}` - return { ...c, text: translatedMap[key] || c.text } - }) - - // Find and replace the code block in content - const fence = "```" - const originalBlock = `${fence}${block.language}\n${block.content}\n${fence}` - const restoredCode = restoreComments( - block.content, - translatedComments, - syntax - ) - const newBlock = `${fence}${block.language}\n${restoredCode}\n${fence}` - content = content.replace(originalBlock, newBlock) - } - - return content -} - -/** - * Core Gemini API call with retries and model fallback. - * Used by both prose translation and comment translation. - */ -async function callGemini( - options: TranslateFileOptions -): Promise { - const { filePath, fileContent, fileType, targetLanguage, glossaryTerms } = - options - - const languageName = LANGUAGE_NAMES[targetLanguage] || targetLanguage - const prompt = buildTranslationPrompt({ - filePath, - fileContent, - fileType, - targetLanguage, - languageName, - glossaryTerms, - }) - - // Retry loop for validation failures (API call retries are in callGeminiRaw) - for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { - const result = await callGeminiRaw(prompt) - - let text = result.text - text = stripCodeBlockWrapping(text, fileType) - - const validation: ValidationResult = - fileType === "json" - ? validateTranslatedJson(text, fileContent) - : validateTranslatedMarkdown(text, fileContent) - - if (validation.valid) { - return { - translatedContent: text, - tokensUsed: result.tokensUsed, - } - } - - if (attempt < MAX_RETRIES) { - console.warn( - `[WARN] ${filePath} validation attempt ${attempt}: ${validation.error}. Retrying...` - ) - await delay(RETRY_DELAY_MS * attempt) - continue - } - - throw new Error( - `Output validation failed after ${MAX_RETRIES} attempts: ${validation.error}` - ) - } - - throw new Error(`Translation failed for ${filePath}`) -} - -/** - * Raw Gemini API call with retries and model fallback. - * Returns the raw text response and token usage. - */ -async function callGeminiRaw( - prompt: string -): Promise<{ text: string; tokensUsed: { input: number; output: number } }> { - const client = getGeminiClient() - - const modelsToTry = process.env.GEMINI_MODEL - ? [process.env.GEMINI_MODEL] - : GEMINI_MODELS - - let lastError: Error | null = null - const modelNotFound = new Set() - - for (const modelId of modelsToTry) { - let modelFailed = false - - for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { - try { - const response = await client.models.generateContent({ - model: modelId, - contents: prompt, - config: { temperature: 0 }, - }) - const usage = response.usageMetadata - - return { - text: response.text ?? "", - tokensUsed: { - input: usage?.promptTokenCount || 0, - output: usage?.candidatesTokenCount || 0, - }, - } - } catch (error) { - lastError = error instanceof Error ? error : new Error(String(error)) - - if ( - lastError.message.includes("404") || - lastError.message.includes("not found") || - lastError.message.includes("deprecated") - ) { - console.warn( - `[WARN] Model ${modelId} unavailable: ${lastError.message}. Trying next model...` - ) - modelNotFound.add(modelId) - modelFailed = true - break - } - - if ( - lastError.message.includes("429") || - lastError.message.includes("RESOURCE_EXHAUSTED") - ) { - const backoff = RETRY_DELAY_MS * Math.pow(2, attempt) - console.warn( - `[WARN] Rate limited (${modelId}). Waiting ${backoff / 1000}s...` - ) - await delay(backoff) - continue - } - - if (attempt < MAX_RETRIES) { - console.warn( - `[WARN] Attempt ${attempt} (${modelId}) failed: ${lastError.message}. Retrying...` - ) - await delay(RETRY_DELAY_MS * attempt) - continue - } - } - } - - if (!modelFailed) break - } - - if (modelNotFound.size === modelsToTry.length) { - throw new Error( - `All Gemini models unavailable (${[...modelNotFound].join(", ")}). ` + - `Update GEMINI_MODELS in gemini-translate.ts or set GEMINI_MODEL env var.` - ) - } - - throw lastError || new Error("Translation failed") -} - -/** - * Gemini sometimes wraps output in ```markdown or ```json blocks. - * Strip that wrapping to get raw content. - */ -function stripCodeBlockWrapping( - text: string, - fileType: "markdown" | "json" -): string { - // Match ```markdown\n...\n``` or ```json\n...\n``` or just ```\n...\n``` - const patterns = [ - new RegExp( - `^\`\`\`(?:${fileType}|md|mdx)?\\s*\\n([\\s\\S]*?)\\n\`\`\`\\s*$` - ), - /^```\s*\n([\s\S]*?)\n```\s*$/, - ] - - for (const re of patterns) { - const match = text.match(re) - if (match) return match[1] - } - - return text -} diff --git a/src/scripts/i18n/lib/ai/gemini.ts b/src/scripts/i18n/lib/ai/gemini.ts deleted file mode 100644 index 29fc3022b83..00000000000 --- a/src/scripts/i18n/lib/ai/gemini.ts +++ /dev/null @@ -1,243 +0,0 @@ -/** - * Gemini AI translation wrapper for JSX attribute translation - */ - -import { GoogleGenAI } from "@google/genai" - -import i18nConfig from "../../../../../i18n.config.json" -import type { ExtractedAttribute, TranslatedAttribute } from "../jsx-attributes" -import { delay } from "../workflows/utils" - -/** Gemini API configuration */ -const GEMINI_MODEL = "gemini-2.5-pro" - -/** Language names parsed from i18n.config.json */ -const LANGUAGE_NAMES: Record = Object.fromEntries( - i18nConfig.map(({ code, name }) => [code, name]) -) - -/** - * Check if Gemini API is available (API key present) - */ -export function isGeminiAvailable(): boolean { - return Boolean(process.env.GEMINI_API_KEY) -} - -/** - * Get the Gemini API client - */ -function getGeminiClient(): GoogleGenAI { - const apiKey = process.env.GEMINI_API_KEY - if (!apiKey) { - throw new Error("GEMINI_API_KEY environment variable is not set") - } - return new GoogleGenAI({ apiKey }) -} - -/** - * Get human-readable language name from code - */ -function getLanguageName(code: string): string { - return LANGUAGE_NAMES[code] || code.toUpperCase() -} - -/** - * Build translation prompt for a batch of attributes - */ -function buildTranslationPrompt( - attributes: ExtractedAttribute[], - targetLanguage: string, - glossaryTerms?: Map -): string { - const langName = getLanguageName(targetLanguage) - - const attributeList = attributes - .map( - (attr, i) => - `${i + 1}. [${attr.componentName}.${attr.attributeName}] "${attr.originalValue}" - Context: ${attr.context}` - ) - .join("\n\n") - - // Build glossary section if terms provided - let glossarySection = "" - if (glossaryTerms && glossaryTerms.size > 0) { - const termsList = Array.from(glossaryTerms.entries()) - .map(([term, translation]) => `- "${term}" → "${translation}"`) - .join("\n") - glossarySection = ` - -REQUIRED TERMINOLOGY (use these exact translations): -${termsList} -` - } - - return `You are translating UI component attributes for the Ethereum.org website into ${langName}. - -These are JSX component attributes that contain human-readable text. Translate each value naturally and accurately while: -- Preserving technical Ethereum terminology appropriately for ${langName} -- Keeping the translation concise (similar length to original) -- Maintaining any placeholders like {variable} or {{variable}} unchanged -- Using region-neutral ${langName} that most speakers would understand -- Using informal, friendly register${glossarySection} - -Attributes to translate: - -${attributeList} - -Respond with ONLY a JSON array of translated strings in the same order, like: -["translated text 1", "translated text 2", ...] - -Do not include any explanation, just the JSON array.` -} - -/** - * Parse Gemini response to extract translated strings - */ -function parseTranslationResponse(response: string): string[] { - // Clean up response - remove markdown code blocks if present - let cleaned = response.trim() - if (cleaned.startsWith("```json")) { - cleaned = cleaned.slice(7) - } else if (cleaned.startsWith("```")) { - cleaned = cleaned.slice(3) - } - if (cleaned.endsWith("```")) { - cleaned = cleaned.slice(0, -3) - } - cleaned = cleaned.trim() - - try { - const parsed = JSON.parse(cleaned) - if (!Array.isArray(parsed)) { - throw new Error("Response is not an array") - } - return parsed.map((item) => String(item)) - } catch (error) { - console.error("[GEMINI] Failed to parse response:", cleaned) - throw new Error(`Failed to parse Gemini response: ${error}`) - } -} - -/** - * Translate a batch of attributes for a single language. - * Returns translated attributes with their values filled in. - */ -export async function translateAttributes( - attributes: ExtractedAttribute[], - targetLanguage: string, - glossaryTerms?: Map -): Promise { - if (attributes.length === 0) { - return [] - } - - if (!isGeminiAvailable()) { - console.warn( - "[GEMINI] API key not available, skipping attribute translation" - ) - return [] - } - - const client = getGeminiClient() - - const prompt = buildTranslationPrompt( - attributes, - targetLanguage, - glossaryTerms - ) - - console.log( - `[GEMINI] Translating ${attributes.length} attributes to ${getLanguageName(targetLanguage)}` - ) - - try { - const result = await client.models.generateContent({ - model: GEMINI_MODEL, - contents: prompt, - }) - const translations = parseTranslationResponse(result.text ?? "") - - if (translations.length !== attributes.length) { - console.warn( - `[GEMINI] Translation count mismatch: expected ${attributes.length}, got ${translations.length}` - ) - } - - // Map translations back to attributes - return attributes.map((attr, i) => ({ - ...attr, - translatedValue: translations[i] || attr.originalValue, - })) - } catch (error) { - console.error("[GEMINI] Translation failed:", error) - throw error - } -} - -/** - * Translate attributes with retry logic - */ -export async function translateAttributesWithRetry( - attributes: ExtractedAttribute[], - targetLanguage: string, - glossaryTerms?: Map, - maxRetries = 3 -): Promise { - let lastError: Error | null = null - - for (let attempt = 1; attempt <= maxRetries; attempt++) { - try { - return await translateAttributes( - attributes, - targetLanguage, - glossaryTerms - ) - } catch (error) { - lastError = error instanceof Error ? error : new Error(String(error)) - console.warn( - `[GEMINI] Attempt ${attempt}/${maxRetries} failed: ${lastError.message}` - ) - - if (attempt < maxRetries) { - // Exponential backoff - const backoff = Math.min(1000 * Math.pow(2, attempt - 1), 10000) - await delay(backoff) - } - } - } - - throw lastError || new Error("Translation failed after retries") -} - -/** - * Translate attributes grouped by file, processing each file's batch sequentially - * to avoid rate limits while maximizing context per request. - */ -export async function translateAttributesByFile( - attributesByFile: Map, - targetLanguage: string, - glossaryTerms?: Map -): Promise> { - const results = new Map() - - for (const [filePath, attributes] of attributesByFile) { - try { - const translated = await translateAttributesWithRetry( - attributes, - targetLanguage, - glossaryTerms - ) - results.set(filePath, translated) - console.log( - `[GEMINI] ✓ Translated ${translated.length} attributes in ${filePath}` - ) - } catch (error) { - console.error(`[GEMINI] ✗ Failed to translate ${filePath}:`, error) - // Continue with other files even if one fails - results.set(filePath, []) - } - } - - return results -} diff --git a/src/scripts/i18n/lib/ai/index.ts b/src/scripts/i18n/lib/ai/index.ts deleted file mode 100644 index e2f75fd59d1..00000000000 --- a/src/scripts/i18n/lib/ai/index.ts +++ /dev/null @@ -1,10 +0,0 @@ -/** - * AI translation module - */ - -export { - isGeminiAvailable, - translateAttributes, - translateAttributesByFile, - translateAttributesWithRetry, -} from "./gemini" diff --git a/src/scripts/i18n/lib/ai/language-groups.ts b/src/scripts/i18n/lib/ai/language-groups.ts deleted file mode 100644 index 44a7edf5ac5..00000000000 --- a/src/scripts/i18n/lib/ai/language-groups.ts +++ /dev/null @@ -1,95 +0,0 @@ -/** - * Language group definitions for translation. - * - * Different script families require different translation strategies. - * Gemini knows these rules natively -- we provide site-specific context, - * not linguistic micromanagement. - */ - -export type LanguageGroup = - | "indic" - | "cyrillic" - | "rtl" - | "cjk-phonetic" - | "cjk-semantic" - | "latin" - -const GROUP_MAP: Record = { - // Indic (Brahmic scripts -- transliterate) - hi: "indic", - mr: "indic", - bn: "indic", - ta: "indic", - te: "indic", - // Cyrillic (transliterate, high Latin tolerance) - ru: "cyrillic", - uk: "cyrillic", - // RTL (transliterate, BiDi considerations) - ar: "rtl", - ur: "rtl", - // CJK Phonetic (Katakana/Hangul -- transliterate even in tags) - ja: "cjk-phonetic", - ko: "cjk-phonetic", - // CJK Semantic (translate by meaning, not sound) - zh: "cjk-semantic", - "zh-tw": "cjk-semantic", -} - -export function getLanguageGroup(code: string): LanguageGroup { - return GROUP_MAP[code] || "latin" -} - -export function isRtl(code: string): boolean { - return code === "ar" || code === "ur" -} - -export function needsTransliteration(code: string): boolean { - const group = getLanguageGroup(code) - return group !== "latin" -} - -/** - * Site-specific translation notes per language group. - * These focus on ethereum.org conventions, not general linguistics - * (Gemini already knows how Arabic/Japanese/etc. work). - */ -export function getSiteSpecificNotes(group: LanguageGroup): string { - const common = ` -Site-specific rules for ethereum.org: -- Frontmatter tags array: brand-name tags (Solidity, MetaMask, ERC-20) stay in Latin script. Concept tags (smart contracts, testing) should be translated. -- Code blocks: never translate functional code. Code comments may be translated. -- Internal links (href starting with /) must stay exactly as in English. -- Ticker symbols (ETH, BTC, ERC, EIP, BLS) always stay in Latin script. -- URLs and domain names always stay in Latin script. -- Use community glossary terms exactly as provided (these are community-voted).` - - switch (group) { - case "rtl": - return `${common} -- Wrap bare numeric dates (YYYY-MM-DD, DD/MM/YYYY) in ... to prevent BiDi flipping. -- Wrap mathematical equations with operators in .... -- Use Western Arabic numerals (1, 2, 3) for Arabic. Urdu uses native numerals for prose but Western for technical identifiers. -- Never convert Gregorian dates to Hijri calendar. -- The word "state" in blockchain context means computational state, not political state.` - - case "cjk-phonetic": - return `${common} -- Brand-name tags CAN be transliterated into native script (Katakana/Hangul) -- unlike other groups. -- Global acronyms (DeFi, NFT, API) may stay in Latin.` - - case "cjk-semantic": - return `${common} -- Translate terms by meaning (calque), not by sound. Example: "Smart Contract" = 智能合约. -- Use officially established translations where they exist (Ethereum = 以太坊). -- If no official translation exists for a brand, keep it in Latin script.` - - case "indic": - case "cyrillic": - return `${common} -- Use Western Arabic numerals (1, 2, 3) -- not native numeral scripts.` - - case "latin": - return `${common} -- Brand names must stay in English (do not translate Solidity, MetaMask, etc.).` - } -} diff --git a/src/scripts/i18n/lib/ai/progress-tracker.ts b/src/scripts/i18n/lib/ai/progress-tracker.ts deleted file mode 100644 index 0737053b6d8..00000000000 --- a/src/scripts/i18n/lib/ai/progress-tracker.ts +++ /dev/null @@ -1,150 +0,0 @@ -/** - * Resumable progress tracking for translation runs. - * Stores state as a JSON manifest so interrupted runs can resume. - */ - -import * as fs from "node:fs" -import * as path from "node:path" - -const DEFAULT_PROGRESS_DIR = "/tmp" - -interface LanguageProgress { - status: "pending" | "in_progress" | "completed" | "failed" - filesCompleted: string[] - filesFailed: string[] - startedAt?: string - completedAt?: string -} - -export interface TranslationProgress { - runId: string - startedAt: string - languages: Record -} - -function getManifestPath(runId: string): string { - return path.join(DEFAULT_PROGRESS_DIR, `translation-progress-${runId}.json`) -} - -/** - * Create or load a progress manifest. - */ -export function initProgress( - runId: string, - languages: string[] -): TranslationProgress { - const manifestPath = getManifestPath(runId) - - // Resume existing - if (fs.existsSync(manifestPath)) { - const existing = JSON.parse( - fs.readFileSync(manifestPath, "utf8") - ) as TranslationProgress - console.log(`[progress] Resuming run ${runId} from manifest`) - return existing - } - - // Create new - const progress: TranslationProgress = { - runId, - startedAt: new Date().toISOString(), - languages: Object.fromEntries( - languages.map((lang) => [ - lang, - { - status: "pending" as const, - filesCompleted: [], - filesFailed: [], - }, - ]) - ), - } - - saveProgress(progress) - return progress -} - -/** - * Check if a file has already been translated for a language. - */ -export function isFileCompleted( - progress: TranslationProgress, - language: string, - filePath: string -): boolean { - return ( - progress.languages[language]?.filesCompleted.includes(filePath) ?? false - ) -} - -/** - * Mark a file as completed for a language. - */ -export function markFileCompleted( - progress: TranslationProgress, - language: string, - filePath: string -): void { - const lang = progress.languages[language] - if (!lang) return - if (!lang.filesCompleted.includes(filePath)) { - lang.filesCompleted.push(filePath) - } - lang.status = "in_progress" - saveProgress(progress) -} - -/** - * Mark a file as failed for a language. - */ -export function markFileFailed( - progress: TranslationProgress, - language: string, - filePath: string -): void { - const lang = progress.languages[language] - if (!lang) return - if (!lang.filesFailed.includes(filePath)) { - lang.filesFailed.push(filePath) - } - saveProgress(progress) -} - -/** - * Mark a language as completed. - */ -export function markLanguageCompleted( - progress: TranslationProgress, - language: string -): void { - const lang = progress.languages[language] - if (!lang) return - lang.status = "completed" - lang.completedAt = new Date().toISOString() - saveProgress(progress) -} - -/** - * Check if a language is already completed. - */ -export function isLanguageCompleted( - progress: TranslationProgress, - language: string -): boolean { - return progress.languages[language]?.status === "completed" -} - -/** - * Delete the manifest (cleanup after successful run). - */ -export function cleanupProgress(progress: TranslationProgress): void { - const manifestPath = getManifestPath(progress.runId) - if (fs.existsSync(manifestPath)) { - fs.unlinkSync(manifestPath) - } -} - -function saveProgress(progress: TranslationProgress): void { - const manifestPath = getManifestPath(progress.runId) - fs.writeFileSync(manifestPath, JSON.stringify(progress, null, 2), "utf8") -} diff --git a/src/scripts/i18n/lib/ai/prompt-builder.ts b/src/scripts/i18n/lib/ai/prompt-builder.ts deleted file mode 100644 index 7b0934566ac..00000000000 --- a/src/scripts/i18n/lib/ai/prompt-builder.ts +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Build translation prompts for Gemini translation. - * - * Philosophy: Gemini is the language expert. We provide: - * 1. Site-specific context (glossary, conventions) - * 2. Structural expectations (frontmatter, markdown) - * 3. Lightweight guardrails (what our sanitizer checks for) - * - * We do NOT micromanage linguistics -- Gemini knows RTL, BiDi, - * transliteration norms, etc. better than any regex. - */ - -import { getLanguageGroup, getSiteSpecificNotes } from "./language-groups" - -interface PromptOptions { - filePath: string - fileContent: string - fileType: "markdown" | "json" - targetLanguage: string - languageName: string - glossaryTerms: Map -} - -/** - * Build the complete translation prompt for a single file. - */ -export function buildTranslationPrompt(options: PromptOptions): string { - const { - filePath, - fileContent, - fileType, - targetLanguage, - languageName, - glossaryTerms, - } = options - - const group = getLanguageGroup(targetLanguage) - const siteNotes = getSiteSpecificNotes(group) - const glossarySection = formatGlossary(glossaryTerms) - const formatRules = getFormatRules(fileType) - const sanitizerHints = getSanitizerHints() - - return `Translate this ${fileType} file from English to ${languageName} (${targetLanguage}). - -File: ${filePath} - -${formatRules} - -${siteNotes} - -${glossarySection} - -${sanitizerHints} - -=== SOURCE FILE === -${fileContent} -=== END SOURCE FILE === - -Output ONLY the translated file content. No explanations, no markdown wrapping, no commentary.` -} - -function getFormatRules(fileType: "markdown" | "json"): string { - if (fileType === "json") { - return `Format rules: -- Output valid JSON with identical key structure. -- Translate only string values. Never translate keys. -- Preserve HTML tags within values exactly (, , etc.). -- Preserve interpolation variables exactly ({count}, {{name}}, etc.). -- Internal href paths (/developers/docs/...) must stay in English.` - } - - return `Format rules: -- Preserve all frontmatter fields and structure exactly. -- Preserve all markdown syntax (headings, lists, links, code blocks). -- Preserve all JSX/HTML components and their attributes exactly. -- Preserve heading anchor IDs exactly as in English ({#anchor-id}). -- Never translate content inside code fences (\`\`\` blocks). -- Code comments inside code fences may be translated. -- Internal links (href starting with /) must match English exactly. -- Image paths must match English exactly.` -} - -function formatGlossary(terms: Map): string { - if (terms.size === 0) return "" - - const lines = Array.from(terms.entries()) - .map(([en, translated]) => `- "${en}" -> "${translated}"`) - .join("\n") - - return `Community-voted glossary (use these exact translations): -${lines}` -} - -/** - * Hints about what our post-processing sanitizer checks for. - * This helps Gemini avoid patterns we'd just fix afterward. - */ -function getSanitizerHints(): string { - return `Our automated sanitizer will check your output for: -- Brand names in frontmatter tags must stay Latin (Solidity, MetaMask, etc.) -- Ticker symbols (ETH, ERC-20) must stay Latin -- Internal hrefs must match English source exactly -- No translated heading anchor IDs -- No broken markdown link syntax -- Valid JSON structure (for JSON files) -Getting these right the first time avoids post-processing corrections.` -} diff --git a/src/scripts/i18n/lib/crowdin/build.ts b/src/scripts/i18n/lib/crowdin/build.ts deleted file mode 100644 index cdd49d35fff..00000000000 --- a/src/scripts/i18n/lib/crowdin/build.ts +++ /dev/null @@ -1,76 +0,0 @@ -// Crowdin build and download operations - -import { - config, - CROWDIN_API_BASE_URL, - crowdinBearerHeaders, -} from "../../config" -import type { BuildProjectFileTranslationResponse } from "../types" - -/** - * Build a project file translation for a specific language - * - * @param fileId - The Crowdin file ID - * @param targetLanguageId - The target language ID - * @param projectId - The Crowdin project ID (defaults to config) - * @returns Build response with download URL - */ -export const postBuildProjectFileTranslation = async ( - fileId: number, - targetLanguageId: string, - projectId = config.projectId -): Promise => { - const url = new URL( - `${CROWDIN_API_BASE_URL}/projects/${projectId}/translations/builds/files/${fileId}` - ) - - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - Accept: "application/json", - }, - body: JSON.stringify({ targetLanguageId }), - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error( - `Crowdin postBuildProjectFileTranslation failed (${res.status}): ${body}` - ) - } - - type JsonResponse = { data: BuildProjectFileTranslationResponse } - const json: JsonResponse = await res.json() - console.log("Built file:", json.data) - return json.data -} - -/** - * Download a built file from Crowdin - * - * @param downloadUrl - The download URL from the build response - * @returns Buffer containing the file contents - */ -export const getBuiltFile = async ( - downloadUrl: string -): Promise<{ buffer: Buffer }> => { - try { - const res = await fetch(downloadUrl) - - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error(`Failed to download built file (${res.status}): ${body}`) - } - - const arrayBuffer = await res.arrayBuffer() - const buffer = Buffer.from(arrayBuffer) - - return { buffer } - } catch (error) { - console.error("getBuiltFile error:", error) - throw error - } -} diff --git a/src/scripts/i18n/lib/crowdin/ephemeral-prompts.ts b/src/scripts/i18n/lib/crowdin/ephemeral-prompts.ts deleted file mode 100644 index 985979bca9c..00000000000 --- a/src/scripts/i18n/lib/crowdin/ephemeral-prompts.ts +++ /dev/null @@ -1,151 +0,0 @@ -/** - * Ephemeral Prompts - * - * Manages Crowdin AI prompts that are created per-job and cleaned up after use. - * Each prompt is uniquely named with language, key, and timestamp to avoid conflicts. - * - * Naming convention: eth-org-{lang}-{key}-{timestamp} - * Example: eth-org-es-glossary-1702987200 - */ - -import { crowdinBearerHeaders } from "../../config" - -import type { PromptResource } from "./prompt" - -/** Parameters for creating an ephemeral prompt */ -export interface CreateEphemeralPromptParams { - /** Crowdin user ID (owner of the prompt) */ - userId: number - /** Language code (e.g., "es", "fr", "de") */ - languageCode: string - /** Prompt key (e.g., "glossary", "formal") */ - promptKey: string - /** The full prompt text */ - promptText: string - /** AI provider ID (optional, uses default if not specified) */ - aiProviderId?: number - /** AI model ID (optional, uses default if not specified) */ - aiModelId?: string -} - -/** Result of creating an ephemeral prompt */ -export interface EphemeralPromptResult { - /** The created prompt's ID */ - promptId: number - /** The prompt's unique name */ - promptName: string -} - -/** Crowdin API response for prompt creation */ -interface CrowdinCreatePromptResponse { - data: PromptResource -} - -/** Prefix for all ephemeral prompt names */ -const EPHEMERAL_PREFIX = "eth-org" - -/** Crowdin action type for pre-translation prompts */ -const PRE_TRANSLATE_ACTION = "pre_translate" - -/** - * Generate a unique name for an ephemeral prompt - */ -export function generateEphemeralPromptName( - languageCode: string, - promptKey: string -): string { - const timestamp = Math.floor(Date.now() / 1000) - return `${EPHEMERAL_PREFIX}-${languageCode}-${promptKey}-${timestamp}` -} - -/** - * Create an ephemeral AI prompt in Crowdin - * - * Uses Crowdin API v2: POST /users/{userId}/ai/prompts - */ -export async function createEphemeralPrompt( - params: CreateEphemeralPromptParams -): Promise { - const { - userId, - languageCode, - promptKey, - promptText, - aiProviderId, - aiModelId, - } = params - - const promptName = generateEphemeralPromptName(languageCode, promptKey) - console.log(`[EPHEMERAL-PROMPT] Creating prompt: ${promptName}`) - - const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts` - - const body: Record = { - name: promptName, - action: PRE_TRANSLATE_ACTION, - config: { - mode: "advanced", - prompt: promptText, - glossaryTerms: true, - tmSuggestions: true, - }, - } - - if (aiProviderId !== undefined) { - body.aiProviderId = aiProviderId - } - if (aiModelId !== undefined) { - body.aiModelId = aiModelId - } - - const response = await fetch(url, { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify(body), - }) - - if (!response.ok) { - const text = await response.text().catch(() => "") - throw new Error( - `Failed to create ephemeral prompt "${promptName}" (${response.status}): ${text}` - ) - } - - const json = (await response.json()) as CrowdinCreatePromptResponse - const promptId = json.data.id - - console.log( - `[EPHEMERAL-PROMPT] Created prompt: ${promptName} (ID: ${promptId})` - ) - return { promptId, promptName } -} - -/** - * Delete an ephemeral AI prompt from Crowdin - */ -export async function deleteEphemeralPrompt( - userId: number, - promptId: number -): Promise { - console.log(`[EPHEMERAL-PROMPT] Deleting prompt ID: ${promptId}`) - - const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` - - const response = await fetch(url, { - method: "DELETE", - headers: crowdinBearerHeaders, - }) - - // 204 No Content is success, 404 is also acceptable (already deleted) - if (!response.ok && response.status !== 404) { - const text = await response.text().catch(() => "") - throw new Error( - `Failed to delete ephemeral prompt ${promptId} (${response.status}): ${text}` - ) - } - - console.log(`[EPHEMERAL-PROMPT] Deleted prompt ID: ${promptId}`) -} diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts deleted file mode 100644 index e0096d6c350..00000000000 --- a/src/scripts/i18n/lib/crowdin/files.ts +++ /dev/null @@ -1,396 +0,0 @@ -// Crowdin file operations - -import { - config, - CROWDIN_API_BASE_URL, - crowdinBearerHeaders, -} from "../../config" -import type { - CrowdinAddFileResponse, - CrowdinFileData, - GitHubCrowdinFileMetadata, -} from "../types" -import { debugLog } from "../workflows/utils" - -/** - * JSX component attributes that should be translated in markdown files. - * These contain human-readable strings, as opposed to technical attributes - * like emoji, eventCategory, href, etc. - * - * Note: Crowdin's PATCH API only accepts a boolean flag (translateAttributes: true) - * to enable attribute translation. The actual whitelist may need to be configured - * separately via the Crowdin UI or a different API endpoint. - * - * See TRANSLATABLE_ATTRIBUTES in jsx-attributes/types.ts for the canonical list. - */ - -/** - * Get all files in the Crowdin project - */ -export const getCrowdinProjectFiles = async (): Promise => { - const url = new URL( - `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/files` - ) - url.searchParams.set("limit", "500") - - debugLog(`Fetching Crowdin project files from: ${url.toString()}`) - - try { - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error( - `Crowdin getCrowdinProjectFiles failed (${res.status}): ${body}` - ) - } - - type JsonResponse = { data: { data: CrowdinFileData }[] } - const json: JsonResponse = await res.json() - - const mappedData = json.data.map(({ data }) => data) - debugLog(`Successfully fetched ${mappedData.length} Crowdin files`) - return mappedData - } catch (error) { - console.error(`[ERROR] Failed to fetch Crowdin project files:`, error) - process.exit(1) - } -} - -/** - * Find a Crowdin file matching a GitHub file. - * Returns null if file not found (indicating it's new and needs to be uploaded). - */ -export const findCrowdinFile = ( - targetFile: GitHubCrowdinFileMetadata, - crowdinFiles: CrowdinFileData[] -): CrowdinFileData | null => { - debugLog(`Looking for Crowdin file matching: ${targetFile.filePath}`) - - const found = crowdinFiles.find(({ path }) => - path.endsWith(targetFile.filePath) - ) - - if (!found) { - // Not an error - file is new and will be uploaded - console.log( - `[INFO] File not in Crowdin (will upload): ${targetFile.filePath}` - ) - return null - } - - debugLog( - `Successfully matched with Crowdin file: ${found.path} (ID: ${found.id})` - ) - return found -} - -/** - * Unhides all hidden strings in a Crowdin file. - * Hidden strings (often marked as duplicates) cannot be translated. - * This function makes them visible so they can be processed by pre-translation. - */ -export const unhideStringsInFile = async (fileId: number): Promise => { - debugLog(`Checking for hidden strings in fileId=${fileId}`) - - // Get all strings from the file - const listUrl = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/strings?fileId=${fileId}&limit=500` - - try { - const listRes = await fetch(listUrl, { headers: crowdinBearerHeaders }) - if (!listRes.ok) { - const text = await listRes.text().catch(() => "") - console.warn( - `[UNHIDE] Failed to list strings for fileId=${fileId}: ${text}` - ) - return 0 - } - - const listJson = await listRes.json() - const strings = listJson.data || [] - - let unhiddenCount = 0 - - for (const item of strings) { - const stringId = item.data.id - const isHidden = item.data.isHidden - - if (!isHidden) continue - - // Unhide the string using PATCH - const patchUrl = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/strings/${stringId}` - - try { - const patchRes = await fetch(patchUrl, { - method: "PATCH", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify([ - { - op: "replace", - path: "/isHidden", - value: false, - }, - ]), - }) - - if (patchRes.ok) { - unhiddenCount++ - } else { - const text = await patchRes.text().catch(() => "") - console.warn(`[UNHIDE] Failed to unhide string ${stringId}: ${text}`) - } - } catch (err) { - console.warn(`[UNHIDE] Error unhiding string ${stringId}:`, err) - } - } - - if (unhiddenCount > 0) { - console.log( - `[UNHIDE] ✓ Unhidden ${unhiddenCount} strings in fileId=${fileId}` - ) - } - - return unhiddenCount - } catch (error) { - console.error(`[UNHIDE] Error processing fileId=${fileId}:`, error) - return 0 - } -} - -/** - * Lists all Crowdin directories in the project. - */ -export const getCrowdinProjectDirectories = async (): Promise< - { id: number; name: string; directoryId?: number }[] -> => { - const url = new URL( - `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/directories` - ) - url.searchParams.set("limit", "500") - - debugLog(`Fetching Crowdin directories: ${url.toString()}`) - - try { - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error( - `Crowdin getCrowdinProjectDirectories failed (${res.status}): ${body}` - ) - } - type DirJson = { - data: { data: { id: number; name: string; directoryId?: number } }[] - } - const json: DirJson = await res.json() - const dirs = json.data.map(({ data }) => data) - debugLog(`Loaded ${dirs.length} directories`) - return dirs - } catch (error) { - console.error("[ERROR] getCrowdinProjectDirectories:", error) - throw error - } -} - -/** - * Creates a single Crowdin directory (one segment). Parent may be undefined for root. - */ -export const postCrowdinDirectory = async ( - name: string, - parentDirectoryId?: number -): Promise => { - const url = new URL( - `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/directories` - ) - - const body: Record = { name } - if (parentDirectoryId) body.directoryId = parentDirectoryId - - debugLog( - `Creating directory segment "${name}" parent=${parentDirectoryId ?? "ROOT"}` - ) - - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - Accept: "application/json", - }, - body: JSON.stringify(body), - }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - // 409 = already exists race condition - throw new Error( - `Crowdin postCrowdinDirectory failed (${res.status}): ${text}` - ) - } - - type JsonResponse = { data: { id: number } } - const json: JsonResponse = await res.json() - debugLog(`Created directory id=${json.data.id} name="${name}"`) - return json.data.id - } catch (error) { - console.error("[ERROR] postCrowdinDirectory:", error) - throw error - } -} - -/** - * Ensures a nested path of directories exists. - * Example path: "public/content/community/events/organizing" - * Returns the final (deepest) directory id. - * - * - Splits path on "/" ignoring empty segments. - * - Reuses existing segments (matched by name + parent). - * - Creates missing segments sequentially. - */ -export const createCrowdinDirectory = async ( - fullPath: string -): Promise => { - if (!fullPath || typeof fullPath !== "string") { - throw new Error("createCrowdinDirectory: path must be a non-empty string") - } - debugLog(`Ensuring Crowdin directory path: "${fullPath}"`) - - const segments = fullPath - .split("/") - .map((s) => s.trim()) - .filter(Boolean) - if (!segments.length) throw new Error("No valid path segments") - - const invalidChars = /[\\:*?"<>|]/ // Disallowed per Crowdin docs for directory name (exclude forward slash which is path separator) - for (const segment of segments) { - if (invalidChars.test(segment)) { - throw new Error( - `createCrowdinDirectory: segment "${segment}" contains invalid characters in path "${fullPath}"` - ) - } - } - - // Load existing directories once - const existing = await getCrowdinProjectDirectories() - - // Build quick lookup: parentId|name -> id (root parentId = 0 sentinel) - const key = (parentId: number | undefined, name: string) => - `${parentId || 0}|${name}` - - const directoryIndex = new Map() - for (const dir of existing) { - directoryIndex.set(key(dir.directoryId, dir.name), dir.id) - } - - let currentParentId: number | undefined - for (const segment of segments) { - const k = key(currentParentId, segment) - let dirId = directoryIndex.get(k) - if (dirId) { - debugLog( - `Reusing existing directory "${segment}" id=${dirId} parent=${currentParentId ?? "ROOT"}` - ) - currentParentId = dirId - continue - } - // Create - dirId = await postCrowdinDirectory(segment, currentParentId) - directoryIndex.set(k, dirId) - currentParentId = dirId - } - - if (!currentParentId) - throw new Error("Failed to resolve final directory id (unexpected)") - - debugLog(`Final directory id for path "${fullPath}" = ${currentParentId}`) - return currentParentId -} - -/** - * Upload a file to Crowdin storage - */ -export const postFileToStorage = async ( - fileBuffer: Buffer, - fileName: string -) => { - const url = new URL(`${CROWDIN_API_BASE_URL}/storages`) - - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - // Crowdin expects raw bytes for storages endpoint; use octet-stream. - "Content-Type": "application/octet-stream", - "Crowdin-API-FileName": fileName, - }, - body: fileBuffer, - }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error( - `Crowdin postFileToStorage failed (${res.status}): ${text}` - ) - } - - type JsonResponse = { - data: { - id: number - fileName: string - } - } - const json: JsonResponse = await res.json() - return json.data - } catch (error) { - console.error("postFileToStorage error:", error) - throw error - } -} - -/** - * Add a file to Crowdin project - */ -export const postCrowdinFile = async ( - storageId: number, - name: string, - dir: string -): Promise => { - const directoryId = await createCrowdinDirectory(dir) - const url = new URL( - `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/files` - ) - - const requestBody = { - storageId, - name, - directoryId, - } - - // Create the file (errors propagate to caller for graceful handling) - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - Accept: "application/json", - }, - body: JSON.stringify(requestBody), - }) - - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error(`Crowdin postCrowdinFile failed (${res.status}): ${body}`) - } - - type JsonResponse = { data: CrowdinAddFileResponse } - const json: JsonResponse = await res.json() - console.log("Created file:", json.data) - - // Note: parser options are managed in Crowdin UI. No PATCH here. - - return json.data -} diff --git a/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt b/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt deleted file mode 100644 index d35cf850695..00000000000 --- a/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt +++ /dev/null @@ -1,71 +0,0 @@ -You are a professional translator with native-level fluency in both English and all the target languages %targetLanguages% in the project, and expertise in Ethereum, blockchain, cryptocurrency, and decentralized technologies. -You have deep familiarity with open-source communities and technical documentation, enabling you to handle domain-specific terminology accurately. Your task is to produce high-quality translations of ethereum.org content from English into the target language, following the guidelines below. -The source content is content from the ethereum.org website, segmented in the source files as %strings%. -Translate content from English into the target language %targetLanguage% specified for each individual pre-translation project run. -The target language is automatically defined in the project configuration and the pre-translation process — never guess or switch languages. -Always output translations in the target language only. - -CRITICAL DO-NOT-BREAK RULES (must follow exactly): -- JSON escaping: When translating JSON files, ALL double quotes (") inside string values MUST be escaped as \" to maintain valid JSON. Similarly, escape backslashes (\) as \\, newlines as \n, tabs as \t. The output MUST be parseable JSON—invalid JSON will break the build. Example: translate "Learn about "Ethereum"" as "Aprenda sobre \"Ethereum\"", NOT "Aprenda sobre "Ethereum"". -- Custom header IDs: If a Markdown heading includes a custom anchor like `{#custom-id}`, the ID MUST remain identical to the English source, ASCII-only (no accents or special characters). Do NOT alter, translate, add, or remove braces. Keep the exact ID string. -- HTML/MDX tag line placement: If an opening HTML tag appears on its own line, the matching closing tag MUST also be on its own line. Preserve line breaks around paired block-level tags. -- JSX/MDX attributes: Translate human-readable text found inside attribute values (e.g., `title="..."`, `aria-label="..."`, `alt="..."`) while preserving placeholders, variables, and code. Do NOT translate attribute names or change quoting/escaping. -- Protected names: Do NOT translate obvious proper names, brands, or team names. This includes programming languages (e.g., "Solidity", "Vyper"), company/product names (e.g., "Alchemy", "Infura", "MetaMask", "Consensys", "Chainlink", "Uniswap", "OpenSea", "OpenZeppelin"), protocol/network names (e.g., "Ethereum", "ETH"), and tools/platforms (e.g., "GitHub", "Crowdin", "ethereum.org"). Leave these as in the source unless a community-approved localized form exists. IMPORTANT: Even when a word has a common translation in the target language (e.g., "Alchemy" meaning the historical practice, or "Solidity" meaning firmness), keep the English term when it refers to a brand, product, or technology name. - - URL/path destinations MUST be preserved character-for-character: keep exact case, hyphens, slashes, fragments (`#...`), and query parameters (`?...`). Do NOT change, normalize, or localize any part of a link destination. This rule also applies to any links contained within JSON string values used in React/MDX pages. - -Maintain Clarity and Professionalism: Ensure the translated text is clear, accurate, and professional in tone, just like the source. Match the tone and register of the English content – if the source is explanatory and formal, the translation should mirror that style. Remember that Ethereum’s content serves both experts and complete beginners, so the translation should be accessible to technical and non-technical readers alike. -Consistency with Source Tone: Use a tone that is neither too casual nor overly stiff, unless the source text itself has a specific tone. For example, if the English text uses a friendly and encouraging tone, reflect that in the translation while maintaining professionalism. -Formal Address: In languages that have formal and informal address forms, use the formal form to address the reader. This ensures the content remains respectful and appropriate for all users, and often helps maintain gender-neutrality. Only use an informal tone if the English source explicitly does so. -Idioms and Cultural Nuances: If the source uses idiomatic expressions or culturally specific references, preserve their intent. Replace an idiom only with an equivalent well-understood expression; otherwise keep a direct translation that preserves meaning. - -Certain elements of the source text must be handled with special care during translation: - -Technical Terms: Do not translate highly specific blockchain terms such as "smart contract", "gas", "dapp", or other Ethereum jargon unless there is a widely accepted equivalent. When in doubt, leave the term in English. -Code, Commands, and Output: Retain code snippets, configuration commands, outputs, function names, and anything in backticks or code blocks exactly. Do not translate placeholders (e.g. {value}, %s, <0>...), variables, or braces. Translate English comments inside code (e.g., lines or blocks starting with //, #, or /* ... */) while leaving all code tokens unchanged. -URLs, File Paths, and Domain Names: Never translate or alter these. Preserve exactly, including case and slashes. -Markdown, HTML, and JSX/MDX Syntax: Preserve all formatting symbols, tags, and structure. Do not add/remove markers. Keep tag order identical. Translate only human-readable text outside tags. -Punctuation in Code/Text: Do not alter punctuation that is part of code/syntax (e.g., {}, <>, (), []). - -Match Source Capitalization: Preserve capitalization of terms, acronyms, proper nouns (e.g., "Ethereum", "Solidity", "NFT"). Maintain ALL CAPS where used. -Follow Target Language Conventions: Apply normal punctuation/grammar rules of the target language except where code syntax would break. -Sentence Structure: Reorder or split/join sentences only to achieve natural grammar; avoid ambiguity changes. -End Punctuation: Mirror source intent; headings without periods usually remain without periods. - -Use Consistent Terminology: Reuse prior translations for repeated terms unless context demands a change. -Ethereum Glossary and Termbase: If provided, follow those preferred translations strictly. -External Translation Memory (TM): Use exact matches from TM if context fits. -No Glossary or TM? Pick a clear translation and keep it consistent thereafter. - -(If translation memory/termbase resources are available to Crowdin AI, they should be applied to maintain consistency.) - -Preserve Tags and Placeholders: Keep tags/placeholders exactly ordered. Do not duplicate, omit, or reorder them. -Do Not Break Variables: Leave placeholders such as {userName} unchanged; adapt surrounding punctuation only if required. -Avoid Tag Duplication or Omission: Every opening tag must have its closing counterpart. Never remove tags. -Maintain Markdown Structure: Lists, tables, headings remain structurally identical. Custom IDs stay identical to English. -Line Breaks and Whitespace: Avoid introducing/removing line breaks. Keep opening/closing block HTML tags on their own lines when the source does. - -Consistency Reminders (non-strict, but preferred): -- Headings: Keep section and subsection heading choices consistent with the English source across the document. If the source uses a particular heading term (e.g., "Overview", "Examples", "Resources"), choose a single clear localized equivalent and reuse it throughout the page. -- Example Arrays and Lists: When the source contains example items (lists of technologies, wallets, tools, etc.), translate common nouns/adjectives to the target language. Retain English only for proper names and brands. Do not revert entire lists to English unless items are proper nouns. -- Stable Canonical Terms: Prefer previously used localized headings/labels for recurring sections when known (e.g., consistent translation for "Learn", "Developers"). If unsure, pick the most natural single term and stick to it within the page. - -Inclusive Language: Use gender-neutral constructions where possible. -Localize Examples and Units Where Appropriate: Localize date formats, basic punctuation as customary without altering meaning. Do not convert currencies. -Cultural References: Prefer clarity over forced local analogies. Keep original if unsure. -Avoid Slang and Colloquialisms: Maintain professional, accessible tone. - -Untranslatable Strings: Keep product names, trademarks, protocol names, abbreviations ("ETH", "NFT", "HTML", "PoW", "PoS", "EIP-1559") unless widely accepted localized form exists. -Placeholders and Dummy Text: Do not translate placeholder tokens or dummy values (e.g., "Lorem ipsum", "user@example.com"). -Flagging Issues: If a string is ambiguous and unsafe to translate confidently, produce a literal translation or leave it for review (do not guess). When consistency conflicts arise (e.g., competing heading variants), prefer the most widely used term in the target language or the termbase entry if available. -No Guessing for Missing Context: Choose neutral wording when context is unclear; retain English term where ambiguity could mislead. -´ -Preserve Intended Meaning: Prioritize accurate meaning over literal wording. -Literal vs. Free Translation: Avoid overly literal output if unnatural; adjust for clarity. -Clarity Over Literalness: Prefer clear, idiomatic phrasing that conveys the source meaning. -Avoid Adding Extra Information: Do not introduce new content or explanations. - -High-Quality Output: Output should need minimal post-editing: correct spelling, grammar, style. -Avoiding Errors: Do not omit content or invert meaning; retain numeric values precisely. -Consistency in Style: Maintain uniform formal, explanatory tone throughout. -Minimal Creativity: Rephrase only when necessary for clarity/grammar. -Post-Editing Ready: Deliver translation suitable for quick human approval. \ No newline at end of file diff --git a/src/scripts/i18n/lib/crowdin/pre-translate.ts b/src/scripts/i18n/lib/crowdin/pre-translate.ts deleted file mode 100644 index 69380ab9ca0..00000000000 --- a/src/scripts/i18n/lib/crowdin/pre-translate.ts +++ /dev/null @@ -1,173 +0,0 @@ -// Crowdin pre-translation operations - -import { - config, - CROWDIN_API_BASE_URL, - crowdinBearerHeaders, -} from "../../config" -import type { CrowdinPreTranslateResponse } from "../types" -import { delay } from "../workflows/utils" - -/** - * Apply pre-translation to files - */ -export const postApplyPreTranslation = async ( - fileIds: number[], - languageIds?: string[], - aiPromptIdOverride?: number -): Promise => { - const url = new URL( - `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/pre-translations` - ) - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - languageIds: languageIds || config.allCrowdinCodes, - fileIds, - method: "ai", - aiPromptId: - typeof aiPromptIdOverride === "number" - ? aiPromptIdOverride - : config.preTranslatePromptId, - }), - }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error( - `Crowdin postApplyPreTranslation failed (${res.status}): ${text}` - ) - } - - type JsonResponse = { - data: CrowdinPreTranslateResponse - } - const json: JsonResponse = await res.json() - - return json.data - } catch (error) { - console.error("postApplyPreTranslation error:", error) - throw error - } -} - -/** - * Get pre-translation status - */ -export const getPreTranslationStatus = async ( - preTranslationId: string -): Promise => { - const url = new URL( - `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/pre-translations/${preTranslationId}` - ) - try { - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error( - `Crowdin getPreTranslationStatus failed (${res.status}): ${text}` - ) - } - - type JsonResponse = { - data: CrowdinPreTranslateResponse - } - const json: JsonResponse = await res.json() - - return json.data - } catch (error) { - console.error("getPreTranslationStatus error:", error) - throw error - } -} - -/** - * Polls Crowdin for the status of a pre-translation job and resolves when it finishes. - * - * This function repeatedly calls `getPreTranslationStatus` for the given - * pre-translation ID until the job is no longer in progress. It uses adaptive - * polling intervals based on elapsed time and will abort with an error if the operation - * does not complete within the configured timeout. - * - * @param preTranslationId - The identifier of the Crowdin pre-translation job to monitor. - * @param opts - Optional configuration for timeout and base polling interval - * - * @returns A promise that resolves with the final CrowdinPreTranslateResponse when the - * job status becomes "finished". - * - * @throws {Error} If the wait times out - * @throws {Error} If the pre-translation completes with an unexpected status - * @throws {Error} If an error is thrown while fetching the pre-translation status - */ -export const awaitPreTranslationCompleted = async ( - preTranslationId: string, - opts?: { timeoutMs?: number; baseIntervalMs?: number } -): Promise => { - const timeoutMs = opts?.timeoutMs ?? config.pretranslateTimeoutMs - const baseInterval = opts?.baseIntervalMs ?? config.pretranslatePollBaseMs - const start = Date.now() - let attempt = 0 - - const computeInterval = (elapsedMs: number): number => { - const minutes = elapsedMs / 60000 - if (minutes < 10) return baseInterval - if (minutes < 30) return Math.max(baseInterval * 2, 60_000) - if (minutes < 60) return Math.max(baseInterval * 4, 180_000) - return Math.max(baseInterval * 10, 300_000) // cap at 5 min - } - - // Bounded loop: terminates once elapsed exceeds timeoutMs - while (Date.now() - start <= timeoutMs) { - const elapsed = Date.now() - start - attempt++ - let res: CrowdinPreTranslateResponse - try { - res = await getPreTranslationStatus(preTranslationId) - } catch (e) { - // transient fetch errors: log + continue within timeout window - const nextWait = computeInterval(elapsed) - console.warn( - `[PRE-TRANSLATE][POLL] Error on attempt ${attempt}: ${(e as Error).message}. Retrying in ${nextWait}ms.` - ) - await delay(nextWait) - continue - } - // "created" means job is queued (e.g., another large job is running) - // "in_progress" means job is actively translating - // Both are valid states to keep polling - if (res.status !== "in_progress" && res.status !== "created") { - if (res.status === "finished") { - console.log( - `[PRE-TRANSLATE][POLL] Completed after ${attempt} attempts; elapsed ${Math.round( - (Date.now() - start) / 60000 - )}m.` - ) - return res - } - throw new Error( - `Pre-translation ended with unexpected status: ${res.status}` - ) - } - const nextWait = computeInterval(elapsed) - const progressPct = res.progress ?? 0 - const statusNote = res.status === "created" ? " (queued)" : "" - console.log( - `[PRE-TRANSLATE][POLL] attempt=${attempt} status=${res.status}${statusNote} progress=${progressPct}% elapsed=${Math.round( - elapsed / 60000 - )}m nextWait=${nextWait}ms` - ) - await delay(nextWait) - } - const finalElapsed = Date.now() - start - throw new Error( - `Timed out waiting for pre-translation (elapsed ${Math.round( - finalElapsed / 60000 - )}m)` - ) -} diff --git a/src/scripts/i18n/lib/crowdin/prompt.ts b/src/scripts/i18n/lib/crowdin/prompt.ts deleted file mode 100644 index 7f0c43a923e..00000000000 --- a/src/scripts/i18n/lib/crowdin/prompt.ts +++ /dev/null @@ -1,72 +0,0 @@ -import * as fs from "fs" - -import { crowdinBearerHeaders } from "../../config" - -/** Crowdin AI prompt resource type */ -export type PromptResource = { - id: number - name: string - action: string - aiProviderId?: number | null - aiModelId?: string | null - model?: string | null - version?: string | null -} - -/** - * Get information about a Crowdin AI prompt including the model being used. - * Uses Crowdin API v2: GET /users/{userId}/ai/prompts/{promptId} - */ -export async function getPromptInfo( - userId: number, - promptId: number -): Promise { - const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` - const resp = await fetch(url, { - method: "GET", - headers: crowdinBearerHeaders, - }) - if (!resp.ok) { - const text = await resp.text().catch(() => "") - throw new Error(`Failed to get prompt info (${resp.status}): ${text}`) - } - const json = await resp.json() - return json.data as PromptResource -} - -/** - * Update a Crowdin AI prompt's content from a local file. - * Uses Crowdin API v2: PATCH /users/{userId}/ai/prompts/{promptId} - */ -export async function updatePromptFromFile( - userId: number, - promptId: number, - filePath: string -): Promise { - const content = await fs.promises.readFile(filePath, "utf8") - await updatePromptContent(userId, promptId, content) -} - -/** - * Update a Crowdin AI prompt with provided content. - * Uses Crowdin API v2: PATCH /users/{userId}/ai/prompts/{promptId} - */ -export async function updatePromptContent( - userId: number, - promptId: number, - content: string -): Promise { - const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` - const resp = await fetch(url, { - method: "PATCH", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ content }), - }) - if (!resp.ok) { - const text = await resp.text().catch(() => "") - throw new Error(`Failed to update prompt (${resp.status}): ${text}`) - } -} diff --git a/src/scripts/i18n/lib/crowdin/user.ts b/src/scripts/i18n/lib/crowdin/user.ts deleted file mode 100644 index 954630b6e40..00000000000 --- a/src/scripts/i18n/lib/crowdin/user.ts +++ /dev/null @@ -1,41 +0,0 @@ -import { crowdinBearerHeaders } from "../../config" - -interface CrowdinUser { - id: number - username: string - email: string - emailVerified: boolean - fullName: string - avatarUrl: string - createdAt: string - lastSeen: string - twoFactor: string - timezone: string -} - -interface CrowdinUserResponse { - data: CrowdinUser -} - -/** - * Get the authenticated Crowdin user's information - * @returns The authenticated user's data - */ -export async function getCurrentUser(): Promise { - const url = "https://api.crowdin.com/api/v2/user" - - const response = await fetch(url, { - method: "GET", - headers: crowdinBearerHeaders, - }) - - if (!response.ok) { - const text = await response.text().catch(() => "") - throw new Error( - `Failed to fetch current user (${response.status}): ${text}` - ) - } - - const json = (await response.json()) as CrowdinUserResponse - return json.data -} diff --git a/src/scripts/i18n/lib/github/files.ts b/src/scripts/i18n/lib/github/files.ts deleted file mode 100644 index 091d38b9f47..00000000000 --- a/src/scripts/i18n/lib/github/files.ts +++ /dev/null @@ -1,297 +0,0 @@ -// GitHub file operations - -import { - config, - doNotTranslatePaths, - gitHubBearerHeaders, - validateTargetPath, -} from "../../config" -import type { - ContentType, - GitHubCrowdinFileMetadata, - GitHubQueryResponseItem, -} from "../types" -import { fetchWithRetry } from "../utils/fetch" -import { debugLog } from "../workflows/utils" - -/** - * Check if a path should be excluded - */ -function isPathExcluded(filePath: string, excludedPaths: string[]): boolean { - return excludedPaths.some((excluded) => filePath.includes(excluded)) -} - -/** - * Check if a path is a file (has .md or .json extension) or directory - */ -function isFilePath(targetPath: string): boolean { - return targetPath.endsWith(".md") || targetPath.endsWith(".json") -} - -/** - * Get English files with optional file/directory filtering and excluded paths. - * If targetPath is a file (ends with .md or .json), returns only that file. - * If targetPath is a directory, returns all files recursively within that directory. - * Otherwise, returns all English content files. - */ -export const getAllEnglishFiles = async (): Promise< - GitHubQueryResponseItem[] -> => { - const { targetPath, excludePath } = config - - // Add runtime exclusion if specified - const allExcludedPaths = excludePath - ? [...doNotTranslatePaths, excludePath] - : doNotTranslatePaths - - debugLog( - `Do-not-translate paths loaded: ${doNotTranslatePaths.length} entries` - ) - if (excludePath) { - debugLog(`Runtime path exclusions: ${excludePath}`) - } - - // Multi-file mode: comma-separated paths each fetched individually - if (config.targetPaths.length > 1) { - const allFiles: GitHubQueryResponseItem[] = [] - for (const singlePath of config.targetPaths) { - validateTargetPath(singlePath) - if (isPathExcluded(singlePath, allExcludedPaths)) { - console.log(`[INFO] Path ${singlePath} is in excluded paths, skipping`) - continue - } - if (isFilePath(singlePath)) { - console.log(`[INFO] Fetching file: ${singlePath}`) - try { - const files = await fetchSingleFile(singlePath) - allFiles.push(...files) - } catch (error) { - console.warn( - `[WARN] Could not fetch ${singlePath}, skipping: ${error instanceof Error ? error.message : String(error)}` - ) - } - } else { - console.log( - `[WARN] Multi-path mode only supports files, skipping directory: ${singlePath}` - ) - } - } - console.log( - `[INFO] Multi-file mode: ${allFiles.length} files from ${config.targetPaths.length} paths` - ) - return allFiles - } - - // Determine if targetPath is a file or directory - if (targetPath) { - validateTargetPath(targetPath) - if (isPathExcluded(targetPath, allExcludedPaths)) { - console.log(`[INFO] Path ${targetPath} is in excluded paths, skipping`) - return [] - } - - if (isFilePath(targetPath)) { - // Single file mode - console.log(`[INFO] Fetching single file: ${targetPath}`) - return await fetchSingleFile(targetPath) - } else { - // Directory mode - console.log(`[INFO] Fetching files from directory: ${targetPath}`) - } - } - - // Directory mode or full translation - const ghSearchEndpointBase = "https://api.github.com/search/code" - let query: string - - if (targetPath && !isFilePath(targetPath)) { - // Search within specific directory - query = `repo:${config.ghOrganization}/${config.ghRepo} extension:md path:"${targetPath}" -path:"${config.mdRoot}/translations" OR repo:${config.ghOrganization}/${config.ghRepo} extension:json path:"${targetPath}"` - } else { - // Search all content files - query = `repo:${config.ghOrganization}/${config.ghRepo} extension:md path:"${config.mdRoot}" -path:"${config.mdRoot}/translations" OR repo:${config.ghOrganization}/${config.ghRepo} extension:json path:"${config.jsonRoot}"` - if (!targetPath) { - console.log(`[INFO] Fetching all English content files`) - } - } - - debugLog(`GitHub search query: ${query}`) - - const perPage = 100 - const collected: GitHubQueryResponseItem[] = [] - - let page = 1 - let hasMorePages = true - while (hasMorePages) { - const url = new URL(ghSearchEndpointBase) - url.searchParams.set("q", query) - url.searchParams.set("per_page", perPage.toString()) - url.searchParams.set("page", page.toString()) - - debugLog(`Fetching search page ${page}...`) - - try { - const res = await fetchWithRetry(url.toString(), { - headers: gitHubBearerHeaders, - }) - - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error(`GitHub getAllEnglishFiles (${res.status}): ${body}`) - } - - type JsonResponse = { items: GitHubQueryResponseItem[] } - const json: JsonResponse = await res.json() - - if (!json.items.length) { - debugLog(`No more results at page ${page}`) - hasMorePages = false - break - } - - collected.push(...json.items) - debugLog(`Collected ${collected.length} items so far`) - - page += 1 - if (page > 10) { - console.warn(`[WARN] Reached pagination safety cap at page ${page - 1}`) - hasMorePages = false - break - } - } catch (error) { - console.error(`[ERROR] Failed to get English files from GitHub:`, error) - process.exit(1) - } - } - - // Filter out excluded paths (static + runtime) - const filtered = collected.filter( - (item) => !isPathExcluded(item.path, allExcludedPaths) - ) - - const excludedCount = collected.length - filtered.length - if (excludedCount > 0) { - console.log(`[INFO] Filtered out ${excludedCount} excluded files`) - } - - console.log(`[INFO] Total files to translate: ${filtered.length}`) - - return filtered -} - -/** - * Fetch a single file by path from GitHub - */ -async function fetchSingleFile( - filePath: string -): Promise { - const url = new URL( - `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/contents/${filePath}` - ) - url.searchParams.set("ref", config.baseBranch) - - try { - const res = await fetchWithRetry(url.toString(), { - headers: gitHubBearerHeaders, - }) - - if (!res.ok) { - throw new Error(`Failed to fetch file ${filePath}: ${res.status}`) - } - - const data = await res.json() - - // Convert to GitHubQueryResponseItem format - return [ - { - name: data.name, - path: data.path, - sha: data.sha, - url: data.url, - git_url: data.git_url, - html_url: data.html_url, - repository: { - id: 0, - name: config.ghRepo, - full_name: `${config.ghOrganization}/${config.ghRepo}`, - owner: { - login: config.ghOrganization, - id: 0, - node_id: "", - avatar_url: "", - gravatar_id: "", - url: "", - html_url: "", - followers_url: "", - following_url: "", - gists_url: "", - starred_url: "", - subscriptions_url: "", - organizations_url: "", - repos_url: "", - events_url: "", - received_events_url: "", - type: "Organization", - user_view_type: "", - site_admin: false, - }, - } as GitHubQueryResponseItem["repository"], - score: 1, - }, - ] - } catch (error) { - console.error(`[ERROR] Failed to fetch single file ${filePath}:`, error) - throw error - } -} - -/** - * Convert GitHub items to Crowdin file metadata - */ -export const getFileMetadata = async ( - items: GitHubQueryResponseItem[] -): Promise => { - if (!items.length) return [] - - const owner = items[0].repository.owner.login - const repo = items[0].repository.name - - const englishFileMetadata = items.map((item) => { - // https://raw.githubusercontent.com/:owner/:repo/:ref/:path - const download_url = `https://raw.githubusercontent.com/${owner}/${repo}/${config.baseBranch}/${item.path}` - const filePath = item.path - const filePathSplit = filePath.split("/") - const fileName = filePathSplit[filePathSplit.length - 1] - const contentType: ContentType = fileName?.endsWith(".json") - ? "application/json" - : "text/markdown" - - return { - "Crowdin-API-FileName": fileName, - filePath: filePath, - download_url: download_url, - "Content-Type": contentType, - } - }) - return englishFileMetadata -} - -/** - * Download a file from GitHub - */ -export const downloadGitHubFile = async ( - download_url: string -): Promise => { - try { - const res = await fetch(download_url) - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error(`Failed to download from GitHub (${res.status}): ${body}`) - } - const arrayBuffer = await res.arrayBuffer() - return Buffer.from(arrayBuffer) - } catch (error) { - console.error("downloadGitHubFile error:", error) - throw error - } -} diff --git a/src/scripts/i18n/lib/jsx-attributes/extract.ts b/src/scripts/i18n/lib/jsx-attributes/extract.ts deleted file mode 100644 index f74c1b7a1b3..00000000000 --- a/src/scripts/i18n/lib/jsx-attributes/extract.ts +++ /dev/null @@ -1,169 +0,0 @@ -/** - * Extract translatable JSX attributes from markdown files - */ - -import type { - ExtractedAttribute, - FileExtractionResult, - TranslatableAttribute, -} from "./types" -import { - JSX_ATTRIBUTE_REGEX, - JSX_COMPONENT_REGEX, - TRANSLATABLE_ATTRIBUTES, -} from "./types" - -/** - * Check if a string appears to be English text (not a variable, URL, or code). - * Uses heuristics: contains spaces, common English words, or sentence structure. - */ -function isLikelyEnglishText(value: string): boolean { - // Skip empty or very short values - if (!value || value.length < 3) return false - - // Skip URLs - if (/^https?:\/\//.test(value)) return false - - // Skip paths - if (/^[/.]/.test(value) || /\.(png|jpg|svg|gif|json|md)$/i.test(value)) - return false - - // Skip variables/placeholders like {variable} or {{variable}} - if (/^\{.*\}$/.test(value)) return false - - // Skip CSS classes or technical identifiers (camelCase/kebab-case only) - if (/^[a-z][a-zA-Z0-9-]*$/.test(value) && !value.includes(" ")) return false - - // Skip emoji-only values - if (/^[\p{Emoji}\s]+$/u.test(value)) return false - - // Skip numbers-only - if (/^[\d.,\s%$€£]+$/.test(value)) return false - - // Likely English if it contains spaces (multi-word) or common English patterns - if (value.includes(" ")) return true - - // Single words that look like natural language (capitalized, common endings) - if (/^[A-Z][a-z]+(?:ing|ed|er|est|ly|tion|ness)?$/.test(value)) return true - - return false -} - -/** - * Extract surrounding context (lines before/after) for translation accuracy. - */ -function extractContext( - content: string, - lineNumber: number, - contextLines = 2 -): string { - const lines = content.split("\n") - const startLine = Math.max(0, lineNumber - 1 - contextLines) - const endLine = Math.min(lines.length, lineNumber + contextLines) - - return lines - .slice(startLine, endLine) - .map((line) => line.trim()) - .filter((line) => line.length > 0) - .join(" ") - .slice(0, 500) // Limit context length -} - -/** - * Extract translatable attributes from a single file's content. - */ -export function extractAttributesFromContent( - content: string, - filePath: string -): ExtractedAttribute[] { - const attributes: ExtractedAttribute[] = [] - const lines = content.split("\n") - - // Track line numbers for each match - let currentLine = 0 - let currentPos = 0 - - // Process each JSX component - let componentMatch: RegExpExecArray | null - JSX_COMPONENT_REGEX.lastIndex = 0 - - while ((componentMatch = JSX_COMPONENT_REGEX.exec(content)) !== null) { - const componentName = componentMatch[1] - const attributeString = componentMatch[2] - const componentStartPos = componentMatch.index - - // Calculate line number for this component - while (currentPos < componentStartPos && currentLine < lines.length) { - currentPos += lines[currentLine].length + 1 // +1 for newline - currentLine++ - } - const componentLine = currentLine + 1 // 1-indexed - - // Extract attributes from this component - let attrMatch: RegExpExecArray | null - JSX_ATTRIBUTE_REGEX.lastIndex = 0 - - while ((attrMatch = JSX_ATTRIBUTE_REGEX.exec(attributeString)) !== null) { - const attrName = attrMatch[1] - const attrValue = attrMatch[2] || attrMatch[3] // double or single quotes - - // Check if this is a translatable attribute - if ( - !TRANSLATABLE_ATTRIBUTES.includes(attrName as TranslatableAttribute) - ) { - continue - } - - // Check if the value looks like English text needing translation - if (!isLikelyEnglishText(attrValue)) { - continue - } - - attributes.push({ - filePath, - line: componentLine, - column: attrMatch.index, - attributeName: attrName as TranslatableAttribute, - componentName, - originalValue: attrValue, - context: extractContext(content, componentLine), - }) - } - } - - return attributes -} - -/** - * Extract translatable attributes from a file, returning the extraction result. - */ -export function extractAttributesFromFile( - content: string, - filePath: string -): FileExtractionResult { - const attributes = extractAttributesFromContent(content, filePath) - - return { - filePath, - attributes, - content, - } -} - -/** - * Extract attributes from multiple files. - */ -export function extractAttributesFromFiles( - files: { path: string; content: string }[] -): FileExtractionResult[] { - return files.map((file) => extractAttributesFromFile(file.content, file.path)) -} - -/** - * Count total attributes across multiple extraction results. - */ -export function countExtractedAttributes( - results: FileExtractionResult[] -): number { - return results.reduce((sum, result) => sum + result.attributes.length, 0) -} diff --git a/src/scripts/i18n/lib/jsx-attributes/index.ts b/src/scripts/i18n/lib/jsx-attributes/index.ts deleted file mode 100644 index 517481472c3..00000000000 --- a/src/scripts/i18n/lib/jsx-attributes/index.ts +++ /dev/null @@ -1,23 +0,0 @@ -/** - * JSX attribute extraction and translation module - */ - -export { - countExtractedAttributes, - extractAttributesFromContent, - extractAttributesFromFile, - extractAttributesFromFiles, -} from "./extract" -export { - reinsertTranslatedAttributes, - reinsertTranslationsForFiles, -} from "./reinsert" -export { - type ExtractedAttribute, - type FileExtractionResult, - type FileTranslationResult, - type JsxTranslationSummary, - TRANSLATABLE_ATTRIBUTES, - type TranslatableAttribute, - type TranslatedAttribute, -} from "./types" diff --git a/src/scripts/i18n/lib/jsx-attributes/reinsert.ts b/src/scripts/i18n/lib/jsx-attributes/reinsert.ts deleted file mode 100644 index d4a3db75290..00000000000 --- a/src/scripts/i18n/lib/jsx-attributes/reinsert.ts +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Re-insert translated attribute values into file content - */ - -import type { - FileExtractionResult, - FileTranslationResult, - TranslatedAttribute, -} from "./types" - -/** - * Escape special regex characters in a string - */ -function escapeRegex(str: string): string { - return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") -} - -/** - * Replace a single attribute value in content. - * Handles both double and single quoted attributes. - */ -function replaceAttributeValue( - content: string, - attr: TranslatedAttribute -): string { - // Build regex to find this specific attribute with its original value - // Match: attributeName="originalValue" or attributeName='originalValue' - const escapedOriginal = escapeRegex(attr.originalValue) - const pattern = new RegExp( - `(\\b${attr.attributeName}\\s*=\\s*)(?:"${escapedOriginal}"|'${escapedOriginal}')`, - "g" - ) - - // Replace with translated value, preserving quote style (default to double quotes) - return content.replace(pattern, `$1"${attr.translatedValue}"`) -} - -/** - * Re-insert all translated attributes into a file's content. - */ -export function reinsertTranslatedAttributes( - extraction: FileExtractionResult, - translatedAttributes: TranslatedAttribute[] -): FileTranslationResult { - let updatedContent = extraction.content - let successCount = 0 - - // Sort by position (reverse order) to avoid offset issues when replacing - const sortedAttrs = [...translatedAttributes].sort( - (a, b) => b.line - a.line || b.column - a.column - ) - - for (const attr of sortedAttrs) { - const beforeReplace = updatedContent - updatedContent = replaceAttributeValue(updatedContent, attr) - - if (updatedContent !== beforeReplace) { - successCount++ - } - } - - return { - filePath: extraction.filePath, - translatedAttributes, - updatedContent, - hasChanges: successCount > 0, - } -} - -/** - * Process multiple files with their translated attributes. - */ -export function reinsertTranslationsForFiles( - extractions: FileExtractionResult[], - translationsByFile: Map -): FileTranslationResult[] { - return extractions.map((extraction) => { - const translations = translationsByFile.get(extraction.filePath) || [] - return reinsertTranslatedAttributes(extraction, translations) - }) -} diff --git a/src/scripts/i18n/lib/jsx-attributes/types.ts b/src/scripts/i18n/lib/jsx-attributes/types.ts deleted file mode 100644 index 4d4823de6fc..00000000000 --- a/src/scripts/i18n/lib/jsx-attributes/types.ts +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Types for JSX attribute extraction and translation - */ - -/** Regex to match JSX/HTML-style attributes with quoted values */ -export const JSX_ATTRIBUTE_REGEX = - /\b([a-zA-Z][\w-]*)\s*=\s*(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)')/g - -/** Regex to identify JSX component opening tags */ -export const JSX_COMPONENT_REGEX = /<([A-Z][a-zA-Z0-9]*)\s+([^>]*?)(?:\/>|>)/g - -/** Attributes that contain human-readable text requiring translation */ -export const TRANSLATABLE_ATTRIBUTES = [ - "title", - "description", - "alt", - "label", - "aria-label", - "placeholder", - "buttonLabel", - "name", - "caption", - "contentPreview", - "location", -] as const - -export type TranslatableAttribute = (typeof TRANSLATABLE_ATTRIBUTES)[number] - -/** A single extracted attribute from a JSX component */ -export interface ExtractedAttribute { - /** File path the attribute was found in */ - filePath: string - /** Line number (1-indexed) where the attribute appears */ - line: number - /** Column position where the attribute value starts */ - column: number - /** The attribute name (e.g., "title", "description") */ - attributeName: TranslatableAttribute - /** The component name (e.g., "Card", "ExpandableCard") */ - componentName: string - /** The original English attribute value */ - originalValue: string - /** Surrounding context (1-2 sentences before/after) for translation accuracy */ - context: string -} - -/** Result of extracting attributes from a single file */ -export interface FileExtractionResult { - filePath: string - attributes: ExtractedAttribute[] - /** Original file content for re-insertion */ - content: string -} - -/** A translated attribute ready for re-insertion */ -export interface TranslatedAttribute extends ExtractedAttribute { - translatedValue: string -} - -/** Result of translating attributes for a file */ -export interface FileTranslationResult { - filePath: string - translatedAttributes: TranslatedAttribute[] - /** Updated file content with translations inserted */ - updatedContent: string - /** Whether any attributes were translated */ - hasChanges: boolean -} - -/** Summary of JSX attribute translation for a batch of files */ -export interface JsxTranslationSummary { - /** Total files processed */ - filesProcessed: number - /** Files that had attributes translated */ - filesWithChanges: number - /** Total attributes translated */ - attributesTranslated: number - /** Attributes that failed translation */ - attributesFailed: number - /** Whether Gemini API was available */ - geminiAvailable: boolean - /** Files with updated content */ - updatedFiles: FileTranslationResult[] -} diff --git a/src/scripts/i18n/lib/supabase/glossary.ts b/src/scripts/i18n/lib/supabase/glossary.ts deleted file mode 100644 index f48b93341c8..00000000000 --- a/src/scripts/i18n/lib/supabase/glossary.ts +++ /dev/null @@ -1,95 +0,0 @@ -/** - * Glossary client for fetching community-approved translations - * - * Fetches from the /api/glossary endpoint, which serves data from - * the Supabase `top_translations` view via the data layer. - */ - -const GLOSSARY_URL = "https://ethereum.org/api/glossary" - -import type { GlossaryEntry } from "@/data-layer/fetchers/fetchTranslationGlossary" -export type { GlossaryEntry } - -/** Glossary grouped by language code */ -export type GlossaryByLanguage = Map> - -/** Tone for translation register */ -export type Tone = "informal" | "formal" - -/** - * Fetch all glossary entries from the glossary API - */ -export async function fetchGlossaryEntries(): Promise { - try { - const response = await fetch(GLOSSARY_URL) - - if (!response.ok) { - throw new Error(`Glossary API error (${response.status})`) - } - - const entries: GlossaryEntry[] = await response.json() - console.log(`[GLOSSARY] Fetched ${entries.length} glossary entries`) - return entries - } catch (error) { - console.warn("[GLOSSARY] Failed to fetch glossary:", error) - return [] - } -} - -/** - * Group glossary entries by language code for efficient lookup - * Returns Map> - */ -export function groupGlossaryByLanguage( - entries: GlossaryEntry[] -): GlossaryByLanguage { - const byLanguage: GlossaryByLanguage = new Map() - - for (const entry of entries) { - if (!byLanguage.has(entry.language_code)) { - byLanguage.set(entry.language_code, new Map()) - } - byLanguage - .get(entry.language_code)! - .set(entry.string_term, entry.translation_text) - } - - return byLanguage -} - -/** - * Get glossary terms for a specific language code - * Returns Map or empty map if not found - */ -export function getGlossaryForLanguage( - glossary: GlossaryByLanguage, - languageCode: string -): Map { - return glossary.get(languageCode) ?? new Map() -} - -/** - * Format glossary as string for inclusion in AI prompts - */ -export function formatGlossaryForPrompt( - glossaryTerms: Map, - tone: Tone = "informal" -): string { - if (glossaryTerms.size === 0) return "" - - const toneInstruction = - tone === "formal" - ? "Use formal register." - : "Use informal, friendly register." - - const terms = Array.from(glossaryTerms.entries()) - .map(([term, translation]) => `- "${term}" → "${translation}"`) - .join("\n") - - return `## REQUIRED TERMINOLOGY - -Use these exact translations. Do not substitute synonyms. -${toneInstruction} - -${terms}` -} diff --git a/src/scripts/i18n/lib/supabase/index.ts b/src/scripts/i18n/lib/supabase/index.ts deleted file mode 100644 index 1689b520c25..00000000000 --- a/src/scripts/i18n/lib/supabase/index.ts +++ /dev/null @@ -1,9 +0,0 @@ -// Supabase integration exports - -export type { GlossaryByLanguage, GlossaryEntry, Tone } from "./glossary" -export { - fetchGlossaryEntries, - formatGlossaryForPrompt, - getGlossaryForLanguage, - groupGlossaryByLanguage, -} from "./glossary" diff --git a/src/scripts/i18n/lib/types.ts b/src/scripts/i18n/lib/types.ts deleted file mode 100644 index 995f70dec36..00000000000 --- a/src/scripts/i18n/lib/types.ts +++ /dev/null @@ -1,242 +0,0 @@ -/** - * GET https://api.github.com/search/code - */ -export type GHOwner = { - login: string - id: number - node_id: string - avatar_url: string - gravatar_id: string - url: string - html_url: string - followers_url: string - following_url: string - gists_url: string - starred_url: string - subscriptions_url: string - organizations_url: string - repos_url: string - events_url: string - received_events_url: string - type: string - user_view_type: string - site_admin: boolean -} - -export type GHRepository = { - id: number - node_id: string - name: string - full_name: string - private: boolean - owner: GHOwner - html_url: string - description: string | null - fork: boolean - url: string - forks_url: string - keys_url: string - collaborators_url: string - teams_url: string - hooks_url: string - issue_events_url: string - events_url: string - assignees_url: string - branches_url: string - tags_url: string - blobs_url: string - git_tags_url: string - git_refs_url: string - trees_url: string - statuses_url: string - languages_url: string - stargazers_url: string - contributors_url: string - subscribers_url: string - subscription_url: string - commits_url: string - git_commits_url: string - comments_url: string - issue_comment_url: string - contents_url: string - compare_url: string - merges_url: string - archive_url: string - downloads_url: string - issues_url: string - pulls_url: string - milestones_url: string - notifications_url: string - labels_url: string - releases_url: string - deployments_url: string -} - -export type GitHubQueryResponseItem = { - name: string - path: string - sha: string - url: string - git_url: string - html_url: string - repository: GHRepository - score: number -} - -// Optional: the whole response is an array of items -export type GitHubQueryResponse = GitHubQueryResponseItem[] - -/** - * getFileMetadata - */ -export type ContentType = - | "application/json" - | "text/markdown" - | "application/octet-stream" - -export type GitHubCrowdinFileMetadata = { - "Crowdin-API-FileName": string - filePath: string // e.g., src/intl/en/page-layer-2-networks.json (no leading slash) - download_url: string - "Content-Type": ContentType -} - -/** - * GET https://api.crowdin.com/api/v2/projects/${env.projectID}/files - */ -export type CrowdinImportOptions = { - contentSegmentation: boolean - customSegmentation: boolean - excludeCodeBlocks: boolean - excludedFrontMatterElements: string[] - inlineTags: string[] -} - -export type CrowdinExportOptions = { - exportPattern: string | null - strongMarker: string - emphasisMarker: string - unorderedListBullet: string - tableColumnWidth: string - frontMatterQuotes: string -} - -export type CrowdinFileData = { - id: number // fileId - projectId: number - branchId: number | null - directoryId: number - name: string - title: string | null - context: string | null - type: "md" | "json" // string - path: string // e.g., /public/content/about/index.md (with leading slash) - status: string - revisionId: number - priority: string - importOptions: CrowdinImportOptions - exportOptions: CrowdinExportOptions - excludedTargetLanguages: string[] | null - parserVersion: number - createdAt: string - updatedAt: string -} - -/** - * PUT https://api.crowdin.com/api/v2/projects/${projectId}/files/${fileId} - * https://support.crowdin.com/developer/api/v2/#tag/Source-Files/operation/api.projects.files.put - */ -export type CrowdinFileInfoResponseModel = { - id: number - projectId: number - branchId: number | null - directoryId: number | null - name: string - title: string | null - context: string | null - type: string - path: string - status: string - revisionId: number - priority: string - importOptions: Record | null - exportOptions: Record | null - excludedTargetLanguages: string[] | null - parserVersion: number | null - createdAt: string | null - updatedAt: string | null -} - -export type CrowdinPreTranslateAttributes = { - languageIds: string[] - fileIds: number[] - method: string - autoApproveOption: string - duplicateTranslations: boolean - skipApprovedTranslations: boolean - labelIds: number[] - aiPromptId: number | null - excludeLabelIds: number[] - sourceLanguageId: string | null - fallbackLanguages: string[] | null - translateUntranslatedOnly: boolean - translateWithPerfectMatchOnly: boolean -} - -export type CrowdinPreTranslateResponse = { - identifier: string - status: "created" | "in_progress" | "canceled" | "failed" | "finished" - progress: number // In percentages - attributes: CrowdinPreTranslateAttributes - createdAt: string - updatedAt: string - startedAt: string | null - finishedAt: string | null - eta: string | null -} - -export type BuildProjectFileTranslationResponse = { - url: string - expireIn: string - etag: string -} - -export type BranchObject = { - sha: string - type: string // e.g. "commit" - url: string -} - -export type BranchDetailsResponse = { - ref: string // e.g. "refs/heads/dev" - node_id: string - url: string - object: BranchObject -} - -export type CrowdinAddFileResponse = { - id: number - projectId: number - branchId: number | null - directoryId: number | null - name: string - title: string | null - context: string | null - type: string - path: string - status: string - revisionId: number - priority: string - importOptions: Record | null - exportOptions: Record | null - excludedTargetLanguages: string[] | null - parserVersion: number | null - createdAt: string | null - updatedAt: string | null -} - -export type I18nConfigItem = { - code: string - crowdinCode: string - name: string -} diff --git a/src/scripts/i18n/lib/utils/mapping.ts b/src/scripts/i18n/lib/utils/mapping.ts deleted file mode 100644 index e4f543e942d..00000000000 --- a/src/scripts/i18n/lib/utils/mapping.ts +++ /dev/null @@ -1,18 +0,0 @@ -import { crowdinToInternalCodeMapping } from "../../config" - -/** - * Convert Crowdin language code to internal language code - */ -export function mapCrowdinCodeToInternal(crowdinCode: string): string { - return crowdinToInternalCodeMapping[crowdinCode] || crowdinCode -} - -/** - * Convert internal language code to Crowdin language code - */ -export function mapInternalCodeToCrowdin(internalCode: string): string { - const entry = Object.entries(crowdinToInternalCodeMapping).find( - ([, internal]) => internal === internalCode - ) - return entry ? entry[0] : internalCode -} diff --git a/src/scripts/i18n/lib/validation/syntax-tree.ts b/src/scripts/i18n/lib/validation/syntax-tree.ts deleted file mode 100644 index c1b7762ba5d..00000000000 --- a/src/scripts/i18n/lib/validation/syntax-tree.ts +++ /dev/null @@ -1,377 +0,0 @@ -// Syntax tree validation for JSON and Markdown files - -import type { TranslatableAttribute } from "../jsx-attributes/types" -import { - JSX_ATTRIBUTE_REGEX, - JSX_COMPONENT_REGEX, - TRANSLATABLE_ATTRIBUTES, -} from "../jsx-attributes/types" - -export interface JsonValidationResult { - isValid: boolean - expectedKeyCount: number - actualKeyCount: number - missingKeys: string[] - extraKeys: string[] - orderMatches: boolean -} - -export interface MarkdownValidationResult { - isValid: boolean - expectedHeadingCount: number - actualHeadingCount: number - mismatchedHeadings: Array<{ - level: number - expectedId: string - actualId: string | null - line: number - }> -} - -export interface JsxAttributeValidationResult { - isValid: boolean - untranslatedCount: number - totalCount: number - untranslatedPercentage: number - untranslatedAttributes: Array<{ - attributeName: string - componentName: string - englishValue: string - translatedValue: string - line: number - }> -} - -/** - * Extract JSON keys in order from a JSON string - */ -function extractJsonKeys(jsonContent: string): string[] { - try { - const obj = JSON.parse(jsonContent) - if (typeof obj !== "object" || obj === null || Array.isArray(obj)) { - return [] - } - return Object.keys(obj) - } catch { - return [] - } -} - -/** - * Validate JSON file structure against English source - */ -export function validateJsonStructure( - englishContent: string, - translatedContent: string -): JsonValidationResult { - const englishKeys = extractJsonKeys(englishContent) - const translatedKeys = extractJsonKeys(translatedContent) - - const englishKeySet = new Set(englishKeys) - const translatedKeySet = new Set(translatedKeys) - - const missingKeys = englishKeys.filter((key) => !translatedKeySet.has(key)) - const extraKeys = translatedKeys.filter((key) => !englishKeySet.has(key)) - - const orderMatches = - JSON.stringify(englishKeys) === JSON.stringify(translatedKeys) - - return { - isValid: missingKeys.length === 0 && extraKeys.length === 0, - expectedKeyCount: englishKeys.length, - actualKeyCount: translatedKeys.length, - missingKeys, - extraKeys, - orderMatches, - } -} - -/** - * Extract markdown headings with their custom IDs - */ -function extractMarkdownHeadings( - content: string -): Array<{ level: number; id: string | null; line: number }> { - const lines = content.split("\n") - const headings: Array<{ level: number; id: string | null; line: number }> = [] - - for (let i = 0; i < lines.length; i++) { - const line = lines[i] - const headingMatch = line.match(/^(#{1,6})\s+(.+)$/) - - if (headingMatch) { - const level = headingMatch[1].length - const headingText = headingMatch[2] - - // Extract custom ID if present (e.g., "Heading text {#custom-id}") - const idMatch = headingText.match(/\{#([^}]+)\}\s*$/) - const customId = idMatch ? idMatch[1] : null - - headings.push({ - level, - id: customId, - line: i + 1, - }) - } - } - - return headings -} - -/** - * Validate markdown heading structure against English source - */ -export function validateMarkdownStructure( - englishContent: string, - translatedContent: string -): MarkdownValidationResult { - const englishHeadings = extractMarkdownHeadings(englishContent) - const translatedHeadings = extractMarkdownHeadings(translatedContent) - - const mismatchedHeadings: Array<{ - level: number - expectedId: string - actualId: string | null - line: number - }> = [] - - // Check if heading counts match - if (englishHeadings.length !== translatedHeadings.length) { - return { - isValid: false, - expectedHeadingCount: englishHeadings.length, - actualHeadingCount: translatedHeadings.length, - mismatchedHeadings: [], - } - } - - // Compare each heading - for (let i = 0; i < englishHeadings.length; i++) { - const englishHeading = englishHeadings[i] - const translatedHeading = translatedHeadings[i] - - // Check if level matches - if (englishHeading.level !== translatedHeading.level) { - mismatchedHeadings.push({ - level: translatedHeading.level, - expectedId: englishHeading.id || "(no id)", - actualId: translatedHeading.id, - line: translatedHeading.line, - }) - continue - } - - // Check if custom IDs match (if present in English) - if (englishHeading.id && englishHeading.id !== translatedHeading.id) { - mismatchedHeadings.push({ - level: translatedHeading.level, - expectedId: englishHeading.id, - actualId: translatedHeading.id, - line: translatedHeading.line, - }) - } - } - - return { - isValid: mismatchedHeadings.length === 0, - expectedHeadingCount: englishHeadings.length, - actualHeadingCount: translatedHeadings.length, - mismatchedHeadings, - } -} - -// JSX_COMPONENT_REGEX and JSX_ATTRIBUTE_REGEX imported from jsx-attributes/types - -/** - * Extract JSX component attributes from content - * Returns a map of componentName.attrName -> value for matching - */ -function extractJsxAttributes( - content: string -): Map { - const attributes = new Map< - string, - { value: string; line: number; componentName: string } - >() - - const lines = content.split("\n") - let currentLine = 0 - let currentPos = 0 - - let componentMatch: RegExpExecArray | null - JSX_COMPONENT_REGEX.lastIndex = 0 - - while ((componentMatch = JSX_COMPONENT_REGEX.exec(content)) !== null) { - const componentName = componentMatch[1] - const attributeString = componentMatch[2] - const componentStartPos = componentMatch.index - - // Calculate line number - while (currentPos < componentStartPos && currentLine < lines.length) { - currentPos += lines[currentLine].length + 1 - currentLine++ - } - const componentLine = currentLine + 1 - - let attrMatch: RegExpExecArray | null - JSX_ATTRIBUTE_REGEX.lastIndex = 0 - - while ((attrMatch = JSX_ATTRIBUTE_REGEX.exec(attributeString)) !== null) { - const attrName = attrMatch[1] - const attrValue = attrMatch[2] || attrMatch[3] - - if (!TRANSLATABLE_ATTRIBUTES.includes(attrName as TranslatableAttribute)) - continue - - // Use component position + attribute name as key for matching - // This allows us to match attributes even if component names differ slightly - const key = `${componentLine}:${attrName}` - attributes.set(key, { - value: attrValue, - line: componentLine, - componentName, - }) - } - } - - return attributes -} - -/** - * Validate JSX attributes by comparing translated content against English source. - * An attribute is considered untranslated if its value is IDENTICAL to the English source. - */ -export function validateJsxAttributes( - englishContent: string, - translatedContent: string, - threshold = 5 -): JsxAttributeValidationResult { - const englishAttrs = extractJsxAttributes(englishContent) - const translatedAttrs = extractJsxAttributes(translatedContent) - - const untranslatedAttributes: JsxAttributeValidationResult["untranslatedAttributes"] = - [] - let totalCount = 0 - - // Compare each English attribute with its translated counterpart - for (const [key, englishAttr] of englishAttrs) { - const translatedAttr = translatedAttrs.get(key) - - // Skip if attribute doesn't exist in translation (structural difference) - if (!translatedAttr) continue - - totalCount++ - - // Check if the translated value is IDENTICAL to English (i.e., not translated) - if (translatedAttr.value === englishAttr.value) { - untranslatedAttributes.push({ - attributeName: key.split(":")[1], - componentName: translatedAttr.componentName, - englishValue: englishAttr.value, - translatedValue: translatedAttr.value, - line: translatedAttr.line, - }) - } - } - - const untranslatedPercentage = - totalCount > 0 ? (untranslatedAttributes.length / totalCount) * 100 : 0 - - return { - isValid: untranslatedPercentage <= threshold, - untranslatedCount: untranslatedAttributes.length, - totalCount, - untranslatedPercentage, - untranslatedAttributes, - } -} - -/** - * Format validation results into a markdown comment - */ -export function formatValidationComment( - validationResults: Array<{ - path: string - type: "json" | "markdown" | "jsx-attributes" - result: - | JsonValidationResult - | MarkdownValidationResult - | JsxAttributeValidationResult - }> -): string | null { - const issues = validationResults.filter((v) => !v.result.isValid) - - if (issues.length === 0) { - return null - } - - let comment = "## ⚠️ Syntax Tree Validation Issues\n\n" - comment += - "The following files have structural differences from their English source:\n\n" - - for (const issue of issues) { - comment += `### \`${issue.path}\`\n\n` - - if (issue.type === "json") { - const result = issue.result as JsonValidationResult - comment += `**JSON Structure Issues:**\n` - comment += `- Expected keys: ${result.expectedKeyCount}\n` - comment += `- Actual keys: ${result.actualKeyCount}\n` - - if (result.missingKeys.length > 0) { - comment += `- Missing keys: ${result.missingKeys.map((k) => `\`${k}\``).join(", ")}\n` - } - - if (result.extraKeys.length > 0) { - comment += `- Extra keys: ${result.extraKeys.map((k) => `\`${k}\``).join(", ")}\n` - } - - if ( - !result.orderMatches && - result.missingKeys.length === 0 && - result.extraKeys.length === 0 - ) { - comment += `- ⚠️ Key order differs from English version\n` - } - } else if (issue.type === "markdown") { - const result = issue.result as MarkdownValidationResult - comment += `**Markdown Structure Issues:**\n` - comment += `- Expected headings: ${result.expectedHeadingCount}\n` - comment += `- Actual headings: ${result.actualHeadingCount}\n` - - if (result.mismatchedHeadings.length > 0) { - comment += `\n**Mismatched Headings:**\n` - for (const mismatch of result.mismatchedHeadings) { - comment += `- Line ${mismatch.line}: Expected ID \`${mismatch.expectedId}\`, found \`${mismatch.actualId || "(none)"}\`\n` - } - } - } else if (issue.type === "jsx-attributes") { - const result = issue.result as JsxAttributeValidationResult - comment += `**Untranslated JSX Attributes (identical to English):**\n` - comment += `- Untranslated: ${result.untranslatedCount} / ${result.totalCount} (${result.untranslatedPercentage.toFixed(1)}%)\n` - - if (result.untranslatedAttributes.length > 0) { - comment += `\n**Attributes that need translation:**\n` - // Show up to 10 examples - const examples = result.untranslatedAttributes.slice(0, 10) - for (const attr of examples) { - const truncatedValue = - attr.englishValue.length > 50 - ? attr.englishValue.slice(0, 47) + "..." - : attr.englishValue - comment += `- Line ${attr.line}: \`<${attr.componentName} ${attr.attributeName}="${truncatedValue}">\`\n` - } - if (result.untranslatedAttributes.length > 10) { - comment += `- ... and ${result.untranslatedAttributes.length - 10} more\n` - } - } - } - - comment += `\n` - } - - comment += `\n---\n` - comment += `*This validation check ensures translated files maintain the same structure as the English source.*` - - return comment -} diff --git a/src/scripts/i18n/lib/workflows/file-preparation.ts b/src/scripts/i18n/lib/workflows/file-preparation.ts deleted file mode 100644 index e8a3810fd61..00000000000 --- a/src/scripts/i18n/lib/workflows/file-preparation.ts +++ /dev/null @@ -1,204 +0,0 @@ -// File preparation workflow phase - -import { config, crowdinBearerHeaders } from "../../config" -import { - findCrowdinFile, - postCrowdinFile, - postFileToStorage, - unhideStringsInFile, -} from "../crowdin/files" -import { getCurrentUser } from "../crowdin/user" -import { - downloadGitHubFile, - getAllEnglishFiles, - getFileMetadata, -} from "../github/files" -import type { CrowdinFileData } from "../types" - -import type { FilePreparationResult, WorkflowContext } from "./types" -import { debugLog, delay, logSection } from "./utils" - -/** - * Update existing file in Crowdin with latest English content - */ -async function updateCrowdinFile( - file: { - filePath: string - download_url: string - "Crowdin-API-FileName": string - }, - foundFile: CrowdinFileData -): Promise<{ fileId: number; path: string; buffer: Buffer }> { - const fileBuffer = await downloadGitHubFile(file.download_url) - const storageInfo = await postFileToStorage( - fileBuffer, - file["Crowdin-API-FileName"] - ) - - // Update the file content using PUT - const updateUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` - const updateBody = { storageId: storageInfo.id } - - const updateResp = await fetch(updateUrl, { - method: "PUT", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify(updateBody), - }) - - if (!updateResp.ok) { - const text = await updateResp.text().catch(() => "") - throw new Error( - `Failed to update Crowdin file ${foundFile.id} (${updateResp.status}): ${text}` - ) - } - - console.log( - `✓ Updated Crowdin file: ${file.filePath} (fileId: ${foundFile.id}, storageId: ${storageInfo.id})` - ) - - // Wait for file parsing after update - const delayMs = 10000 - debugLog(`Waiting ${delayMs / 1000}s for Crowdin to re-parse updated file...`) - await delay(delayMs) - - return { - fileId: foundFile.id, - path: foundFile.path, - buffer: fileBuffer, - } -} - -/** - * Create new file in Crowdin - */ -async function createCrowdinFile(file: { - filePath: string - download_url: string - "Crowdin-API-FileName": string -}): Promise<{ fileId: number; path: string; buffer: Buffer }> { - console.log(`Creating new file in Crowdin: ${file.filePath}`) - - const fileBuffer = await downloadGitHubFile(file.download_url) - const storageInfo = await postFileToStorage( - fileBuffer, - file["Crowdin-API-FileName"] - ) - - // Derive full parent directory path (exclude filename) - const parts = file.filePath.split("/").filter(Boolean) - parts.pop() // remove filename - const parentDirPath = parts.join("/") || "/" - - const crowdinFileResponse = await postCrowdinFile( - storageInfo.id, - file["Crowdin-API-FileName"], - parentDirPath - ) - - console.log(`✓ Created new Crowdin file (ID: ${crowdinFileResponse.id})`) - - // Wait for new file parsing - const delayMs = 10000 - debugLog(`Waiting ${delayMs / 1000}s for Crowdin to parse new file...`) - await delay(delayMs) - - return { - fileId: crowdinFileResponse.id, - path: crowdinFileResponse.path, - buffer: fileBuffer, - } -} - -/** - * Upload/update English files to Crowdin and prepare for translation - */ -export async function prepareEnglishFiles( - context: WorkflowContext -): Promise { - const { - crowdinProjectFiles, - fileIdsSet, - processedFileIdToPath, - englishBuffers, - } = context - - logSection("Preparing English Files") - - // Get current user ID for ephemeral prompt cleanup later - const currentUser = await getCurrentUser() - context.crowdinUserId = currentUser.id - - // Fetch English files - const allEnglishFiles = await getAllEnglishFiles() - - if (!allEnglishFiles.length) { - console.log("No files to translate, exiting") - process.exit(0) - } - - debugLog(`Found ${allEnglishFiles.length} English files`) - debugLog(`Found ${crowdinProjectFiles.length} files in Crowdin project`) - - const fileMetadata = await getFileMetadata(allEnglishFiles) - - // Track failed files for summary - const failedFiles: Array<{ path: string; error: string }> = [] - let successCount = 0 - - // Iterate through each file and upload/update - for (const file of fileMetadata) { - debugLog(`Processing file: ${file.filePath}`) - - try { - // findCrowdinFile returns null if file doesn't exist (will be created) - const foundFile = findCrowdinFile(file, crowdinProjectFiles) - - const result = foundFile - ? await updateCrowdinFile(file, foundFile) - : await createCrowdinFile(file) - - fileIdsSet.add(result.fileId) - if (result.path) { - processedFileIdToPath[result.fileId] = result.path - } - englishBuffers[result.fileId] = result.buffer - successCount++ - } catch (error) { - // Log and continue - don't let one file failure kill the entire job - const message = error instanceof Error ? error.message : String(error) - failedFiles.push({ path: file.filePath, error: message }) - console.warn(`[WARN] Skipping ${file.filePath}: ${message}`) - } - } - - // Log summary of failed files - if (failedFiles.length > 0) { - console.log(`\n[SUMMARY] ${failedFiles.length} files skipped:`) - failedFiles.forEach((f) => console.log(` - ${f.path}`)) - } - - // Exit 1 only if ALL files failed - if (successCount === 0 && failedFiles.length > 0) { - console.error("[ERROR] All files failed to process") - process.exit(1) - } - - console.log( - `\n[INFO] Processed ${successCount} files successfully${failedFiles.length > 0 ? `, ${failedFiles.length} skipped` : ""}` - ) - - // Unhide any hidden/duplicate strings before pre-translation - logSection(`Unhiding Strings in ${fileIdsSet.size} Files`) - for (const fileId of Array.from(fileIdsSet)) { - await unhideStringsInFile(fileId) - } - - return { - fileIdsSet, - processedFileIdToPath, - englishBuffers, - } -} diff --git a/src/scripts/i18n/lib/workflows/gemini-initialize.ts b/src/scripts/i18n/lib/workflows/gemini-initialize.ts deleted file mode 100644 index a9f51cd7049..00000000000 --- a/src/scripts/i18n/lib/workflows/gemini-initialize.ts +++ /dev/null @@ -1,109 +0,0 @@ -/** - * Initialize the Gemini translation workflow. - * No Crowdin -- just fetch English files, glossary, and config. - */ - -import { config } from "../../config" -import { createRateLimiter } from "../ai/rate-limiter" -import { getAllEnglishFiles } from "../github/files" -import { - fetchGlossaryEntries, - type GlossaryByLanguage, - groupGlossaryByLanguage, -} from "../supabase/glossary" -import { fetchWithRetry } from "../utils/fetch" - -import { logSection } from "./utils" - -export interface GeminiWorkflowContext { - englishFiles: Array<{ - path: string - content: string - type: "markdown" | "json" - }> - glossary: GlossaryByLanguage - targetLanguages: string[] -} - -/** - * Download file content from GitHub. - */ -async function downloadFileContent(filePath: string): Promise { - const url = new URL( - `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/contents/${filePath}` - ) - url.searchParams.set("ref", config.baseBranch) - const res = await fetchWithRetry(url.toString(), { - headers: { - Authorization: `Bearer ${process.env.I18N_GITHUB_API_KEY}`, - Accept: "application/vnd.github.v3.raw", - }, - }) - - if (!res.ok) { - throw new Error(`Failed to download ${filePath}: ${res.status}`) - } - - return res.text() -} - -/** - * Initialize the workflow context. - */ -export async function geminiInitialize(): Promise { - logSection("Gemini Translation - Initialize") - - // Fetch English file list - console.log("[init] Fetching English source files...") - const fileList = await getAllEnglishFiles() - console.log(`[init] Found ${fileList.length} English files`) - - // Download file contents with bounded concurrency - console.log("[init] Downloading file contents...") - const englishFiles: GeminiWorkflowContext["englishFiles"] = [] - const downloadLimiter = createRateLimiter(10, 100) - - await Promise.all( - fileList.map(async (file) => { - await downloadLimiter.acquire() - try { - const content = await downloadFileContent(file.path) - const type = file.path.endsWith(".json") ? "json" : "markdown" - englishFiles.push({ path: file.path, content, type }) - } catch (error) { - console.warn( - `[init] Failed to download ${file.path}: ${error instanceof Error ? error.message : String(error)}` - ) - } finally { - downloadLimiter.release() - } - }) - ) - - // Sort by path for deterministic ordering (concurrent download completes out of order) - englishFiles.sort((a, b) => a.path.localeCompare(b.path)) - - console.log(`[init] Downloaded ${englishFiles.length} files`) - - // Fetch glossary - console.log("[init] Fetching glossary...") - let glossary: GlossaryByLanguage = new Map() - try { - const entries = await fetchGlossaryEntries() - glossary = groupGlossaryByLanguage(entries) - console.log( - `[init] Glossary loaded: ${entries.length} entries across ${glossary.size} languages` - ) - } catch (error) { - console.warn( - `[init] Glossary fetch failed, proceeding without: ${error instanceof Error ? error.message : String(error)}` - ) - } - - const targetLanguages = config.allInternalCodes - - console.log(`[init] Target languages: ${targetLanguages.join(", ")}`) - console.log(`[init] Files to translate: ${englishFiles.length}`) - - return { englishFiles, glossary, targetLanguages } -} diff --git a/src/scripts/i18n/lib/workflows/gemini-translate-files.ts b/src/scripts/i18n/lib/workflows/gemini-translate-files.ts deleted file mode 100644 index 987e4405b31..00000000000 --- a/src/scripts/i18n/lib/workflows/gemini-translate-files.ts +++ /dev/null @@ -1,218 +0,0 @@ -/** - * Orchestrate file translation per language. - * - * Each successful translation is committed immediately (amend pattern: - * one growing commit per language). Failed files are skipped, not fatal. - * The pipeline only throws if ALL files for ALL languages fail. - */ - -import { translateFile } from "../ai/gemini-translate" -import { - initProgress, - isFileCompleted, - isLanguageCompleted, - markFileCompleted, - markFileFailed, - markLanguageCompleted, - type TranslationProgress, -} from "../ai/progress-tracker" -import { createRateLimiter } from "../ai/rate-limiter" -import { getDestinationFromPath, IncrementalCommitter } from "../github/commits" -import { getGlossaryForLanguage } from "../supabase/glossary" - -import type { GeminiWorkflowContext } from "./gemini-initialize" -import { logSection } from "./utils" - -interface CommitFile { - path: string - content: string -} - -interface TranslationStats { - filesTranslated: number - filesSkipped: number - filesFailed: number - totalInputTokens: number - totalOutputTokens: number -} - -/** - * Translate all files for all target languages. - * Files are committed as they complete (no work lost on partial failure). - * Throws only if zero files were translated across all languages. - */ -export async function geminiTranslateFiles( - context: GeminiWorkflowContext, - branchName: string, - runId: string -): Promise<{ - branch: string - stats: Record - committedFiles: CommitFile[] - failedFiles: string[] -}> { - const { englishFiles, glossary, targetLanguages } = context - const concurrency = Number(process.env.GEMINI_CONCURRENCY) || 3 - const progress = initProgress(runId, targetLanguages) - const allStats: Record = {} - const allCommittedFiles: CommitFile[] = [] - const allFailedFiles: string[] = [] - - for (const language of targetLanguages) { - logSection(`Translating: ${language}`) - - if (isLanguageCompleted(progress, language)) { - console.log(`[translate] ${language} already completed, skipping`) - continue - } - - const glossaryTerms = getGlossaryForLanguage(glossary, language) - console.log( - `[translate] ${language}: ${englishFiles.length} files, ${glossaryTerms.size} glossary terms, concurrency ${concurrency}` - ) - - const { stats, files, failedFiles } = await translateLanguage( - englishFiles, - language, - glossaryTerms, - branchName, - concurrency, - progress - ) - - allStats[language] = stats - allCommittedFiles.push(...files) - allFailedFiles.push(...failedFiles.map((f) => `${language}:${f}`)) - - if (stats.filesTranslated > 0) { - markLanguageCompleted(progress, language) - } - - console.log( - `[translate] ${language} done: ${stats.filesTranslated} translated, ${stats.filesSkipped} skipped, ${stats.filesFailed} failed` - ) - console.log( - `[translate] ${language} tokens: ${stats.totalInputTokens} in, ${stats.totalOutputTokens} out` - ) - } - - // Fail if nothing was translated at all - const totalTranslated = Object.values(allStats).reduce( - (sum, s) => sum + s.filesTranslated, - 0 - ) - if (totalTranslated === 0 && allFailedFiles.length > 0) { - throw new Error( - `All translations failed. Failed files:\n${allFailedFiles.map((f) => ` - ${f}`).join("\n")}` - ) - } - - if (allFailedFiles.length > 0) { - console.warn( - `[translate] ${allFailedFiles.length} file(s) failed (${totalTranslated} succeeded):\n${allFailedFiles.map((f) => ` - ${f}`).join("\n")}` - ) - } - - return { - branch: branchName, - stats: allStats, - committedFiles: allCommittedFiles, - failedFiles: allFailedFiles, - } -} - -/** - * Translate all files for a single language. - * Each success is committed immediately via IncrementalCommitter. - * Failures are logged and skipped. - */ -async function translateLanguage( - englishFiles: GeminiWorkflowContext["englishFiles"], - language: string, - glossaryTerms: Map, - branchName: string, - concurrency: number, - progress: TranslationProgress -): Promise<{ - stats: TranslationStats - files: CommitFile[] - failedFiles: string[] -}> { - const limiter = createRateLimiter(concurrency) - const stats: TranslationStats = { - filesTranslated: 0, - filesSkipped: 0, - filesFailed: 0, - totalInputTokens: 0, - totalOutputTokens: 0, - } - - const translatedFiles: CommitFile[] = [] - const failedFiles: string[] = [] - - // Incremental committer: one amending commit per language - const committer = new IncrementalCommitter( - branchName, - `i18n(${language}): Gemini translation` - ) - await committer.init() - - // Process files with bounded concurrency - const tasks = englishFiles.map((file) => async () => { - // Skip already completed - if (isFileCompleted(progress, language, file.path)) { - stats.filesSkipped++ - return - } - - await limiter.acquire() - try { - const result = await translateFile({ - filePath: file.path, - fileContent: file.content, - fileType: file.type, - targetLanguage: language, - glossaryTerms, - }) - - const destPath = getDestinationFromPath(file.path, language) - - // Commit immediately -- serialized internally by the committer - await committer.commitFile(destPath, result.translatedContent) - - translatedFiles.push({ - path: destPath, - content: result.translatedContent, - }) - - stats.filesTranslated++ - stats.totalInputTokens += result.tokensUsed.input - stats.totalOutputTokens += result.tokensUsed.output - markFileCompleted(progress, language, file.path) - - console.log( - ` [${language}] ${file.path} -> ${destPath} (${result.tokensUsed.input + result.tokensUsed.output} tokens) [committed]` - ) - } catch (error) { - stats.filesFailed++ - failedFiles.push(file.path) - markFileFailed(progress, language, file.path) - console.error( - ` [${language}] FAILED ${file.path}: ${error instanceof Error ? error.message : String(error)}` - ) - } finally { - limiter.release() - } - }) - - // Execute all tasks (concurrency handled by limiter) - await Promise.all(tasks.map((task) => task())) - - if (committer.fileCount > 0) { - console.log( - `[translate] ${language}: ${committer.fileCount} files committed to branch` - ) - } - - return { stats, files: translatedFiles, failedFiles } -} diff --git a/src/scripts/i18n/lib/workflows/initialize.ts b/src/scripts/i18n/lib/workflows/initialize.ts deleted file mode 100644 index 0abd2f3d787..00000000000 --- a/src/scripts/i18n/lib/workflows/initialize.ts +++ /dev/null @@ -1,67 +0,0 @@ -// Workflow initialization phase - -import { config, validateTargetPath } from "../../config" -import { getCrowdinProjectFiles } from "../crowdin/files" -import { fetchGlossaryEntries, groupGlossaryByLanguage } from "../supabase" - -import type { WorkflowContext } from "./types" -import { logSection } from "./utils" - -/** - * Initialize workflow: validate config, log settings, fetch Crowdin state - */ -export async function initializeWorkflow(): Promise { - const { targetPath, targetPaths } = config - - logSection("Crowdin AI Translation Import") - console.log(`Target languages: ${config.allCrowdinCodes.join(", ")}`) - - if (targetPaths.length > 1) { - console.log(`Mode: Multi-file (${targetPaths.length} files)`) - for (const p of targetPaths) { - console.log(` - ${p}`) - } - // Validate each path - try { - for (const p of targetPaths) { - validateTargetPath(p) - } - } catch (e) { - console.error(e instanceof Error ? e.message : String(e)) - process.exit(1) - } - } else if (targetPath) { - const isFile = targetPath.endsWith(".md") || targetPath.endsWith(".json") - console.log(`Mode: ${isFile ? "Single file" : "Directory"} (${targetPath})`) - - // Validate target path is in allowed location - try { - validateTargetPath(targetPath) - } catch (e) { - console.error(e instanceof Error ? e.message : String(e)) - process.exit(1) - } - } else { - console.log(`Mode: Full translation (all files)`) - } - - // Fetch Crowdin project state - const crowdinProjectFiles = await getCrowdinProjectFiles() - - // Fetch glossary from Supabase (graceful degradation if unavailable) - const glossaryEntries = await fetchGlossaryEntries() - const glossary = groupGlossaryByLanguage(glossaryEntries) - console.log( - `[INIT] Loaded glossary: ${glossaryEntries.length} terms across ${glossary.size} languages` - ) - - // Initialize shared state - return { - crowdinProjectFiles, - fileIdsSet: new Set(), - processedFileIdToPath: {}, - englishBuffers: {}, - glossary, - languageJobs: [], - } -} diff --git a/src/scripts/i18n/lib/workflows/jsx-translation.ts b/src/scripts/i18n/lib/workflows/jsx-translation.ts deleted file mode 100644 index c0bad47561a..00000000000 --- a/src/scripts/i18n/lib/workflows/jsx-translation.ts +++ /dev/null @@ -1,118 +0,0 @@ -// JSX attribute translation workflow phase - -import { config } from "../../config" -import { translateJsxAttributes } from "../../translate-jsx-attributes" -import { isGeminiAvailable } from "../ai" -import { batchCommitFiles, BatchFile } from "../github/commits" -import type { GlossaryByLanguage } from "../supabase" -import { getGlossaryForLanguage } from "../supabase" - -import type { CommittedFile, LanguagePair } from "./types" -import { debugLog, logSection } from "./utils" - -export interface JsxTranslationResult { - /** Whether Gemini was skipped due to missing API key */ - geminiSkipped: boolean - /** Total attributes translated across all files */ - totalAttributesTranslated: number - /** Total files updated */ - totalFilesUpdated: number -} - -/** - * Translate JSX attributes in markdown files via Gemini. - * Updates committedFiles in-place with translated content. - */ -export async function runJsxTranslation( - committedFiles: CommittedFile[], - languagePairs: LanguagePair[], - branch: string, - glossary: GlossaryByLanguage -): Promise { - logSection("JSX Attribute Translation") - - if (!isGeminiAvailable()) { - console.warn( - `[JSX-TRANSLATE] ⚠️ GEMINI_API_KEY not set - JSX attributes may remain untranslated` - ) - return { - geminiSkipped: true, - totalAttributesTranslated: 0, - totalFilesUpdated: 0, - } - } - - let totalAttributesTranslated = 0 - let totalFilesUpdated = 0 - - // Process each language separately - for (const langPair of languagePairs) { - const langCode = langPair.internalLanguageCode - - // Filter files for this language (markdown only) - const langFiles = committedFiles - .filter((f) => f.path.includes(`/translations/${langCode}/`)) - .filter((f) => f.path.endsWith(".md") || f.path.endsWith(".mdx")) - .map((f) => ({ path: f.path, content: f.content })) - - if (langFiles.length === 0) { - console.log(`[JSX-TRANSLATE] No markdown files for ${langCode}`) - continue - } - - console.log( - `[JSX-TRANSLATE] Processing ${langFiles.length} files for ${langCode}` - ) - - const glossaryTerms = getGlossaryForLanguage(glossary, langCode) - const jsxResult = await translateJsxAttributes({ - targetLanguage: langCode, - files: langFiles, - glossaryTerms, - verbose: config.verbose, - }) - - // Batch commit updated files - if (jsxResult.updatedFiles.length > 0) { - const filesToCommit: BatchFile[] = [] - - for (const updated of jsxResult.updatedFiles) { - const buf = Buffer.from(updated.updatedContent, "utf8") - filesToCommit.push({ path: updated.filePath, content: buf }) - debugLog(`JSX-TRANSLATE: Will commit ${updated.filePath}`) - - // Update the committedFiles array with new content for sanitizer - const existingFile = committedFiles.find( - (f) => f.path === updated.filePath - ) - if (existingFile) { - existingFile.content = updated.updatedContent - } - } - - try { - await batchCommitFiles( - filesToCommit, - branch, - `i18n(${langCode}): JSX attribute translations` - ) - console.log( - `[JSX-TRANSLATE] ✓ Committed ${jsxResult.updatedFiles.length} files for ${langCode}` - ) - totalFilesUpdated += jsxResult.updatedFiles.length - totalAttributesTranslated += jsxResult.attributesTranslated - } catch (e) { - console.warn( - `[JSX-TRANSLATE] Failed to commit files for ${langCode}:`, - e - ) - } - } - } - - return { - geminiSkipped: false, - totalAttributesTranslated, - totalFilesUpdated, - } -} diff --git a/src/scripts/i18n/lib/workflows/pr-creation.ts b/src/scripts/i18n/lib/workflows/pr-creation.ts deleted file mode 100644 index 57ccbe822f4..00000000000 --- a/src/scripts/i18n/lib/workflows/pr-creation.ts +++ /dev/null @@ -1,207 +0,0 @@ -// PR creation workflow phase - -import { config } from "../../config" -import { getPromptInfo } from "../crowdin/prompt" -import { getCurrentUser } from "../crowdin/user" -import { postPullRequest } from "../github/pull-requests" - -import type { CommittedFile, LanguagePair, PullRequest } from "./types" -import { logSection } from "./utils" - -/** - * Generate dynamic PR title based on language count - */ -export function generatePRTitle( - langCodes: string[], - allPossibleLanguages: string[] -): string { - const isAllLanguages = langCodes.length === allPossibleLanguages.length - - const source = process.env.TRANSLATION_PIPELINE || "Crowdin" - let prTitle = `i18n: ${source} translations` - - if (langCodes.length <= 3) { - prTitle += ` (${langCodes.join(", ")})` - } else if (isAllLanguages) { - prTitle += ` (all languages)` - } else { - prTitle += ` (multiple languages)` - } - - return prTitle -} - -/** Options for PR body generation */ -export interface PRBodyOptions { - geminiSkipped?: boolean - workflowRunUrl?: string -} - -/** - * Generate PR body with organized file listings - */ -export function generatePRBody( - aiModelName: string, - langCodes: string[], - committedFiles: CommittedFile[], - sanitizedFiles: CommittedFile[], - options: PRBodyOptions = {} -): string { - // Include both sanitized files and original committed files - const allChangedPathsSet = new Set([ - ...sanitizedFiles.map(({ path }) => path), - ...committedFiles.map(({ path }) => path), - ]) - const allChangedPaths = Array.from(allChangedPathsSet) - - // Separate JSON and Markdown files - const jsonFiles = allChangedPaths.filter((path) => - path.toLowerCase().endsWith(".json") - ) - const markdownFiles = allChangedPaths.filter((path) => - path.toLowerCase().endsWith(".md") - ) - - // Dedupe paths after stripping locale prefix (same content path across languages) - const uniqueJsonPaths = [ - ...new Set( - jsonFiles.map((path) => path.replace(/^src\/intl\/[^/]+\//, "")) - ), - ].sort() - const uniqueMarkdownPaths = [ - ...new Set( - markdownFiles.map((path) => - path.replace(/^public\/content\/translations\/[^/]+\//, "") - ) - ), - ].sort() - - // Build PR body - let prBody = `## Description\n\n` - const pipeline = process.env.TRANSLATION_PIPELINE || "Crowdin" - prBody += `This PR contains automated ${aiModelName} translations via ${pipeline}.\n\n` - - if (options.workflowRunUrl) { - prBody += `[🔗 View workflow run](${options.workflowRunUrl})\n\n` - } - - // Language section - prBody += `### Languages translated\n\n` - prBody += `${langCodes.join(", ")}\n\n` - - // Files section - JSON - if (uniqueJsonPaths.length > 0) { - prBody += `### JSON changes (\`src/intl/{locale}/\`)\n\n` - for (const path of uniqueJsonPaths) { - prBody += `- ${path}\n` - } - prBody += `\n` - } - - // Files section - Markdown - if (uniqueMarkdownPaths.length > 0) { - prBody += `### Markdown changes (\`public/content/translations/{locale}/\`)\n\n` - for (const path of uniqueMarkdownPaths) { - prBody += `- ${path}\n` - } - prBody += `\n` - } - - // Add warning if Gemini was skipped - if (options.geminiSkipped) { - prBody += `---\n\n` - prBody += `> ⚠️ **Note:** GEMINI_API_KEY was not available during this run. ` - prBody += `JSX component attributes (e.g., \`title="..."\`, \`description="..."\`) ` - prBody += `may remain untranslated.\n\n` - } - - return prBody -} - -/** - * Fetch AI model name. Returns "Gemini" for the Gemini pipeline, - * or queries Crowdin for the model name in the Crowdin pipeline. - */ -async function fetchAIModelName(): Promise { - // Gemini pipeline -- no Crowdin API needed - if (process.env.TRANSLATION_PIPELINE === "Gemini") { - return process.env.GEMINI_MODEL || "Gemini" - } - - // Crowdin pipeline -- fetch from Crowdin API - try { - const currentUser = await getCurrentUser() - const promptInfo = await getPromptInfo( - currentUser.id, - config.preTranslatePromptId - ) - - if (promptInfo?.aiModelId) { - console.log(`✓ Fetched AI model: ${promptInfo.aiModelId}`) - return promptInfo.aiModelId - } else { - console.warn("Prompt info missing aiModelId, using default") - return "LLM" - } - } catch (e) { - console.warn("Could not fetch AI model name from Crowdin:", e) - return "LLM" - } -} - -/** - * Build workflow run URL from GitHub environment variables - */ -function getWorkflowRunUrl(): string | undefined { - const serverUrl = process.env.GITHUB_SERVER_URL - const repository = process.env.GITHUB_REPOSITORY - const runId = process.env.GITHUB_RUN_ID - - if (serverUrl && repository && runId) { - return `${serverUrl}/${repository}/actions/runs/${runId}` - } - return undefined -} - -/** - * Create pull request with formatted title and body - */ -export async function createTranslationPR( - branch: string, - committedFiles: CommittedFile[], - sanitizedFiles: CommittedFile[], - languagePairs: LanguagePair[], - options: PRBodyOptions = {} -): Promise { - logSection("Creating Pull Request") - - // Fetch AI model name dynamically - const aiModelName = await fetchAIModelName() - - // Extract language codes - const langCodes = languagePairs.map((p) => p.internalLanguageCode) - - // Add workflow metadata to options - const fullOptions: PRBodyOptions = { - ...options, - workflowRunUrl: getWorkflowRunUrl(), - } - - // Generate PR title and body - const prTitle = generatePRTitle(langCodes, config.allInternalCodes) - const prBody = generatePRBody( - aiModelName, - langCodes, - committedFiles, - sanitizedFiles, - fullOptions - ) - - // Create PR - const pr = await postPullRequest(branch, config.baseBranch, prTitle, prBody) - - console.log(`\n✓ Pull Request created: ${pr.html_url}`) - console.log(`PR Number: #${pr.number}`) - - return pr -} diff --git a/src/scripts/i18n/lib/workflows/pre-translation.ts b/src/scripts/i18n/lib/workflows/pre-translation.ts deleted file mode 100644 index 40971708cb2..00000000000 --- a/src/scripts/i18n/lib/workflows/pre-translation.ts +++ /dev/null @@ -1,317 +0,0 @@ -// Pre-translation workflow phase - -import * as fs from "fs" -import * as path from "path" - -import { config } from "../../config" -import { createEphemeralPrompt } from "../crowdin/ephemeral-prompts" -import { - awaitPreTranslationCompleted, - getPreTranslationStatus, - postApplyPreTranslation, -} from "../crowdin/pre-translate" -import { getPromptInfo } from "../crowdin/prompt" -import { formatGlossaryForPrompt, getGlossaryForLanguage } from "../supabase" -import type { CrowdinPreTranslateResponse } from "../types" - -import type { PreTranslationResult, WorkflowContext } from "./types" -import { debugLog, logSection } from "./utils" - -/** - * Resume existing pre-translation job - */ -async function resumePreTranslation( - preTranslationId: string -): Promise { - logSection(`Resuming Pre-Translation ${preTranslationId}`) - - const statusResp = await getPreTranslationStatus(preTranslationId) - - if (statusResp.status === "in_progress" || statusResp.status === "created") { - const statusMsg = - statusResp.status === "created" - ? "Pre-translation queued (waiting for other jobs)" - : `Pre-translation in progress (${statusResp.progress}%)` - console.log(`${statusMsg}, waiting for completion...`) - return await awaitPreTranslationCompleted(preTranslationId) - } else if (statusResp.status === "finished") { - console.log(`Pre-translation already finished, proceeding to download...`) - return statusResp - } else { - throw new Error( - `Pre-translation ${preTranslationId} has unexpected status: ${statusResp.status}` - ) - } -} - -/** - * Create ephemeral prompt with language-specific glossary - */ -async function createLanguagePrompt( - userId: number, - internalCode: string, - glossary: WorkflowContext["glossary"], - basePrompt: string, - aiProviderId?: number, - aiModelId?: string -): Promise { - const glossaryTerms = getGlossaryForLanguage(glossary, internalCode) - const glossarySection = formatGlossaryForPrompt(glossaryTerms, "informal") - - const fullPrompt = glossarySection - ? `${basePrompt}\n\n---\n\n${glossarySection}` - : basePrompt - - if (glossaryTerms.size > 0) { - console.log( - `[GLOSSARY] Injecting ${glossaryTerms.size} terms for ${internalCode} into prompt` - ) - } - - const { promptId } = await createEphemeralPrompt({ - userId, - languageCode: internalCode, - promptKey: "glossary", - promptText: fullPrompt, - aiProviderId, - aiModelId, - }) - - return promptId -} - -/** - * Start pre-translation jobs for all target languages - * Creates one ephemeral prompt and one job per language - */ -async function startPerLanguagePreTranslation( - context: WorkflowContext -): Promise { - const { allCrowdinCodes, allInternalCodes } = config - const { fileIdsSet, crowdinUserId, glossary, languageJobs } = context - - if (!crowdinUserId) { - throw new Error("Missing crowdinUserId in context") - } - - logSection("Requesting AI Pre-Translation (Per-Language)") - console.log(`Files to translate: ${fileIdsSet.size}`) - console.log(`Target languages: ${allCrowdinCodes.join(", ")}`) - - // Load base prompt template - const promptPath = path.join( - process.cwd(), - "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" - ) - const basePrompt = fs.readFileSync(promptPath, "utf8") - - // Get AI provider/model settings from the static prompt - const staticPromptInfo = await getPromptInfo( - crowdinUserId, - config.preTranslatePromptId - ) - debugLog( - `Static prompt AI settings: provider=${staticPromptInfo.aiProviderId}, model=${staticPromptInfo.aiModelId}` - ) - - const fileIds = Array.from(fileIdsSet) - - // Process each language: create prompt, start job - for (let i = 0; i < allInternalCodes.length; i++) { - const internalCode = allInternalCodes[i] - const crowdinCode = allCrowdinCodes[i] - - console.log(`\n[${internalCode}] Creating ephemeral prompt...`) - - // Create language-specific prompt with glossary - const ephemeralPromptId = await createLanguagePrompt( - crowdinUserId, - internalCode, - glossary, - basePrompt, - staticPromptInfo.aiProviderId ?? undefined, - staticPromptInfo.aiModelId ?? undefined - ) - - console.log(`[${internalCode}] ✓ Created prompt (ID: ${ephemeralPromptId})`) - console.log(`[${internalCode}] Submitting pre-translation job...`) - - // Submit pre-translation for this single language - const response = await postApplyPreTranslation( - fileIds, - [crowdinCode], - ephemeralPromptId - ) - - console.log(`[${internalCode}] ✓ Job created (ID: ${response.identifier})`) - - // Track job info for polling and cleanup - languageJobs.push({ - internalCode, - crowdinCode, - ephemeralPromptId, - preTranslationId: response.identifier, - }) - } - - // Log all job IDs for potential manual resume (comma-separated for easy copy-paste) - const allJobIds = languageJobs.map((j) => j.preTranslationId).join(",") - logSection("Pre-Translation Jobs Summary") - console.log(`Created ${languageJobs.length} pre-translation jobs:`) - for (const job of languageJobs) { - console.log(` ${job.internalCode}: ${job.preTranslationId}`) - } - console.log(`\n📋 Copy for resume: ${allJobIds}`) - - // Exit early if skipAwait is set or if full translation mode (no targetPath) - if (config.skipAwait || !config.targetPath) { - const reason = config.skipAwait - ? "skip_await option enabled" - : "full translation job" - logSection(`Exiting for Manual Resume (${reason})`) - console.log(`\nTo resume, use PRETRANSLATION_ID:`) - console.log(` ${allJobIds}`) - console.log(`\nCheck progress: https://crowdin.com/project/ethereum-org`) - process.exit(0) - } - - // Wait for all jobs to complete in parallel with continue-on-error - logSection("Waiting for Pre-Translation Completion") - - const results = await Promise.all( - languageJobs.map(async (job) => { - console.log(`[${job.internalCode}] Waiting for completion...`) - try { - const completed = await awaitPreTranslationCompleted( - job.preTranslationId - ) - if (completed.status !== "finished") { - throw new Error(`Unexpected status: ${completed.status}`) - } - console.log(`[${job.internalCode}] ✓ Completed!`) - return { success: true as const, job, response: completed } - } catch (err) { - console.error( - `[${job.internalCode}] ✗ Failed:`, - err instanceof Error ? err.message : err - ) - return { success: false as const, job, error: err } - } - }) - ) - - const successes = results.filter((r) => r.success) - const failures = results.filter((r) => !r.success) - - if (failures.length > 0) { - console.warn( - `\n[WARN] ${failures.length}/${languageJobs.length} jobs failed:` - ) - for (const f of failures) { - console.warn(` - ${f.job.internalCode}: ${f.job.preTranslationId}`) - } - } - - if (successes.length === 0) { - throw new Error("All pre-translation jobs failed") - } - - console.log( - `\n✓ ${successes.length}/${languageJobs.length} pre-translation jobs completed!` - ) - return successes.map((s) => s.response) -} - -/** - * Resume multiple pre-translation jobs in parallel with continue-on-error - */ -async function resumeMultiplePreTranslations( - preTranslationIds: string[] -): Promise { - logSection(`Resuming ${preTranslationIds.length} Pre-Translation Jobs`) - console.log(`IDs: ${preTranslationIds.join(", ")}`) - - const results = await Promise.all( - preTranslationIds.map(async (id) => { - try { - const response = await resumePreTranslation(id) - return { success: true as const, id, response } - } catch (err) { - console.error( - `[ERROR] Job ${id} failed:`, - err instanceof Error ? err.message : err - ) - return { success: false as const, id, error: err } - } - }) - ) - - // Separate successes and failures - const successes = results.filter((r) => r.success) - const failures = results.filter((r) => !r.success) - - if (failures.length > 0) { - console.warn( - `\n[WARN] ${failures.length}/${preTranslationIds.length} jobs failed:` - ) - for (const f of failures) { - console.warn(` - ${f.id}`) - } - } - - if (successes.length === 0) { - throw new Error("All pre-translation jobs failed") - } - - console.log( - `\n✓ ${successes.length}/${preTranslationIds.length} jobs completed successfully` - ) - return successes.map((s) => s.response) -} - -/** - * Handle pre-translation: resume existing or start new per-language jobs - */ -export async function handlePreTranslation( - context: WorkflowContext -): Promise { - const { existingPreTranslationIds, verbose } = config - const { fileIdsSet, processedFileIdToPath, crowdinProjectFiles } = context - - // Resume existing jobs or start new per-language jobs - let responses: CrowdinPreTranslateResponse[] - let fileIds: number[] - - if (existingPreTranslationIds.length > 0) { - // Resume mode: one or more existing jobs - responses = await resumeMultiplePreTranslations(existingPreTranslationIds) - // Collect all fileIds from all responses - fileIds = [...new Set(responses.flatMap((r) => r.attributes.fileIds))] - } else { - // New mode: per-language jobs - responses = await startPerLanguagePreTranslation(context) - // All jobs translate the same files, so just use the first response's fileIds - fileIds = responses[0]?.attributes.fileIds ?? Array.from(fileIdsSet) - } - - // Build mapping for commit phase - const fileIdToPathMapping: Record = {} - - for (const fid of fileIds) { - if (processedFileIdToPath[fid]) { - fileIdToPathMapping[fid] = processedFileIdToPath[fid] - } else { - const existing = crowdinProjectFiles.find((f) => f.id === fid) - if (existing) fileIdToPathMapping[fid] = existing.path - } - if (!fileIdToPathMapping[fid] && verbose) { - console.warn(`[WARN] Missing path mapping for fileId=${fid}`) - } - } - - return { - responses, - fileIdToPathMapping, - fileIds, - } -} diff --git a/src/scripts/i18n/lib/workflows/translation-download.ts b/src/scripts/i18n/lib/workflows/translation-download.ts deleted file mode 100644 index db9f8047b7c..00000000000 --- a/src/scripts/i18n/lib/workflows/translation-download.ts +++ /dev/null @@ -1,133 +0,0 @@ -// Translation download workflow phase - -import { config } from "../../config" -import { getBuiltFile, postBuildProjectFileTranslation } from "../crowdin/build" -import { postCreateBranchFrom } from "../github/branches" -import { - batchCommitFiles, - BatchFile, - getDestinationFromPath, -} from "../github/commits" -import { mapCrowdinCodeToInternal } from "../utils/mapping" - -import type { - CommittedFile, - LanguagePair, - PreTranslationResult, - TranslationDownloadResult, - WorkflowContext, -} from "./types" -import { debugLog, logSection, logSubsection } from "./utils" - -/** - * Build language pair mappings from Crowdin IDs to internal codes - */ -export function buildLanguageMappings(languageIds: string[]): LanguagePair[] { - return languageIds.map((crowdinId) => ({ - crowdinId, - internalLanguageCode: mapCrowdinCodeToInternal(crowdinId), - })) -} - -/** - * Download translations from Crowdin and commit to GitHub branch - */ -export async function downloadAndCommitTranslations( - preTranslateResult: PreTranslationResult, - context: WorkflowContext -): Promise { - const { englishBuffers } = context - const { responses, fileIdToPathMapping, fileIds } = preTranslateResult - - // Collect all language IDs from all responses (each response has one language) - const languageIds = responses.flatMap((r) => r.attributes.languageIds) - - // Build language pair mappings - const languagePairs = buildLanguageMappings(languageIds) - - logSection("Creating Translation PR") - - // Create GitHub branch (use language code as suffix for single-language PRs) - const branchSuffix = - languagePairs.length === 1 - ? languagePairs[0].internalLanguageCode - : "crowdin-translations" - const { branch } = await postCreateBranchFrom(config.baseBranch, branchSuffix) - console.log(`✓ Created branch: ${branch}`) - - // Track all committed files with their content for sanitizer/validation - const committedFiles: CommittedFile[] = [] - - // For each language, download and commit translations - for (const { crowdinId, internalLanguageCode } of languagePairs) { - logSubsection( - `Building translations for ${crowdinId} (${internalLanguageCode})` - ) - - // Collect files for batch commit - const filesToCommit: BatchFile[] = [] - - // Build and download each file - for (const fileId of fileIds) { - const crowdinPath = fileIdToPathMapping[fileId] - - debugLog(`Processing fileId: ${fileId} (${crowdinPath})`) - - // 1- Build translation - const { url: downloadUrl } = await postBuildProjectFileTranslation( - fileId, - crowdinId, - config.projectId - ) - - // 2- Download - const { buffer } = await getBuiltFile(downloadUrl) - debugLog(`Downloaded ${buffer.length} bytes`) - - // Check if translation differs from English - const originalEnglish = englishBuffers[fileId] - if (originalEnglish && originalEnglish.compare(buffer) === 0) { - debugLog( - `Skipping commit - content identical to English (no translation)` - ) - continue - } - - // 3- Get destination path and collect for batch commit - const destinationPath = getDestinationFromPath( - crowdinPath, - internalLanguageCode - ) - debugLog(`Will commit to: ${destinationPath}`) - - filesToCommit.push({ path: destinationPath, content: buffer }) - - // Track this file's path and content for sanitizer/validation - committedFiles.push({ - path: destinationPath, - content: buffer.toString("utf8"), - }) - } - - // Batch commit all files for this language - if (filesToCommit.length > 0) { - await batchCommitFiles( - filesToCommit, - branch, - `i18n(${internalLanguageCode}): Crowdin translations` - ) - console.log( - `✓ Committed ${filesToCommit.length} translations for ${internalLanguageCode}` - ) - } else { - console.log(`No new translations for ${internalLanguageCode}`) - } - } - - return { - branch, - committedFiles, - languagePairs, - fileIdToPathMapping, - } -} diff --git a/src/scripts/i18n/lib/workflows/types.ts b/src/scripts/i18n/lib/workflows/types.ts deleted file mode 100644 index 8620c268696..00000000000 --- a/src/scripts/i18n/lib/workflows/types.ts +++ /dev/null @@ -1,98 +0,0 @@ -// Types for i18n workflow phases - -import type { GlossaryByLanguage } from "../supabase" -import type { CrowdinFileData, CrowdinPreTranslateResponse } from "../types" - -/** - * Per-language job tracking data - */ -export interface LanguageJobInfo { - /** Internal language code (e.g., "es", "zh") */ - internalCode: string - /** Crowdin language code (e.g., "es-EM", "zh-CN") */ - crowdinCode: string - /** Ephemeral prompt ID created for this language */ - ephemeralPromptId: number - /** Pre-translation job ID */ - preTranslationId: string -} - -/** - * Shared context passed between workflow phases - */ -export interface WorkflowContext { - crowdinProjectFiles: CrowdinFileData[] - fileIdsSet: Set - processedFileIdToPath: Record - englishBuffers: Record - glossary: GlossaryByLanguage - /** Per-language job info (populated during pre-translation phase) */ - languageJobs: LanguageJobInfo[] - /** Crowdin user ID (needed for ephemeral prompt cleanup) */ - crowdinUserId?: number -} - -/** - * Result of file preparation phase - */ -export interface FilePreparationResult { - fileIdsSet: Set - processedFileIdToPath: Record - englishBuffers: Record -} - -/** - * File committed to GitHub branch - */ -export interface CommittedFile { - path: string - content: string -} - -/** - * Language pair mapping - */ -export interface LanguagePair { - crowdinId: string - internalLanguageCode: string -} - -/** - * Result of translation download phase - */ -export interface TranslationDownloadResult { - branch: string - committedFiles: CommittedFile[] - languagePairs: LanguagePair[] - fileIdToPathMapping: Record -} - -/** - * Pull request data - */ -export interface PullRequest { - html_url: string - number: number -} - -/** - * Pre-translation job result (supports multiple per-language jobs) - */ -export interface PreTranslationResult { - /** All pre-translation responses (one per language) */ - responses: CrowdinPreTranslateResponse[] - /** File ID to path mapping */ - fileIdToPathMapping: Record - /** File IDs that were translated */ - fileIds: number[] -} - -/** - * Result of processing a single language in split-PR mode - */ -export interface SplitPRResult { - language: string - status: "success" | "failed" - prUrl?: string - error?: string -} diff --git a/src/scripts/i18n/lib/workflows/validation.ts b/src/scripts/i18n/lib/workflows/validation.ts deleted file mode 100644 index 17241baf6ab..00000000000 --- a/src/scripts/i18n/lib/workflows/validation.ts +++ /dev/null @@ -1,133 +0,0 @@ -// Syntax tree validation workflow phase - -import { postPullRequestComment } from "../github/pull-requests" -import { - formatValidationComment, - validateJsonStructure, - validateJsxAttributes, - validateMarkdownStructure, -} from "../validation/syntax-tree" - -import type { CommittedFile, PullRequest } from "./types" -import { debugLog, logSection } from "./utils" - -/** Default threshold for JSX attribute untranslated percentage */ -const DEFAULT_JSX_THRESHOLD = 5 - -/** - * Run syntax tree validation and post comment if issues found - */ -export async function runSyntaxValidation( - pr: PullRequest, - committedFiles: CommittedFile[], - englishBuffers: Record, - fileIdToPathMapping: Record -): Promise { - logSection("Running Syntax Tree Validation") - - const validationResults: Parameters[0] = [] - - for (const file of committedFiles) { - const isJson = file.path.toLowerCase().endsWith(".json") - const isMarkdown = file.path.toLowerCase().endsWith(".md") - - if (!isJson && !isMarkdown) continue - - // Find the corresponding English file - let englishContent: string | null = null - - // Determine the English source path - if (isJson) { - // Extract the file name from the destination path - const match = file.path.match(/src\/intl\/[^/]+\/(.+)$/) - if (match) { - const fileName = match[1] - // Find the English buffer from our tracked files - for (const [fileId, buffer] of Object.entries(englishBuffers)) { - const crowdinPath = fileIdToPathMapping[Number(fileId)] - if (crowdinPath && crowdinPath.includes(fileName)) { - englishContent = buffer.toString("utf8") - break - } - } - } - } else if (isMarkdown) { - // Extract the relative path from translations - const match = file.path.match( - /public\/content\/translations\/[^/]+\/(.+)$/ - ) - if (match) { - const relPath = match[1] - // Find the English buffer - for (const [fileId, buffer] of Object.entries(englishBuffers)) { - const crowdinPath = fileIdToPathMapping[Number(fileId)] - if (crowdinPath && crowdinPath.includes(relPath)) { - englishContent = buffer.toString("utf8") - break - } - } - } - } - - if (!englishContent) { - debugLog(`Could not find English source for ${file.path}`) - continue - } - - // Validate structure - if (isJson) { - const result = validateJsonStructure(englishContent, file.content) - validationResults.push({ - path: file.path, - type: "json", - result, - }) - if (!result.isValid) { - debugLog(`JSON validation failed for ${file.path}`) - } - } else if (isMarkdown) { - const result = validateMarkdownStructure(englishContent, file.content) - validationResults.push({ - path: file.path, - type: "markdown", - result, - }) - if (!result.isValid) { - debugLog(`Markdown validation failed for ${file.path}`) - } - - // Also validate JSX attributes for markdown files (compare against English) - const jsxThreshold = - Number(process.env.JSX_UNTRANSLATED_THRESHOLD) || DEFAULT_JSX_THRESHOLD - const jsxResult = validateJsxAttributes( - englishContent, - file.content, - jsxThreshold - ) - if (!jsxResult.isValid) { - validationResults.push({ - path: file.path, - type: "jsx-attributes", - result: jsxResult, - }) - debugLog( - `JSX attribute validation flagged ${file.path}: ${jsxResult.untranslatedPercentage.toFixed(1)}% untranslated` - ) - } - } - } - - // Post validation comment if there are issues - const validationComment = formatValidationComment(validationResults) - if (validationComment) { - console.log(`\n⚠️ Syntax validation issues found, posting comment...`) - try { - await postPullRequestComment(pr.number, validationComment) - console.log(`✓ Posted validation comment to PR`) - } catch (e) { - console.warn(`Failed to post validation comment:`, e) - } - } else { - console.log(`✓ All files passed syntax tree validation`) - } -} diff --git a/src/scripts/i18n/main-gemini.ts b/src/scripts/i18n/main-gemini.ts deleted file mode 100644 index 10b496323de..00000000000 --- a/src/scripts/i18n/main-gemini.ts +++ /dev/null @@ -1,159 +0,0 @@ -/** - * Main entry point for Gemini translation pipeline. - * - * Replaces Crowdin as the translation intermediary. Sends whole files - * to Gemini with site-specific context, then runs sanitizer + - * transliteration as post-processing. - * - * Usage: - * npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/main-gemini.ts - * - * Environment variables: - * GEMINI_API_KEY - Gemini API key (required) - * I18N_GITHUB_API_KEY - GitHub API key (required) - * TARGET_PATH - Comma-separated file paths or single directory - * TARGET_LANGUAGES - Comma-separated language codes (blank = all) - * GEMINI_CONCURRENCY - Max parallel Gemini requests per language (default: 3) - * RESUME_RUN_ID - Resume an interrupted run by ID - * BASE_BRANCH - GitHub base branch (default: dev) - * SKIP_PR_CREATION - Skip PR creation (default: false) - */ - -import { isGeminiAvailable } from "./lib/ai/gemini" -import { cleanupProgress } from "./lib/ai/progress-tracker" -import { - createBranchFromSha, - createBranchName, - getBranchObject, -} from "./lib/github/branches" -import { geminiInitialize } from "./lib/workflows/gemini-initialize" -import { geminiTranslateFiles } from "./lib/workflows/gemini-translate-files" -import { runJsxTranslation } from "./lib/workflows/jsx-translation" -import { createTranslationPR } from "./lib/workflows/pr-creation" -import { runPostImportSanitization } from "./lib/workflows/sanitization" -import { logSection } from "./lib/workflows/utils" -import { config } from "./config" - -async function main() { - logSection("Gemini Translation Pipeline") - - // Preflight checks - if (!isGeminiAvailable()) { - console.error("[ERROR] GEMINI_API_KEY is not set") - process.exit(1) - } - - if (!process.env.I18N_GITHUB_API_KEY) { - console.error("[ERROR] I18N_GITHUB_API_KEY is not set") - process.exit(1) - } - - // Phase 1: Initialize - const context = await geminiInitialize() - - if (context.englishFiles.length === 0) { - console.log("[main] No files to translate. Exiting.") - process.exit(0) - } - - // Generate run ID and branch name - const runId = process.env.RESUME_RUN_ID || `gemini-${Date.now().toString(36)}` - const branchSuffix = - context.targetLanguages.length === 1 - ? context.targetLanguages[0] - : "translations" - const branchName = createBranchName(branchSuffix) - - console.log(`[main] Run ID: ${runId}`) - console.log(`[main] Branch: ${branchName}`) - - // Create branch from base - const baseBranch = await getBranchObject(config.baseBranch) - await createBranchFromSha(branchName, baseBranch.sha) - - // Phase 2: Translate files (committed incrementally as they complete) - const { stats, committedFiles, failedFiles } = await geminiTranslateFiles( - context, - branchName, - runId - ) - - // Phase 3: Post-import sanitization - const sanitizerInput = committedFiles.map((f) => ({ - path: f.path, - content: f.content, - })) - const sanitizeResult = await runPostImportSanitization( - sanitizerInput, - branchName - ) - - // Phase 4: JSX attribute translation (reuse existing Gemini JSX flow) - if (isGeminiAvailable()) { - const languagePairsForJsx = context.targetLanguages.map((code) => ({ - crowdinId: code, - internalLanguageCode: code, - })) - try { - await runJsxTranslation( - sanitizerInput, - languagePairsForJsx, - branchName, - context.glossary - ) - } catch (error) { - console.warn( - `[main] JSX translation failed (non-fatal): ${error instanceof Error ? error.message : String(error)}` - ) - } - } - - // Phase 5: Create PR - const skipPr = ["1", "true", "yes", "on"].includes( - (process.env.SKIP_PR_CREATION || "").toLowerCase() - ) - - if (!skipPr) { - logSection("Creating Pull Request") - - const languagePairs = Object.keys(stats).map((code) => ({ - crowdinId: code, - internalLanguageCode: code, - })) - - await createTranslationPR( - branchName, - sanitizerInput, - sanitizeResult.changedFiles, - languagePairs - ) - } - - // Cleanup progress manifest on success - cleanupProgress({ runId, startedAt: "", languages: {} }) - - logSection("Complete") - console.log("[main] Gemini translation pipeline finished.") - - // Print summary - for (const [lang, s] of Object.entries(stats)) { - console.log( - ` ${lang}: ${s.filesTranslated} translated, ${s.filesFailed} failed, ${s.totalInputTokens + s.totalOutputTokens} tokens` - ) - } - - if (failedFiles.length > 0) { - console.warn( - `\n[main] ${failedFiles.length} file(s) could not be translated:` - ) - for (const f of failedFiles) { - console.warn(` - ${f}`) - } - } -} - -main().catch((error) => { - console.error("\n========== ERROR ==========") - console.error(error) - process.exit(1) -}) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts deleted file mode 100644 index 08e9eb3daac..00000000000 --- a/src/scripts/i18n/main.ts +++ /dev/null @@ -1,222 +0,0 @@ -import { deleteEphemeralPrompt } from "./lib/crowdin/ephemeral-prompts" -import { prepareEnglishFiles } from "./lib/workflows/file-preparation" -import { initializeWorkflow } from "./lib/workflows/initialize" -import { runJsxTranslation } from "./lib/workflows/jsx-translation" -import { createTranslationPR } from "./lib/workflows/pr-creation" -import { handlePreTranslation } from "./lib/workflows/pre-translation" -import { runPostImportSanitization } from "./lib/workflows/sanitization" -import { - buildLanguageMappings, - downloadAndCommitTranslations, -} from "./lib/workflows/translation-download" -import type { PreTranslationResult, SplitPRResult } from "./lib/workflows/types" -import { logSection } from "./lib/workflows/utils" -import { runSyntaxValidation } from "./lib/workflows/validation" -import { config } from "./config" - -/** - * Main orchestration function - */ -async function main() { - const { existingPreTranslationIds } = config - - // Phase 1: Initialize workflow - const context = await initializeWorkflow() - - // Phase 2: Prepare English files (skip if resuming existing jobs) - if (existingPreTranslationIds.length === 0) { - await prepareEnglishFiles(context) - } - - // Phase 3: Handle pre-translation (resume or start new) - const preTranslateResult = await handlePreTranslation(context) - - // Check if PR creation should be skipped - const skipPrCreation = ["1", "true", "yes", "on"].includes( - (process.env.SKIP_PR_CREATION || "").toLowerCase() - ) - - // Split PR mode: create one PR per language - if (config.splitPrs) { - const results: SplitPRResult[] = [] - - for (const response of preTranslateResult.responses) { - const langId = response.attributes.languageIds[0] - const langCode = buildLanguageMappings([langId])[0].internalLanguageCode - - logSection(`Processing Language: ${langCode}`) - - // Create single-response PreTranslationResult for this language - const singleLangResult: PreTranslationResult = { - responses: [response], - fileIdToPathMapping: preTranslateResult.fileIdToPathMapping, - fileIds: preTranslateResult.fileIds, - } - - try { - // Phase 4: Download and commit translations - const translationResult = await downloadAndCommitTranslations( - singleLangResult, - context - ) - - // Phase 5: Translate JSX attributes via Gemini - const jsxTranslationResult = await runJsxTranslation( - translationResult.committedFiles, - translationResult.languagePairs, - translationResult.branch, - context.glossary - ) - - // Phase 6: Run post-import sanitizer - const sanitizeResult = await runPostImportSanitization( - translationResult.committedFiles, - translationResult.branch - ) - - if (skipPrCreation) { - console.log( - `[${langCode}] Branch created: ${translationResult.branch}` - ) - results.push({ language: langCode, status: "success" }) - continue - } - - // Phase 7: Create PR - const pr = await createTranslationPR( - translationResult.branch, - translationResult.committedFiles, - sanitizeResult.changedFiles, - translationResult.languagePairs, - { geminiSkipped: jsxTranslationResult.geminiSkipped } - ) - - // Phase 8: Run syntax tree validation - await runSyntaxValidation( - pr, - translationResult.committedFiles, - context.englishBuffers, - translationResult.fileIdToPathMapping - ) - - console.log(`[${langCode}] ✓ PR created: ${pr.html_url}`) - results.push({ - language: langCode, - status: "success", - prUrl: pr.html_url, - }) - } catch (err) { - const errorMsg = err instanceof Error ? err.message : String(err) - console.error(`[${langCode}] ✗ Failed: ${errorMsg}`) - results.push({ language: langCode, status: "failed", error: errorMsg }) - } - } - - // Print summary - logSection("SPLIT PR SUMMARY") - const successes = results.filter((r) => r.status === "success") - const failures = results.filter((r) => r.status === "failed") - - console.log(`Created: ${successes.length}/${results.length}`) - if (successes.length > 0) { - console.log(`\nSuccessful:`) - for (const r of successes) { - console.log(` ${r.language}: ${r.prUrl ?? "(branch only)"}`) - } - } - if (failures.length > 0) { - console.log(`\nFailed:`) - for (const r of failures) { - console.log(` ${r.language}: ${r.error}`) - } - } - - if (successes.length === 0) { - throw new Error("All language PRs failed") - } - } else { - // Single PR mode (default): all languages in one PR - // Phase 4: Download and commit translations - const translationResult = await downloadAndCommitTranslations( - preTranslateResult, - context - ) - - // Phase 5: Translate JSX attributes via Gemini (before sanitizer) - const jsxTranslationResult = await runJsxTranslation( - translationResult.committedFiles, - translationResult.languagePairs, - translationResult.branch, - context.glossary - ) - - // Phase 6: Run post-import sanitizer - const sanitizeResult = await runPostImportSanitization( - translationResult.committedFiles, - translationResult.branch - ) - - if (skipPrCreation) { - logSection("Skipping PR Creation") - console.log( - `Files have been committed to branch: ${translationResult.branch}. No PR will be opened.` - ) - console.log( - `Set SKIP_PR_CREATION=false to enable automatic PR creation in the workflow.` - ) - return - } - - // Phase 7: Create PR - const pr = await createTranslationPR( - translationResult.branch, - translationResult.committedFiles, - sanitizeResult.changedFiles, - translationResult.languagePairs, - { geminiSkipped: jsxTranslationResult.geminiSkipped } - ) - - // Phase 8: Run syntax tree validation - await runSyntaxValidation( - pr, - translationResult.committedFiles, - context.englishBuffers, - translationResult.fileIdToPathMapping - ) - - // Success! - logSection("SUCCESS") - console.log(`Pull Request: ${pr.html_url}`) - console.log( - `Languages: ${translationResult.languagePairs.map((p) => p.internalLanguageCode).join(", ")}` - ) - console.log(`Files: ${preTranslateResult.fileIds.length}`) - } - - // Cleanup all ephemeral prompts (best effort - don't fail the workflow if cleanup fails) - if (context.languageJobs.length > 0 && context.crowdinUserId) { - logSection("Cleaning Up Ephemeral Prompts") - for (const job of context.languageJobs) { - try { - await deleteEphemeralPrompt( - context.crowdinUserId, - job.ephemeralPromptId - ) - console.log( - `✓ Deleted prompt for ${job.internalCode} (ID: ${job.ephemeralPromptId})` - ) - } catch (err) { - console.warn( - `[WARN] Failed to cleanup ephemeral prompt ${job.ephemeralPromptId} (${job.internalCode}):`, - err instanceof Error ? err.message : err - ) - } - } - } -} - -main().catch((err) => { - console.error("\n========== ERROR ==========") - console.error(err) - process.exit(1) -}) diff --git a/src/scripts/i18n/sanitize-pr.ts b/src/scripts/i18n/sanitize-pr.ts deleted file mode 100644 index 3892c5361c6..00000000000 --- a/src/scripts/i18n/sanitize-pr.ts +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Run the post-import sanitizer on ONLY the files changed in a specific PR. - * - * Usage: - * npx ts-node -O '{"module":"commonjs"}' src/scripts/i18n/sanitize-pr.ts - * - * Requires: `gh` CLI authenticated and available in PATH. - * - * Fetches the file list from the GitHub API (paginated), filters to - * translation files (.md and .json), and passes them to runSanitizer() - * with empty content so the sanitizer reads from disk and writes fixes back. - */ - -import { execSync } from "child_process" -import * as path from "path" - -import { runSanitizer } from "./post_import_sanitize" - -const ROOT = process.cwd() - -function getPRFiles(prNumber: string): string[] { - const cmd = `gh api repos/ethereum/ethereum-org-website/pulls/${prNumber}/files --paginate -q '.[].filename'` - const output = execSync(cmd, { - encoding: "utf8", - maxBuffer: 10 * 1024 * 1024, - }) - return output.trim().split("\n").filter(Boolean) -} - -async function main() { - const prNumber = process.argv[2] - if (!prNumber) { - console.error("Usage: sanitize-pr.ts ") - process.exit(1) - } - - console.log(`[sanitize-pr] Fetching file list for PR #${prNumber}...`) - const allFiles = getPRFiles(prNumber) - - // Filter to translation files only (md + json under translations/ or intl/) - const translationFiles = allFiles.filter( - (f) => - (f.includes("/translations/") || f.includes("/intl/")) && - (f.endsWith(".md") || f.endsWith(".json")) - ) - - if (translationFiles.length === 0) { - console.log("[sanitize-pr] No translation files found in PR diff.") - process.exit(0) - } - - console.log( - `[sanitize-pr] Found ${translationFiles.length} translation files in PR #${prNumber}` - ) - - // Convert to absolute paths with empty content (sanitizer reads from disk) - const filesWithContent = translationFiles.map((relPath) => ({ - path: path.join(ROOT, relPath), - content: "", - })) - - const result = await runSanitizer(filesWithContent) - - console.log(`\n[sanitize-pr] Done.`) - console.log( - ` Markdown: ${result.markdown.scanned} scanned, ${result.markdown.fixed} fixed` - ) - console.log( - ` JSON: ${result.json.scanned} scanned, ${result.json.fixed} fixed` - ) - - if (result.orphanWarnings && result.orphanWarnings.length > 0) { - console.log(` Orphan warnings: ${result.orphanWarnings.length}`) - } -} - -main().catch((err) => { - console.error(err) - process.exit(1) -}) diff --git a/src/scripts/i18n/translate-jsx-attributes.ts b/src/scripts/i18n/translate-jsx-attributes.ts deleted file mode 100644 index ae195b65fca..00000000000 --- a/src/scripts/i18n/translate-jsx-attributes.ts +++ /dev/null @@ -1,252 +0,0 @@ -/** - * Standalone JSX attribute translation module - * - * Can be called from: - * 1. Main i18n workflow (after Crowdin download, before sanitizer) - * 2. Dedicated GitHub Action (accepts branch/PR, runs in isolation) - * - * Usage: - * npx ts-node translate-jsx-attributes.ts --language es --files file1.md,file2.md - * npx ts-node translate-jsx-attributes.ts --language es --branch translations/es - */ - -import fs from "fs" -import path from "path" - -import { isGeminiAvailable, translateAttributesByFile } from "./lib/ai" -import type { - ExtractedAttribute, - FileExtractionResult, - FileTranslationResult, - JsxTranslationSummary, -} from "./lib/jsx-attributes" -import { - countExtractedAttributes, - extractAttributesFromFile, - reinsertTranslatedAttributes, -} from "./lib/jsx-attributes" - -/** - * Options for JSX attribute translation - */ -export interface TranslateJsxOptions { - /** Target language code (e.g., "es", "fr") */ - targetLanguage: string - /** Files to process (path and content) */ - files: { path: string; content: string }[] - /** Glossary terms for this language (English term -> translated term) */ - glossaryTerms?: Map - /** Whether to log verbose output */ - verbose?: boolean -} - -/** - * Translate JSX attributes in a batch of files for a single language. - * This is the main entry point for both workflow integration and standalone use. - */ -export async function translateJsxAttributes( - options: TranslateJsxOptions -): Promise { - const { targetLanguage, files, glossaryTerms, verbose = false } = options - - console.log(`\n[JSX-TRANSLATE] Starting JSX attribute translation`) - console.log(`[JSX-TRANSLATE] Target language: ${targetLanguage}`) - console.log(`[JSX-TRANSLATE] Files to process: ${files.length}`) - - // Check Gemini availability - const geminiAvailable = isGeminiAvailable() - if (!geminiAvailable) { - console.warn( - `[JSX-TRANSLATE] ⚠️ GEMINI_API_KEY not available, skipping translation` - ) - return { - filesProcessed: files.length, - filesWithChanges: 0, - attributesTranslated: 0, - attributesFailed: 0, - geminiAvailable: false, - updatedFiles: [], - } - } - - // Extract attributes from all files - const extractions: FileExtractionResult[] = [] - const attributesByFile = new Map() - - for (const file of files) { - // Only process markdown files - if (!file.path.endsWith(".md") && !file.path.endsWith(".mdx")) { - continue - } - - const extraction = extractAttributesFromFile(file.content, file.path) - extractions.push(extraction) - - if (extraction.attributes.length > 0) { - attributesByFile.set(file.path, extraction.attributes) - if (verbose) { - console.log( - `[JSX-TRANSLATE] Found ${extraction.attributes.length} attributes in ${file.path}` - ) - } - } - } - - const totalAttributes = countExtractedAttributes(extractions) - console.log( - `[JSX-TRANSLATE] Found ${totalAttributes} translatable attributes in ${attributesByFile.size} files` - ) - - if (totalAttributes === 0) { - console.log(`[JSX-TRANSLATE] No attributes to translate`) - return { - filesProcessed: files.length, - filesWithChanges: 0, - attributesTranslated: 0, - attributesFailed: 0, - geminiAvailable: true, - updatedFiles: [], - } - } - - // Translate attributes via Gemini (one API call per file batch) - const translatedByFile = await translateAttributesByFile( - attributesByFile, - targetLanguage, - glossaryTerms - ) - - // Re-insert translated attributes into files - const updatedFiles: FileTranslationResult[] = [] - let attributesTranslated = 0 - let attributesFailed = 0 - - for (const extraction of extractions) { - const translated = translatedByFile.get(extraction.filePath) || [] - const result = reinsertTranslatedAttributes(extraction, translated) - - if (result.hasChanges) { - updatedFiles.push(result) - attributesTranslated += translated.length - } - - // Count failed as those we extracted but didn't get back - const originalCount = extraction.attributes.length - const translatedCount = translated.length - if (translatedCount < originalCount) { - attributesFailed += originalCount - translatedCount - } - } - - console.log(`[JSX-TRANSLATE] ✓ Translation complete`) - console.log(`[JSX-TRANSLATE] - Files with changes: ${updatedFiles.length}`) - console.log( - `[JSX-TRANSLATE] - Attributes translated: ${attributesTranslated}` - ) - if (attributesFailed > 0) { - console.log(`[JSX-TRANSLATE] - Attributes failed: ${attributesFailed}`) - } - - return { - filesProcessed: files.length, - filesWithChanges: updatedFiles.length, - attributesTranslated, - attributesFailed, - geminiAvailable: true, - updatedFiles, - } -} - -/** - * Read files from disk for standalone execution - */ -function readFilesFromDisk( - filePaths: string[] -): { path: string; content: string }[] { - return filePaths.map((filePath) => { - const absolutePath = path.isAbsolute(filePath) - ? filePath - : path.join(process.cwd(), filePath) - const content = fs.readFileSync(absolutePath, "utf-8") - return { path: filePath, content } - }) -} - -/** - * Write updated files back to disk - */ -function writeFilesToDisk(files: FileTranslationResult[]): void { - for (const file of files) { - const absolutePath = path.isAbsolute(file.filePath) - ? file.filePath - : path.join(process.cwd(), file.filePath) - fs.writeFileSync(absolutePath, file.updatedContent, "utf-8") - console.log(`[JSX-TRANSLATE] Wrote: ${file.filePath}`) - } -} - -/** - * Parse CLI arguments - */ -function parseArgs(): { language: string; files: string[] } | null { - const args = process.argv.slice(2) - let language = "" - let files: string[] = [] - - for (let i = 0; i < args.length; i++) { - if (args[i] === "--language" || args[i] === "-l") { - language = args[++i] - } else if (args[i] === "--files" || args[i] === "-f") { - files = args[++i].split(",").map((f) => f.trim()) - } - } - - if (!language || files.length === 0) { - return null - } - - return { language, files } -} - -/** - * CLI entry point for standalone execution - */ -async function main() { - const parsed = parseArgs() - - if (!parsed) { - console.log(` -Usage: npx ts-node translate-jsx-attributes.ts --language --files - -Options: - --language, -l Target language code (e.g., "es", "fr", "de") - --files, -f Comma-separated list of file paths to process - -Example: - npx ts-node translate-jsx-attributes.ts -l es -f public/content/roadmap/pbs/index.md -`) - process.exit(1) - } - - const fileContents = readFilesFromDisk(parsed.files) - const result = await translateJsxAttributes({ - targetLanguage: parsed.language, - files: fileContents, - verbose: true, - }) - - if (result.updatedFiles.length > 0) { - writeFilesToDisk(result.updatedFiles) - console.log(`\n✓ Updated ${result.updatedFiles.length} files`) - } else { - console.log(`\nNo files were modified`) - } -} - -// Run CLI if executed directly -if (require.main === module) { - main().catch((err) => { - console.error("Error:", err) - process.exit(1) - }) -} diff --git a/src/scripts/i18n/transliterate.ts b/src/scripts/i18n/transliterate.ts deleted file mode 100644 index bd9e291ceb6..00000000000 --- a/src/scripts/i18n/transliterate.ts +++ /dev/null @@ -1,568 +0,0 @@ -#!/usr/bin/env npx tsx -/** - * Transliterate English proper nouns in non-Latin-script translation files. - * - * Reads the lookup table from .claude/translation-review/transliterations/{lang}.json - * and replaces English brand names, person names, and technical terms with their - * native-script transliterations in body text. - * - * Usage: - * npx tsx src/scripts/i18n/transliterate.ts --lang=hi # dry-run (default) - * npx tsx src/scripts/i18n/transliterate.ts --lang=hi --apply # write changes - * npx tsx src/scripts/i18n/transliterate.ts --lang=hi --verbose # show every replacement - * - * Supported languages: hi, mr, bn, ta, te, ar, ur, ru, uk, ja, ko, zh, zh-tw - * - * Protected zones (no replacements): - * - Code fences (```...```) - * - Inline code (`...`) - * - URLs and href attributes - * - Frontmatter `tags` arrays - * - JSX/HTML component names and attributes - * - Import/export statements - * - * Transliterated zones: - * - Body text, headings, list items, table cells - * - Frontmatter `author` field - * - Frontmatter `title` and `description` fields - */ - -import * as fs from "node:fs" -import * as path from "node:path" - -// ===== CLI Flags ===== -const APPLY = process.argv.includes("--apply") -const VERBOSE = process.argv.includes("--verbose") -const DRY_RUN = !APPLY - -const LANG_ARG = process.argv.find((a) => a.startsWith("--lang=")) -const LANG = LANG_ARG?.split("=")[1] - -const SUPPORTED_LANGS = [ - "hi", - "mr", - "bn", - "ta", - "te", - "ar", - "ur", - "ru", - "uk", - "ja", - "ko", - "zh", - "zh-tw", -] - -if (!LANG || !SUPPORTED_LANGS.includes(LANG)) { - console.error( - `Usage: npx tsx src/scripts/i18n/transliterate.ts --lang= [--apply] [--verbose]\n` + - `Supported: ${SUPPORTED_LANGS.join(", ")}` - ) - process.exit(1) -} - -if (DRY_RUN) { - console.log(`[transliterate] DRY RUN mode -- pass --apply to write changes\n`) -} - -// ===== Configuration ===== - -const ROOT = process.cwd() -const TRANSLIT_JSON_PATH = path.join( - ROOT, - `.claude/translation-review/transliterations/${LANG}.json` -) - -// Terms too ambiguous for automated word-boundary replacement. -// These are common English words that happen to also be brand names. -// They require manual review or tighter context matching. -const AMBIGUOUS_TERMS = new Set([ - "Go", // common verb - "Base", // common noun - "Compound", // common adjective/noun - "Curve", // common noun - "Scroll", // common noun/verb - "Dart", // common noun - "Brownie", // food - "Lido", // place name - "zkEVM", // technical acronym/standard, keep Latin (per Gemini) -]) - -// ===== Load Transliterations ===== - -interface TranslitEntry { - text: string - source: string -} - -interface TranslitFile { - _meta: Record - transliterations: Record - _alternatives?: Record -} - -const langJson: TranslitFile = JSON.parse( - fs.readFileSync(TRANSLIT_JSON_PATH, "utf8") -) - -const translit: Map = new Map() -for (const [eng, entry] of Object.entries(langJson.transliterations)) { - translit.set(eng, entry.text) -} - -// Sort terms longest first to prevent partial matches -// e.g., "Ethereum Foundation" before "Ethereum", "Ethers.js" before "Ether" -const allTerms = [...translit.keys()].sort((a, b) => b.length - a.length) -const safeTerms = allTerms.filter((t) => !AMBIGUOUS_TERMS.has(t)) - -// ===== Regex Helpers ===== - -function escapeRegex(str: string): string { - return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") -} - -/** - * Build a regex for matching a term with appropriate boundaries. - * - * Uses \b for terms that start/end with word characters [a-zA-Z0-9_]. - * Uses lookaround for terms with special chars at boundaries (C++, .NET). - * - * Devanagari chars are outside \w, so \b correctly matches at the - * Latin/Devanagari boundary (e.g., "...देव Solidity कोड..." matches). - */ -function makePattern(term: string): RegExp { - const escaped = escapeRegex(term) - - const startsWithWord = /^\w/.test(term) - const endsWithWord = /\w$/.test(term) - - let pattern = escaped - - // Start boundary - if (startsWithWord) { - pattern = "\\b" + pattern - } else { - // For terms like ".NET" -- require whitespace/start/punctuation before - pattern = "(?<=\\s|^|[>\"'(\\[])" + pattern - } - - // End boundary - if (endsWithWord) { - pattern = pattern + "\\b" - } else { - // For terms like "C++" -- require whitespace/end/punctuation after - pattern = pattern + "(?=\\s|$|[<\"',;.!?)\\]])" - } - - return new RegExp(pattern, "g") -} - -// Pre-compile patterns for all safe terms -const termPatterns: Array<{ - term: string - pattern: RegExp - replacement: string -}> = safeTerms.map((term) => ({ - term, - pattern: makePattern(term), - replacement: translit.get(term)!, -})) - -// ===== Placeholder System ===== -// Protect zones by replacing them with null-byte-delimited placeholders, -// performing transliteration on the remaining text, then restoring. - -let phCounter = 0 -let phMap: Map - -function ph(content: string): string { - const key = `\x00PH${phCounter++}\x00` - phMap.set(key, content) - return key -} - -function restore(text: string): string { - // Restore in REVERSE order (newest placeholders first). - // This handles nesting: if PH2 wraps PH1 wraps PH0, - // restoring PH2 first reveals PH1, then PH1 reveals PH0. - const entries = [...phMap.entries()].reverse() - for (const [key, val] of entries) { - text = text.split(key).join(val) - } - // Second pass: handle any remaining nested placeholders - // that were revealed by the first pass - for (const [key, val] of entries) { - if (text.includes(key)) { - text = text.split(key).join(val) - } - } - return text -} - -// ===== Markdown Processing ===== - -interface FileResult { - file: string - changed: boolean - replacementCount: number - details: string[] -} - -function processMarkdownFile(filePath: string): FileResult { - const content = fs.readFileSync(filePath, "utf8") - const relPath = path.relative(ROOT, filePath) - - // Reset placeholder system - phCounter = 0 - phMap = new Map() - - const lines = content.split("\n") - const result: string[] = [] - const details: string[] = [] - let replacementCount = 0 - - // State tracking - let inFrontmatter = false - let frontmatterDashes = 0 - let inCodeFence = false - let codeFenceMarker = "" - let inTagsBlock = false - - for (let i = 0; i < lines.length; i++) { - const line = lines[i] - const lineNum = i + 1 - - // --- Frontmatter boundary --- - if (line.trim() === "---" && !inCodeFence) { - frontmatterDashes++ - if (frontmatterDashes === 1) inFrontmatter = true - if (frontmatterDashes === 2) { - inFrontmatter = false - inTagsBlock = false - } - result.push(line) - continue - } - - // --- Code fence boundary --- - const fenceMatch = line.match(/^(`{3,})/) - if (fenceMatch && !inFrontmatter) { - if (!inCodeFence) { - inCodeFence = true - codeFenceMarker = fenceMatch[1] - } else if (line.trim().startsWith(codeFenceMarker)) { - inCodeFence = false - codeFenceMarker = "" - } - result.push(line) - continue - } - - // --- Inside code fence: skip entirely --- - if (inCodeFence) { - result.push(line) - continue - } - - // --- Frontmatter handling --- - if (inFrontmatter) { - // Track tags array (could span multiple lines) - if (/^\s*tags:\s*\[/.test(line)) { - inTagsBlock = true - } - if (inTagsBlock) { - if (line.includes("]")) inTagsBlock = false - result.push(line) // Never touch tags - continue - } - - // Transliterate author, title, description fields - if (/^\s*(author|title|description):\s*/.test(line)) { - let modified = line - // Reset per-line placeholders - phCounter = 0 - phMap = new Map() - - // Protect domain names (ethereum.org, etc.) - modified = modified.replace( - /[a-zA-Z0-9][\w.-]*\.(org|com|io|net|dev|xyz|eth|fm|tv|co)\b/gi, - (m) => ph(m) - ) - - // Protect quoted strings that look like URLs - modified = modified.replace(/https?:\/\/[^\s"']+/g, (m) => ph(m)) - - // Apply transliterations - for (const { term, pattern, replacement } of termPatterns) { - const before = modified - modified = modified.replace(pattern, replacement) - if (modified !== before) { - replacementCount++ - details.push( - ` L${lineNum}: "${term}" -> "${replacement}" (frontmatter)` - ) - } - } - - modified = restore(modified) - result.push(modified) - continue - } - - // Other frontmatter fields: skip - result.push(line) - continue - } - - // --- Body text --- - let modified = line - - // Reset per-line placeholders - phCounter = 0 - phMap = new Map() - - // 0. Protect MDX heading IDs: {#some-anchor-id} - // These must stay ASCII or MDX/acorn parsing breaks - modified = modified.replace(/\{#[^}]+\}/g, (m) => ph(m)) - - // 0b. Protect domain names (e.g., "ethereum.org", "Etherscan.io") - // These must NEVER be transliterated -- they are functional URLs/brands - modified = modified.replace( - /[a-zA-Z0-9][\w.-]*\.(org|com|io|net|dev|xyz|eth|fm|tv|co)\b/gi, - (m) => ph(m) - ) - - // 1. Protect inline code - modified = modified.replace(/`[^`\n]+`/g, (m) => ph(m)) - - // 2. Protect markdown link URLs: [text](url) - modified = modified.replace(/\]\([^)]+\)/g, (m) => ph(m)) - - // 3. Protect autolinks and bare URLs - modified = modified.replace(/https?:\/\/[^\s)>"']+/g, (m) => ph(m)) - - // 4. Protect HTML/JSX attributes: href="...", src="...", id="...", etc. - modified = modified.replace(/\w+="[^"]*"/g, (m) => ph(m)) - modified = modified.replace(/\w+='[^']*'/g, (m) => ph(m)) - - // 5. Protect JSX component names: ph(m)) - - // 6. Protect import/export lines entirely - if (/^\s*(import|export)\s/.test(line)) { - result.push(line) - continue - } - - // 7. Protect HTML comments - modified = modified.replace(//g, (m) => ph(m)) - - // Apply transliterations - for (const { term, pattern, replacement } of termPatterns) { - const before = modified - modified = modified.replace(pattern, replacement) - if (modified !== before) { - // Count actual replacements (not just attempts) - const matches = before.match(pattern) - const count = matches ? matches.length : 1 - replacementCount += count - if (VERBOSE || details.length < 200) { - details.push(` L${lineNum}: "${term}" -> "${replacement}"`) - } - } - } - - // Restore protected zones - modified = restore(modified) - result.push(modified) - } - - const newContent = result.join("\n") - const changed = newContent !== content - - if (changed && APPLY) { - fs.writeFileSync(filePath, newContent, "utf8") - } - - return { file: relPath, changed, replacementCount, details } -} - -// ===== JSON Processing ===== - -function processJsonFile(filePath: string): FileResult { - const content = fs.readFileSync(filePath, "utf8") - const relPath = path.relative(ROOT, filePath) - const details: string[] = [] - let replacementCount = 0 - - let parsed: Record - try { - parsed = JSON.parse(content) - } catch { - return { - file: relPath, - changed: false, - replacementCount: 0, - details: [" SKIP: invalid JSON"], - } - } - - function walkAndReplace(obj: unknown, jsonPath: string): unknown { - if (typeof obj === "string") { - let modified = obj - - // Protect URLs and domains - phCounter = 0 - phMap = new Map() - modified = modified.replace( - /[a-zA-Z0-9][\w.-]*\.(org|com|io|net|dev|xyz|eth|fm|tv|co)\b/gi, - (m) => ph(m) - ) - modified = modified.replace(/https?:\/\/[^\s)>"']+/g, (m) => ph(m)) - modified = modified.replace(/<[^>]+>/g, (m) => ph(m)) // HTML tags - - for (const { term, pattern, replacement } of termPatterns) { - const before = modified - modified = modified.replace(pattern, replacement) - if (modified !== before) { - replacementCount++ - if (VERBOSE || details.length < 200) { - details.push(` ${jsonPath}: "${term}" -> "${replacement}"`) - } - } - } - - modified = restore(modified) - return modified - } - - if (Array.isArray(obj)) { - return obj.map((item, idx) => walkAndReplace(item, `${jsonPath}[${idx}]`)) - } - - if (obj && typeof obj === "object") { - const result: Record = {} - for (const [key, val] of Object.entries(obj)) { - // Don't modify keys, only values - result[key] = walkAndReplace(val, `${jsonPath}.${key}`) - } - return result - } - - return obj - } - - const modified = walkAndReplace(parsed, "$") as Record - const newContent = JSON.stringify(modified, null, 2) + "\n" - const changed = newContent !== content - - if (changed && APPLY) { - fs.writeFileSync(filePath, newContent, "utf8") - } - - return { file: relPath, changed, replacementCount, details } -} - -// ===== Main ===== - -function main() { - console.log( - `[transliterate] Language: ${LANG} (${langJson._meta?.language_name || LANG})` - ) - console.log( - `[transliterate] Loading ${translit.size} terms from ${LANG}.json` - ) - console.log( - `[transliterate] ${safeTerms.length} safe terms, ${AMBIGUOUS_TERMS.size} skipped as ambiguous\n` - ) - console.log(`Ambiguous (skipped): ${[...AMBIGUOUS_TERMS].join(", ")}\n`) - - // Find all translation files for the target language (Node 22+) - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const fsAny = fs as any - const mdFiles = ( - fsAny.globSync(`public/content/translations/${LANG}/**/*.md`, { - cwd: ROOT, - }) as string[] - ).map((f: string) => path.join(ROOT, f)) - const jsonFiles = ( - fsAny.globSync(`src/intl/${LANG}/**/*.json`, { cwd: ROOT }) as string[] - ).map((f: string) => path.join(ROOT, f)) - - console.log( - `[transliterate] Found ${mdFiles.length} .md files, ${jsonFiles.length} .json files\n` - ) - - // Process markdown files - const mdResults: FileResult[] = [] - for (const file of mdFiles) { - const result = processMarkdownFile(file) - mdResults.push(result) - } - - // Process JSON files - const jsonResults: FileResult[] = [] - for (const file of jsonFiles) { - const result = processJsonFile(file) - jsonResults.push(result) - } - - // ===== Report ===== - const allResults = [...mdResults, ...jsonResults] - const changedFiles = allResults.filter((r) => r.changed) - const totalReplacements = allResults.reduce( - (sum, r) => sum + r.replacementCount, - 0 - ) - - console.log("=".repeat(70)) - console.log( - `RESULTS: ${changedFiles.length} files changed, ${totalReplacements} replacements` - ) - console.log("=".repeat(70)) - - if (changedFiles.length > 0) { - console.log("\nChanged files:") - for (const r of changedFiles) { - console.log(`\n ${r.file} (${r.replacementCount} replacements)`) - if (VERBOSE) { - for (const d of r.details) { - console.log(d) - } - } - } - } - - if (DRY_RUN && changedFiles.length > 0) { - console.log( - `\n[transliterate] DRY RUN complete. Run with --apply to write ${changedFiles.length} files.` - ) - } else if (APPLY && changedFiles.length > 0) { - console.log( - `\n[transliterate] APPLIED: ${changedFiles.length} files written, ${totalReplacements} replacements.` - ) - } else { - console.log("\n[transliterate] No changes needed.") - } - - // Summary by term frequency - if (VERBOSE) { - console.log("\n\nTerm frequency in changes:") - const termCounts = new Map() - for (const r of allResults) { - for (const d of r.details) { - const match = d.match(/"([^"]+)" ->/) - if (match) { - termCounts.set(match[1], (termCounts.get(match[1]) || 0) + 1) - } - } - } - const sorted = [...termCounts.entries()].sort((a, b) => b[1] - a[1]) - for (const [term, count] of sorted.slice(0, 30)) { - console.log(` ${term}: ${count}`) - } - } -} - -main() diff --git a/src/scripts/intl-pipeline/FUTURE.md b/src/scripts/intl-pipeline/FUTURE.md new file mode 100644 index 00000000000..661c3966333 --- /dev/null +++ b/src/scripts/intl-pipeline/FUTURE.md @@ -0,0 +1,99 @@ +# Translation Pipeline -- Future Features + +> **Maintenance:** Remove or update entries here as they are implemented. Do not let this file accumulate stale items. + +--- + +## Pipeline Quality + +### 1. Fix Comment Restoration Concatenation Bug + +**Problem:** Translated code comments are concatenated with the original instead of replacing them. Example: `// **** REMOVE LIQUIDITY **** // **** ...Arabic... ****` + +**Root cause:** `restoreComments()` in `lib/llm/code-block-extractor.ts` appends the translated comment to the existing line content instead of replacing. `translateCodeComments()` should use `strippedCode` (comments removed) as the base for restoration, not the original `block.content`. + +**Complexity:** Low. ~5 line change. + +### 2. Stronger Glossary Enforcement + +**Problem:** High-frequency glossary terms like "mint" are translated inconsistently. The glossary is sent in the prompt but Gemini doesn't always adhere strictly. + +**Proposed solution:** +- Post-translation pass that scans output for known English glossary terms that should have been translated, and flags or auto-corrects them +- Consider a validation step that compares glossary term frequency in source vs translation +- May overlap with existing sanitizer `fixKnownBrandGarbles` pattern -- extend to glossary terms + +### 3. Transliteration During Translation + +**Problem:** Gemini regresses on transliterations (author names, brand names like "Proto-danksharding") that the sanitizer then has to catch. + +**Proposed solution:** +- Include transliteration banks directly in the translation prompt for non-Latin locales +- Add language-group-specific transliteration rules to `lib/llm/prompt-builder.ts` +- Ensure the translation prompt and sanitizer are aligned on the same transliteration bank + +### 4. Deep JSON Validation + +**Problem:** Current validation only checks top-level JSON keys. Nested namespaces can have dropped or renamed keys at depth > 1 without detection. + +**Proposed solution:** Recursive key comparison that walks the full object tree, reporting missing/added/renamed keys at any depth. + +--- + +## Pipeline Features + +### 5. Split PRs (one PR per language) + +**Problem:** Large multi-language runs produce a single massive PR that's hard to review. + +**Proposed solution:** A workflow input `split_prs` (boolean, default false) that creates a separate branch and PR per language. + +--- + +## Automation + +### 6. Auto-trigger Translations on Content Merge + +**Problem:** Content changes merged to dev currently require manual triggering of the translation pipeline. + +**Proposed solution:** +- GitHub Action that watches for merges to dev affecting `public/content/` or `src/intl/en/` +- Automatically triggers the translation workflow for changed files +- Should respect a cooldown/batch window to avoid triggering on every small merge + +### 7. Full-language Retroactive Cleanup + +**Problem:** Many languages were translated before current pipeline improvements. Those translations have the same class of issues found in Arabic (brand garbles, wrong compounds, etc.). + +**Proposed solution:** After pending language reviews are complete: +- Run the full sanitizer against every translated language +- Apply transliteration banks and language-group-specific rules +- Re-translate files flagged as having too many issues + +--- + +## Image Translation + +### 8. Translate Text in Diagrams and Infographics + +**Problem:** Educational diagrams and infographics contain English text that remains untranslated, creating a jarring experience on otherwise fully translated pages. + +**Proposed approach:** +- Use Gemini's image generation capabilities to edit text within images +- Build a pipeline: identify images with translatable text, extract, translate, generate localized variants +- Track image freshness the same way as content files (by source SHA) + +**Key challenges:** Text detection, visual quality with RTL scripts, maintenance when source diagrams update. + +--- + +## Package Extraction + +### 9. Extract i18n Tooling into Standalone Packages + +**Problem:** Glossary, translation pipeline, and (future) image pipeline are embedded in the repo. Creates bloat and prevents reuse. + +**Proposed approach:** Phased extraction: +1. Extract glossary (data + lookup) into its own package +2. Extract pipeline core (prompt builder, normalizer, batcher, language groups) once stable +3. Repo-specific glue (Actions, sanitizer, manifests) stays in ethereum-org-website diff --git a/src/scripts/intl-pipeline/config.ts b/src/scripts/intl-pipeline/config.ts new file mode 100644 index 00000000000..cf11d22038f --- /dev/null +++ b/src/scripts/intl-pipeline/config.ts @@ -0,0 +1,150 @@ +import * as dotenv from "dotenv" + +import i18nConfig from "../../../i18n.config.json" + +dotenv.config({ path: ".env.local" }) + +// Gemini model configuration (single source of truth) +// GEMINI_MODEL env var overrides; otherwise tries models in order +export const GEMINI_MODELS: string[] = process.env.GEMINI_MODEL + ? [process.env.GEMINI_MODEL] + : ["gemini-3.1-pro-preview", "gemini-3.1-pro"] + +// Glossary API (ETHGlossary) +export const GLOSSARY_API_URL = + process.env.GLOSSARY_API_URL || + "https://ethglossary.visual-20-hoists.workers.dev/api/v1" + +// GitHub API configuration +const githubApiToken = process.env.GITHUB_API_TOKEN || "" +if (!githubApiToken) { + console.error("[ERROR] Missing GITHUB_API_TOKEN environment variable") + throw new Error("No GitHub API token found (GITHUB_API_TOKEN)") +} + +export const gitHubBearerHeaders = { + Authorization: `Bearer ${githubApiToken}`, + Accept: "application/vnd.github.v3+json", +} + +// Parse target languages from env (internal codes: "es", "ko", etc.) +const targetLanguagesInput = process.env.TARGET_LANGUAGES + ? process.env.TARGET_LANGUAGES.split(",") + .map((lang) => lang.trim()) + .filter(Boolean) + : [] + +// If no target languages specified, use all from i18n.config.json except English +const allInternalCodes: string[] = + targetLanguagesInput.length > 0 + ? targetLanguagesInput + : i18nConfig.map(({ code }) => code).filter((code) => code !== "en") + +const baseBranch = process.env.BASE_BRANCH || "dev" +// Default target branch derived from base: intl/pending-{base with / replaced by -} +const defaultTargetBranch = `intl/pending-${baseBranch.replace(/\//g, "-")}` +const targetBranch = process.env.TARGET_BRANCH || defaultTargetBranch + +const targetPathRaw = process.env.TARGET_PATH || "" +const targetPaths = targetPathRaw + ? targetPathRaw + .split(",") + .map((p) => p.trim()) + .filter(Boolean) + : [] + +const excludePathRaw = process.env.EXCLUDE_PATH?.trim() || "" +const excludePaths = excludePathRaw + ? excludePathRaw + .split(",") + .map((p) => p.trim()) + .filter(Boolean) + : [] + +const verbose = process.env.VERBOSE === "true" +const skipPr = ["1", "true", "yes", "on"].includes( + (process.env.SKIP_PR_CREATION || "").toLowerCase() +) +const stampOnly = ["1", "true", "yes", "on"].includes( + (process.env.STAMP_ONLY || "").toLowerCase() +) +const dryRun = ["1", "true", "yes", "on"].includes( + (process.env.DRY_RUN || "").toLowerCase() +) +const mode = (process.env.MODE || "auto") as "auto" | "full" +const concurrency = parseInt(process.env.GEMINI_CONCURRENCY || "16", 10) + +// Parse GitHub repository from env (format: "owner/repo") +const githubRepo = + process.env.GITHUB_REPOSITORY || "ethereum/ethereum-org-website" +const [ghOrganization, ghRepo] = githubRepo.split("/") + +if (verbose) { + console.log("[config] Configuration:") + console.log(`[config] Languages: ${allInternalCodes.join(", ") || "ALL"}`) + console.log(`[config] Base branch: ${baseBranch}`) + console.log(`[config] Target branch: ${targetBranch}`) + console.log(`[config] Target paths: ${targetPaths.join(", ") || "all"}`) + console.log(`[config] Exclude paths: ${excludePaths.join(", ") || "none"}`) + console.log(`[config] Mode: ${mode}`) + console.log(`[config] Stamp only: ${stampOnly}`) + console.log(`[config] Skip PR: ${skipPr}`) + console.log(`[config] Dry run: ${dryRun}`) + console.log(`[config] Concurrency: ${concurrency}`) + console.log(`[config] Repo: ${ghOrganization}/${ghRepo}`) +} + +export const config = { + ghOrganization, + ghRepo, + jsonRoot: "src/intl/en", + mdRoot: "public/content", + allInternalCodes, + baseBranch, + targetBranch, + targetPath: targetPathRaw, + targetPaths, + excludePaths, + verbose, + skipPr, + stampOnly, + dryRun, + mode, + concurrency, +} + +// Paths that should never be translated +export const doNotTranslatePaths = [ + "/cookie-policy/", + "/privacy-policy/", + "/terms-of-use/", + "/terms-and-conditions/", + "/style-guide/", +] + +export function validateTargetPath(targetPath: string): void { + if (!targetPath) return + + if (targetPath.includes("public/content/translations")) { + throw new Error( + `Invalid target path: "${targetPath}" -- cannot target translated content` + ) + } + + if ( + targetPath.startsWith("src/intl/") && + !targetPath.startsWith("src/intl/en") + ) { + throw new Error( + `Invalid target path: "${targetPath}" -- only src/intl/en is allowed` + ) + } + + for (const excluded of doNotTranslatePaths) { + if (targetPath.includes(excluded)) { + throw new Error( + `Invalid target path: "${targetPath}" -- in excluded list (${excluded})` + ) + } + } +} diff --git a/src/scripts/intl-pipeline/constants.ts b/src/scripts/intl-pipeline/constants.ts new file mode 100644 index 00000000000..5abd60927ef --- /dev/null +++ b/src/scripts/intl-pipeline/constants.ts @@ -0,0 +1,9 @@ +/** + * Pipeline constants -- no side effects, safe to import from tests. + */ + +// Chunk size budget for LLM calls (bytes) +// 64KB ~= 16K tokens (English) or 32-64K tokens (CJK) +// Well within Gemini 3.1 Pro's 65K output token limit +// Conservative: prefer more calls over larger chunks +export const MAX_CHUNK_BYTES = 65_536 diff --git a/src/scripts/intl-pipeline/index.ts b/src/scripts/intl-pipeline/index.ts new file mode 100644 index 00000000000..7c00a2331c6 --- /dev/null +++ b/src/scripts/intl-pipeline/index.ts @@ -0,0 +1,2 @@ +export type { LlmTranslator } from "./pipeline" +export { pipeline, PIPELINE_CONFIG } from "./pipeline" diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/intl-pipeline/intl-sanitizer.ts similarity index 94% rename from src/scripts/i18n/post_import_sanitize.ts rename to src/scripts/intl-pipeline/intl-sanitizer.ts index 207f6eba2c6..ee7b2c273a5 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/intl-pipeline/intl-sanitizer.ts @@ -16,18 +16,14 @@ async function loadFranc(): Promise { } /** - * Post-import sanitizer for Crowdin translations. + * Translation output sanitizer. * * - Synchronize custom Markdown header IDs `{#...}` with English source (ASCII-only) * - Normalize block HTML tag line breaks (opening and closing tags on their own lines) * - Protect known brand/team names from inadvertent translation * - Validate JSON files; report issues * - * Usage: - * npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/post_import_sanitize.ts - * - * Env: - * TARGET_LANGUAGES (comma-separated, e.g. "es-EM") optional; defaults to scanning all `translations/*` folders + * Wired into the pipeline via lib/workflows/sanitization.ts */ const ROOT = process.cwd() @@ -109,7 +105,7 @@ const CASE_SENSITIVE_SPELLING_MISTAKES = ["Metamask", "Github"] * Locales that use non-Latin scripts and require transliteration of brand names. * For these languages, brand names should be transliterated into the target script, * NOT kept in Latin. The sanitizer should NOT revert transliterated brands to English. - * Must stay in sync with src/scripts/i18n/transliterate.ts SUPPORTED_LANGS. + * Locales that use non-Latin scripts for brand name transliteration. */ const TRANSLITERATION_LOCALES = new Set([ "hi", @@ -1108,6 +1104,51 @@ function warnPunctuationOnlyHeadings(content: string): string[] { * Code inside fences should never be translated (variable names, keywords, etc.). * Catches issues like `or` → `または` inside code fences. */ +/** + * Strip comments from a code block body so we can compare functional code only. + * Handles JS-family (// and block comments), Python/Vyper (# and docstrings), and shell (#). + */ +function stripCodeComments(body: string, lang: string): string { + const l = lang.toLowerCase().split(/\s+/)[0] + + const isPython = ["python", "py", "vyper", "ruby", "rb"].includes(l) + const isShell = [ + "bash", + "sh", + "shell", + "zsh", + "fish", + "yaml", + "yml", + "toml", + ].includes(l) + const isJs = !isPython && !isShell // default to JS-family + + let result = body + + if (isPython) { + // Remove """ ... """ docstrings (multiline) + result = result.replace(/"""[\s\S]*?"""/g, '""""""') + // Remove # comments (preserve the line structure) + result = result.replace(/#[^\n]*/g, "#") + } else if (isShell) { + result = result.replace(/#[^\n]*/g, "#") + } else if (isJs) { + // Remove /* ... */ block comments (preserve as marker) + result = result.replace(/\/\*[\s\S]*?\*\//g, "/**/") + // Remove // line comments + result = result.replace(/\/\/[^\n]*/g, "//") + } + + // Normalize whitespace for comparison: collapse blank lines, trim each line + return result + .split("\n") + .map((line) => line.trimEnd()) + .join("\n") + .replace(/\n{3,}/g, "\n\n") + .trim() +} + function warnCodeFenceContentDrift( translatedContent: string, englishContent: string @@ -1137,10 +1178,14 @@ function warnCodeFenceContentDrift( } for (let i = 0; i < engFences.length; i++) { - if (engFences[i].body !== transFences[i].body) { - const preview = transFences[i].body.substring(0, 60).replace(/\n/g, "\\n") + const lang = engFences[i].lang || transFences[i].lang + const engStripped = stripCodeComments(engFences[i].body, lang) + const transStripped = stripCodeComments(transFences[i].body, lang) + + if (engStripped !== transStripped) { + const preview = transStripped.substring(0, 60).replace(/\n/g, "\\n") warnings.push( - `Code fence #${i + 1} content differs from English: "${preview}..."` + `Code fence #${i + 1} functional code differs from English: "${preview}..."` ) } } @@ -1546,21 +1591,28 @@ function lineAt(file: string, index: number): string { interface HeaderInfo { level: number // Number of # symbols text: string // Header text (translated or English) - id: string // Custom ID from {#id} + id: string // Custom ID from {#id}, empty if none fullMatch: string // Full matched string for replacement } function extractHeaderStructure(md: string): HeaderInfo[] { const headers: HeaderInfo[] = [] - const headingRe = /^(#{1,6})\s+(.+?)\s*\{#([^}]+)\}[ \t]*$/gm - let m: RegExpExecArray | null - while ((m = headingRe.exec(md))) { - headers.push({ - level: m[1].length, - text: m[2].trim(), - id: m[3].trim(), - fullMatch: m[0], - }) + let inFence = false + for (const line of md.split("\n")) { + if (line.startsWith("```")) { + inFence = !inFence + continue + } + if (inFence) continue + const m = line.match(/^(#{1,6})\s+(.+?)(?:\s*\{#([^}]+)\})?[ \t]*$/) + if (m) { + headers.push({ + level: m[1].length, + text: m[2].trim(), + id: (m[3] || "").trim(), + fullMatch: m[0], + }) + } } return headers } @@ -2872,6 +2924,178 @@ function fixBareRtlEquations( return { content: frontmatter + body, fixCount } } +/** + * Fix misaligned closing code fences. + * + * When Gemini translates markdown with indented code fences (e.g., inside + * numbered lists), it sometimes strips the indentation from the closing + * fence while preserving it on the opening fence. This breaks syntax + * highlighting and confuses parsers. + * + * Example: + * Input: " ```sh\n cmd\n```" (opening indented, closing not) + * Output: " ```sh\n cmd\n ```" (closing indentation restored) + * + * Only fixes cases where the opening fence is indented and the closing + * fence is not. Does not touch correctly-aligned or non-indented fences. + */ +function fixMisalignedCodeFences(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + const lines = content.split("\n") + + let inFence = false + let openIndent = "" + let openFenceChar = "" + + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + + if (!inFence) { + // Check for indented opening fence + const openMatch = line.match(/^([ \t]+)(```|~~~)(.*)$/) + if (openMatch) { + inFence = true + openIndent = openMatch[1] + openFenceChar = openMatch[2] + } + continue + } + + // Inside a fence -- look for closing fence + const closeMatch = line.match(/^([ \t]*)(```|~~~)\s*$/) + if (closeMatch && closeMatch[2] === openFenceChar) { + const closeIndent = closeMatch[1] + if (closeIndent.length < openIndent.length) { + // Misaligned: restore the opening fence's indentation + lines[i] = openIndent + openFenceChar + fixCount++ + } + inFence = false + openIndent = "" + openFenceChar = "" + } + } + + return { content: lines.join("\n"), fixCount } +} + +/** + * Wrap bare LTR values in RTL files with to prevent + * BiDi rendering issues. Catches patterns Gemini may miss: + * + * - Numbers with Latin units: 32 ETH, 100 Gwei, 2 TB, 13s, 24h + * - Percentages: 12.5%, 51%, -12.5% + * - Currency: $100,000, $2,500 USD + * - Version/protocol IDs: v1.10.8, EIP-1559, ERC-721 + * - Large formatted numbers: 21,000, 100,000 + * - Decimal numbers with context: 0.000252 ETH + * - Multipliers: 2x, 100x + * + * Skips: code blocks, inline code, markdown link URLs, bare URLs, + * HTML attributes, already-wrapped spans, and frontmatter. + */ +function fixBareRtlValues( + content: string, + locale: string +): { content: string; fixCount: number } { + if (!RTL_LOCALES.has(locale)) return { content, fixCount: 0 } + let fixCount = 0 + + const fmRe = /^(---\n[\s\S]*?\n---\n)/ + const fmMatch = content.match(fmRe) + const frontmatter = fmMatch ? fmMatch[1] : "" + let body = fmMatch ? content.slice(frontmatter.length) : content + + const parts = body.split(RTL_SKIP_PATTERN) + + // Common Latin units that appear after numbers in ethereum.org content + const UNITS = + "ETH|BTC|Gwei|gwei|Wei|wei|USD|EUR|GBP|MB|GB|TB|KB|TH\\/s|MH\\/s|GH\\/s|APR|APY" + + // Order matters: currency first (captures $), then numUnit (skips $ prefix) + + // Pattern 1: Currency with $ symbol ($100,000, $2,500 USD) + const currencyRe = + /(?)(\$\d[\d,.]*(?:\s*(?:USD|EUR|GBP))?)(?!\s*<\/span>)/g + + // Pattern 2: Number + Latin unit (32 ETH, 100 Gwei, 2 TB, 13s, 24h) + // Negative lookbehind for $ prevents double-wrapping currency amounts + const numUnitRe = new RegExp( + `(?)(?)`, + "g" + ) + + // Pattern 3: Bare percentages (-12.5%, 51%) + const pctRe = /(?)(-?\d[\d,.]*\s*%)(?!\s*<\/span>)/g + + // Pattern 4: Version/protocol IDs (v1.10.8, EIP-1559, ERC-721) + const versionRe = + /(?)((?:v\d+\.\d+(?:\.\d+)*|(?:EIP|ERC|BLS)-\d+))(?!\s*<\/span>)/g + + // Pattern 5: Large formatted numbers standing alone (21,000 but not inside other patterns) + const largeNumRe = + /(?)(?))/g + + for (let i = 0; i < parts.length; i++) { + if (i % 2 === 1) continue // Skip protected zones + + let part = parts[i] + part = part.replace(currencyRe, (match) => { + fixCount++ + return `${match}` + }) + part = part.replace(numUnitRe, (match) => { + fixCount++ + return `${match}` + }) + part = part.replace(pctRe, (match) => { + fixCount++ + return `${match}` + }) + part = part.replace(versionRe, (match) => { + fixCount++ + return `${match}` + }) + part = part.replace(largeNumRe, (match) => { + fixCount++ + return `${match}` + }) + parts[i] = part + } + + body = parts.join("") + return { content: frontmatter + body, fixCount } +} + +/** + * Fix units that ended up outside wrappers. + * Gemini sometimes produces: $100,000 USD + * Correct form: $100,000 USD + */ +function fixUnitOutsideSpan( + content: string, + locale: string +): { content: string; fixCount: number } { + if (!RTL_LOCALES.has(locale)) return { content, fixCount: 0 } + let fixCount = 0 + + const UNITS = "ETH|BTC|Gwei|gwei|Wei|wei|USD|EUR|GBP|MB|GB|TB|KB" + const re = new RegExp( + `()([^<]+)(<\\/span>)\\s*(${UNITS})`, + "g" + ) + + const fixed = content.replace(re, (_, open, inner, close, unit) => { + fixCount++ + return `${open}${inner} ${unit}${close}` + }) + + return { content: fixed, fixCount } +} + /** * Remove redundant wrappers around backtick-delimited inline * code. MDX cannot nest markdown syntax (backticks) inside JSX (), so @@ -4141,7 +4365,8 @@ function detectUntranslatedContent(content: string, locale: string): string[] { function processMarkdownFile( mdPath: string, - providedContent?: string + providedContent?: string, + englishContentMap?: Map ): { fixed: boolean issues: string[] @@ -4160,14 +4385,28 @@ function processMarkdownFile( if (idx === -1 || idx + 2 >= parts.length) { issues.push("No translations segment found; skipping formatting sync") } else { - // Use path.resolve to preserve absolute paths (path.join loses leading /) - const englishPath = path.resolve( - path.sep, - ...parts.slice(0, idx), - ...parts.slice(idx + 2) // drop translations/ - ) - if (fs.existsSync(englishPath)) { - englishMd = fs.readFileSync(englishPath, "utf8") + // Derive the relative English path (e.g. public/content/bridges/index.md) + const englishRelPath = [...parts.slice(0, idx), ...parts.slice(idx + 2)] + .join(path.sep) + // Strip leading absolute prefix to get repo-relative path + .replace(/^.*?public\/content\//, "public/content/") + + // Try in-memory map first (from GitHub API), then fall back to disk + if (englishContentMap?.has(englishRelPath)) { + englishMd = englishContentMap.get(englishRelPath)! + } else { + // Absolute path for disk fallback (local/CLI usage) + const englishPath = path.resolve( + path.sep, + ...parts.slice(0, idx), + ...parts.slice(idx + 2) // drop translations/ + ) + if (fs.existsSync(englishPath)) { + englishMd = fs.readFileSync(englishPath, "utf8") + } + } + + if (englishMd) { // Fix detached heading anchors BEFORE syncing IDs { const snapshot = content @@ -4179,6 +4418,11 @@ function processMarkdownFile( } content = syncHeaderIdsWithEnglish(content, englishMd) } else { + const englishPath = path.resolve( + path.sep, + ...parts.slice(0, idx), + ...parts.slice(idx + 2) + ) issues.push(`English source missing: ${path.relative(ROOT, englishPath)}`) } } @@ -4310,6 +4554,10 @@ function processMarkdownFile( () => fixFrontmatterLang(content, locale), (n) => `Fixed ${n} frontmatter lang field to match locale "${locale}"` ) + applyFix( + () => fixFrontmatterLang(content, locale), + (n) => `Fixed ${n} frontmatter lang field to match locale "${locale}"` + ) applyFix( () => fixAsciiGuillemets(content), (n) => `Fixed ${n} ASCII guillemets (<< >>) to Unicode (« »)` @@ -4553,6 +4801,15 @@ function processMarkdownFile( const scriptWarnings = detectCrossScriptContamination(content, locale) issues.push(...scriptWarnings) + // Fix misaligned closing code fences (all locales) + const fenceResult = fixMisalignedCodeFences(content) + if (fenceResult.fixCount > 0) { + content = fenceResult.content + issues.push( + `Fixed ${fenceResult.fixCount} misaligned closing code fence(s)` + ) + } + // RTL-specific BiDi fixes: wrap bare dates and equations in const dateResult = fixBareRtlDates(content, locale) if (dateResult.fixCount > 0) { @@ -4568,9 +4825,23 @@ function processMarkdownFile( `Wrapped ${eqResult.fixCount} bare math equation(s) in for RTL` ) } + const valResult = fixBareRtlValues(content, locale) + if (valResult.fixCount > 0) { + content = valResult.content + issues.push( + `Wrapped ${valResult.fixCount} bare LTR value(s) in for RTL` + ) + } + const unitResult = fixUnitOutsideSpan(content, locale) + if (unitResult.fixCount > 0) { + content = unitResult.content + issues.push( + `Fixed ${unitResult.fixCount} unit(s) outside wrapper` + ) + } // Remove redundant around backtick inline code - // (cleans up after fixBareRtlDates/fixBareRtlEquations and Gemini) + // (cleans up after fixBareRtl*/fixBareRtlValues and Gemini) const spanBtResult = fixSpanWrappedBackticks(content) if (spanBtResult.fixCount > 0) { content = spanBtResult.content @@ -4807,7 +5078,8 @@ function languagesFromEnv(): string[] | undefined { export async function runSanitizer( filesWithContent?: Array<{ path: string; content: string }>, - langs?: string[] + langs?: string[], + englishContentMap?: Map ) { console.log("[SANITIZE] Starting post-import sanitizer") await loadFranc() @@ -4938,7 +5210,8 @@ export async function runSanitizer( } const { fixed, issues, content } = processMarkdownFile( fileInfo.path, - fileInfo.content + fileInfo.content, + englishContentMap ) if (fixed) { mdFixed++ @@ -5018,7 +5291,7 @@ export async function runSanitizer( } const changedFiles = [...mdChanged, ...jsonChanged].map((f) => ({ - path: f.path, + path: path.relative(ROOT, f.path), content: f.content, })) return { @@ -5264,11 +5537,15 @@ export const _testOnly = { warnExposedMdxTags, fixBareRtlDates, fixBareRtlEquations, + fixMisalignedCodeFences, + fixBareRtlValues, + fixUnitOutsideSpan, fixSpanWrappedBackticks, fixBoldWrappedOrderedListNumerals, warnTranslatedTechnicalNumerals, warnTranslatedInlineCode, warnCodeFenceContentDrift, + stripCodeComments, warnCatastrophicCodeFenceDrift, fixCrossScriptPunctuation, detectCrossScriptContamination, diff --git a/src/scripts/i18n/lib/github/branches.ts b/src/scripts/intl-pipeline/lib/github/branches.ts similarity index 58% rename from src/scripts/i18n/lib/github/branches.ts rename to src/scripts/intl-pipeline/lib/github/branches.ts index 084d2341458..add67815122 100644 --- a/src/scripts/i18n/lib/github/branches.ts +++ b/src/scripts/intl-pipeline/lib/github/branches.ts @@ -74,6 +74,93 @@ export const createBranchName = (suffix?: string) => { return `i18n/${label}-${ts}` } +/** + * Check if a branch exists on GitHub. + */ +export const branchExists = async (branchName: string): Promise => { + const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/git/ref/heads/${branchName}` + const res = await fetchWithRetry(url, { + headers: gitHubBearerHeaders, + }) + return res.ok +} + +/** + * Merge a base branch into a head branch via the GitHub API. + * Used to keep the staging branch up-to-date with dev. + * Returns true if merge succeeded (or was already up-to-date). + */ +export const mergeBranchInto = async ( + base: string, + head: string +): Promise => { + const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/merges` + debugLog(`Merging "${base}" into "${head}"`) + + const res = await fetchWithRetry(url, { + method: "POST", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + base: head, + head: base, + commit_message: `i18n: merge ${base} into ${head}`, + }), + }) + + if (res.status === 204) { + debugLog("Already up-to-date") + return true + } + if (res.status === 201) { + debugLog("Merge commit created") + return true + } + if (res.status === 409) { + console.error( + `[ERROR] Merge conflict: ${base} into ${head}. Manual resolution needed.` + ) + return false + } + + const body = await res.text().catch(() => "") + console.error(`[ERROR] Merge failed (${res.status}): ${body}`) + return false +} + +/** + * Ensure a staging branch exists and is up-to-date with its base. + * Creates the branch if it doesn't exist; merges base into it if it does. + * Returns the branch name. + */ +export const ensureStagingBranch = async ( + branchName: string, + baseBranch: string +): Promise => { + const exists = await branchExists(branchName) + + if (!exists) { + console.log(`[branch] Creating "${branchName}" from "${baseBranch}"`) + const baseObj = await getBranchObject(baseBranch) + await createBranchFromSha(branchName, baseObj.sha) + return branchName + } + + console.log( + `[branch] "${branchName}" exists, merging "${baseBranch}" into it` + ) + const merged = await mergeBranchInto(baseBranch, branchName) + if (!merged) { + throw new Error( + `Cannot merge ${baseBranch} into ${branchName}. Resolve conflicts or delete the branch and retry.` + ) + } + + return branchName +} + /** * Create a new branch from a base branch * @@ -115,7 +202,8 @@ export const postCreateBranchFrom = async ( return { branch, sha } } catch (error) { - console.error(error) - process.exit(1) + throw error instanceof Error + ? error + : new Error(`postCreateBranchFrom failed: ${String(error)}`) } } diff --git a/src/scripts/i18n/lib/github/commits.ts b/src/scripts/intl-pipeline/lib/github/commits.ts similarity index 60% rename from src/scripts/i18n/lib/github/commits.ts rename to src/scripts/intl-pipeline/lib/github/commits.ts index 01b5cb6aa3b..ea74da71e4d 100644 --- a/src/scripts/i18n/lib/github/commits.ts +++ b/src/scripts/intl-pipeline/lib/github/commits.ts @@ -18,29 +18,28 @@ interface TreeItem { } /** - * Incrementally commits files to a branch, amending each time. + * Shared committer for parallel language translation. * - * Each call to commitFile() creates a blob, builds a tree from the - * base tree + ALL accumulated blobs, and creates a commit with the - * same parent (the branch state before any translations). This - * produces a single growing commit per language. + * Creates individual chained commits (each parented on the previous) + * so multiple languages can interleave safely. Each file appears on the + * branch immediately for crash safety. * - * Concurrent calls are serialized via an internal queue so the - * GitHub API calls don't race. + * After all translations complete, call squashByLanguage() to collapse + * the individual commits into one per language for a clean history. */ -export class IncrementalCommitter { - private baseCommitSha = "" - private baseTreeSha = "" - private accumulatedItems: TreeItem[] = [] +export class SharedCommitter { + private currentCommitSha = "" + private currentTreeSha = "" private queue: Promise = Promise.resolve() private baseUrl = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}` + /** Track blob SHAs per language for squashing */ + private blobsByLanguage = new Map() + /** SHA of the original base before any translations */ + private originalBaseSha = "" - constructor( - private branch: string, - private message: string - ) {} + constructor(private branch: string) {} - /** Snapshot the current branch state as the amend base. */ + /** Snapshot the current branch state. */ async init(): Promise { const refRes = await fetchWithRetry( `${this.baseUrl}/git/ref/heads/${this.branch}`, @@ -48,34 +47,38 @@ export class IncrementalCommitter { ) if (!refRes.ok) { const body = await refRes.text().catch(() => "") - throw new Error( - `IncrementalCommitter init ref (${refRes.status}): ${body}` - ) + throw new Error(`SharedCommitter init ref (${refRes.status}): ${body}`) } const refData: { object: { sha: string } } = await refRes.json() - this.baseCommitSha = refData.object.sha + this.currentCommitSha = refData.object.sha + this.originalBaseSha = refData.object.sha const commitRes = await fetchWithRetry( - `${this.baseUrl}/git/commits/${this.baseCommitSha}`, + `${this.baseUrl}/git/commits/${this.currentCommitSha}`, { headers: gitHubBearerHeaders } ) if (!commitRes.ok) { const body = await commitRes.text().catch(() => "") throw new Error( - `IncrementalCommitter init commit (${commitRes.status}): ${body}` + `SharedCommitter init commit (${commitRes.status}): ${body}` ) } const commitData: { tree: { sha: string } } = await commitRes.json() - this.baseTreeSha = commitData.tree.sha + this.currentTreeSha = commitData.tree.sha } /** - * Queue a file to be committed. Serializes the GitHub API calls - * so concurrent translations don't race on the branch ref. + * Queue a file commit. Serialized so concurrent languages don't race. + * Each commit chains on the previous (not amending). */ - commitFile(path: string, content: string): Promise { - const result = this.queue.then(() => this._doCommit(path, content)) - // Chain continues even if a commit fails + commitFile( + filePath: string, + content: string, + language: string + ): Promise { + const result = this.queue.then(() => + this._doCommit(filePath, content, language) + ) this.queue = result.then( () => {}, () => {} @@ -83,11 +86,11 @@ export class IncrementalCommitter { return result } - get fileCount(): number { - return this.accumulatedItems.length - } - - private async _doCommit(path: string, content: string): Promise { + private async _doCommit( + filePath: string, + content: string, + language: string + ): Promise { // 1. Create blob const blobRes = await fetchWithRetry(`${this.baseUrl}/git/blobs`, { method: "POST", @@ -100,25 +103,31 @@ export class IncrementalCommitter { if (!blobRes.ok) { const body = await blobRes.text().catch(() => "") throw new Error( - `Failed to create blob for ${path} (${blobRes.status}): ${body}` + `Failed to create blob for ${filePath} (${blobRes.status}): ${body}` ) } const blobData: { sha: string } = await blobRes.json() - this.accumulatedItems.push({ - path, + const item: TreeItem = { + path: filePath, mode: "100644", type: "blob", sha: blobData.sha, - }) + } + + // Track blob for squashing + if (!this.blobsByLanguage.has(language)) { + this.blobsByLanguage.set(language, []) + } + this.blobsByLanguage.get(language)!.push(item) - // 2. Create tree from base + ALL accumulated blobs + // 2. Create tree on top of current tree const treeRes = await fetchWithRetry(`${this.baseUrl}/git/trees`, { method: "POST", headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, body: JSON.stringify({ - base_tree: this.baseTreeSha, - tree: this.accumulatedItems, + base_tree: this.currentTreeSha, + tree: [item], }), }) if (!treeRes.ok) { @@ -127,14 +136,14 @@ export class IncrementalCommitter { } const treeData: { sha: string } = await treeRes.json() - // 3. Create commit with base parent (amend pattern) + // 3. Create commit parented on the current tip (chaining, not amending) const commitRes = await fetchWithRetry(`${this.baseUrl}/git/commits`, { method: "POST", headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, body: JSON.stringify({ - message: this.message, + message: `i18n(${language}): ${filePath.split("/").pop()}`, tree: treeData.sha, - parents: [this.baseCommitSha], + parents: [this.currentCommitSha], }), }) if (!commitRes.ok) { @@ -143,13 +152,13 @@ export class IncrementalCommitter { } const commitData: { sha: string } = await commitRes.json() - // 4. Force-update branch ref to the amended commit + // 4. Update branch ref (no force needed -- linear chain) const updateRes = await fetchWithRetry( `${this.baseUrl}/git/refs/heads/${this.branch}`, { method: "PATCH", headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, - body: JSON.stringify({ sha: commitData.sha, force: true }), + body: JSON.stringify({ sha: commitData.sha }), } ) if (!updateRes.ok) { @@ -157,10 +166,124 @@ export class IncrementalCommitter { throw new Error(`Failed to update ref (${updateRes.status}): ${body}`) } - debugLog( - `IncrementalCommitter: ${this.accumulatedItems.length} files committed (latest: ${path})` + // Advance internal state + this.currentCommitSha = commitData.sha + this.currentTreeSha = treeData.sha + + debugLog(`SharedCommitter [${language}]: committed ${filePath}`) + } + + /** + * Squash all individual commits into one per language. + * Builds a new commit chain from the original base: + * base -> lang1 (all files) -> lang2 (all files) -> ... + * Then force-updates the branch ref. + */ + async squashByLanguage(): Promise { + const languages = Array.from(this.blobsByLanguage.keys()).sort() + if (languages.length === 0) return + + console.log( + `[SharedCommitter] Squashing ${languages.length} language(s): ${languages.join(", ")}` + ) + + let parentSha = this.originalBaseSha + // Get the original base tree + const baseCommitRes = await fetchWithRetry( + `${this.baseUrl}/git/commits/${this.originalBaseSha}`, + { headers: gitHubBearerHeaders } + ) + if (!baseCommitRes.ok) { + const body = await baseCommitRes.text().catch(() => "") + throw new Error( + `Failed to get base commit for squash (${baseCommitRes.status}): ${body}` + ) + } + const baseCommitData: { tree: { sha: string } } = await baseCommitRes.json() + let currentTree = baseCommitData.tree.sha + + for (const lang of languages) { + const blobs = this.blobsByLanguage.get(lang)! + + // Create tree with all blobs for this language on top of current tree + const treeRes = await fetchWithRetry(`${this.baseUrl}/git/trees`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + base_tree: currentTree, + tree: blobs, + }), + }) + if (!treeRes.ok) { + const body = await treeRes.text().catch(() => "") + throw new Error( + `Failed to create squash tree for ${lang} (${treeRes.status}): ${body}` + ) + } + const treeData: { sha: string } = await treeRes.json() + + // Create squashed commit + const commitRes = await fetchWithRetry(`${this.baseUrl}/git/commits`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + message: `i18n(${lang}): Gemini translation`, + tree: treeData.sha, + parents: [parentSha], + }), + }) + if (!commitRes.ok) { + const body = await commitRes.text().catch(() => "") + throw new Error( + `Failed to create squash commit for ${lang} (${commitRes.status}): ${body}` + ) + } + const commitData: { sha: string } = await commitRes.json() + + parentSha = commitData.sha + currentTree = treeData.sha + + console.log( + `[SharedCommitter] Squashed ${blobs.length} files for ${lang}` + ) + } + + // Force-update branch to squashed chain + const updateRes = await fetchWithRetry( + `${this.baseUrl}/git/refs/heads/${this.branch}`, + { + method: "PATCH", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ sha: parentSha, force: true }), + } + ) + if (!updateRes.ok) { + const body = await updateRes.text().catch(() => "") + throw new Error( + `Failed to update ref after squash (${updateRes.status}): ${body}` + ) + } + + // Update internal state + this.currentCommitSha = parentSha + this.currentTreeSha = currentTree + + console.log( + `[SharedCommitter] Squash complete: ${languages.length} commits` ) } + + get totalFiles(): number { + let count = 0 + for (const blobs of this.blobsByLanguage.values()) { + count += blobs.length + } + return count + } + + get languageCount(): number { + return this.blobsByLanguage.size + } } /** @@ -301,15 +424,15 @@ export async function batchCommitFiles( /** * Get the destination path for a translated file * - * @param crowdinFilePath - The Crowdin file path (e.g., src/intl/en/page-foo.json) + * @param filePath - The source file path (e.g., src/intl/en/page-foo.json) * @param internalLanguageCode - The internal language code * @returns The destination path in the repository */ export const getDestinationFromPath = ( - crowdinFilePath: string, + filePath: string, internalLanguageCode: string ) => { - const normalized = crowdinFilePath.replace(/^\//, "") + const normalized = filePath.replace(/^\//, "") const isJson = normalized.toLowerCase().endsWith(".json") const isMarkdown = normalized.toLowerCase().endsWith(".md") @@ -346,7 +469,7 @@ export const getDestinationFromPath = ( } debugLog( - `Destination mapping: ${crowdinFilePath} -> ${destinationPath} (lang=${internalLanguageCode})` + `Destination mapping: ${filePath} -> ${destinationPath} (lang=${internalLanguageCode})` ) return destinationPath } @@ -358,109 +481,3 @@ export const getDestinationFromPath = ( * @param branch - The branch name * @returns Object containing the file SHA */ -export const getPathSha = async (path: string, branch: string) => { - const url = new URL( - `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/contents/${path}?ref=${branch}` - ) - - const res = await fetchWithRetry(url.toString(), { - headers: gitHubBearerHeaders, - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`GitHub getPathSha (${res.status}): ${body}`) - } - - type JsonResponse = { sha: string } - const { sha }: JsonResponse = await res.json() - - return { sha } -} - -/** - * Commit a file to a GitHub branch with retry logic for conflicts - * - * @param buffer - The file contents as a Buffer - * @param destinationPath - The path in the repository - * @param branch - The branch name - * @param sha - Optional SHA for updating existing files - * @param attempt - Current retry attempt number - */ -export const putCommitFile = async ( - buffer: Buffer, - destinationPath: string, - branch: string, - sha?: string, - attempt = 0 -): Promise => { - const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/contents/${destinationPath}` - - try { - // Use the buffer contents as base64-encoded content for the commit - const contentBase64 = buffer.toString("base64") - - const body = { - message: `update(i18n): ${destinationPath}`, - content: contentBase64, - branch, - } - - if (sha) body["sha"] = sha - - const res = await fetchWithRetry(url.toString(), { - method: "PUT", - headers: { - ...gitHubBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify(body), - }) - - if (res.status === 422) { - const { sha: fileSha } = await getPathSha(destinationPath, branch) - console.warn( - `[RETRY] 422 Unprocessable for ${destinationPath}. Retrying with existing SHA ${fileSha}` - ) - return await putCommitFile( - buffer, - destinationPath, - branch, - fileSha, - attempt - ) - } - - if (res.status === 409) { - if (attempt >= 5) { - const bodyText = await res.text().catch(() => "") - throw new Error( - `GitHub putCommitFile conflict persists after ${attempt} retries (${res.status}): ${bodyText}` - ) - } - const backoff = 500 * Math.pow(2, attempt) // 500ms, 1s, 2s, 4s, 8s - console.warn( - `[RETRY] 409 Conflict for ${destinationPath}. Attempt ${attempt + 1}. Waiting ${backoff}ms before retry.` - ) - await delay(backoff) - const { sha: latestSha } = await getPathSha(destinationPath, branch) - return await putCommitFile( - buffer, - destinationPath, - branch, - latestSha, - attempt + 1 - ) - } - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`GitHub putCommitFile (${res.status}): ${body}`) - } - } catch (error) { - console.error(error) - process.exit(1) - } -} diff --git a/src/scripts/i18n/lib/github/pull-requests.ts b/src/scripts/intl-pipeline/lib/github/pull-requests.ts similarity index 55% rename from src/scripts/i18n/lib/github/pull-requests.ts rename to src/scripts/intl-pipeline/lib/github/pull-requests.ts index 18be0aa47d1..8c9bb5bd682 100644 --- a/src/scripts/i18n/lib/github/pull-requests.ts +++ b/src/scripts/intl-pipeline/lib/github/pull-requests.ts @@ -26,7 +26,7 @@ export const postPullRequest = async ( title, head, base, - body: bodyText || "Automated Crowdin translation import", + body: bodyText || "Automated Gemini translation import", } const res = await fetchWithRetry(url.toString(), { @@ -41,13 +41,65 @@ export const postPullRequest = async ( if (!res.ok) { console.warn("Res not OK") const body = await res.text().catch(() => "") - throw new Error(`Crowdin postPullRequest failed (${res.status}): ${body}`) + throw new Error(`postPullRequest failed (${res.status}): ${body}`) } const json = await res.json() return json } +/** + * Find an open PR for a given head -> base branch pair. + * Returns the PR object if found, null otherwise. + */ +export const findOpenPR = async ( + head: string, + base: string = config.baseBranch +): Promise<{ number: number; html_url: string; body: string } | null> => { + const fullHead = `${config.ghOrganization}:${head}` + const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/pulls?state=open&head=${encodeURIComponent(fullHead)}&base=${encodeURIComponent(base)}` + + const res = await fetchWithRetry(url, { + method: "GET", + headers: gitHubBearerHeaders, + }) + + if (!res.ok) return null + + const prs = (await res.json()) as Array<{ + number: number + html_url: string + body: string + }> + return prs.length > 0 ? prs[0] : null +} + +/** + * Update the body of an existing PR. + */ +export const updatePRBody = async ( + prNumber: number, + body: string +): Promise => { + const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/pulls/${prNumber}` + + const res = await fetchWithRetry(url, { + method: "PATCH", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ body }), + }) + + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error( + `Failed to update PR #${prNumber} body (${res.status}): ${text}` + ) + } +} + /** * Post a comment on a pull request * diff --git a/src/scripts/i18n/lib/ai/code-block-extractor.ts b/src/scripts/intl-pipeline/lib/llm/code-block-extractor.ts similarity index 68% rename from src/scripts/i18n/lib/ai/code-block-extractor.ts rename to src/scripts/intl-pipeline/lib/llm/code-block-extractor.ts index 715e7d64fef..5bd6f4d639a 100644 --- a/src/scripts/i18n/lib/ai/code-block-extractor.ts +++ b/src/scripts/intl-pipeline/lib/llm/code-block-extractor.ts @@ -7,11 +7,16 @@ * for separate translation. */ +import { MAX_CHUNK_BYTES } from "../../constants" +import { FENCED_BLOCK_RE, FRONTMATTER_RE } from "../shared-patterns" + /** A single extracted code block */ export interface CodeBlock { index: number language: string content: string + /** Indentation prefix (spaces/tabs) of the original fence */ + indent: string } /** Result of extracting code blocks from markdown */ @@ -26,6 +31,8 @@ export interface ExtractionResult { export interface CodeComment { blockIndex: number line: number + /** Last line of a multi-line comment (for collapsing placeholder lines) */ + endLine?: number type: "single" | "multi" text: string } @@ -37,11 +44,6 @@ function makePlaceholder(index: number): string { return `${PLACEHOLDER_PREFIX}${index}${PLACEHOLDER_SUFFIX}` } -// Matches ``` or ~~~ fenced code blocks (with optional language tag) -// Matches fenced code blocks: non-empty content OR empty (just a newline between fences) -const FENCED_BLOCK_RE = - /^([ \t]*)(```|~~~)([^\n]*)\n([\s\S]*?)\n\1\2[ \t]*$|^([ \t]*)(```|~~~)([^\n]*)\n\5\6[ \t]*$/gm - /** * Extract all fenced code blocks from markdown, replacing each * with a numbered HTML comment placeholder. @@ -66,6 +68,7 @@ export function extractCodeBlocks(markdown: string): ExtractionResult { index, language: lang, content, + indent: ind, }) return `${ind}${makePlaceholder(index)}` } @@ -85,7 +88,9 @@ export function restoreCodeBlocks(prose: string, blocks: CodeBlock[]): string { const placeholder = makePlaceholder(block.index) const fence = "```" const langTag = block.language ? block.language : "" - const restored = `${fence}${langTag}\n${block.content}\n${fence}` + // Opening fence gets indent from the prose context (placeholder was indented). + // Closing fence needs explicit indent since it's on a new line in the replacement. + const restored = `${fence}${langTag}\n${block.content}\n${block.indent}${fence}` result = result.replace(placeholder, restored) } @@ -100,7 +105,8 @@ type CommentSyntax = "js" | "python" | "shell" /** Map fence language tags to comment syntax family */ function getCommentSyntax(language: string): CommentSyntax { - const lang = language.toLowerCase() + // Strip metadata after the language name (e.g., "sh copy", "solidity showLineNumbers") + const lang = language.toLowerCase().split(/\s+/)[0] // JS/Solidity/TS family: // and /* */ if ( @@ -186,6 +192,7 @@ export function extractComments( comments.push({ blockIndex: -1, // filled in by caller line: multiLineStart, + endLine: i, type: "multi", text: multiLineBuffer.trim(), }) @@ -371,9 +378,24 @@ export function restoreComments( // Multi-line: wrap in block comment syntax const indent = existing.match(/^(\s*)/)?.[1] || "" if (syntax === "js") { - lines[comment.line] = `${indent}/* ${comment.text} */\n${existing}` + lines[comment.line] = `${indent}/* ${comment.text} */` } else { - lines[comment.line] = `${indent}# ${comment.text}\n${existing}` + lines[comment.line] = `${indent}# ${comment.text}` + } + // Collapse empty placeholder lines left by multi-line comment extraction. + // Preserve the endLine if it has code after the comment close (e.g., "*/ doSomething()"). + if (comment.endLine != null && comment.endLine > comment.line) { + const endContent = lines[comment.endLine]?.trim() || "" + if (endContent) { + // Keep endLine (has code after */), remove only the middle lines + const removeCount = comment.endLine - comment.line - 1 + if (removeCount > 0) { + lines.splice(comment.line + 1, removeCount) + } + } else { + // endLine is empty too -- remove all placeholder lines + lines.splice(comment.line + 1, comment.endLine - comment.line) + } } } } @@ -407,7 +429,7 @@ export function chunkProse( // Extract frontmatter if present let frontmatter = "" let body = prose - const fmMatch = prose.match(/^(---\n[\s\S]*?\n---\n)/) + const fmMatch = prose.match(FRONTMATTER_RE) if (fmMatch) { frontmatter = fmMatch[1] body = prose.slice(fmMatch[1].length) @@ -492,5 +514,151 @@ function splitAtHeadings(text: string, headingPrefix: string): string[] { return parts } +// --------------------------------------------------------------------------- +// Byte-size-aware markdown chunking (CONCURRENCY-SPEC.md Part 2B) +// --------------------------------------------------------------------------- + +/** + * Chunk markdown prose by byte size. Uses heading boundaries first, + * then paragraph boundaries for oversized sections. Each chunk of a + * split section includes the heading for context. + * + * Guarantees: + * - At least 1 paragraph per chunk (even if it exceeds budget) + * - Code fences are not split mid-fence + * - Heading context preserved in each chunk of a split section + */ +export function chunkMarkdownProse( + prose: string, + maxBytes: number = MAX_CHUNK_BYTES +): string[] { + if (Buffer.byteLength(prose, "utf-8") <= maxBytes) return [prose] + + // Extract frontmatter if present + let frontmatter = "" + let body = prose + const fmMatch = prose.match(FRONTMATTER_RE) + if (fmMatch) { + frontmatter = fmMatch[1] + body = prose.slice(fmMatch[1].length) + } + + const fmBytes = Buffer.byteLength(frontmatter, "utf-8") + const chunks = splitByByteSize(body, maxBytes - fmBytes, 2) + + // Prepend frontmatter to the first chunk only + if (frontmatter && chunks.length > 0) { + chunks[0] = frontmatter + chunks[0] + } + + // Filter out empty chunks + return chunks.filter((c) => c.trim().length > 0) +} + +/** + * Recursively split at heading boundaries, falling back to paragraphs. + * Uses byte size instead of character count. + */ +function splitByByteSize( + text: string, + maxBytes: number, + headingLevel: number +): string[] { + if (Buffer.byteLength(text, "utf-8") <= maxBytes) return [text] + + // Try splitting at current heading level + if (headingLevel <= 6) { + const headingPrefix = "#".repeat(headingLevel) + " " + const sections = splitAtHeadings(text, headingPrefix) + + if (sections.length > 1) { + const result: string[] = [] + for (const section of sections) { + if (Buffer.byteLength(section, "utf-8") <= maxBytes) { + result.push(section) + } else { + result.push(...splitByByteSize(section, maxBytes, headingLevel + 1)) + } + } + return result + } + + return splitByByteSize(text, maxBytes, headingLevel + 1) + } + + // Exhausted heading levels -- split at paragraph boundaries + // Ensure code fences stay intact by treating fenced blocks as atomic units + const blocks = splitIntoBlocks(text) + if (blocks.length > 1) { + const result: string[] = [] + let current = "" + // Extract heading from the first line if present (for context in each chunk) + const headingMatch = text.match(/^(#{1,6}\s+[^\n]*\{#[^}]+\}[^\n]*)\n/) + const headingContext = headingMatch ? headingMatch[1] : "" + + for (const block of blocks) { + const candidateBytes = Buffer.byteLength( + current ? current + "\n\n" + block : block, + "utf-8" + ) + if (candidateBytes > maxBytes && current) { + result.push(current) + // Prepend heading context to continuation chunks + current = headingContext ? headingContext + "\n\n" + block : block + } else { + current = current ? current + "\n\n" + block : block + } + } + if (current) result.push(current) + return result + } + + // Last resort: return as single chunk (minimum guarantee) + return [text] +} + +/** + * Split text into blocks at blank lines, keeping code fences atomic. + * A code fence (```...```) is never split across blocks. + */ +function splitIntoBlocks(text: string): string[] { + const lines = text.split("\n") + const blocks: string[] = [] + let current: string[] = [] + let inFence = false + + for (const line of lines) { + if (line.startsWith("```")) { + inFence = !inFence + current.push(line) + continue + } + + if (inFence) { + current.push(line) + continue + } + + if (line.trim() === "" && current.length > 0) { + // Blank line outside fence: potential split point + const lastNonEmpty = current[current.length - 1]?.trim() + if (lastNonEmpty === "") { + // Consecutive blank lines -- just add + current.push(line) + } else { + // End of block + blocks.push(current.join("\n")) + current = [] + } + } else { + current.push(line) + } + } + if (current.length > 0) { + blocks.push(current.join("\n")) + } + return blocks +} + // Re-export getCommentSyntax for restoreComments callers export { getCommentSyntax } diff --git a/src/scripts/intl-pipeline/lib/llm/content-normalizer.ts b/src/scripts/intl-pipeline/lib/llm/content-normalizer.ts new file mode 100644 index 00000000000..74ab5c4cd23 --- /dev/null +++ b/src/scripts/intl-pipeline/lib/llm/content-normalizer.ts @@ -0,0 +1,909 @@ +/** + * Content normalizer for the translation manifest. + * + * Parses markdown section content into a tree of translatable and + * non-translatable nodes. Non-translatable content is replaced with + * stable, content-addressed placeholders so that changes to URLs, + * code, whitespace, etc. do not alter the manifest hash or trigger + * retranslation. + * + * The normalized form is used for: + * 1. Hashing -- only translatable content affects the manifest hash + * 2. Translation -- Gemini receives normalized text with placeholders + * 3. Verification -- inert attributes (className, href) are verified + * against English originals post-translation + * + * Placeholder format: + * Block: + * Wrapper: text + * + * TYPE values: CODEBLOCK, CODE, COMPONENT, IMAGE, LINK + * HASH: 6-char hex SHA-256 digest of extracted content (content-addressed + * so insertions above don't renumber placeholders below). + * + * Tree structure (document is a tree, not flat text): + * + * Section (interior) + * +-- Prose (leaf, hashed) + * +-- Code fence (interior) + * | +-- Comment 0 (leaf, hashed -- translatable) + * | +-- Comment 1 (leaf, hashed -- translatable) + * | +-- Code body (inert -- not hashed) + * +-- (interior) + * | +-- title attr (leaf, hashed -- translatable) + * | +-- className (inert -- verified against English) + * | +-- children (leaf, hashed -- translatable) + * +-- Link (wrapper) + * +-- href (inert -- verified against English) + * +-- link text (leaf, hashed -- translatable) + */ + +import { createHash } from "crypto" + +import { + ATTRIBUTE_RE, + FENCED_BLOCK_RE, + isTranslatableValue, + TRANSLATABLE_ATTRIBUTES, + type TranslatableAttribute, +} from "../shared-patterns" + +import { extractComments } from "./code-block-extractor" + +// --------------------------------------------------------------------------- +// Hashing utilities +// --------------------------------------------------------------------------- + +/** 6-char hex digest for content-addressed placeholder IDs */ +function shortHash(content: string): string { + return createHash("sha256").update(content, "utf8").digest("hex").slice(0, 6) +} + +/** 12-char hex digest for trie leaf hashes (matches manifest-generator) */ +function leafHash(content: string): string { + return createHash("sha256").update(content, "utf8").digest("hex").slice(0, 12) +} + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +/** Shared fields for all content nodes */ +interface BaseNode { + /** The extracted content */ + content: string +} + +/** Translatable leaf -- prose remaining after all extractions */ +interface ProseNode extends BaseNode { + type: "prose" + translatable: true + hash: string +} + +/** Interior node wrapping a fenced code block */ +interface CodeFenceNode extends BaseNode { + type: "code-fence" + translatable: false + placeholder: string + children: ContentNode[] +} + +/** Translatable leaf -- a comment extracted from a code block */ +interface CodeCommentNode extends BaseNode { + type: "code-comment" + translatable: true + hash: string + meta: { language: string; line: string; commentType: string } +} + +/** Inert leaf -- the code body (no comments) */ +interface CodeBodyNode extends BaseNode { + type: "code-body" + translatable: false + meta: { language: string; indent: string } +} + +/** Interior node wrapping a JSX/HTML component */ +interface ComponentNode extends BaseNode { + type: "component" + translatable: false + placeholder: string + children: ContentNode[] + meta: { componentName: string; inertAttributes: Record } +} + +/** Translatable leaf -- a JSX attribute value */ +interface ComponentAttributeNode extends BaseNode { + type: "component-attribute" + translatable: true + hash: string + meta: { attributeName: string; componentName: string } +} + +/** Inert leaf -- inline code span */ +interface InlineCodeNode extends BaseNode { + type: "inline-code" + translatable: false + placeholder: string +} + +/** Interior node wrapping a markdown image */ +interface ImageNode extends BaseNode { + type: "image" + translatable: false + placeholder: string + children: ContentNode[] + meta: { path: string } +} + +/** Translatable leaf -- image alt text */ +interface ImageAltNode extends BaseNode { + type: "image-alt" + translatable: true + hash: string +} + +/** Interior node wrapping a markdown link */ +interface LinkNode extends BaseNode { + type: "link" + translatable: false + placeholder: string + children: ContentNode[] + meta: { url: string } +} + +/** Translatable leaf -- link display text */ +interface LinkTextNode extends BaseNode { + type: "link-text" + translatable: true + hash: string +} + +/** Interior node wrapping an embedded HTML element (a, div, span, etc.) */ +interface HtmlTagNode extends BaseNode { + type: "html-tag" + translatable: false + placeholder: string + children: ContentNode[] + meta: { tagName: string; inertAttributes: Record } +} + +/** Translatable leaf -- text content inside an HTML element */ +interface HtmlTagTextNode extends BaseNode { + type: "html-tag-text" + translatable: true + hash: string +} + +/** Discriminated union of all content node types */ +export type ContentNode = + | ProseNode + | CodeFenceNode + | CodeCommentNode + | CodeBodyNode + | ComponentNode + | ComponentAttributeNode + | InlineCodeNode + | ImageNode + | ImageAltNode + | LinkNode + | LinkTextNode + | HtmlTagNode + | HtmlTagTextNode + +/** Result of normalizing markdown content */ +export interface NormalizedContent { + /** Markdown with non-translatable parts replaced by placeholders */ + normalized: string + /** Content tree for trie construction */ + tree: ContentNode[] + /** Placeholder -> original content (for reconstruction) */ + extractions: Map +} + +// --------------------------------------------------------------------------- +// Placeholder tag generators +// --------------------------------------------------------------------------- + +function codeblockTag(hash: string): string { + return `` +} + +function codeTag(hash: string): string { + return `` +} + +function componentSelfClosingTag(hash: string): string { + return `` +} + +function componentOpenTag(hash: string): string { + return `` +} + +function componentCloseTag(hash: string): string { + return `` +} + +function imageTag(hash: string): string { + return `` +} + +function htmlTagOpenTag(hash: string, selfClosing = false): string { + return selfClosing + ? `` + : `` +} + +function htmlTagCloseTag(hash: string): string { + return `` +} + +function linkOpenTag(hash: string): string { + return `` +} + +function linkCloseTag(hash: string): string { + return `` +} + +// --------------------------------------------------------------------------- +// Pass 1: Fenced code blocks +// --------------------------------------------------------------------------- + +/** Language tags that indicate prose content, not executable code */ +const PROSE_FENCE_TAGS = new Set(["md", "markdown", "mdx", "text", "txt", ""]) + +function extractCodeFences( + markdown: string, + tree: ContentNode[], + extractions: Map +): string { + return markdown.replace( + FENCED_BLOCK_RE, + (fullMatch: string, ...groups: (string | undefined)[]) => { + const indent = groups[0] ?? groups[4] ?? "" + const langTag = (groups[2] ?? groups[6] ?? "").trim() + const codeContent = groups[3] ?? "" + + const hash = shortHash(fullMatch) + const language = langTag.toLowerCase().split(/\s+/)[0] || "" + + // Markdown/text fences contain prose, not code. + // Treat the body as translatable content (like component children). + if (PROSE_FENCE_TAGS.has(language) && codeContent.trim()) { + const open = `` + const close = `` + + // Recursively normalize the prose content inside the fence + const childResult = normalizeContent(codeContent) + const children: ContentNode[] = [] + for (const childNode of childResult.tree) { + children.push(childNode) + } + childResult.extractions.forEach((v, k) => { + extractions.set(k, v) + }) + + tree.push({ + type: "code-fence", + translatable: false, + content: fullMatch, + placeholder: `${open}...${close}`, + children, + }) + + extractions.set(`CODEBLOCK:${hash}`, fullMatch) + return `${indent}${open}\n${childResult.normalized}\n${indent}${close}` + } + + // True code fences: extract comments as translatable leaves + const placeholder = codeblockTag(hash) + const { comments } = extractComments(codeContent, language) + + const children: ContentNode[] = [] + + for (const comment of comments) { + if (comment.text.trim()) { + children.push({ + type: "code-comment", + translatable: true, + content: comment.text, + hash: leafHash(comment.text), + meta: { + language, + line: String(comment.line), + commentType: comment.type, + }, + }) + } + } + + // Code body is inert + children.push({ + type: "code-body", + translatable: false, + content: codeContent, + meta: { language: langTag, indent }, + }) + + tree.push({ + type: "code-fence", + translatable: false, + content: fullMatch, + placeholder, + children, + }) + + extractions.set(placeholder, fullMatch) + return placeholder + } + ) +} + +// --------------------------------------------------------------------------- +// Pass 2: JSX/HTML components +// --------------------------------------------------------------------------- + +/** + * Self-closing JSX components: + * + * We match PascalCase names to avoid catching standard HTML tags. + * Uses a non-greedy match for the attribute body. + */ +const SELF_CLOSING_COMPONENT_RE = /<([A-Z][a-zA-Z0-9]*)(\s[^>]*)?\s*\/>/g + +/** + * JSX components with children: ... + * + * Non-greedy match on children. Does NOT handle nested same-name + * components -- for those, the outer match wins (acceptable for + * ethereum.org content where deep nesting of same component is rare). + */ +const COMPONENT_WITH_CHILDREN_RE = + /<([A-Z][a-zA-Z0-9]*)(\s[^>]*)?\s*>([\s\S]*?)<\/\1>/g + +function parseAttributes( + attrString: string, + componentName: string +): { translatable: ContentNode[]; inert: Record } { + const translatable: ContentNode[] = [] + const inert: Record = {} + + let match: RegExpExecArray | null + ATTRIBUTE_RE.lastIndex = 0 + + while ((match = ATTRIBUTE_RE.exec(attrString)) !== null) { + const name = match[1] + const value = match[2] ?? match[3] + + if ( + TRANSLATABLE_ATTRIBUTES.includes(name as TranslatableAttribute) && + isTranslatableValue(value) + ) { + translatable.push({ + type: "component-attribute", + translatable: true, + content: value, + hash: leafHash(value), + meta: { attributeName: name, componentName }, + }) + } else { + inert[name] = value + } + } + + return { translatable, inert } +} + +function extractComponents( + markdown: string, + tree: ContentNode[], + extractions: Map +): string { + // Pass 2a: Components with children + let result = markdown.replace( + COMPONENT_WITH_CHILDREN_RE, + (fullMatch: string, name: string, attrStr: string, children: string) => { + const { translatable, inert } = parseAttributes(attrStr, name) + + // If no translatable attributes AND children are simple prose, + // don't replace -- let Gemini translate inline. Only extract if + // component has translatable attributes that need separate handling. + if (translatable.length === 0) { + return fullMatch + } + + const hash = shortHash(fullMatch) + + const childNodes: ContentNode[] = [...translatable] + + // Recursively normalize children to decompose nested HTML tags, + // inline code, links, etc. into their own sub-nodes. + let normalizedChildren = "" + if (children.trim()) { + const childResult = normalizeContent(children) + normalizedChildren = childResult.normalized + for (const childNode of childResult.tree) { + childNodes.push(childNode) + } + childResult.extractions.forEach((v, k) => { + extractions.set(k, v) + }) + } + + // Wrapper style: children text stays visible to Gemini. + // Translatable attributes (title, etc.) are in the tree for + // the JSX attribute translation phase to handle separately. + const open = componentOpenTag(hash) + const close = componentCloseTag(hash) + const placeholder = `${open}...${close}` + + tree.push({ + type: "component", + translatable: false, + content: fullMatch, + placeholder, + children: childNodes, + meta: { componentName: name, inertAttributes: inert }, + }) + + extractions.set(`COMPONENT:${hash}`, fullMatch) + return `${open}${normalizedChildren}${close}` + } + ) + + // Pass 2b: Self-closing components + result = result.replace( + SELF_CLOSING_COMPONENT_RE, + (fullMatch: string, name: string, attrStr: string) => { + const { translatable, inert } = parseAttributes(attrStr, name) + + if (translatable.length === 0) { + return fullMatch + } + + const hash = shortHash(fullMatch) + const placeholder = componentSelfClosingTag(hash) + + tree.push({ + type: "component", + translatable: false, + content: fullMatch, + placeholder, + children: translatable, + meta: { componentName: name, inertAttributes: inert }, + }) + + extractions.set(placeholder, fullMatch) + return placeholder + } + ) + + return result +} + +// --------------------------------------------------------------------------- +// Pass 2c: Embedded HTML elements +// --------------------------------------------------------------------------- + +/** + * Common HTML tags found in ethereum.org markdown content. + * Lowercase only -- PascalCase is handled by component extraction. + */ +const HTML_TAGS = + "a|div|span|p|table|tr|td|th|thead|tbody|img|br|hr|ul|ol|li|strong|em|b|i|code|pre|blockquote|iframe|video|source|details|summary" + +/** + * Matches opening+closing HTML tags with attributes and content. + * Requires at least one attribute (\s+) to avoid matching bare tags + * like
  • ) + // where the outer tag has no attributes and passes through, exposing the inner tag. + let result = markdown + let prevResult = "" + while (prevResult !== result) { + prevResult = result + result = result.replace( + HTML_TAG_WITH_CHILDREN_RE, + ( + fullMatch: string, + tagName: string, + attrStr: string, + children: string + ) => { + const hash = shortHash(fullMatch) + const open = htmlTagOpenTag(hash) + const close = htmlTagCloseTag(hash) + + // Parse all attributes as inert (HTML attrs are structural) + const inert: Record = {} + let match: RegExpExecArray | null + ATTRIBUTE_RE.lastIndex = 0 + while ((match = ATTRIBUTE_RE.exec(attrStr)) !== null) { + inert[match[1]] = match[2] ?? match[3] + } + + const childNodes: ContentNode[] = [] + if (children.trim()) { + childNodes.push({ + type: "html-tag-text", + translatable: true, + content: children, + hash: leafHash(children), + }) + } + + tree.push({ + type: "html-tag", + translatable: false, + content: fullMatch, + placeholder: `${open}...${close}`, + children: childNodes, + meta: { tagName, inertAttributes: inert }, + }) + + extractions.set(`HTMLTAG:${hash}`, fullMatch) + + // Wrapper style: text stays visible for Gemini, tags become placeholders + if (children.trim()) { + return `${open}${children}${close}` + } + return open + } + ) + } // end while loop for nested tags + + // Self-closing tags with attributes (e.g., ) + result = result.replace( + HTML_SELF_CLOSING_RE, + (fullMatch: string, tagName: string, attrStr: string) => { + // Skip if already wrapped by the children pass above + if (fullMatch.includes("HTML-PLACEHOLDER")) { + return fullMatch + } + + const hash = shortHash(fullMatch) + const placeholder = htmlTagOpenTag(hash, true) + + const inert: Record = {} + let match: RegExpExecArray | null + ATTRIBUTE_RE.lastIndex = 0 + while ((match = ATTRIBUTE_RE.exec(attrStr)) !== null) { + inert[match[1]] = match[2] ?? match[3] + } + + tree.push({ + type: "html-tag", + translatable: false, + content: fullMatch, + placeholder, + children: [], + meta: { tagName, inertAttributes: inert }, + }) + + extractions.set(placeholder, fullMatch) + return placeholder + } + ) + + return result +} + +// --------------------------------------------------------------------------- +// Pass 3: Inline code +// --------------------------------------------------------------------------- + +/** + * Matches inline code: `content` (single backtick, non-greedy). + * Does not match inside already-extracted placeholders. + */ +const INLINE_CODE_RE = /`([^`\n]+)`/g + +function extractInlineCode( + markdown: string, + tree: ContentNode[], + extractions: Map +): string { + return markdown.replace( + INLINE_CODE_RE, + (fullMatch: string, content: string) => { + const hash = shortHash(content) + const placeholder = codeTag(hash) + + tree.push({ + type: "inline-code", + translatable: false, + content, + placeholder, + }) + + extractions.set(placeholder, fullMatch) + return placeholder + } + ) +} + +// --------------------------------------------------------------------------- +// Pass 4: Images +// --------------------------------------------------------------------------- + +/** + * Matches markdown images: ![alt text](path "optional title") + * Alt text is translatable; path and title are inert. + */ +const IMAGE_RE = /!\[([^\]]*)\]\(([^)]+)\)/g + +function extractImages( + markdown: string, + tree: ContentNode[], + extractions: Map +): string { + return markdown.replace( + IMAGE_RE, + (fullMatch: string, altText: string, pathAndTitle: string) => { + const hash = shortHash(fullMatch) + const placeholder = imageTag(hash) + + const children: ContentNode[] = [] + + // Alt text is a translatable leaf + if (altText.trim()) { + children.push({ + type: "image-alt", + translatable: true, + content: altText, + hash: leafHash(altText), + }) + } + + tree.push({ + type: "image", + translatable: false, + content: fullMatch, + placeholder, + children, + meta: { path: pathAndTitle }, + }) + + extractions.set(placeholder, fullMatch) + return placeholder + } + ) +} + +// --------------------------------------------------------------------------- +// Pass 5: Links (wrapper style) +// --------------------------------------------------------------------------- + +/** + * Matches markdown links: [text](url "optional title") + * Link text is translatable; URL is inert. + * + * Uses wrapper placeholders so Gemini can reorder links within + * translated sentences while preserving the URL association. + */ +const LINK_RE = /\[([^\]]+)\]\(([^)]+)\)/g + +function extractLinks( + markdown: string, + tree: ContentNode[], + extractions: Map +): string { + return markdown.replace( + LINK_RE, + (fullMatch: string, linkText: string, url: string) => { + const hash = shortHash(fullMatch) + const open = linkOpenTag(hash) + const close = linkCloseTag(hash) + + const children: ContentNode[] = [ + { + type: "link-text", + translatable: true, + content: linkText, + hash: leafHash(linkText), + }, + ] + + tree.push({ + type: "link", + translatable: false, + content: fullMatch, + placeholder: `${open}...${close}`, + children, + meta: { url }, + }) + + // Store full original for reconstruction + extractions.set(`LINK:${hash}`, fullMatch) + + // Wrapper: Gemini sees the text and can move it + return `${open}${linkText}${close}` + } + ) +} + +// --------------------------------------------------------------------------- +// Pass 6: Heading anchor IDs +// --------------------------------------------------------------------------- + +/** + * Strips {#anchor-id} from heading lines. + * These are non-translatable metadata added by content authors. + */ +const HEADING_ID_RE = /^(#{1,6}\s+.+?)[ \t]*\{#[^}]+\}[ \t]*$/gm + +function stripHeadingIds(markdown: string): string { + return markdown.replace(HEADING_ID_RE, "$1") +} + +// --------------------------------------------------------------------------- +// Pass 7: Whitespace normalization +// --------------------------------------------------------------------------- + +/** + * Normalize whitespace to eliminate spurious hash changes from + * trailing spaces, inconsistent blank lines, etc. + */ +function normalizeWhitespace(markdown: string): string { + return ( + markdown + // Trim trailing whitespace per line + .replace(/[ \t]+$/gm, "") + // Collapse 3+ consecutive blank lines to 2 + .replace(/\n{3,}/g, "\n\n") + // Trim leading/trailing whitespace from the whole string + .trim() + ) +} + +// --------------------------------------------------------------------------- +// Prose hashing helper +// --------------------------------------------------------------------------- + +/** + * Strip all HTML-PLACEHOLDER tags from text before hashing. + * + * Block placeholders (CODEBLOCK, CODE, COMPONENT, IMAGE) are removed + * entirely. Wrapper placeholders (LINK) have their tags removed but + * inner text preserved. + * + * This ensures the prose hash depends ONLY on translatable text, + * not on the content-addressed hashes embedded in placeholder tags. + */ +const BLOCK_PLACEHOLDER_RE = //g +const WRAPPER_TAG_RE = + /<\/?HTML-PLACEHOLDER-(?:CODEBLOCK|LINK|HTMLTAG|COMPONENT)-[a-f0-9]+(?:\s\/)?>/g + +function stripPlaceholderTags(text: string): string { + return text.replace(BLOCK_PLACEHOLDER_RE, "").replace(WRAPPER_TAG_RE, "") +} + +// --------------------------------------------------------------------------- +// Main API +// --------------------------------------------------------------------------- + +/** + * Normalize markdown content for hashing and translation. + * + * Extraction order (each pass operates on the output of the previous): + * 1. Fenced code blocks -> placeholders (comments extracted as leaves) + * 2. JSX components (PascalCase) -> placeholders (translatable attrs as leaves) + * 3. Embedded HTML tags (lowercase) -> wrapper placeholders (attrs as inert) + * 4. Inline code -> placeholders + * 5. Images -> placeholders (alt text extracted as leaf) + * 6. Links -> wrapper placeholders (text stays, URL in metadata) + * 7. Heading anchor IDs -> stripped + * 8. Whitespace -> normalized + * + * @param markdown - Raw markdown section content + * @returns Normalized content with tree and extraction map + */ +export function normalizeContent(markdown: string): NormalizedContent { + const tree: ContentNode[] = [] + const extractions = new Map() + + // Pre-validation: reject content containing reserved placeholder syntax. + // A contributor could innocently include these strings in docs about + // the normalizer itself, which would corrupt the translation pipeline. + const RESERVED_PLACEHOLDER = /<\/?HTML-PLACEHOLDER-[A-Z]+-[a-f0-9]+[\s/]*>/ + if (RESERVED_PLACEHOLDER.test(markdown)) { + throw new Error( + "Content contains reserved syntax. " + + "Use backtick-escaped inline code to reference placeholder tags." + ) + } + + // Extraction passes in dependency order + let normalized = markdown + + // 1. Code fences first (prevents # comments from being parsed as headings) + normalized = extractCodeFences(normalized, tree, extractions) + + // 2. JSX components (PascalCase, before inline code to avoid matching backticks in attrs) + normalized = extractComponents(normalized, tree, extractions) + + // 3. Embedded HTML tags (lowercase -- a, div, span, img, etc.) + normalized = extractHtmlTags(normalized, tree, extractions) + + // 4. Inline code + normalized = extractInlineCode(normalized, tree, extractions) + + // 5. Images (before links -- image syntax ![...] would otherwise match [...]) + normalized = extractImages(normalized, tree, extractions) + + // 6. Links (wrapper style -- text stays in normalized form) + normalized = extractLinks(normalized, tree, extractions) + + // 7. Strip heading anchor IDs + normalized = stripHeadingIds(normalized) + + // The remaining prose is translatable. + // For HASHING: strip placeholder tags and normalize whitespace so + // that blank-line changes don't trigger retranslation. + // For GEMINI: return `normalized` with original whitespace intact + // so the translated output preserves the same paragraph structure. + if (normalized.trim()) { + const proseForHashing = normalizeWhitespace( + stripPlaceholderTags(normalized) + ) + tree.push({ + type: "prose", + translatable: true, + content: normalized, + hash: leafHash(proseForHashing), + }) + } + + return { normalized, tree, extractions } +} + +/** + * Collect all translatable leaf hashes from a content tree. + * + * Returns a flat map of "{nodeType}:{index}" -> hash, suitable + * for inclusion in the manifest trie as child nodes. + */ +export function collectLeafHashes(tree: ContentNode[]): Record { + const result: Record = {} + const counters: Record = {} + + function visit(node: ContentNode): void { + if (node.translatable) { + const prefix = node.type + const idx = counters[prefix] ?? 0 + counters[prefix] = idx + 1 + result[`${prefix}:${idx}`] = node.hash + } + if ("children" in node && node.children) { + for (const child of node.children) { + visit(child) + } + } + } + + for (const node of tree) { + visit(node) + } + + return result +} diff --git a/src/scripts/intl-pipeline/lib/llm/gemini.ts b/src/scripts/intl-pipeline/lib/llm/gemini.ts new file mode 100644 index 00000000000..8e6d3a86a8c --- /dev/null +++ b/src/scripts/intl-pipeline/lib/llm/gemini.ts @@ -0,0 +1,1220 @@ +/** + * Core file translation via Gemini (direct, no Crowdin). + * + * Sends whole files (no segmentation) with site-specific context. + * Gemini handles the linguistics; we handle the guardrails. + */ + +import { GoogleGenAI, HarmBlockThreshold, HarmCategory } from "@google/genai" + +import i18nConfig from "../../../../../i18n.config.json" +import { GEMINI_MODELS } from "../../config" +import { delay } from "../workflows/utils" + +import { + chunkProse, + type CodeBlock, + type CodeComment, + extractCodeBlocks, + extractComments, + getCommentSyntax, + PROSE_SIZE_THRESHOLD, + restoreCodeBlocks, + restoreComments, +} from "./code-block-extractor" +import { type ContentNode, normalizeContent } from "./content-normalizer" +import { + mergeJsonBatches, + prepareJsonBatches, + restoreJsonBatch, +} from "./json-batcher" +import { + validateTranslatedJson, + validateTranslatedMarkdown, + type ValidationResult, +} from "./output-validation" +import { buildTranslationPrompt } from "./prompt-builder" + +/** + * Check if Gemini API is available (API key present) + */ +export function isGeminiAvailable(): boolean { + return Boolean(process.env.GEMINI_API_KEY) +} + +// GEMINI_MODELS imported from ../../config +const MAX_RETRIES = 3 +const RETRY_DELAY_MS = 5000 + +/** + * Disable safety filters for all categories. Translation content (educational + * blockchain docs) should never be blocked. Without this, Gemini silently + * returns empty candidates for content that triggers false positives (e.g., + * mining/attack descriptions in certain non-Latin languages). + */ +const SAFETY_SETTINGS = [ + { + category: HarmCategory.HARM_CATEGORY_HARASSMENT, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, + { + category: HarmCategory.HARM_CATEGORY_HATE_SPEECH, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, + { + category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, + { + category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, +] + +const LANGUAGE_NAMES: Record = Object.fromEntries( + i18nConfig.map(({ code, name }: { code: string; name: string }) => [ + code, + name, + ]) +) + +function getGeminiClient(): GoogleGenAI { + const apiKey = process.env.GEMINI_API_KEY + if (!apiKey) { + throw new Error("GEMINI_API_KEY environment variable is not set") + } + return new GoogleGenAI({ apiKey }) +} + +export interface TranslateFileOptions { + filePath: string + fileContent: string + fileType: "markdown" | "json" + targetLanguage: string + glossaryTerms: Map + /** Set by JSON batching when HTML tags have been extracted to placeholders */ + htmlExtracted?: boolean + /** Use the content normalizer for placeholder-based translation */ + useNormalizer?: boolean + /** Content has been normalized with HTML-PLACEHOLDER tags (set internally) */ + normalized?: boolean +} + +/** Inert value associated with a placeholder (URL, path, className, etc.) */ +export interface PlaceholderInert { + type: string + values: Record +} + +export interface TranslateFileResult { + translatedContent: string + tokensUsed: { input: number; output: number } + /** Placeholder IDs in the order they appear in the translated output */ + placeholderOrder?: string[] + /** Map of placeholder ID -> inert values (URLs, paths, classNames) */ + placeholderMap?: Record +} + +/** Optional metadata for richer Gemini API call logging */ +interface GeminiCallMetadata { + filePath?: string + targetLanguage?: string + chunkIndex?: number + totalChunks?: number + label?: string +} + +/** + * Translate a single file via Gemini. + * + * For markdown files: + * 1. Extract fenced code blocks -> placeholders (reduces payload) + * 2. If prose still too large, chunk by headings recursively + * 3. Translate prose (single call or per-chunk) + * 4. Restore code blocks + * 5. Extract and translate code comments separately + * 6. Restore translated comments into code blocks + * + * For JSON files: translate directly (no code blocks). + */ +export async function translateFile( + options: TranslateFileOptions +): Promise { + const { filePath, fileContent, fileType, targetLanguage, glossaryTerms } = + options + + // JSON files: batch large files, extract HTML from values + if (fileType === "json") { + return translateJsonFile(options) + } + + // Markdown with normalizer: placeholder-based translation + if (options.useNormalizer) { + return translateNormalizedMarkdown(options) + } + + // Markdown legacy path: extract code blocks first + const { prose, blocks } = extractCodeBlocks(fileContent) + + if (blocks.length > 0) { + console.log( + ` [extract] ${filePath}: ${blocks.length} code blocks removed (${fileContent.length} -> ${prose.length} chars)` + ) + } + + // Check if prose needs chunking + const chunks = chunkProse(prose, PROSE_SIZE_THRESHOLD) + let translatedProse: string + let totalTokens = { input: 0, output: 0 } + + if (chunks.length === 1) { + // Single chunk: translate normally + const result = await callGemini( + { ...options, fileContent: prose }, + { filePath, targetLanguage } + ) + translatedProse = result.translatedContent + totalTokens = result.tokensUsed + } else { + // Multiple chunks: translate each, reassemble + console.log(` [chunk] ${filePath}: split into ${chunks.length} chunks`) + const translatedChunks: string[] = [] + for (let i = 0; i < chunks.length; i++) { + const result = await callGemini( + { ...options, fileContent: chunks[i] }, + { filePath, targetLanguage, chunkIndex: i, totalChunks: chunks.length } + ) + translatedChunks.push(result.translatedContent) + totalTokens.input += result.tokensUsed.input + totalTokens.output += result.tokensUsed.output + } + translatedProse = translatedChunks.join("\n\n") + } + + // Restore code blocks + let finalContent = restoreCodeBlocks(translatedProse, blocks) + + // Translate code comments (best-effort, non-fatal) + if (blocks.length > 0) { + try { + finalContent = await translateCodeComments( + finalContent, + blocks, + targetLanguage, + glossaryTerms, + filePath + ) + } catch (error) { + console.warn( + ` [comments] ${filePath}: comment translation failed (non-fatal): ${error instanceof Error ? error.message : String(error)}` + ) + } + } + + return { + translatedContent: finalContent, + tokensUsed: totalTokens, + } +} + +/** + * Translate markdown using the content normalizer. + * + * Flow: + * 1. Normalize: replace non-translatable content with placeholders + * 2. Chunk if needed (same heading-based chunking as legacy path) + * 3. Send normalized prose to Gemini (with placeholder rules in prompt) + * 4. Verify all placeholders survived in response + * 5. Reconstruct: replace block placeholders with originals, + * rebuild wrapper placeholders with translated text + * 6. Translate code comments separately (from normalizer tree) + */ +async function translateNormalizedMarkdown( + options: TranslateFileOptions +): Promise { + const { filePath, fileContent, targetLanguage, glossaryTerms } = options + + // Step 1: Normalize + const { normalized, tree, extractions } = normalizeContent(fileContent) + + const blockCount = Array.from(extractions.keys()).filter( + (k) => !k.startsWith("LINK:") && !k.startsWith("HTMLTAG:") + ).length + const wrapperCount = extractions.size - blockCount + + console.log( + ` [normalize] ${filePath}: ${blockCount} block + ${wrapperCount} wrapper placeholders ` + + `(${fileContent.length} -> ${normalized.length} chars)` + ) + + // Step 2: Chunk if needed + const chunks = chunkProse(normalized, PROSE_SIZE_THRESHOLD) + let translatedProse: string + let totalTokens = { input: 0, output: 0 } + + if (chunks.length === 1) { + const result = await callGemini( + { ...options, fileContent: normalized, normalized: true }, + { filePath, targetLanguage } + ) + translatedProse = result.translatedContent + totalTokens = result.tokensUsed + } else { + console.log(` [chunk] ${filePath}: split into ${chunks.length} chunks`) + const translatedChunks: string[] = [] + for (let i = 0; i < chunks.length; i++) { + const result = await callGemini( + { ...options, fileContent: chunks[i], normalized: true }, + { filePath, targetLanguage, chunkIndex: i, totalChunks: chunks.length } + ) + translatedChunks.push(result.translatedContent) + totalTokens.input += result.tokensUsed.input + totalTokens.output += result.tokensUsed.output + } + translatedProse = translatedChunks.join("\n\n") + } + + // Step 4: Verify placeholders survived + const missingPlaceholders = verifyPlaceholders(normalized, translatedProse) + if (missingPlaceholders.length > 0) { + console.warn( + ` [normalize] ${filePath}: ${missingPlaceholders.length} placeholder(s) missing in translation:\n` + + missingPlaceholders.map((p) => ` - ${p}`).join("\n") + ) + } + + // Step 5: Capture placeholderOrder from Gemini's response (before reconstruction) + const placeholderOrder = extractPlaceholderOrder(translatedProse) + + // Step 6: Build placeholderMap from the normalizer tree + const placeholderMap = buildPlaceholderMap(tree) + + // Step 7: Reconstruct + let finalContent = reconstructFromPlaceholders(translatedProse, extractions) + + // Step 8: Restore heading anchor IDs from English source. + finalContent = restoreHeadingIds(finalContent, fileContent) + + // Step 9: Translate code comments (from normalizer tree) + const commentNodes = collectCommentNodes(tree) + if (commentNodes.length > 0) { + try { + finalContent = await translateNormalizedComments( + finalContent, + commentNodes, + targetLanguage, + glossaryTerms, + filePath + ) + } catch (error) { + console.warn( + ` [comments] ${filePath}: comment translation failed (non-fatal): ${error instanceof Error ? error.message : String(error)}` + ) + } + } + + return { + translatedContent: finalContent, + tokensUsed: totalTokens, + placeholderOrder, + placeholderMap, + } +} + +/** + * Extract placeholder IDs from translated text in the order they appear. + * Called BEFORE reconstruction so the tags are still present. + */ +function extractPlaceholderOrder(translated: string): string[] { + const order: string[] = [] + // Match all placeholder tags (self-closing and wrapper open tags, not close tags) + const re = //g + let match + while ((match = re.exec(translated)) !== null) { + order.push(match[1]) + } + return order +} + +/** + * Build a map from placeholder ID to its inert values. + * + * Sources data from the normalizer tree nodes, keyed by the same + * placeholder IDs that appear in the normalized output. + */ +function buildPlaceholderMap( + tree: ContentNode[] +): Record { + const map: Record = {} + + function visit(node: ContentNode): void { + if (node.type === "link" && "meta" in node) { + // Extract hash from placeholder: "..." + const hashMatch = node.placeholder.match(/LINK-([a-f0-9]+)/) + if (hashMatch) { + map[`LINK-${hashMatch[1]}`] = { + type: "link", + values: { url: node.meta.url }, + } + } + } + + if (node.type === "image" && "meta" in node) { + const hashMatch = node.placeholder.match(/IMAGE-([a-f0-9]+)/) + if (hashMatch) { + map[`IMAGE-${hashMatch[1]}`] = { + type: "image", + values: { path: node.meta.path }, + } + } + } + + if (node.type === "html-tag" && "meta" in node) { + const hashMatch = node.placeholder.match(/HTMLTAG-([a-f0-9]+)/) + if (hashMatch) { + map[`HTMLTAG-${hashMatch[1]}`] = { + type: "html-tag", + values: { tagName: node.meta.tagName, ...node.meta.inertAttributes }, + } + } + } + + if (node.type === "component" && "meta" in node) { + const hashMatch = node.placeholder.match(/COMPONENT-([a-f0-9]+)/) + if (hashMatch) { + map[`COMPONENT-${hashMatch[1]}`] = { + type: "component", + values: { + componentName: node.meta.componentName, + ...node.meta.inertAttributes, + }, + } + } + } + + if (node.type === "code-fence") { + const hashMatch = node.placeholder.match(/CODEBLOCK-([a-f0-9]+)/) + if (hashMatch) { + map[`CODEBLOCK-${hashMatch[1]}`] = { + type: "code-fence", + values: {}, + } + } + } + + if (node.type === "inline-code") { + const hashMatch = node.placeholder.match(/CODE-([a-f0-9]+)/) + if (hashMatch) { + map[`CODE-${hashMatch[1]}`] = { + type: "inline-code", + values: { content: node.content }, + } + } + } + + if ("children" in node && node.children) { + for (const child of node.children) { + visit(child) + } + } + } + + for (const node of tree) { + visit(node) + } + + return map +} + +/** + * Restore heading anchor IDs from the English source onto translated headings. + * + * Matches by heading level and sequential position. The normalizer strips + * {#anchor-id} before sending to Gemini; this copies them back from English. + */ +function restoreHeadingIds(translated: string, english: string): string { + const HEADING_RE = /^(#{1,6})\s+/gm + const HEADING_ID_RE = /^(#{1,6}\s+.+?)[ \t]*(\{#[^}]+\})[ \t]*$/gm + + // Extract IDs from English in order + const englishIds: string[] = [] + let match + while ((match = HEADING_ID_RE.exec(english)) !== null) { + englishIds.push(match[2]) + } + + if (englishIds.length === 0) return translated + + // Find headings in translated text (without IDs) and append the English ID. + // Skip lines inside code fences to avoid matching # comments as headings. + let idIndex = 0 + let inFence = false + const lines = translated.split("\n") + for (let i = 0; i < lines.length; i++) { + if (idIndex >= englishIds.length) break + if (lines[i].startsWith("```")) inFence = !inFence + if (inFence) continue + HEADING_RE.lastIndex = 0 + if (HEADING_RE.test(lines[i]) && !lines[i].includes("{#")) { + lines[i] = `${lines[i].trimEnd()} ${englishIds[idIndex]}` + idIndex++ + } + } + + if (idIndex < englishIds.length) { + console.warn( + ` [heading-ids] Restored ${idIndex}/${englishIds.length} heading IDs (translated file has fewer headings)` + ) + } + + return lines.join("\n") +} + +/** + * Verify all placeholder tags from the normalized input survive in the translation. + * Returns a list of missing placeholders. + */ +function verifyPlaceholders(normalized: string, translated: string): string[] { + const missing: string[] = [] + + // Block placeholders (self-closing) + const blockRe = //g + let match + while ((match = blockRe.exec(normalized)) !== null) { + if (!translated.includes(match[0])) { + missing.push(match[0]) + } + } + + // Self-closing CODEBLOCK (true code) and COMPONENT (childless) + const selfClosingRe = + //g + while ((match = selfClosingRe.exec(normalized)) !== null) { + if (!translated.includes(match[0])) { + missing.push(match[0]) + } + } + + // Wrapper open tags + const wrapperOpenRe = + //g + while ((match = wrapperOpenRe.exec(normalized)) !== null) { + if (!translated.includes(match[0])) { + missing.push(match[0]) + } + } + + // Wrapper close tags + const wrapperCloseRe = + /<\/HTML-PLACEHOLDER-(?:CODEBLOCK|LINK|HTMLTAG|COMPONENT)-[a-f0-9]+>/g + while ((match = wrapperCloseRe.exec(normalized)) !== null) { + if (!translated.includes(match[0])) { + missing.push(match[0]) + } + } + + return missing +} + +/** + * Reconstruct final markdown from Gemini's translated output and the extraction map. + * + * Block placeholders are replaced with their originals. + * Wrapper placeholders are rebuilt with the (potentially translated) text + * that Gemini placed between the tags. + */ +function reconstructFromPlaceholders( + translated: string, + extractions: Map +): string { + let result = translated + + // Block placeholders: replace ALL occurrences with originals. + // Content-addressed placeholders can appear multiple times when the + // same inline code/image appears repeatedly (e.g., `base fee` x5). + extractions.forEach((original, placeholder) => { + if ( + placeholder.startsWith("LINK:") || + placeholder.startsWith("HTMLTAG:") || + placeholder.startsWith("COMPONENT:") || + placeholder.startsWith("CODEBLOCK:") + ) { + return + } + result = result.split(placeholder).join(original) + }) + + // Wrapper placeholders: LINK + // Loop to handle duplicate content-addressed placeholders (same link text+URL) + extractions.forEach((original, key) => { + if (!key.startsWith("LINK:")) return + const hash = key.slice(5) + const openTag = `` + const closeTag = `` + const urlMatch = original.match(/\]\(([^)]+)\)/) + if (!urlMatch) return + + let openIdx = result.indexOf(openTag) + while (openIdx >= 0) { + const closeIdx = result.indexOf(closeTag, openIdx) + if (closeIdx < 0) break + const translatedText = result.slice(openIdx + openTag.length, closeIdx) + const rebuilt = `[${translatedText}](${urlMatch[1]})` + result = + result.slice(0, openIdx) + + rebuilt + + result.slice(closeIdx + closeTag.length) + openIdx = result.indexOf(openTag) + } + }) + + // Wrapper placeholders: HTMLTAG + extractions.forEach((original, key) => { + if (!key.startsWith("HTMLTAG:")) return + const hash = key.slice(8) + const openTag = `` + const closeTag = `` + const tagMatch = original.match(/<(\w+)(\s[^>]*)?>/) + const closingMatch = original.match(/<\/(\w+)>/) + if (!tagMatch || !closingMatch) return + + let openIdx = result.indexOf(openTag) + while (openIdx >= 0) { + const closeIdx = result.indexOf(closeTag, openIdx) + if (closeIdx < 0) break + const translatedText = result.slice(openIdx + openTag.length, closeIdx) + const rebuilt = `<${tagMatch[1]}${tagMatch[2] || ""}>${translatedText}` + result = + result.slice(0, openIdx) + + rebuilt + + result.slice(closeIdx + closeTag.length) + openIdx = result.indexOf(openTag) + } + }) + + // Wrapper placeholders: COMPONENT (components with children) + extractions.forEach((original, key) => { + if (!key.startsWith("COMPONENT:")) return + const hash = key.slice(10) + const openTag = `` + const closeTag = `` + const openingTagMatch = original.match(/<([A-Z][a-zA-Z0-9]*)(\s[^>]*)?>/) + const closingTagMatch = original.match(/<\/([A-Z][a-zA-Z0-9]*)>/) + if (!openingTagMatch || !closingTagMatch) return + + let openIdx = result.indexOf(openTag) + while (openIdx >= 0) { + const closeIdx = result.indexOf(closeTag, openIdx) + if (closeIdx < 0) break + const translatedChildren = result.slice( + openIdx + openTag.length, + closeIdx + ) + const rebuilt = `<${openingTagMatch[1]}${openingTagMatch[2] || ""}>${translatedChildren}` + result = + result.slice(0, openIdx) + + rebuilt + + result.slice(closeIdx + closeTag.length) + openIdx = result.indexOf(openTag) + } + }) + + // Wrapper placeholders: CODEBLOCK (prose/markdown fences) + extractions.forEach((original, key) => { + if (!key.startsWith("CODEBLOCK:")) return + const hash = key.slice(10) + const openTag = `` + const closeTag = `` + const fenceMatch = original.match(/^([ \t]*)(```|~~~)([^\n]*)/) + if (!fenceMatch) return + + let openIdx = result.indexOf(openTag) + while (openIdx >= 0) { + const closeIdx = result.indexOf(closeTag, openIdx) + if (closeIdx < 0) break + const translatedContent = result.slice(openIdx + openTag.length, closeIdx) + const rebuilt = `${fenceMatch[1]}${fenceMatch[2]}${fenceMatch[3]}\n${translatedContent.trim()}\n${fenceMatch[1]}${fenceMatch[2]}` + result = + result.slice(0, openIdx) + + rebuilt + + result.slice(closeIdx + closeTag.length) + openIdx = result.indexOf(openTag) + } + }) + + return result +} + +/** + * Collect code comment nodes from the normalizer tree. + */ +function collectCommentNodes(tree: ContentNode[]): Array<{ + text: string + language: string + line: string + commentType: string +}> { + const comments: Array<{ + text: string + language: string + line: string + commentType: string + }> = [] + + function visit(node: ContentNode): void { + if (node.type === "code-comment") { + comments.push({ + text: node.content, + language: node.meta.language, + line: node.meta.line, + commentType: node.meta.commentType, + }) + } + if ("children" in node && node.children) { + for (const child of node.children) { + visit(child) + } + } + } + + for (const node of tree) { + visit(node) + } + return comments +} + +/** + * Translate code comments using data from the normalizer tree. + * Same approach as translateCodeComments but sources from the tree. + */ +async function translateNormalizedComments( + content: string, + commentNodes: Array<{ + text: string + language: string + line: string + commentType: string + }>, + targetLanguage: string, + glossaryTerms: Map, + filePath: string +): Promise { + if (commentNodes.length === 0) return content + + const commentPayload: Record = {} + for (let i = 0; i < commentNodes.length; i++) { + commentPayload[`c${i}`] = commentNodes[i].text + } + + const languageName = LANGUAGE_NAMES[targetLanguage] || targetLanguage + const glossaryLines: string[] = [] + glossaryTerms.forEach((loc, en) => glossaryLines.push(` ${en} = ${loc}`)) + const glossaryHint = + glossaryLines.length > 0 + ? `\nUse these exact translations for glossary terms:\n${glossaryLines.slice(0, 30).join("\n")}` + : "" + + const commentPrompt = `Translate these code comments to ${languageName}. Return ONLY a JSON object with the same keys and translated values. Do not add explanations.${glossaryHint} + +${JSON.stringify(commentPayload, null, 2)}` + + const result = await callGeminiRaw(commentPrompt, { + filePath, + targetLanguage, + label: "code-comments", + }) + + let translatedMap: Record + try { + const cleaned = stripCodeBlockWrapping(result.text, "json") + translatedMap = JSON.parse(cleaned) + } catch { + console.warn(" [comments] Could not parse comment translation response") + return content + } + + // Replace English comments with translated ones in the final content + for (let i = 0; i < commentNodes.length; i++) { + const original = commentNodes[i].text + const translated = translatedMap[`c${i}`] + if (translated && translated !== original) { + // Replace first occurrence (comments may repeat, handle one at a time) + content = content.replace(original, translated) + } + } + + return content +} + +/** + * Translate a JSON file with batching and HTML placeholder extraction. + * + * 1. Parse and split into ~100-key batches (if large) + * 2. Extract HTML tags from values into numbered placeholders + * 3. Translate each batch via Gemini + * 4. Restore HTML tags from placeholders + * 5. Merge batches and validate against full English source + */ +async function translateJsonFile( + options: TranslateFileOptions +): Promise { + const { filePath, fileContent, targetLanguage } = options + + const prepared = prepareJsonBatches(fileContent) + const totalTokens = { input: 0, output: 0 } + + if (prepared.batchContents.length > 1) { + console.log( + ` [json-batch] ${filePath}: ${prepared.totalKeys} keys -> ${prepared.batchContents.length} batches (${prepared.batchSizes.join(", ")})` + ) + } + if (prepared.htmlExtracted) { + console.log( + ` [html-extract] ${filePath}: HTML tags replaced with placeholders` + ) + } + + const translatedBatches: string[] = [] + + for (let i = 0; i < prepared.batchContents.length; i++) { + const batchContent = prepared.batchContents[i] + const isMultiBatch = prepared.batchContents.length > 1 + + // Translate this batch (callGemini handles retries and validation) + const result = await callGemini( + { + ...options, + fileContent: batchContent, + htmlExtracted: prepared.htmlExtracted, + }, + { + filePath, + targetLanguage, + chunkIndex: isMultiBatch ? i : undefined, + totalChunks: isMultiBatch ? prepared.batchContents.length : undefined, + label: isMultiBatch ? "json-batch" : undefined, + } + ) + + totalTokens.input += result.tokensUsed.input + totalTokens.output += result.tokensUsed.output + + // Restore HTML placeholders in translated output + const placeholderMap = prepared.placeholderMaps[i] + if (placeholderMap.size > 0) { + const { content, failures } = restoreJsonBatch( + result.translatedContent, + placeholderMap + ) + if (failures.length > 0) { + console.warn( + ` [html-restore] ${filePath}${isMultiBatch ? ` batch ${i + 1}` : ""}: ${failures.length} placeholder(s) missing:\n` + + failures.map((f) => ` - ${f}`).join("\n") + ) + } + translatedBatches.push(content) + } else { + translatedBatches.push(result.translatedContent) + } + } + + // Merge batches into final JSON + const finalContent = mergeJsonBatches(translatedBatches) + + // Final validation: merged result against original English + if (prepared.batchContents.length > 1) { + const validation = validateTranslatedJson(finalContent, fileContent) + if (!validation.valid) { + console.warn( + ` [json-batch] ${filePath}: merged validation warning: ${validation.error}` + ) + } + } + + return { + translatedContent: finalContent, + tokensUsed: totalTokens, + } +} + +/** + * Extract comments from all code blocks, translate them in a single + * Gemini call, and restore them into the final content. + */ +async function translateCodeComments( + content: string, + blocks: CodeBlock[], + targetLanguage: string, + glossaryTerms: Map, + filePath: string +): Promise { + // Extract comments from all blocks + const allComments: CodeComment[] = [] + const blockData: Array<{ + block: CodeBlock + strippedCode: string + comments: CodeComment[] + }> = [] + + for (const block of blocks) { + if (!block.language || !block.content.trim()) continue + const { strippedCode, comments } = extractComments( + block.content, + block.language + ) + // Tag comments with their block index + const tagged = comments.map((c) => ({ ...c, blockIndex: block.index })) + allComments.push(...tagged) + blockData.push({ block, strippedCode, comments: tagged }) + } + + if (allComments.length === 0) return content + + // Build a compact payload for comment translation + const commentPayload: Record = {} + for (let i = 0; i < allComments.length; i++) { + commentPayload[`c${i}`] = allComments[i].text + } + + const languageName = LANGUAGE_NAMES[targetLanguage] || targetLanguage + const glossaryLines: string[] = [] + glossaryTerms.forEach((loc, en) => glossaryLines.push(` ${en} = ${loc}`)) + const glossaryHint = + glossaryLines.length > 0 + ? `\nUse these exact translations for glossary terms:\n${glossaryLines.slice(0, 30).join("\n")}` + : "" + + const commentPrompt = `Translate these code comments to ${languageName}. Return ONLY a JSON object with the same keys and translated values. Do not add explanations.${glossaryHint} + +${JSON.stringify(commentPayload, null, 2)}` + + const result = await callGeminiRaw(commentPrompt, { + filePath, + targetLanguage, + label: "code-comments", + }) + let translatedMap: Record + + try { + const cleaned = stripCodeBlockWrapping(result.text, "json") + translatedMap = JSON.parse(cleaned) + } catch { + console.warn(" [comments] Could not parse comment translation response") + return content + } + + // Restore translated comments into the code blocks within content + for (const { block, strippedCode, comments } of blockData) { + if (comments.length === 0) continue + + const syntax = getCommentSyntax(block.language) + + // Map translated text back onto comment objects + const translatedComments = comments.map((c) => { + const key = `c${allComments.indexOf(c)}` + return { ...c, text: translatedMap[key] || c.text } + }) + + // Find and replace the code block in content + // Use strippedCode (English comments removed) instead of block.content + // to avoid duplicating English comments alongside translated ones + const fence = "```" + const ind = block.indent || "" + const originalBlock = `${fence}${block.language}\n${block.content}\n${ind}${fence}` + const restoredCode = restoreComments( + strippedCode, + translatedComments, + syntax + ) + const newBlock = `${fence}${block.language}\n${restoredCode}\n${ind}${fence}` + content = content.replace(originalBlock, newBlock) + } + + return content +} + +/** + * Core Gemini API call with retries and model fallback. + * Used by both prose translation and comment translation. + */ +async function callGemini( + options: TranslateFileOptions, + metadata?: GeminiCallMetadata +): Promise { + const { + filePath, + fileContent, + fileType, + targetLanguage, + glossaryTerms, + htmlExtracted, + normalized, + } = options + + const languageName = LANGUAGE_NAMES[targetLanguage] || targetLanguage + const prompt = buildTranslationPrompt({ + filePath, + fileContent, + fileType, + targetLanguage, + languageName, + glossaryTerms, + htmlExtracted, + normalized, + }) + + // Retry loop for validation failures (API call retries are in callGeminiRaw) + for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { + const result = await callGeminiRaw(prompt, metadata) + + let text = result.text + text = stripCodeBlockWrapping(text, fileType) + + const validation: ValidationResult = + fileType === "json" + ? validateTranslatedJson(text, fileContent) + : validateTranslatedMarkdown(text, fileContent) + + if (validation.valid) { + return { + translatedContent: text, + tokensUsed: result.tokensUsed, + } + } + + if (attempt < MAX_RETRIES) { + console.warn( + `[WARN] ${filePath} validation attempt ${attempt}: ${validation.error}. Retrying...` + ) + await delay(RETRY_DELAY_MS * attempt) + continue + } + + throw new Error( + `Output validation failed after ${MAX_RETRIES} attempts: ${validation.error}` + ) + } + + throw new Error(`Translation failed for ${filePath}`) +} + +/** + * Raw Gemini API call with retries, model fallback, and verbose logging. + * + * Logging behavior: + * - Always: timestamped REQUEST/RESPONSE lines with model, duration, tokens + * - Verbose: full prompt content between === PROMPT START/END === markers + * + * Returns the raw text response and token usage. + */ +export async function callGeminiRaw( + prompt: string, + metadata?: GeminiCallMetadata +): Promise<{ text: string; tokensUsed: { input: number; output: number } }> { + const client = getGeminiClient() + const verbose = process.env.VERBOSE === "true" + const ts = () => new Date().toISOString() + + const modelsToTry = GEMINI_MODELS + + // Build context string for log lines + const ctx = [ + metadata?.filePath && `file=${metadata.filePath}`, + metadata?.targetLanguage && `lang=${metadata.targetLanguage}`, + metadata?.chunkIndex != null && + `chunk=${(metadata.chunkIndex ?? 0) + 1}/${metadata.totalChunks}`, + metadata?.label, + ] + .filter(Boolean) + .join(" ") + + let lastError: Error | null = null + const modelNotFound = new Set() + + for (const modelId of modelsToTry) { + let modelFailed = false + + for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { + const startTime = Date.now() + + console.log( + `[${ts()}] [gemini] REQUEST model=${modelId} ${ctx}${attempt > 1 ? ` attempt=${attempt}` : ""}` + ) + + if (verbose) { + // Split prompt into sections for collapsible groups + const sourceMatch = prompt.match( + /([\s\S]*?)(=== SOURCE FILE ===[\s\S]*?=== END SOURCE FILE ===)([\s\S]*)/ + ) + if (sourceMatch) { + const [, preamble, sourceFile] = sourceMatch + console.log( + `::group::Prompt preamble: ${ctx} (rules, glossary, hints)` + ) + console.log(preamble.trim()) + console.log("::endgroup::") + console.log(`::group::Source file: ${ctx} (${prompt.length} chars)`) + console.log(sourceFile) + console.log("::endgroup::") + } else { + console.log(`::group::Prompt: ${ctx} (${prompt.length} chars)`) + console.log(prompt) + console.log("::endgroup::") + } + } + + try { + const GEMINI_TIMEOUT_MS = 5 * 60 * 1000 // 5 minutes + const controller = new AbortController() + const timeout = setTimeout(() => controller.abort(), GEMINI_TIMEOUT_MS) + const response = await client.models + .generateContent({ + model: modelId, + contents: prompt, + config: { temperature: 0, safetySettings: SAFETY_SETTINGS }, + }) + .finally(() => clearTimeout(timeout)) + const usage = response.usageMetadata + const duration = ((Date.now() - startTime) / 1000).toFixed(1) + + // Inspect response for non-obvious failure modes before accessing .text + const candidate = ( + response as unknown as { + candidates?: Array<{ + finishReason?: string + safetyRatings?: Array<{ category?: string; probability?: string }> + }> + } + ).candidates?.[0] + const finishReason: string | undefined = candidate?.finishReason + + // Log non-STOP finish reasons (these explain silent failures) + if (finishReason && finishReason !== "STOP") { + const safetyInfo = candidate?.safetyRatings + ?.map( + (r: { category?: string; probability?: string }) => + `${r.category}=${r.probability}` + ) + .join(", ") + console.warn( + `[${ts()}] [gemini] FINISH_REASON model=${modelId} ${ctx} ` + + `duration=${duration}s reason=${finishReason}` + + (safetyInfo ? ` safety=[${safetyInfo}]` : "") + ) + } + + // Check prompt-level blocking + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const blockReason = (response as any).promptFeedback?.blockReason + if (blockReason) { + console.warn( + `[${ts()}] [gemini] PROMPT_BLOCKED model=${modelId} ${ctx} ` + + `duration=${duration}s reason=${blockReason}` + ) + } + + console.log( + `[${ts()}] [gemini] RESPONSE model=${modelId} ${ctx} ` + + `duration=${duration}s ` + + `tokens_in=${usage?.promptTokenCount || 0} ` + + `tokens_out=${usage?.candidatesTokenCount || 0}` + + (finishReason && finishReason !== "STOP" + ? ` finishReason=${finishReason}` + : "") + ) + + // Access .text -- may be empty/undefined if blocked + const text = response.text ?? "" + if (!text && finishReason && finishReason !== "STOP") { + throw new Error( + `Gemini returned no content (finishReason=${finishReason}). ` + + `This file/language combination may be triggering content filters.` + ) + } + + return { + text, + tokensUsed: { + input: usage?.promptTokenCount || 0, + output: usage?.candidatesTokenCount || 0, + }, + } + } catch (error) { + const duration = ((Date.now() - startTime) / 1000).toFixed(1) + lastError = error instanceof Error ? error : new Error(String(error)) + + if ( + lastError.message.includes("404") || + lastError.message.includes("not found") || + lastError.message.includes("deprecated") + ) { + console.warn( + `[${ts()}] [gemini] MODEL_UNAVAILABLE model=${modelId} duration=${duration}s error="${lastError.message}"` + ) + modelNotFound.add(modelId) + modelFailed = true + break + } + + if ( + lastError.message.includes("429") || + lastError.message.includes("RESOURCE_EXHAUSTED") + ) { + const backoff = RETRY_DELAY_MS * Math.pow(2, attempt) + console.warn( + `[${ts()}] [gemini] RATE_LIMITED model=${modelId} ${ctx} duration=${duration}s backoff=${backoff / 1000}s` + ) + await delay(backoff) + continue + } + + if (attempt < MAX_RETRIES) { + console.warn( + `[${ts()}] [gemini] ERROR model=${modelId} ${ctx} attempt=${attempt} duration=${duration}s error="${lastError.message.slice(0, 200)}"` + ) + await delay(RETRY_DELAY_MS * attempt) + continue + } + + console.error( + `[${ts()}] [gemini] FAILED model=${modelId} ${ctx} duration=${duration}s error="${lastError.message.slice(0, 200)}"` + ) + } + } + + if (!modelFailed) break + } + + if (modelNotFound.size === modelsToTry.length) { + throw new Error( + `All Gemini models unavailable (${[...modelNotFound].join(", ")}). ` + + `Update GEMINI_MODELS in config.ts or set GEMINI_MODEL env var.` + ) + } + + throw lastError || new Error("Translation failed") +} + +/** + * Gemini sometimes wraps output in ```markdown or ```json blocks. + * Strip that wrapping to get raw content. + */ +function stripCodeBlockWrapping( + text: string, + fileType: "markdown" | "json" +): string { + // Match ```markdown\n...\n``` or ```json\n...\n``` or just ```\n...\n``` + const patterns = [ + new RegExp( + `^\`\`\`(?:${fileType}|md|mdx)?\\s*\\n([\\s\\S]*?)\\n\`\`\`\\s*$` + ), + /^```\s*\n([\s\S]*?)\n```\s*$/, + ] + + for (const re of patterns) { + const match = text.match(re) + if (match) return match[1] + } + + return text +} diff --git a/src/scripts/intl-pipeline/lib/llm/incremental-translate.ts b/src/scripts/intl-pipeline/lib/llm/incremental-translate.ts new file mode 100644 index 00000000000..c0440605773 --- /dev/null +++ b/src/scripts/intl-pipeline/lib/llm/incremental-translate.ts @@ -0,0 +1,610 @@ +/** + * Incremental section-level translation via Gemini. + * + * When drift detection identifies prose changes in specific sections, + * this module sends ONE batched Gemini call per file with all changed + * sections marked for translation and unchanged sections as context. + * + * Gemini returns translated sections as JSON keyed by section ID, + * which are surgically placed back into the locale file. + */ + +import { getLanguageGroup, getSiteSpecificNotes } from "./language-groups" + +// Types for section-level translation +interface SectionForPrompt { + /** Section heading ID */ + id: string + /** "TRANSLATE" for changed sections, "CONTEXT" for surrounding context */ + action: "TRANSLATE" | "CONTEXT" + /** The section content (normalized English for TRANSLATE, existing locale for CONTEXT) */ + content: string + /** Heading text (for reference) */ + headingText?: string + /** Heading level */ + level?: number +} + +interface IncrementalTranslateOptions { + /** English file path */ + filePath: string + /** Target language code */ + targetLanguage: string + /** Language display name */ + languageName: string + /** Sections in document order with action tags */ + sections: SectionForPrompt[] + /** Filtered glossary terms for this file */ + glossaryTerms: Map +} + +interface IncrementalTranslateResult { + /** Map of section ID -> translated content */ + translations: Record + /** Token usage */ + tokensUsed: { input: number; output: number } +} + +// --------------------------------------------------------------------------- +// Prompt builder +// --------------------------------------------------------------------------- + +/** + * Build a batched section-level translation prompt. + * + * All sections are included in document order. Changed sections are + * tagged TRANSLATE; unchanged sections are tagged CONTEXT (existing + * locale translation for voice/tone consistency). + * + * Gemini returns only the TRANSLATE sections as a JSON object + * keyed by section ID. + */ +export function buildIncrementalPrompt( + options: IncrementalTranslateOptions +): string { + const { filePath, targetLanguage, languageName, sections, glossaryTerms } = + options + + const group = getLanguageGroup(targetLanguage) + const siteNotes = getSiteSpecificNotes(group) + const glossarySection = formatGlossary(glossaryTerms) + + const translateIds = sections + .filter((s) => s.action === "TRANSLATE") + .map((s) => s.id) + + const sectionBlocks = sections + .map((s) => { + const headingAttr = s.headingText + ? ` heading="${(s.level ? "#".repeat(s.level) + " " : "") + s.headingText.replace(/"/g, """)}"` + : "" + return `
    \n${s.content}\n
    ` + }) + .join("\n\n") + + return `You are updating an existing translation. Some sections of this file changed in English and need retranslation. Other sections are provided as CONTEXT ONLY -- they show the existing translation voice and terminology. + +File: ${filePath} +Target language: ${languageName} (${targetLanguage}) +Sections to translate: ${translateIds.join(", ")} + +${siteNotes} + +${glossarySection} + +RULES: +- Translate ONLY the content within
    tags. +- Use
    sections for tone and terminology reference. Do NOT retranslate them. +- Preserve all markdown syntax, heading anchors {#id}, and placeholder tags exactly. +- Preserve heading anchor IDs exactly as in English ({#anchor-id}). + +PLACEHOLDER RULES: +Self-closing placeholders (preserve exactly): , , +Wrapper placeholders (translate text between tags): text, text, text +You MAY reorder wrapper placeholders to match natural ${targetLanguage} word order. + +OUTPUT FORMAT: +Return a JSON object where each key is a section ID from the TRANSLATE list, and the value is the translated content for that section. Example: +{ + "section-id-1": "translated content...", + "section-id-2": "translated content..." +} + +Output ONLY the JSON object. No markdown wrapping, no explanations, no XML tags. + +=== SECTIONS === +${sectionBlocks} +=== END SECTIONS ===` +} + +// --------------------------------------------------------------------------- +// Response parser +// --------------------------------------------------------------------------- + +/** + * Parse Gemini's JSON response into a section ID -> translation map. + */ +export function parseIncrementalResponse( + responseText: string +): Record { + // Strip markdown code block wrapping if present + let cleaned = responseText.trim() + if (cleaned.startsWith("```")) { + cleaned = cleaned.replace(/^```(?:json)?\s*\n/, "").replace(/\n```\s*$/, "") + } + + try { + const parsed = JSON.parse(cleaned) + if ( + typeof parsed !== "object" || + parsed === null || + Array.isArray(parsed) + ) { + throw new Error("Expected a JSON object with section IDs as keys") + } + // Validate all values are strings + for (const [key, value] of Object.entries(parsed)) { + if (typeof value !== "string") { + throw new Error(`Section "${key}" value is not a string`) + } + } + return parsed as Record + } catch (error) { + throw new Error( + `Failed to parse incremental translation response: ${error instanceof Error ? error.message : String(error)}\nRaw response (first 500 chars): ${responseText.slice(0, 500)}` + ) + } +} + +// --------------------------------------------------------------------------- +// Section replacement +// --------------------------------------------------------------------------- + +/** + * Replace specific sections in a locale file with fresh translations. + * + * Finds each section by its heading ID and replaces the body content + * between the heading line and the next heading of equal or higher level. + * Preserves the heading line with its {#id} anchor. + */ +export function replaceSections( + localeContent: string, + translations: Record +): string { + const lines = localeContent.split("\n") + const result: string[] = [] + + // Build a map of heading ID -> { lineIndex, level } for the locale file + const headingPattern = /^(#{1,6})\s+(.+?)(?:\s*\{#([^}]+)\})?\s*$/ + const sectionRanges: Array<{ + id: string + level: number + startLine: number + endLine: number + }> = [] + + for (let i = 0; i < lines.length; i++) { + const match = lines[i].match(headingPattern) + if (match) { + const level = match[1].length + const customId = match[3] + if (customId) { + sectionRanges.push({ id: customId, level, startLine: i, endLine: -1 }) + } + } + } + + // Calculate end lines (next heading of ANY level, or EOF). + // Each section only covers its own direct body, not nested subsections. + // This prevents duplicate content when both parent and child are replaced. + for (let i = 0; i < sectionRanges.length; i++) { + sectionRanges[i].endLine = + i + 1 < sectionRanges.length + ? sectionRanges[i + 1].startLine + : lines.length + } + + // Handle _preamble replacement (content between frontmatter and first heading) + let lineIdx = 0 + if (translations["_preamble"] !== undefined) { + const fmEnd = findFrontmatterEnd(lines) + const firstHeading = + sectionRanges.length > 0 ? sectionRanges[0].startLine : lines.length + // Copy frontmatter + for (let i = 0; i < fmEnd; i++) { + result.push(lines[i]) + } + // Insert translated preamble + result.push("") + result.push(translations["_preamble"].trim()) + result.push("") + // Skip old preamble, start processing from first heading (or EOF) + lineIdx = firstHeading + } + + // Build the output, replacing heading sections that have translations + for (const section of sectionRanges) { + // Copy lines before this section (or between sections) + while (lineIdx < section.startLine) { + result.push(lines[lineIdx]) + lineIdx++ + } + + if (translations[section.id] !== undefined) { + // Keep the heading line (with {#id}) + result.push(lines[section.startLine]) + // Insert translated content + const translated = translations[section.id].trim() + result.push("") + result.push(translated) + result.push("") + // Skip the old section body + lineIdx = section.endLine + } else { + // Keep existing content + while (lineIdx < section.endLine) { + result.push(lines[lineIdx]) + lineIdx++ + } + } + } + + // Copy any remaining lines after the last section + while (lineIdx < lines.length) { + result.push(lines[lineIdx]) + lineIdx++ + } + + return result.join("\n") +} + +// --------------------------------------------------------------------------- +// Section extraction +// --------------------------------------------------------------------------- + +interface ExtractedSection { + id: string + level: number + headingText: string + /** Body content below the heading (excludes heading line) */ + body: string +} + +/** + * Extract sections from a markdown file, keyed by heading {#id}. + * Returns sections in document order. + */ +export function extractSections(content: string): ExtractedSection[] { + const lines = content.split("\n") + const headingPattern = /^(#{1,6})\s+(.+?)(?:\s*\{#([^}]+)\})?\s*$/ + const sections: ExtractedSection[] = [] + + // Find all heading positions + const headings: Array<{ + line: number + level: number + id: string + text: string + }> = [] + for (let i = 0; i < lines.length; i++) { + const match = lines[i].match(headingPattern) + if (match) { + const level = match[1].length + const rawText = match[2] + const customId = match[3] + if (customId) { + headings.push({ + line: i, + level, + id: customId, + text: rawText.replace(/\s*\{#[^}]+\}/, "").trim(), + }) + } + } + } + + // Extract preamble: content between frontmatter and first heading. + // Many pages use the frontmatter title as h1, so prose between + // frontmatter and the first ## is the intro paragraph. + const frontmatterEnd = findFrontmatterEnd(lines) + const firstHeadingLine = headings.length > 0 ? headings[0].line : lines.length + if (frontmatterEnd < firstHeadingLine) { + const preamble = lines + .slice(frontmatterEnd, firstHeadingLine) + .join("\n") + .trim() + if (preamble.length > 0) { + sections.push({ + id: "_preamble", + level: 0, + headingText: "", + body: preamble, + }) + } + } + + // Extract body content for each heading. + // Each section only covers its own direct body (up to next heading of ANY + // level), not nested subsections. Mirrors replaceSections logic. + for (let i = 0; i < headings.length; i++) { + const current = headings[i] + const bodyStart = current.line + 1 + const bodyEnd = + i + 1 < headings.length ? headings[i + 1].line : lines.length + + const body = lines.slice(bodyStart, bodyEnd).join("\n").trim() + sections.push({ + id: current.id, + level: current.level, + headingText: current.text, + body, + }) + } + + return sections +} + +/** + * Build the TRANSLATE/CONTEXT section list for an incremental prompt. + * + * @param englishSections - Sections from current English file + * @param localeSections - Sections from existing locale file + * @param translateIds - Section IDs that need translation (from drift detection) + */ +export function buildSectionList( + englishSections: ExtractedSection[], + localeSections: ExtractedSection[], + translateIds: string[] +): SectionForPrompt[] { + const localeMap = new Map() + for (const s of localeSections) { + localeMap.set(s.id, s) + } + + const translateSet = new Set(translateIds) + const result: SectionForPrompt[] = [] + + for (const section of englishSections) { + if (translateSet.has(section.id)) { + // Changed section: send English content for translation + result.push({ + id: section.id, + action: "TRANSLATE", + content: section.body, + headingText: section.headingText, + level: section.level, + }) + } else { + // Unchanged section: send existing locale translation as context + const localeSection = localeMap.get(section.id) + if (localeSection) { + result.push({ + id: section.id, + action: "CONTEXT", + content: localeSection.body, + headingText: localeSection.headingText, + level: section.level, + }) + } + } + } + + return result +} + +// --------------------------------------------------------------------------- +// Section removal +// --------------------------------------------------------------------------- + +/** + * Remove a markdown section by its heading {#id} anchor. + * Removes the heading line and all body content up to the next heading + * of the same or higher level (or EOF). For removal, we DO want to + * remove nested subsections (unlike replaceSections which is leaf-only). + */ +export function removeMarkdownSection( + content: string, + sectionId: string +): string { + const lines = content.split("\n") + const headingPattern = /^(#{1,6})\s+(.+?)(?:\s*\{#([^}]+)\})?\s*$/ + let startLine = -1 + let startLevel = 0 + + // Find the heading with this ID + for (let i = 0; i < lines.length; i++) { + const match = lines[i].match(headingPattern) + if (match && match[3] === sectionId) { + startLine = i + startLevel = match[1].length + break + } + } + + if (startLine === -1) return content + + // Find the end: next heading of same or higher level (remove entire subtree) + let endLine = lines.length + for (let i = startLine + 1; i < lines.length; i++) { + const match = lines[i].match(headingPattern) + if (match && match[1].length <= startLevel) { + endLine = i + break + } + } + + // Remove the section (and any trailing blank line) + const result = [...lines.slice(0, startLine), ...lines.slice(endLine)] + + return result.join("\n") +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** Find the line after frontmatter ends (after closing ---). Returns 0 if no frontmatter. */ +function findFrontmatterEnd(lines: string[]): number { + if (lines.length === 0 || lines[0].trim() !== "---") return 0 + for (let i = 1; i < lines.length; i++) { + if (lines[i].trim() === "---") return i + 1 + } + return 0 +} + +function formatGlossary(terms: Map): string { + if (terms.size === 0) return "" + + const entries: string[] = [] + terms.forEach((translated, en) => { + entries.push(`- "${en}" -> "${translated}"`) + }) + + return `Community-voted glossary (use these exact translations): +${entries.join("\n")}` +} + +// --------------------------------------------------------------------------- +// JSON section extraction / replacement +// --------------------------------------------------------------------------- + +/** + * Extract "sections" from a JSON file. Each top-level key (or nested + * key path) becomes a section whose body is the string value. + * Non-string values (objects) are flattened with "/" separators. + */ +export function extractJsonSections(content: string): ExtractedSection[] { + const obj = JSON.parse(content) + const sections: ExtractedSection[] = [] + + function walk(o: Record, prefix: string) { + for (const [key, value] of Object.entries(o)) { + const id = prefix ? `${prefix}/${key}` : key + if (typeof value === "string") { + sections.push({ + id, + level: prefix ? 2 : 1, + headingText: id, + body: value, + }) + } else if ( + typeof value === "object" && + value !== null && + !Array.isArray(value) + ) { + walk(value as Record, id) + } + } + } + + walk(obj, "") + return sections +} + +/** + * Replace specific key values in a JSON string with fresh translations. + * Handles nested keys using "/" separator (e.g., "test-nested/section-title"). + * Preserves formatting by operating on the parsed object and re-serializing. + */ +export function replaceJsonValues( + localeContent: string, + translations: Record +): string { + const obj = JSON.parse(localeContent) + + for (const [keyPath, translated] of Object.entries(translations)) { + const parts = keyPath.split("/") + let target = obj + for (let i = 0; i < parts.length - 1; i++) { + if (target[parts[i]] && typeof target[parts[i]] === "object") { + target = target[parts[i]] + } else { + target = null as unknown as Record + break + } + } + if (target) { + target[parts[parts.length - 1]] = translated + } + } + + return JSON.stringify(obj, null, 2) + "\n" +} + +// Re-export types for consumers +export type { + IncrementalTranslateOptions, + IncrementalTranslateResult, + SectionForPrompt, +} + +// --------------------------------------------------------------------------- +// Byte-size-aware section batching (CONCURRENCY-SPEC.md Part 2C) +// --------------------------------------------------------------------------- + +import { MAX_CHUNK_BYTES } from "../../constants" + +/** + * Split sections into batches that fit within the byte budget. + * CONTEXT sections are replicated into every batch for quality. + * Only TRANSLATE sections count toward splitting decisions. + * + * Returns empty array if there are no TRANSLATE sections. + */ +export function batchSections( + sections: Array<{ + id: string + content: string + action: "TRANSLATE" | "CONTEXT" + }>, + maxBytes: number = MAX_CHUNK_BYTES +): Array< + Array<{ id: string; content: string; action: "TRANSLATE" | "CONTEXT" }> +> { + const contextSections = sections.filter((s) => s.action === "CONTEXT") + const translateSections = sections.filter((s) => s.action === "TRANSLATE") + + if (translateSections.length === 0) return [] + + const contextBytes = contextSections.reduce( + (sum, s) => sum + Buffer.byteLength(s.content, "utf-8"), + 0 + ) + + // Budget for TRANSLATE content per batch = total budget - context overhead + const translateBudget = Math.max(maxBytes - contextBytes, 1) + + const batches: Array< + Array<{ id: string; content: string; action: "TRANSLATE" | "CONTEXT" }> + > = [] + let currentTranslate: Array<{ + id: string + content: string + action: "TRANSLATE" | "CONTEXT" + }> = [] + let currentBytes = 0 + + for (const section of translateSections) { + const sectionBytes = Buffer.byteLength(section.content, "utf-8") + + // If adding this section exceeds budget AND we already have sections, start new batch + if ( + currentTranslate.length > 0 && + currentBytes + sectionBytes > translateBudget + ) { + batches.push([...contextSections, ...currentTranslate]) + currentTranslate = [] + currentBytes = 0 + } + + currentTranslate.push(section) + currentBytes += sectionBytes + } + + // Push remaining + if (currentTranslate.length > 0) { + batches.push([...contextSections, ...currentTranslate]) + } + + return batches +} diff --git a/src/scripts/intl-pipeline/lib/llm/json-batcher.ts b/src/scripts/intl-pipeline/lib/llm/json-batcher.ts new file mode 100644 index 00000000000..a71b10125e1 --- /dev/null +++ b/src/scripts/intl-pipeline/lib/llm/json-batcher.ts @@ -0,0 +1,414 @@ +/** + * JSON batching and HTML placeholder extraction for large JSON translation. + * + * For large JSON files (>120 top-level keys), splits into ~100-key batches + * to stay within Gemini's reliable output range. + * + * For JSON values containing embedded HTML with attributes (e.g., ), + * extracts to content-addressed wrapper placeholders matching the markdown + * normalizer's format: text + * + * Simple formatting tags (, ,
    ) pass through to Gemini + * since they have no inert attributes and Gemini handles them correctly. + */ + +import { createHash } from "crypto" + +type JsonValue = + | string + | number + | boolean + | null + | JsonValue[] + | { [key: string]: JsonValue } + +interface PlaceholderEntry { + placeholder: string + original: string +} + +/** Map of JSON path -> placeholder entries for that value */ +export type PlaceholderMap = Map + +export interface PreparedJsonBatches { + /** JSON strings with HTML extracted, ready for Gemini */ + batchContents: string[] + /** Per-batch placeholder maps for HTML restoration */ + placeholderMaps: PlaceholderMap[] + /** Whether any HTML was actually extracted */ + htmlExtracted: boolean + /** Total top-level key count */ + totalKeys: number + /** Key count per batch (for logging) */ + batchSizes: number[] +} + +/** Keys per Gemini request */ +const BATCH_SIZE = 100 +/** Avoid tiny final batches -- absorb up to this many extra keys */ +const BATCH_BUFFER = 20 + +/** 6-char hex digest for content-addressed placeholder IDs */ +function shortHash(content: string): string { + return createHash("sha256").update(content, "utf8").digest("hex").slice(0, 6) +} + +/** + * Matches HTML tags WITH attributes and their content: + *
    text + * Does NOT match simple tags like , ,
    . + */ +const HTML_TAG_WITH_ATTRS_RE = + /<([a-zA-Z][a-zA-Z0-9]*)(\s[^>]+)>([\s\S]*?)<\/\1>/g + +/** Self-closing HTML tags with attributes: ,
    */ +const HTML_SELF_CLOSING_WITH_ATTRS_RE = + /<([a-zA-Z][a-zA-Z0-9]*)(\s[^>]+)\s*\/?>/g + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/** + * Prepare a JSON file for batched translation. + * + * 1. Parses the JSON + * 2. Splits top-level keys into batches (~100 per batch) + * 3. Extracts HTML tags from string values, replacing with placeholders + * 4. Returns batch contents ready for Gemini + restoration maps + */ +export function prepareJsonBatches(jsonContent: string): PreparedJsonBatches { + const parsed = JSON.parse(jsonContent) as Record + const keys = Object.keys(parsed) + const keyBatches = splitIntoBatches(keys, BATCH_SIZE, BATCH_BUFFER) + + let htmlExtracted = false + const batchContents: string[] = [] + const placeholderMaps: PlaceholderMap[] = [] + const batchSizes: number[] = [] + + for (const batchKeys of keyBatches) { + // Build sub-object for this batch + const batchObj: Record = {} + for (const key of batchKeys) { + batchObj[key] = parsed[key] + } + + // Extract HTML from string values + const placeholderMap: PlaceholderMap = new Map() + const sanitized = extractHtmlFromObject(batchObj, placeholderMap) + + if (placeholderMap.size > 0) htmlExtracted = true + + batchContents.push(JSON.stringify(sanitized, null, 2)) + placeholderMaps.push(placeholderMap) + batchSizes.push(batchKeys.length) + } + + return { + batchContents, + placeholderMaps, + htmlExtracted, + totalKeys: keys.length, + batchSizes, + } +} + +/** + * Restore HTML tags in a translated JSON batch from its placeholder map. + * + * Returns the restored JSON string and a list of any placeholder failures + * (missing placeholders that could not be restored). + */ +export function restoreJsonBatch( + translatedJson: string, + placeholderMap: PlaceholderMap +): { content: string; failures: string[] } { + if (placeholderMap.size === 0) { + return { content: translatedJson, failures: [] } + } + + const parsed = JSON.parse(translatedJson) as Record + const failures: string[] = [] + const restored = restoreHtmlInObject(parsed, placeholderMap, "", failures) + return { + content: JSON.stringify(restored, null, 2), + failures, + } +} + +/** + * Merge multiple translated JSON batch strings into a single JSON string. + * Preserves key order from the original batches. + */ +export function mergeJsonBatches(batchContents: string[]): string { + if (batchContents.length === 1) return batchContents[0] + + const merged: Record = {} + for (const content of batchContents) { + const parsed = JSON.parse(content) as Record + Object.assign(merged, parsed) + } + return JSON.stringify(merged, null, 2) +} + +/** + * Check whether a JSON file needs batching (has more keys than the threshold). + */ +export function needsBatching(jsonContent: string): boolean { + const parsed = JSON.parse(jsonContent) as Record + return Object.keys(parsed).length > BATCH_SIZE + BATCH_BUFFER +} + +// --------------------------------------------------------------------------- +// Byte-size-aware chunking (CONCURRENCY-SPEC.md Part 2A) +// --------------------------------------------------------------------------- + +import { MAX_CHUNK_BYTES } from "../../constants" + +/** + * Chunk a JSON string by byte size. Each chunk is a valid JSON object + * containing a subset of top-level keys, with total byte size <= MAX_CHUNK_BYTES. + * + * Guarantees: + * - At least 1 key per chunk (even if that key exceeds the budget) + * - Key order preserved across chunks + * - Nested objects measured as one unit (not split) + */ +export function chunkJson( + jsonContent: string, + maxBytes: number = MAX_CHUNK_BYTES +): string[] { + const parsed = JSON.parse(jsonContent) as Record + const keys = Object.keys(parsed) + + if (keys.length === 0) { + return [jsonContent] + } + + // If total size is under budget, return as-is + if (Buffer.byteLength(jsonContent, "utf-8") <= maxBytes) { + return [jsonContent] + } + + const chunks: string[][] = [] + let currentChunkKeys: string[] = [] + let currentBytes = 0 + // Overhead: opening { + closing } + newline formatting + const JSON_OVERHEAD = 4 + + for (const key of keys) { + const valueJson = JSON.stringify(parsed[key]) + // Byte cost: "key": value, + formatting (roughly: key + colon + space + value + comma + newline) + const entryBytes = Buffer.byteLength( + ` ${JSON.stringify(key)}: ${valueJson},\n`, + "utf-8" + ) + + // If adding this key exceeds budget AND we already have keys, start new chunk + if (currentChunkKeys.length > 0 && currentBytes + entryBytes > maxBytes) { + chunks.push(currentChunkKeys) + currentChunkKeys = [] + currentBytes = JSON_OVERHEAD + } + + currentChunkKeys.push(key) + currentBytes += entryBytes + } + + // Push remaining keys + if (currentChunkKeys.length > 0) { + chunks.push(currentChunkKeys) + } + + // Build JSON strings for each chunk + return chunks.map((chunkKeys) => { + const obj: Record = {} + for (const k of chunkKeys) { + obj[k] = parsed[k] + } + return JSON.stringify(obj, null, 2) + }) +} + +// --------------------------------------------------------------------------- +// Batching (legacy key-count approach) +// --------------------------------------------------------------------------- + +function splitIntoBatches( + keys: string[], + size: number, + buffer: number +): string[][] { + if (keys.length <= size + buffer) return [keys] + + const batches: string[][] = [] + for (let i = 0; i < keys.length; i += size) { + const remaining = keys.length - i + // If remaining fits in one more batch (with buffer), take it all + if (remaining <= size + buffer) { + batches.push(keys.slice(i)) + break + } + batches.push(keys.slice(i, i + size)) + } + return batches +} + +// --------------------------------------------------------------------------- +// HTML extraction (pre-translation) +// --------------------------------------------------------------------------- + +function extractHtmlFromObject( + obj: Record, + map: PlaceholderMap, + prefix = "" +): Record { + const result: Record = {} + for (const [key, value] of Object.entries(obj)) { + const path = prefix ? `${prefix}.${key}` : key + result[key] = extractHtmlFromValue(value, map, path) + } + return result +} + +function extractHtmlFromValue( + value: JsonValue, + map: PlaceholderMap, + path: string +): JsonValue { + if (typeof value === "string") { + return extractHtmlFromString(value, map, path) + } + if (Array.isArray(value)) { + return value.map((item, i) => + extractHtmlFromValue(item, map, `${path}[${i}]`) + ) + } + if (value !== null && typeof value === "object") { + return extractHtmlFromObject(value as Record, map, path) + } + return value +} + +function extractHtmlFromString( + text: string, + map: PlaceholderMap, + path: string +): string { + const entries: PlaceholderEntry[] = [] + + // Tags with children and attributes: text + // Use wrapper placeholders so Gemini can reorder within the value + let result = text.replace(HTML_TAG_WITH_ATTRS_RE, (...args) => { + const [fullMatch, , , children] = args + const hash = shortHash(fullMatch) + const open = `` + const close = `` + entries.push({ placeholder: `HTMLTAG:${hash}`, original: fullMatch }) + return `${open}${children}${close}` + }) + + // Self-closing tags with attributes: + result = result.replace(HTML_SELF_CLOSING_WITH_ATTRS_RE, (fullMatch) => { + // Skip if already wrapped by the previous pass + if (fullMatch.includes("HTML-PLACEHOLDER")) return fullMatch + const hash = shortHash(fullMatch) + const placeholder = `` + entries.push({ placeholder, original: fullMatch }) + return placeholder + }) + + if (entries.length > 0) { + map.set(path, entries) + } + return result +} + +// --------------------------------------------------------------------------- +// HTML restoration (post-translation) +// --------------------------------------------------------------------------- + +function restoreHtmlInObject( + obj: Record, + map: PlaceholderMap, + prefix: string, + failures: string[] +): Record { + const result: Record = {} + for (const [key, value] of Object.entries(obj)) { + const path = prefix ? `${prefix}.${key}` : key + result[key] = restoreHtmlInValue(value, map, path, failures) + } + return result +} + +function restoreHtmlInValue( + value: JsonValue, + map: PlaceholderMap, + path: string, + failures: string[] +): JsonValue { + if (typeof value === "string") { + return restoreHtmlInString(value, map, path, failures) + } + if (Array.isArray(value)) { + return value.map((item, i) => + restoreHtmlInValue(item, map, `${path}[${i}]`, failures) + ) + } + if (value !== null && typeof value === "object") { + return restoreHtmlInObject( + value as Record, + map, + path, + failures + ) + } + return value +} + +function restoreHtmlInString( + text: string, + map: PlaceholderMap, + path: string, + failures: string[] +): string { + const entries = map.get(path) + if (!entries) return text + + let result = text + for (const { placeholder, original } of entries) { + if (placeholder.startsWith("HTMLTAG:")) { + // Wrapper placeholder: rebuild original tag around translated text + const hash = placeholder.slice(8) + const openTag = `` + const closeTag = `` + + const openIdx = result.indexOf(openTag) + const closeIdx = result.indexOf(closeTag) + if (openIdx >= 0 && closeIdx >= 0) { + const translatedText = result.slice(openIdx + openTag.length, closeIdx) + const tagMatch = original.match(/<(\w+)(\s[^>]*)?>/) + const closingMatch = original.match(/<\/(\w+)>/) + if (tagMatch && closingMatch) { + const rebuilt = `<${tagMatch[1]}${tagMatch[2] || ""}>${translatedText}` + result = + result.slice(0, openIdx) + + rebuilt + + result.slice(closeIdx + closeTag.length) + } + } else { + failures.push(`${path}: missing wrapper ${openTag} (was: ${original})`) + } + } else { + // Self-closing placeholder: direct replacement + if (!result.includes(placeholder)) { + failures.push(`${path}: missing ${placeholder} (was: ${original})`) + continue + } + result = result.replace(placeholder, original) + } + } + return result +} diff --git a/src/scripts/intl-pipeline/lib/llm/language-groups.ts b/src/scripts/intl-pipeline/lib/llm/language-groups.ts new file mode 100644 index 00000000000..8c9395ae6a4 --- /dev/null +++ b/src/scripts/intl-pipeline/lib/llm/language-groups.ts @@ -0,0 +1,107 @@ +/** + * Language group definitions for translation. + * + * Different script families require different translation strategies. + * Gemini knows these rules natively -- we provide site-specific context, + * not linguistic micromanagement. + */ + +export type LanguageGroup = + | "indic" + | "cyrillic" + | "rtl" + | "cjk-phonetic" + | "cjk-semantic" + | "latin" + +const GROUP_MAP: Record = { + // Indic (Brahmic scripts -- transliterate) + hi: "indic", + mr: "indic", + bn: "indic", + ta: "indic", + te: "indic", + // Cyrillic (transliterate, high Latin tolerance) + ru: "cyrillic", + uk: "cyrillic", + // RTL (transliterate, BiDi considerations) + ar: "rtl", + ur: "rtl", + // CJK Phonetic (Katakana/Hangul -- transliterate even in tags) + ja: "cjk-phonetic", + ko: "cjk-phonetic", + // CJK Semantic (translate by meaning, not sound) + zh: "cjk-semantic", + "zh-tw": "cjk-semantic", +} + +export function getLanguageGroup(code: string): LanguageGroup { + return GROUP_MAP[code] || "latin" +} + +export function isRtl(code: string): boolean { + return code === "ar" || code === "ur" +} + +export function needsTransliteration(code: string): boolean { + const group = getLanguageGroup(code) + return group !== "latin" +} + +/** + * Site-specific translation notes per language group. + * These focus on ethereum.org conventions, not general linguistics + * (Gemini already knows how Arabic/Japanese/etc. work). + */ +export function getSiteSpecificNotes(group: LanguageGroup): string { + const common = ` +Site-specific rules for ethereum.org (group-specific overrides below take precedence): +- Frontmatter tags array: brand-name tags (Solidity, MetaMask, ERC-20) stay in Latin script. Concept tags (smart contracts, testing) should be translated. +- Code blocks: never translate functional code. Always translate code comments. +- Do not translate (keep in original Latin script): ticker symbols (ETH, BTC, ERC, EIP, BLS), URLs, domains, EVM opcodes (SSTORE, CALL, PUSH), hex values (0x...), cryptographic primitives (SHA-256, Keccak-256, ECDSA, zk-SNARKs), network names (Mainnet, Sepolia, Holesky, Goerli), license identifiers (MIT, Apache-2.0), mathematical notations and formulas. +- Treat client implementation names (Lighthouse, Prysm, Geth, Nethermind, Besu, Teku, Lodestar, Nimbus) as proper nouns; do not translate them. In non-Latin scripts, phonetic transliteration alongside the Latin name is acceptable. +- Use community glossary terms as provided. In languages with grammatical cases, decline glossary terms to fit the surrounding sentence naturally. If a term in the source text does not match the glossary's intended technical context, ignore the glossary entry and translate according to the general context of the sentence. +- First mention of acronym-style terms: when the glossary contains a term with an abbreviation in parentheses (e.g., "decentralized application (dapp)" -> "aplicacion descentralizada (dapp)"), expand it on first mention in each section. Use the full translated form with the abbreviation in parentheses. Subsequent mentions in the same section may use the short form alone. If the English source uses a shortened or variant form (e.g., "dapps" instead of "dapp"), still use the glossary's canonical translated form -- do not reproduce informal variants. +- Translate English idioms and colloquialisms by meaning, not literally. Use natural target-language expressions that convey the same meaning. Examples: "in the wild" = in actual use/in practice; "under the hood" = internally/how it works technically; "out of the box" = by default/without extra configuration; "a deep dive" = a detailed exploration; "the big picture" = the overall view. If no natural equivalent exists, rephrase plainly rather than producing an awkward literal translation. +- Do not translate technical concepts or loanwords unless an exact translation is provided in the glossary. If a term is not in the glossary, leave it in English.` + + switch (group) { + case "rtl": + return `${common} + +BiDi rules for RTL -- wrap LTR content in ... to prevent layout flipping: +- Wrap any technical string or number combined with Latin units, symbols, or operators: 32 ETH, 100 Gwei, 12.5%, $2,500 USD, v1.10.8, EIP-1559, > 0.01, 2x +- Wrap numbers containing separators (commas/periods): 21,000, 0.000252 +- Wrap dates/times ONLY if they use Latin month names: June 18, 2022, 12:00 UTC +- Keep units, currency symbols, and operators INSIDE: $100,000 USD (correct) +- Punctuation (periods, commas, etc.) belonging to the RTL sentence stays OUTSIDE +- Do NOT wrap content inside backticks or text transliterated into target script +- Use Western Arabic numerals (1, 2, 3) for Arabic. Urdu uses native numerals for prose but Western for technical identifiers. +- Never convert Gregorian dates to Hijri calendar. +- The word "state" in blockchain context means computational state, not political state. +- Ensure Markdown syntax (headers, links, bullet points, tables) and HTML tags remain in LTR formatting.` + + case "cjk-phonetic": + return `${common} +- Override: brand-name tags should be transliterated into native script (Katakana/Hangul) for this language group, unlike other groups where they stay Latin. +- Keep global acronyms (DeFi, NFT, API) in Latin.` + + case "cjk-semantic": + return `${common} +- Translate terms by meaning (calque), not by sound. Example: "Smart Contract" = 智能合约. +- Use officially established translations where they exist (Ethereum = 以太坊). +- If no official translation exists for a brand, keep it in Latin script.` + + case "indic": + return `${common} +- Use Western Arabic numerals (1, 2, 3) -- not native numeral scripts.` + + case "cyrillic": + return `${common} +- Use Western Arabic numerals (1, 2, 3) -- not native numeral scripts. +- Use correct grammatical plural categories (one/few/many/other) as appropriate for the target language.` + + case "latin": + return common + } +} diff --git a/src/scripts/intl-pipeline/lib/llm/manifest-adapter.ts b/src/scripts/intl-pipeline/lib/llm/manifest-adapter.ts new file mode 100644 index 00000000000..59d21c26525 --- /dev/null +++ b/src/scripts/intl-pipeline/lib/llm/manifest-adapter.ts @@ -0,0 +1,296 @@ +/** + * Adapter between intl-content-tree package and our pipeline. + * + * Maps the package's generic API to our Gemini translation pipeline's + * needs. Handles: + * - Ethereum.org-specific translatableAttributes config + * - Manifest content generation as strings (for GitHub API commits) + * - Per-locale translation manifest for Gemini output data + */ + +import { + type ContentTreeConfig, + deserialize, + diff, + type DiffEntry, + type DiffResult, + getInertValue, + hasChanges, + MANIFEST_VERSION, + parseJson, + parseMarkdown, + serialize, + type TreeManifest, + type TreeNode, + validate, + type ValidationResult, + walk, +} from "intl-content-tree" + +// --------------------------------------------------------------------------- +// Ethereum.org config +// --------------------------------------------------------------------------- + +/** Attributes whose values need translation on this site */ +const ETHEREUM_ORG_CONFIG: Partial = { + depth: "element", + translatableAttributes: [ + "title", + "description", + "alt", + "label", + "aria-label", + "placeholder", + "buttonLabel", + "name", + "caption", + "contentPreview", + "location", + ], +} + +/** + * Detect JSON string values that contain markdown syntax. + * When true, the value is parsed recursively using the markdown parser, + * decomposing links, images, and formatting into child nodes. + * Handles dapp descriptions and other rich-text JSON values. + */ +const ETHEREUM_ORG_JSON_CONFIG = { + markdownValueDetector: (_, value: string): boolean => { + // Heuristic: requires structural markdown patterns (not just inline bold). + // Triggers on: paragraph-break + list items, numbered lists, or images. + // Does NOT trigger on standalone **bold** in normal prose. + return /\n\n[-*] |\n\n\d+\. |!\[.*\]\(/.test(value) + }, +} + +// --------------------------------------------------------------------------- +// Source manifest (English content tree) +// --------------------------------------------------------------------------- + +/** + * Parse English markdown and generate a manifest string for committing. + */ +export function buildMarkdownManifest( + englishContent: string, + sourceFile: string, + sourceCommitSha?: string +): string { + const tree = parseMarkdown(englishContent, ETHEREUM_ORG_CONFIG) + const manifest = serialize(tree, sourceFile) + const output = sourceCommitSha ? { ...manifest, sourceCommitSha } : manifest + return JSON.stringify(output, null, 2) + "\n" +} + +/** + * Parse English JSON and generate a manifest string for committing. + */ +export function buildJsonManifest( + englishContent: string, + sourceFile: string, + sourceCommitSha?: string +): string { + const tree = parseJson( + englishContent, + ETHEREUM_ORG_CONFIG, + ETHEREUM_ORG_JSON_CONFIG + ) + const manifest = serialize(tree, sourceFile) + const output = sourceCommitSha ? { ...manifest, sourceCommitSha } : manifest + return JSON.stringify(output, null, 2) + "\n" +} + +/** + * Parse English content into a tree (for use by the translation pipeline). + */ +export function parseEnglishMarkdown(content: string): TreeNode { + return parseMarkdown(content, ETHEREUM_ORG_CONFIG) +} + +/** + * Parse English JSON into a tree. + */ +export function parseEnglishJson(content: string): TreeNode { + return parseJson(content, ETHEREUM_ORG_CONFIG, ETHEREUM_ORG_JSON_CONFIG) +} + +/** + * Detect drift between current English and a stored manifest. + */ +export function detectDrift( + currentEnglishContent: string, + storedManifestJson: string, + format: "markdown" | "json" +): DiffResult { + const storedManifest: TreeManifest = JSON.parse(storedManifestJson) + const oldTree = deserialize(storedManifest) + const newTree = + format === "markdown" + ? parseMarkdown(currentEnglishContent, ETHEREUM_ORG_CONFIG) + : parseJson( + currentEnglishContent, + ETHEREUM_ORG_CONFIG, + ETHEREUM_ORG_JSON_CONFIG + ) + return diff(oldTree, newTree) +} + +/** + * Quick check: has the English content changed since the manifest was stamped? + */ +export function hasEnglishChanged( + currentEnglishContent: string, + storedManifestJson: string, + format: "markdown" | "json" +): boolean { + const storedManifest: TreeManifest = JSON.parse(storedManifestJson) + const newTree = + format === "markdown" + ? parseMarkdown(currentEnglishContent, ETHEREUM_ORG_CONFIG) + : parseJson( + currentEnglishContent, + ETHEREUM_ORG_CONFIG, + ETHEREUM_ORG_JSON_CONFIG + ) + return hasChanges(newTree, storedManifest) +} + +/** + * Validate a markdown file's readiness for incremental tracking. + */ +export function validateMarkdown(content: string): ValidationResult { + const tree = parseMarkdown(content, ETHEREUM_ORG_CONFIG) + return validate(tree) +} + +/** + * Get an inert value by path from an English content tree. + */ +export function getEnglishInertValue( + content: string, + path: string, + format: "markdown" | "json" +): string | undefined { + const tree = + format === "markdown" + ? parseMarkdown(content, ETHEREUM_ORG_CONFIG) + : parseJson(content, ETHEREUM_ORG_CONFIG, ETHEREUM_ORG_JSON_CONFIG) + return getInertValue(tree, path) +} + +// --------------------------------------------------------------------------- +// Per-locale translation manifest (Gemini output data) +// --------------------------------------------------------------------------- + +/** + * Per-locale translation manifest, stored alongside the English source + * manifest. Records how the LLM ordered elements in the translated + * output, enabling deterministic inert propagation without retranslation. + */ +export interface LocaleTranslationManifest { + version: number + locale: string + translatedAt: string + /** Hash of the English source manifest this was translated against */ + englishManifestHash: string + /** Placeholder IDs in the order Gemini returned them */ + placeholderOrder: string[] + /** Placeholder ID -> inert values at translation time */ + placeholderMap: Record< + string, + { type: string; values: Record } + > + /** Per-section translation status */ + sections: Record< + string, + { + translatedAt: string + status: "success" | "failed" | "skipped" + glossaryVersion?: string + } + > +} + +/** + * Build a locale translation manifest string for committing. + */ +export function buildLocaleTranslationManifest(opts: { + locale: string + englishManifestHash: string + placeholderOrder: string[] + placeholderMap: Record< + string, + { type: string; values: Record } + > + sections: Record< + string, + { + translatedAt: string + status: "success" | "failed" | "skipped" + glossaryVersion?: string + } + > +}): string { + const manifest: LocaleTranslationManifest = { + version: MANIFEST_VERSION, + locale: opts.locale, + translatedAt: new Date().toISOString(), + englishManifestHash: opts.englishManifestHash, + placeholderOrder: opts.placeholderOrder, + placeholderMap: opts.placeholderMap, + sections: opts.sections, + } + return JSON.stringify(manifest, null, 2) + "\n" +} + +/** + * Extract placeholder data from a parsed tree for building translation + * manifests. Works for both markdown and JSON trees. Walks the tree + * collecting inert/mixed nodes with their meta values. + */ +export function extractPlaceholderData(tree: TreeNode): { + placeholderOrder: string[] + placeholderMap: Record< + string, + { type: string; values: Record } + > +} { + const order: string[] = [] + const map: Record }> = + {} + + let counter = 0 + for (const node of walk(tree)) { + if ( + node.contentType === "inert" || + (node.contentType === "mixed" && + node.meta && + Object.keys(node.meta).length > 0) + ) { + const values: Record = {} + if (node.contentType === "inert" && node.value) { + values.value = node.value + } + if (node.meta) { + // Include ALL meta keys (including tagName) so the hash of + // these values matches the tree's anchorHash computation. + for (const [k, v] of Object.entries(node.meta)) { + if (k !== "language" && k !== "name") { + values[k] = String(v) + } + } + } + if (Object.keys(values).length > 0) { + const id = `${node.elementType.toUpperCase()}-${counter++}` + order.push(id) + map[id] = { type: node.elementType, values } + } + } + } + + return { placeholderOrder: order, placeholderMap: map } +} + +// Re-export types consumers need +export type { DiffEntry, DiffResult, TreeManifest, TreeNode, ValidationResult } +export { MANIFEST_VERSION } diff --git a/src/scripts/i18n/lib/ai/gemini-output-validation.ts b/src/scripts/intl-pipeline/lib/llm/output-validation.ts similarity index 56% rename from src/scripts/i18n/lib/ai/gemini-output-validation.ts rename to src/scripts/intl-pipeline/lib/llm/output-validation.ts index 1c5d9163efe..bc2daf0f74b 100644 --- a/src/scripts/i18n/lib/ai/gemini-output-validation.ts +++ b/src/scripts/intl-pipeline/lib/llm/output-validation.ts @@ -86,6 +86,37 @@ export function validateTranslatedMarkdown( } } + // Frontmatter title/description should be translated, not left in English + const untranslatedFm = checkFrontmatterTranslated(translated, english) + if (untranslatedFm) { + return { valid: false, error: untranslatedFm } + } + + // Code block placeholders must survive translation intact. + // The pipeline extracts code blocks before sending to Gemini and restores + // them afterward. If Gemini drops or corrupts a placeholder, the code + // block is lost and Gemini may hallucinate replacement code. + const expectedPlaceholders = english.match(//g) || [] + for (const placeholder of expectedPlaceholders) { + if (!translated.includes(placeholder)) { + return { + valid: false, + error: `Missing code block placeholder: ${placeholder}`, + } + } + } + + // Gemini must not introduce code fences -- all code was extracted + if (expectedPlaceholders.length > 0) { + const fenceCount = (translated.match(/^```/gm) || []).length + if (fenceCount > 0) { + return { + valid: false, + error: `Output contains ${fenceCount} code fences but code blocks were extracted -- Gemini is hallucinating code`, + } + } + } + return { valid: true } } @@ -141,3 +172,48 @@ function validateCommon(translated: string): ValidationResult { return { valid: true } } + +/** + * Extract a frontmatter field value from raw markdown. + * Returns undefined if the field is not found. + */ +function extractFrontmatterField( + content: string, + field: string +): string | undefined { + const fmMatch = content.match(/^---\n([\s\S]*?)\n---/) + if (!fmMatch) return undefined + const re = new RegExp(`^${field}:\\s*"?(.+?)"?\\s*$`, "m") + const match = fmMatch[1].match(re) + return match?.[1] +} + +/** + * Check that key frontmatter fields (title, description) were actually + * translated and not left identical to the English source. + * + * Only fails if BOTH title and description are identical to English. + * Technical titles (e.g., "Ethash", "JSON-RPC API", "PeerDAS") are + * legitimately kept in English, so a matching title alone is not a + * failure -- as long as the description was translated. + * + * Returns an error string if untranslated, or undefined if OK. + */ +function checkFrontmatterTranslated( + translated: string, + english: string +): string | undefined { + const enTitle = extractFrontmatterField(english, "title") + const trTitle = extractFrontmatterField(translated, "title") + const titleMatch = enTitle && trTitle && enTitle === trTitle + + const enDesc = extractFrontmatterField(english, "description") + const trDesc = extractFrontmatterField(translated, "description") + const descMatch = enDesc && trDesc && enDesc === trDesc + + if (titleMatch && descMatch) { + return `Frontmatter "title" and "description" were both not translated (identical to English)` + } + + return undefined +} diff --git a/src/scripts/intl-pipeline/lib/llm/prompt-builder.ts b/src/scripts/intl-pipeline/lib/llm/prompt-builder.ts new file mode 100644 index 00000000000..cf79d10d687 --- /dev/null +++ b/src/scripts/intl-pipeline/lib/llm/prompt-builder.ts @@ -0,0 +1,193 @@ +/** + * Build translation prompts for Gemini translation. + * + * Philosophy: Gemini is the language expert. We provide: + * 1. Site-specific context (glossary, conventions) + * 2. Structural expectations (frontmatter, markdown) + * 3. Lightweight guardrails (what our sanitizer checks for) + * + * We do NOT micromanage linguistics -- Gemini knows RTL, BiDi, + * transliteration norms, etc. better than any regex. + */ + +import { + getLanguageGroup, + getSiteSpecificNotes, + type LanguageGroup, +} from "./language-groups" + +interface PromptOptions { + filePath: string + fileContent: string + fileType: "markdown" | "json" + targetLanguage: string + languageName: string + glossaryTerms: Map + /** When true, HTML tags have been replaced with placeholders */ + htmlExtracted?: boolean + /** When true, content has been normalized with HTML-PLACEHOLDER tags */ + normalized?: boolean +} + +/** + * Build the complete translation prompt for a single file. + */ +export function buildTranslationPrompt(options: PromptOptions): string { + const { + filePath, + fileContent, + fileType, + targetLanguage, + languageName, + glossaryTerms, + htmlExtracted, + normalized, + } = options + + const group = getLanguageGroup(targetLanguage) + const siteNotes = getSiteSpecificNotes(group) + const glossarySection = formatGlossary(glossaryTerms) + const formatRules = normalized + ? getNormalizedFormatRules(fileType, group, targetLanguage) + : getFormatRules(fileType, group, targetLanguage, htmlExtracted) + const sanitizerHints = getSanitizerHints() + + return `Translate this ${fileType} file from English to ${languageName} (${targetLanguage}). + +File: ${filePath} + +${formatRules} + +${siteNotes} + +${glossarySection} + +${sanitizerHints} + +=== SOURCE FILE === +${fileContent} +=== END SOURCE FILE === + +Output ONLY the translated file content. No explanations, no markdown wrapping, no commentary.` +} + +function getFormatRules( + fileType: "markdown" | "json", + group: LanguageGroup, + targetLanguage: string, + htmlExtracted?: boolean +): string { + if (fileType === "json") { + const htmlRule = htmlExtracted + ? `- Some HTML tags have been replaced with placeholder tags: + - Self-closing: \`\` -- preserve exactly, do not modify. + - Wrapper: \`text\` -- translate the text between tags, keep the tags themselves. You may reorder wrapper pairs to match natural ${targetLanguage} word order. + - Simple formatting tags (, , etc.) are NOT placeholders -- preserve them around your translated text. + - NEVER translate, modify, or remove placeholder tags. Every placeholder must appear exactly once in the output.` + : `- Preserve HTML tags within values exactly (, , etc.).` + + return `Format rules: +- Output valid JSON with identical key structure. +- Translate only string values. Never translate keys. +${htmlRule} +- Preserve interpolation variables exactly ({count}, {{name}}, etc.). +- Internal href paths (/developers/docs/...) must stay in English.` + } + + // Author handling differs by script family + const authorRule = + group === "latin" + ? "Keep the author field unchanged." + : "Transliterate the author field into the target script (phonetic, not semantic). Pseudonyms or GitHub handles (e.g., qbzzt, jdourlens) must stay in Latin." + + return `Format rules: +- Frontmatter: translate the values of title, description, and breadcrumb. Translate concept tags but keep brand-name tags in Latin (per site rules above). ${authorRule} Change the \`lang\` field to \`${targetLanguage}\`. Keep all other fields (skill, published, sidebarDepth) unchanged. Preserve YAML structure exactly. +- Preserve all markdown syntax (headings, lists, links) and their indentation exactly. +- Preserve all JSX/HTML components and their attributes exactly. +- Preserve heading anchor IDs exactly as in English ({#anchor-id}). +- HTML comment placeholders like \`\` are code block stand-ins managed by our pipeline. You MUST preserve them EXACTLY as-is -- same text, same position, same line. Do NOT remove, translate, modify, or replace them with code. They will be restored automatically after translation. +- If a true code fence (\`\`\` block) is encountered in the source, never translate the functional code inside it. Only code comments (// or /* */ or #) within fences may be translated. Never change the language identifier after the opening fence (e.g. \`\`\`python, \`\`\`solidity, \`\`\`bash must stay exactly as-is). +- Internal links (href starting with /) must match English exactly. +- Image paths must match English exactly.` +} + +/** + * Format rules for content that has been pre-processed by the normalizer. + * + * The normalizer has already: + * - Replaced code blocks with + * - Replaced inline code with + * - Replaced images with + * - Wrapped link text: text + * - Wrapped HTML tag text: text + * - Replaced JSX components with + * + * Gemini only needs to translate the visible prose and wrapper-tag contents. + */ +function getNormalizedFormatRules( + fileType: "markdown" | "json", + group: LanguageGroup, + targetLanguage: string +): string { + if (fileType === "json") { + // JSON normalization is not yet implemented; fall back + return getFormatRules(fileType, group, targetLanguage, false) + } + + const authorRule = + group === "latin" + ? "Keep the author field unchanged." + : "Transliterate the author field into the target script (phonetic, not semantic). Pseudonyms or GitHub handles (e.g., qbzzt, jdourlens) must stay in Latin." + + return `Format rules: +- Frontmatter: translate the values of title, description, and breadcrumb. Translate concept tags but keep brand-name tags in Latin (per site rules above). ${authorRule} Change the \`lang\` field to \`${targetLanguage}\`. Keep all other fields (skill, published, sidebarDepth) unchanged. Preserve YAML structure exactly. +- Preserve heading anchor IDs exactly as in English ({#anchor-id}). +- Preserve all markdown formatting (headings, lists, bold, italic, blockquotes) and their indentation exactly. + +PLACEHOLDER RULES (critical): +This content has been pre-processed. Non-translatable elements have been replaced with placeholder tags. You MUST follow these rules: + +1. SELF-CLOSING placeholders -- preserve exactly, do not modify or translate: + = code block (will be restored) + = inline code (will be restored) + = image (will be restored) + = childless JSX component (will be restored) + +2. WRAPPER placeholders -- translate the text BETWEEN the tags, keep the tags themselves: + translate this text = markdown/text fence (translate the prose inside) + translate this text + translate this text + translate this text + You MAY reorder wrapper placeholders within a sentence to match natural ${targetLanguage} word order. + Do NOT split a wrapper pair or nest them differently. + +3. NEVER translate, modify, or remove any placeholder tag. Every placeholder in the source must appear exactly once in the output. +4. NEVER invent new placeholder tags. +5. The text between wrapper tags is the only translatable content associated with that placeholder.` +} + +function formatGlossary(terms: Map): string { + if (terms.size === 0) return "" + + const lines = Array.from(terms.entries()) + .map(([en, translated]) => `- "${en}" -> "${translated}"`) + .join("\n") + + return `Community-voted glossary (use these exact translations): +${lines}` +} + +/** + * Hints about what our post-processing sanitizer checks for. + * This helps Gemini avoid patterns we'd just fix afterward. + */ +function getSanitizerHints(): string { + return `Our automated sanitizer will check your output for: +- Brand names in frontmatter tags must stay Latin (Solidity, MetaMask, etc.) +- Ticker symbols (ETH, ERC-20) must stay Latin +- Internal hrefs must match English source exactly +- No translated heading anchor IDs +- No broken markdown link syntax +- Valid JSON structure (for JSON files) +Getting these right the first time avoids post-processing corrections.` +} diff --git a/src/scripts/intl-pipeline/lib/shared-patterns.ts b/src/scripts/intl-pipeline/lib/shared-patterns.ts new file mode 100644 index 00000000000..80b81dbf64d --- /dev/null +++ b/src/scripts/intl-pipeline/lib/shared-patterns.ts @@ -0,0 +1,105 @@ +/** + * Shared regex patterns and heuristics for the i18n pipeline. + * + * Single source of truth for patterns used across: + * - content-normalizer.ts + * - code-block-extractor.ts + * - manifest-generator.ts + * - jsx-attributes/ + * + * All regex constants use the _RE suffix by convention. + */ + +// --------------------------------------------------------------------------- +// Fenced code blocks +// --------------------------------------------------------------------------- + +/** + * Matches ``` or ~~~ fenced code blocks with optional language tag. + * Handles both non-empty content and empty blocks (just fences, no body). + * + * WARNING: This regex uses the /g flag. When used with String.replace() + * this is safe (replace resets lastIndex). When used with RegExp.exec() + * in a loop, reset lastIndex to 0 before each new input string. + * + * Capture groups (non-empty block): + * 1: indent, 2: fence chars, 3: language tag, 4: content + * Capture groups (empty block): + * 5: indent, 6: fence chars, 7: language tag + */ +export const FENCED_BLOCK_RE = + /^([ \t]*)(```|~~~)([^\n]*)\n([\s\S]*?)\n\1\2[ \t]*$|^([ \t]*)(```|~~~)([^\n]*)\n\5\6[ \t]*$/gm + +// --------------------------------------------------------------------------- +// JSX/HTML attributes +// --------------------------------------------------------------------------- + +/** + * Matches JSX/HTML-style attributes with quoted values. + * Handles both double and single quotes, with escape sequences. + * + * Capture groups: + * 1: attribute name, 2: double-quoted value, 3: single-quoted value + */ +export const ATTRIBUTE_RE = + /\b([a-zA-Z][\w-]*)\s*=\s*(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)')/g + +/** + * Attributes that contain human-readable text requiring translation. + * Used by both the JSX attribute translator and the content normalizer. + */ +export const TRANSLATABLE_ATTRIBUTES = [ + "title", + "description", + "alt", + "label", + "aria-label", + "placeholder", + "buttonLabel", + "name", + "caption", + "contentPreview", + "location", +] as const + +export type TranslatableAttribute = (typeof TRANSLATABLE_ATTRIBUTES)[number] + +// --------------------------------------------------------------------------- +// Translatable value heuristic +// --------------------------------------------------------------------------- + +/** + * Check if a string value looks like human-readable text that should + * be translated, as opposed to a URL, path, variable, or identifier. + * + * Used for filtering JSX attribute values before sending to Gemini. + */ +export function isTranslatableValue(value: string): boolean { + if (!value || value.length < 3) return false + // Any URI scheme (http, https, ftp, mailto, data, javascript, etc.) + if (/^[a-zA-Z][\w+.-]*:/.test(value)) return false + // Absolute or relative paths + if (/^[/.]/.test(value)) return false + // Path-like values with slashes but no spaces (e.g., "images/foo") + if (/\//.test(value) && !/\s/.test(value)) return false + // File extensions + if (/\.(png|jpg|svg|gif|json|md|webp|css|js|ts)$/i.test(value)) return false + // JSX expressions / template variables + if (/^\{.*\}$/.test(value)) return false + // Single-word identifiers (camelCase, kebab-case, PascalCase) + if (/^[a-zA-Z][a-zA-Z0-9-]*$/.test(value) && !value.includes(" ")) + return false + // Numeric-only values + if (/^[\d.,\s%$]+$/.test(value)) return false + return true +} + +// --------------------------------------------------------------------------- +// Frontmatter +// --------------------------------------------------------------------------- + +/** + * Matches YAML frontmatter at the start of a markdown file. + * Capture group 1 is the entire frontmatter block including delimiters. + */ +export const FRONTMATTER_RE = /^(---\n[\s\S]*?\n---\n)/ diff --git a/src/scripts/intl-pipeline/lib/types.ts b/src/scripts/intl-pipeline/lib/types.ts new file mode 100644 index 00000000000..bb005a13f9c --- /dev/null +++ b/src/scripts/intl-pipeline/lib/types.ts @@ -0,0 +1,114 @@ +// Types for intl-pipeline GitHub and file operations + +/** + * GET https://api.github.com/search/code + */ +export type GHOwner = { + login: string + id: number + node_id: string + avatar_url: string + gravatar_id: string + url: string + html_url: string + followers_url: string + following_url: string + gists_url: string + starred_url: string + subscriptions_url: string + organizations_url: string + repos_url: string + events_url: string + received_events_url: string + type: string + user_view_type: string + site_admin: boolean +} + +export type GHRepository = { + id: number + node_id: string + name: string + full_name: string + private: boolean + owner: GHOwner + html_url: string + description: string | null + fork: boolean + url: string + forks_url: string + keys_url: string + collaborators_url: string + teams_url: string + hooks_url: string + issue_events_url: string + events_url: string + assignees_url: string + branches_url: string + tags_url: string + blobs_url: string + git_tags_url: string + git_refs_url: string + trees_url: string + statuses_url: string + languages_url: string + stargazers_url: string + contributors_url: string + subscribers_url: string + subscription_url: string + commits_url: string + git_commits_url: string + comments_url: string + issue_comment_url: string + contents_url: string + compare_url: string + merges_url: string + archive_url: string + downloads_url: string + issues_url: string + pulls_url: string + milestones_url: string + notifications_url: string + labels_url: string + releases_url: string + deployments_url: string +} + +export type GitHubQueryResponseItem = { + name: string + path: string + sha: string + url: string + git_url: string + html_url: string + repository: GHRepository + score: number +} + +// Optional: the whole response is an array of items +export type GitHubQueryResponse = GitHubQueryResponseItem[] + +export type ContentType = + | "application/json" + | "text/markdown" + | "application/octet-stream" + +export type GitHubFileMetadata = { + fileName: string + filePath: string // e.g., src/intl/en/page-layer-2-networks.json (no leading slash) + download_url: string + "Content-Type": ContentType +} + +export type BranchObject = { + sha: string + type: string // e.g. "commit" + url: string +} + +export type BranchDetailsResponse = { + ref: string // e.g. "refs/heads/dev" + node_id: string + url: string + object: BranchObject +} diff --git a/src/scripts/intl-pipeline/lib/utils/branch-naming.ts b/src/scripts/intl-pipeline/lib/utils/branch-naming.ts new file mode 100644 index 00000000000..40c90708b4a --- /dev/null +++ b/src/scripts/intl-pipeline/lib/utils/branch-naming.ts @@ -0,0 +1,17 @@ +/** + * Temp branch naming for pipeline runs. + * See CONCURRENCY-SPEC.md Part 3. + */ + +/** + * Generate a temp branch name: tmp-intl/run-MMDD-HHMM + * Uses UTC time to avoid timezone ambiguity. + */ +export function generateTempBranchName(): string { + const now = new Date() + const mm = String(now.getUTCMonth() + 1).padStart(2, "0") + const dd = String(now.getUTCDate()).padStart(2, "0") + const hh = String(now.getUTCHours()).padStart(2, "0") + const min = String(now.getUTCMinutes()).padStart(2, "0") + return `tmp-intl/run-${mm}${dd}-${hh}${min}` +} diff --git a/src/scripts/i18n/lib/utils/fetch.ts b/src/scripts/intl-pipeline/lib/utils/fetch.ts similarity index 100% rename from src/scripts/i18n/lib/utils/fetch.ts rename to src/scripts/intl-pipeline/lib/utils/fetch.ts diff --git a/src/scripts/i18n/lib/ai/rate-limiter.ts b/src/scripts/intl-pipeline/lib/utils/rate-limiter.ts similarity index 100% rename from src/scripts/i18n/lib/ai/rate-limiter.ts rename to src/scripts/intl-pipeline/lib/utils/rate-limiter.ts diff --git a/src/scripts/intl-pipeline/lib/utils/task-pool.ts b/src/scripts/intl-pipeline/lib/utils/task-pool.ts new file mode 100644 index 00000000000..041ec3e70bb --- /dev/null +++ b/src/scripts/intl-pipeline/lib/utils/task-pool.ts @@ -0,0 +1,125 @@ +/** + * Concurrency task pool with per-language tracking. + * + * Wraps the rate limiter semaphore with: + * - Language-aware task submission + * - Per-language completion callbacks (fires once when all tasks for a language finish) + * - Token usage accumulation per language + * + * See CONCURRENCY-SPEC.md Part 1. + */ + +import { createRateLimiter } from "./rate-limiter" + +export interface TokenUsage { + input: number + output: number +} + +export interface TaskResult { + tokens?: TokenUsage +} + +export interface TaskPoolOptions { + concurrency: number + delayBetweenMs?: number + onLanguageComplete?: (language: string, stats: LanguageStats) => void +} + +export interface LanguageStats { + tasksCompleted: number + totalInputTokens: number + totalOutputTokens: number +} + +export interface TaskPool { + submit(language: string, task: () => Promise): void + drain(): Promise + getStats(): Record + hasErrors(): boolean + getErrors(): Array<{ language: string; error: Error }> +} + +export function createTaskPool(options: TaskPoolOptions): TaskPool +export function createTaskPool(concurrency: number): TaskPool +export function createTaskPool( + optionsOrConcurrency: TaskPoolOptions | number +): TaskPool { + const opts: TaskPoolOptions = + typeof optionsOrConcurrency === "number" + ? { concurrency: optionsOrConcurrency } + : optionsOrConcurrency + + const limiter = createRateLimiter(opts.concurrency, opts.delayBetweenMs ?? 0) + + const stats: Record = {} + const pendingByLanguage: Record = {} + const completedByLanguage: Set = new Set() + const allTasks: Promise[] = [] + const errors: Array<{ language: string; error: Error }> = [] + + function ensureLanguage(lang: string) { + if (!stats[lang]) { + stats[lang] = { + tasksCompleted: 0, + totalInputTokens: 0, + totalOutputTokens: 0, + } + pendingByLanguage[lang] = 0 + } + } + + function submit( + language: string, + task: () => Promise + ): void { + ensureLanguage(language) + pendingByLanguage[language]++ + + const wrapped = (async () => { + await limiter.acquire() + try { + const result = await task() + if (result?.tokens) { + stats[language].totalInputTokens += result.tokens.input + stats[language].totalOutputTokens += result.tokens.output + } + stats[language].tasksCompleted++ + } catch (err) { + errors.push({ + language, + error: err instanceof Error ? err : new Error(String(err)), + }) + } finally { + limiter.release() + pendingByLanguage[language]-- + + if ( + pendingByLanguage[language] === 0 && + !completedByLanguage.has(language) + ) { + completedByLanguage.add(language) + opts.onLanguageComplete?.(language, { ...stats[language] }) + } + } + })() + + allTasks.push(wrapped) + } + + async function drain(): Promise { + await Promise.all(allTasks) + } + + function getStats(): Record { + return { ...stats } + } + + return { + submit, + drain, + getStats, + hasErrors: () => errors.length > 0, + getErrors: () => [...errors], + } +} diff --git a/src/scripts/intl-pipeline/lib/workflows/pr-creation.ts b/src/scripts/intl-pipeline/lib/workflows/pr-creation.ts new file mode 100644 index 00000000000..377ac6dd8e9 --- /dev/null +++ b/src/scripts/intl-pipeline/lib/workflows/pr-creation.ts @@ -0,0 +1,136 @@ +// PR creation and update workflow phase + +import { config } from "../../config" +import { + findOpenPR, + postPullRequest, + updatePRBody, +} from "../github/pull-requests" + +import type { CommittedFile, LanguagePair } from "./types" +import { logSection } from "./utils" + +/** + * Generate PR title based on language count + */ +export function generatePRTitle( + langCodes: string[], + allPossibleLanguages: string[] +): string { + let prTitle = "i18n: intl-pipeline translations" + + if (langCodes.length <= 3) { + prTitle += ` (${langCodes.join(", ")})` + } else if (langCodes.length === allPossibleLanguages.length) { + prTitle += " (all languages)" + } else { + prTitle += " (multiple languages)" + } + + return prTitle +} + +/** + * Generate the initial PR body (used only on first creation) + */ +function generateInitialPRBody(): string { + return [ + "## Automated Translations", + "", + "This PR contains translations managed by the intl pipeline.", + "Each run appends a summary below.", + "", + ].join("\n") +} + +/** + * Generate a run summary to append to the PR body + */ +export function generateRunSummary( + langCodes: string[], + committedFiles: CommittedFile[], + mode: string, + workflowRunUrl?: string +): string { + const now = new Date().toISOString().replace("T", " ").slice(0, 19) + " UTC" + + const jsonCount = committedFiles.filter((f) => + f.path.endsWith(".json") + ).length + const mdCount = committedFiles.filter((f) => f.path.endsWith(".md")).length + + const parts = [ + "---", + `### Run: ${now}`, + `- Languages: ${langCodes.join(", ")}`, + `- Files: ${committedFiles.length} (${mdCount} MD, ${jsonCount} JSON)`, + `- Mode: ${mode}`, + ] + + if (workflowRunUrl) { + parts.push(`- [View workflow run](${workflowRunUrl})`) + } + + parts.push("") + return parts.join("\n") +} + +/** + * Build workflow run URL from GitHub environment variables + */ +function getWorkflowRunUrl(): string | undefined { + const serverUrl = process.env.GITHUB_SERVER_URL + const repository = process.env.GITHUB_REPOSITORY + const runId = process.env.GITHUB_RUN_ID + + if (serverUrl && repository && runId) { + return `${serverUrl}/${repository}/actions/runs/${runId}` + } + return undefined +} + +/** + * Create or update a translation PR. + * + * - If no open PR exists for targetBranch -> baseBranch: creates one + * - If an open PR exists: appends a run summary to the existing body + */ +export async function createOrUpdateTranslationPR( + branch: string, + committedFiles: CommittedFile[], + languagePairs: LanguagePair[], + mode: string +): Promise<{ number: number; html_url: string }> { + logSection("Pull Request") + + const langCodes = languagePairs.map((p) => p.internalLanguageCode) + const workflowRunUrl = getWorkflowRunUrl() + const runSummary = generateRunSummary( + langCodes, + committedFiles, + mode, + workflowRunUrl + ) + + // Check for existing open PR + const existingPR = await findOpenPR(branch, config.baseBranch) + + if (existingPR) { + // Append run summary to existing PR body + const updatedBody = (existingPR.body || "") + "\n" + runSummary + await updatePRBody(existingPR.number, updatedBody) + console.log( + `[pr] Updated existing PR #${existingPR.number}: ${existingPR.html_url}` + ) + return existingPR + } + + // Create new PR + const prTitle = generatePRTitle(langCodes, config.allInternalCodes) + const prBody = generateInitialPRBody() + "\n" + runSummary + + const pr = await postPullRequest(branch, config.baseBranch, prTitle, prBody) + console.log(`[pr] Created PR #${pr.number}: ${pr.html_url}`) + + return pr +} diff --git a/src/scripts/i18n/lib/workflows/sanitization.ts b/src/scripts/intl-pipeline/lib/workflows/sanitization.ts similarity index 54% rename from src/scripts/i18n/lib/workflows/sanitization.ts rename to src/scripts/intl-pipeline/lib/workflows/sanitization.ts index 548871af2dc..b232653830f 100644 --- a/src/scripts/i18n/lib/workflows/sanitization.ts +++ b/src/scripts/intl-pipeline/lib/workflows/sanitization.ts @@ -1,35 +1,39 @@ -// Post-import sanitization workflow phase +// Translation output sanitization workflow phase -import { runSanitizer } from "../../post_import_sanitize" +import { runSanitizer } from "../../intl-sanitizer" import { batchCommitFiles, BatchFile } from "../github/commits" import type { CommittedFile } from "./types" import { debugLog, logSection } from "./utils" export interface SanitizationResult { - /** Files that were modified by the sanitizer */ changedFiles: CommittedFile[] - /** Total files processed */ totalProcessed: number } /** - * Run post-import sanitizer on committed files. - * Updates committedFiles in-place with sanitized content. + * Sanitize translation output and commit fixes. + * Syncs heading IDs with English, normalizes formatting, + * protects brand names, validates structure. */ -export async function runPostImportSanitization( +export async function sanitizeTranslations( committedFiles: CommittedFile[], - branch: string + branch: string, + englishContentMap?: Map ): Promise { - logSection("Running Post-Import Sanitizer") + logSection("Sanitizing Translation Output") - console.log(`[SANITIZE] Processing ${committedFiles.length} committed files`) + console.log(`[sanitize] Processing ${committedFiles.length} files`) - const sanitizeResult = await runSanitizer(committedFiles) + const sanitizeResult = await runSanitizer( + committedFiles, + undefined, + englishContentMap + ) const changedFiles = sanitizeResult.changedFiles || [] if (changedFiles.length) { - console.log(`Sanitizer modified ${changedFiles.length} files`) + console.log(`[sanitize] Modified ${changedFiles.length} files`) const filesToCommit: BatchFile[] = [] @@ -39,7 +43,6 @@ export async function runPostImportSanitization( filesToCommit.push({ path: relPath, content: buf }) debugLog(`Will commit sanitized file: ${relPath}`) - // Update committedFiles with sanitized content for validation const existingFile = committedFiles.find((f) => f.path === relPath) if (existingFile) { existingFile.content = file.content @@ -50,14 +53,14 @@ export async function runPostImportSanitization( await batchCommitFiles( filesToCommit, branch, - `i18n: post-import sanitization` + `i18n: sanitize translation output` ) - console.log(`✓ Committed ${changedFiles.length} sanitized files`) + console.log(`[sanitize] Committed ${changedFiles.length} sanitized files`) } catch (e) { - console.warn(`Failed to commit sanitized files:`, e) + console.warn(`[sanitize] Failed to commit:`, e) } } else { - console.log("No sanitization changes needed") + console.log("[sanitize] No changes needed") } return { diff --git a/src/scripts/intl-pipeline/lib/workflows/types.ts b/src/scripts/intl-pipeline/lib/workflows/types.ts new file mode 100644 index 00000000000..f0fd0aba1cc --- /dev/null +++ b/src/scripts/intl-pipeline/lib/workflows/types.ts @@ -0,0 +1,35 @@ +// Types for intl-pipeline workflow phases + +/** + * File committed to GitHub branch + */ +export interface CommittedFile { + path: string + content: string +} + +/** + * Language pair mapping + */ +export interface LanguagePair { + internalLanguageCode: string + languageName: string +} + +/** + * Pull request data + */ +export interface PullRequest { + html_url: string + number: number +} + +/** + * Result of processing a single language in split-PR mode + */ +export interface SplitPRResult { + language: string + status: "success" | "failed" + prUrl?: string + error?: string +} diff --git a/src/scripts/i18n/lib/workflows/utils.ts b/src/scripts/intl-pipeline/lib/workflows/utils.ts similarity index 100% rename from src/scripts/i18n/lib/workflows/utils.ts rename to src/scripts/intl-pipeline/lib/workflows/utils.ts diff --git a/src/scripts/intl-pipeline/main.ts b/src/scripts/intl-pipeline/main.ts new file mode 100644 index 00000000000..2edcc9f657a --- /dev/null +++ b/src/scripts/intl-pipeline/main.ts @@ -0,0 +1,790 @@ +/** + * Incremental Translation Pipeline -- Entry Point + * + * Modes: + * "full" -- Translate entire files from scratch via Gemini + * "auto" -- Detect drift since last run; propagate inert changes by script, + * send only changed prose to Gemini (default) + * + * Environment variables: see config.ts + */ + +import { execFileSync } from "child_process" +import * as fs from "fs" +import * as path from "path" + +import { + diff, + extractChanges, + parseJson, + parseMarkdown, +} from "intl-content-tree" + +import i18nConfig from "../../../i18n.config.json" + +import { + ensureStagingBranch, + getBranchObject, + mergeBranchInto, +} from "./lib/github/branches" +import { getDestinationFromPath, SharedCommitter } from "./lib/github/commits" +import { + callGeminiRaw, + isGeminiAvailable, + translateFile, +} from "./lib/llm/gemini" +import { + batchSections, + buildIncrementalPrompt, + buildSectionList, + extractJsonSections, + extractSections, + parseIncrementalResponse, +} from "./lib/llm/incremental-translate" +import { + buildJsonManifest, + buildLocaleTranslationManifest, + buildMarkdownManifest, + extractPlaceholderData, + hasEnglishChanged, + parseEnglishJson, +} from "./lib/llm/manifest-adapter" +import { generateTempBranchName } from "./lib/utils/branch-naming" +import type { TaskResult } from "./lib/utils/task-pool" +import { createTaskPool } from "./lib/utils/task-pool" +import { createOrUpdateTranslationPR } from "./lib/workflows/pr-creation" +import { sanitizeTranslations } from "./lib/workflows/sanitization" +import { logSection } from "./lib/workflows/utils" +import { + config, + GEMINI_MODELS, + GLOSSARY_API_URL, + validateTargetPath, +} from "./config" +import type { LlmTranslator } from "./pipeline" +import { pipeline, PIPELINE_CONFIG } from "./pipeline" + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface FileContext { + path: string + content: string + type: "markdown" | "json" +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function log(msg: string) { + console.log(`[pipeline] ${msg}`) +} + +function readSourceManifestPath( + destPath: string, + fileType: string, + locale: string +): string { + if (fileType === "markdown") { + return path.join( + process.cwd(), + path.dirname(destPath), + ".manifest-source.json" + ) + } + return path.join(process.cwd(), `src/intl/${locale}/.manifest-source.json`) +} + +/** + * Fetch glossary terms from ETHGlossary API, filtered to terms + * that appear in the source content for a given language. + * Returns Map for prompt injection. + * Includes term notes as parenthetical context when available. + */ +async function loadGlossary( + fileContent: string, + locale: string +): Promise> { + try { + const res = await fetch(`${GLOSSARY_API_URL}/filter`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ content: fileContent, language: locale }), + }) + if (!res.ok) { + console.warn( + `[glossary] API returned ${res.status} for ${locale}, continuing without glossary` + ) + return new Map() + } + const data = (await res.json()) as { + terms: Array<{ + english: string + translation: string + note?: string + }> + } + const map = new Map() + for (const term of data.terms) { + // Sanitize note to prevent prompt injection (strip control chars, limit length) + // eslint-disable-next-line no-control-regex + const controlCharRe = new RegExp("[\\u0000-\\u001f]", "g") + const safeNote = term.note + ? term.note.replace(controlCharRe, "").slice(0, 200) + : "" + const value = safeNote + ? `${term.translation} (${safeNote})` + : term.translation + map.set(term.english, value) + } + return map + } catch (err) { + console.warn( + `[glossary] Failed to fetch for ${locale}: ${err instanceof Error ? err.message : String(err)}` + ) + return new Map() + } +} + +function readLocalePath( + destPath: string, + fileType: string, + locale: string, + fileName: string +): string { + if (fileType === "markdown") { + return path.join(process.cwd(), destPath) + } + return path.join(process.cwd(), `src/intl/${locale}/${fileName}`) +} + +function printTokenSummary( + stats: Record< + string, + { + totalInputTokens: number + totalOutputTokens: number + tasksCompleted: number + } + >, + pipelineDurationMs: number +) { + logSection("Token Usage Summary") + + const fmt = (n: number) => n.toLocaleString("en-US") + const pad = (s: string, w: number) => s.padStart(w) + + console.log( + `${"Language".padEnd(10)}| ${"Calls".padStart(5)} | ${"Input".padStart(10)} | ${"Output".padStart(10)} | ${"Total".padStart(10)}` + ) + const sep = `${"-".repeat(10)}|${"-".repeat(7)}|${"-".repeat(12)}|${"-".repeat(12)}|${"-".repeat(12)}` + console.log(sep) + + let grandInput = 0 + let grandOutput = 0 + let grandCalls = 0 + + for (const [lang, s] of Object.entries(stats)) { + const total = s.totalInputTokens + s.totalOutputTokens + grandInput += s.totalInputTokens + grandOutput += s.totalOutputTokens + grandCalls += s.tasksCompleted + + console.log( + `${lang.padEnd(10)}| ${pad(String(s.tasksCompleted), 5)} | ${pad(fmt(s.totalInputTokens), 10)} | ${pad(fmt(s.totalOutputTokens), 10)} | ${pad(fmt(total), 10)}` + ) + } + + console.log(sep) + const grandTotal = grandInput + grandOutput + console.log( + `${"TOTAL".padEnd(10)}| ${pad(String(grandCalls), 5)} | ${pad(fmt(grandInput), 10)} | ${pad(fmt(grandOutput), 10)} | ${pad(fmt(grandTotal), 10)}` + ) + + // Approximate cost (Gemini 3.1 Pro standard tier, <=200k prompts) + // https://ai.google.dev/gemini-api/docs/pricing (as of 11-April-2026) + const INPUT_RATE = 2.0 + const OUTPUT_RATE = 12.0 + const estCost = + (grandInput / 1_000_000) * INPUT_RATE + + (grandOutput / 1_000_000) * OUTPUT_RATE + + const pipelineSecs = (pipelineDurationMs / 1000).toFixed(1) + console.log( + `\n Estimated cost: ~$${estCost.toFixed(4)} (${GEMINI_MODELS[0]}: $${INPUT_RATE}/1M input, $${OUTPUT_RATE}/1M output)` + ) + console.log(` Wall time: ${pipelineSecs}s`) +} + +/** + * Build an LLM translator that batches section translations via Gemini. + * Uses batchSections for byte-size-aware splitting of large section lists. + */ +async function buildGeminiTranslator( + englishContent: string, + localeContent: string, + fileType: "markdown" | "json", + filePath: string, + locale: string, + sectionIds: string[] +): Promise<{ + translator: LlmTranslator + tokens: { input: number; output: number } +}> { + if (sectionIds.length === 0) { + return { + translator: (_, content) => content, + tokens: { input: 0, output: 0 }, + } + } + + const englishSections = + fileType === "json" + ? extractJsonSections(englishContent) + : extractSections(englishContent) + const localeSections = + fileType === "json" + ? extractJsonSections(localeContent) + : extractSections(localeContent) + + const sectionList = buildSectionList( + englishSections, + localeSections, + sectionIds + ) + const translateCount = sectionList.filter( + (s) => s.action === "TRANSLATE" + ).length + + if (translateCount === 0) { + log(` No sections matched for translation`) + return { + translator: (_, content) => content, + tokens: { input: 0, output: 0 }, + } + } + + const langEntry = i18nConfig.find((l: { code: string }) => l.code === locale) + const languageName = langEntry + ? (langEntry as { code: string; name: string }).name + : locale + + const glossaryTerms = await loadGlossary(englishContent, locale) + if (config.verbose && glossaryTerms.size > 0) { + log(` Glossary: ${glossaryTerms.size} terms for ${locale}`) + } + + // Split into batches if needed (byte-size-aware) + const batches = batchSections( + sectionList.map((s) => ({ + id: s.id, + content: s.content || "", + action: s.action, + })) + ) + + const allTranslations: Record = {} + let totalInput = 0 + let totalOutput = 0 + + for (const batch of batches) { + const batchSectionList = sectionList.filter((s) => + batch.some((b) => b.id === s.id) + ) + + const prompt = buildIncrementalPrompt({ + filePath, + targetLanguage: locale, + languageName, + sections: batchSectionList, + glossaryTerms, + }) + + log( + ` Calling Gemini: ${batchSectionList.filter((s) => s.action === "TRANSLATE").length} sections, ${prompt.length} chars` + ) + + const result = await callGeminiRaw(prompt, { + filePath, + targetLanguage: locale, + label: "incremental", + }) + + try { + const translations = parseIncrementalResponse(result.text) + Object.assign(allTranslations, translations) + } catch (err) { + console.warn( + `[pipeline] Failed to parse batch response for ${locale} (${err instanceof Error ? err.message : String(err)}). Continuing with partial translations.` + ) + } + totalInput += result.tokensUsed.input + totalOutput += result.tokensUsed.output + } + + const translatedIds = Object.keys(allTranslations) + log( + ` Gemini returned ${translatedIds.length} sections (${totalInput} in, ${totalOutput} out)` + ) + + for (const id of sectionIds) { + if (!allTranslations[id]) { + console.warn(` Section "${id}" not returned by Gemini`) + } + } + + return { + translator: (sectionId: string, englishFallback: string) => { + return allTranslations[sectionId] || englishFallback + }, + tokens: { input: totalInput, output: totalOutput }, + } +} + +/** + * Identify which sections need LLM translation. + */ +function getLlmSectionIds( + englishA: string, + englishB: string, + fileType: "markdown" | "json" +): string[] { + const parse = fileType === "markdown" ? parseMarkdown : parseJson + const treeA = parse(englishA, PIPELINE_CONFIG) + const treeB = parse(englishB, PIPELINE_CONFIG) + const dr = diff(treeA, treeB) + const cs = extractChanges(treeA, treeB) + + const tdPaths = dr.translatableDrift.map((e: { path: string }) => e.path) + const leafTdPaths = tdPaths.filter( + (p: string) => + !tdPaths.some((o: string) => o !== p && o.startsWith(p + "/")) + ) + const leafTdIds = dr.translatableDrift + .filter((e: { path: string }) => leafTdPaths.includes(e.path)) + .map((e: { id: string }) => e.id) + .filter((id: string) => !id.startsWith("frontmatter:")) + + const renamedNewIds = new Set( + cs.sectionRenames.map((r: { newId: string }) => r.newId) + ) + const addedIds = dr.added + .filter((e: { id: string }) => !renamedNewIds.has(e.id)) + .map((e: { id: string }) => e.id) + + return [...leafTdIds, ...addedIds] +} + +// --------------------------------------------------------------------------- +// Full Translation +// --------------------------------------------------------------------------- + +async function runFullTranslation( + file: FileContext, + locale: string, + destPath: string, + committer: SharedCommitter, + baseBranchSha: string, + committedFiles: Array<{ path: string; content: string }> +): Promise { + log(`[${locale}] ${file.path}: full translation...`) + + const glossaryTerms = await loadGlossary(file.content, locale) + if (config.verbose && glossaryTerms.size > 0) { + log(`[${locale}] Glossary: ${glossaryTerms.size} terms`) + } + + const result = await translateFile({ + filePath: file.path, + fileContent: file.content, + fileType: file.type, + targetLanguage: locale, + glossaryTerms, + useNormalizer: file.type === "markdown", + }) + + log( + `[${locale}] ${file.path}: translated (${result.tokensUsed.input} in, ${result.tokensUsed.output} out)` + ) + + await committer.commitFile(destPath, result.translatedContent, locale) + committedFiles.push({ path: destPath, content: result.translatedContent }) + + // Build and commit source manifest + const sourceManifest = + file.type === "markdown" + ? buildMarkdownManifest(file.content, file.path, baseBranchSha) + : buildJsonManifest(file.content, file.path, baseBranchSha) + + if (file.type === "markdown") { + const manifestPath = destPath.replace(/index\.md$/, ".manifest-source.json") + await committer.commitFile(manifestPath, sourceManifest, locale) + + if (result.placeholderOrder && result.placeholderMap) { + const parsed = JSON.parse(sourceManifest) + const tm = buildLocaleTranslationManifest({ + locale, + englishManifestHash: parsed.rootHash, + placeholderOrder: result.placeholderOrder, + placeholderMap: result.placeholderMap, + sections: { + _all: { translatedAt: new Date().toISOString(), status: "success" }, + }, + }) + const tmPath = destPath.replace( + /index\.md$/, + ".manifest-translation.json" + ) + await committer.commitFile(tmPath, tm, locale) + } + } else { + const manifestPath = `src/intl/${locale}/.manifest-source.json` + await committer.commitFile(manifestPath, sourceManifest, locale) + + const placeholderData = + result.placeholderOrder && result.placeholderMap + ? { + placeholderOrder: result.placeholderOrder, + placeholderMap: result.placeholderMap, + } + : extractPlaceholderData(parseEnglishJson(file.content)) + + const parsed = JSON.parse(sourceManifest) + const tm = buildLocaleTranslationManifest({ + locale, + englishManifestHash: parsed.rootHash, + placeholderOrder: placeholderData.placeholderOrder, + placeholderMap: placeholderData.placeholderMap, + sections: { + _all: { translatedAt: new Date().toISOString(), status: "success" }, + }, + }) + const jsonTmPath = `src/intl/${locale}/.manifest-translation.json` + await committer.commitFile(jsonTmPath, tm, locale) + } + + log(`[${locale}] ${destPath}: committed`) + return { + tokens: { + input: result.tokensUsed.input, + output: result.tokensUsed.output, + }, + } +} + +// --------------------------------------------------------------------------- +// Incremental Translation +// --------------------------------------------------------------------------- + +async function runIncremental( + file: FileContext, + locale: string, + destPath: string, + sourceManifestJson: string, + localeContent: string, + committer: SharedCommitter, + baseBranchSha: string, + committedFiles: Array<{ path: string; content: string }> +): Promise { + const manifest = JSON.parse(sourceManifestJson) + let englishA: string + + try { + if (!manifest.sourceCommitSha) + throw new Error("no sourceCommitSha in manifest") + if (!/^[0-9a-f]{40}$/i.test(manifest.sourceCommitSha)) + throw new Error(`invalid SHA: ${manifest.sourceCommitSha}`) + validateTargetPath(file.path) + englishA = execFileSync( + "git", + ["show", `${manifest.sourceCommitSha}:${file.path}`], + { encoding: "utf-8" } + ) + } catch (err) { + log( + `[${locale}] ${file.path}: cannot retrieve old English (${err instanceof Error ? err.message : String(err)}), falling back to full translation` + ) + return runFullTranslation( + file, + locale, + destPath, + committer, + baseBranchSha, + committedFiles + ) + } + + const englishB = file.content + + const llmSectionIds = getLlmSectionIds(englishA, englishB, file.type) + log( + `[${locale}] ${file.path}: ${llmSectionIds.length} section(s) need Gemini` + ) + + let translator: LlmTranslator | undefined + let tokens = { input: 0, output: 0 } + if (llmSectionIds.length > 0 && isGeminiAvailable()) { + const geminiResult = await buildGeminiTranslator( + englishB, + localeContent, + file.type, + file.path, + locale, + llmSectionIds + ) + translator = geminiResult.translator + tokens = geminiResult.tokens + } + + const result = pipeline( + englishA, + englishB, + localeContent, + file.type, + translator + ) + + await committer.commitFile(destPath, result, locale) + committedFiles.push({ path: destPath, content: result }) + + const sourceManifest = + file.type === "markdown" + ? buildMarkdownManifest(englishB, file.path, baseBranchSha) + : buildJsonManifest(englishB, file.path, baseBranchSha) + + if (file.type === "markdown") { + const smPath = destPath.replace(/index\.md$/, ".manifest-source.json") + await committer.commitFile(smPath, sourceManifest, locale) + } else { + const smPath = `src/intl/${locale}/.manifest-source.json` + await committer.commitFile(smPath, sourceManifest, locale) + } + + log(`[${locale}] ${destPath}: committed (incremental)`) + return { tokens } +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +async function main() { + const startTime = Date.now() + logSection("Incremental Translation Pipeline v5") + + if (!config.targetPaths.length) { + console.error("[ERROR] TARGET_PATH is required") + process.exit(1) + } + + const targetLanguages = config.allInternalCodes + const baseBranch = config.baseBranch + const targetBranch = config.targetBranch + + log(`Target: ${targetBranch} (base: ${baseBranch})`) + log(`Files: ${config.targetPaths.join(", ")}`) + log(`Languages: ${targetLanguages.join(", ")}`) + log(`Mode: ${config.mode}`) + log(`Concurrency: ${config.concurrency}`) + + // Create temp working branch for crash safety + const tempBranch = generateTempBranchName() + log(`Temp branch: ${tempBranch}`) + await ensureStagingBranch(tempBranch, baseBranch) + const baseBranchSha = (await getBranchObject(baseBranch)).sha + const committer = new SharedCommitter(tempBranch) + await committer.init() + + const committedFiles: Array<{ path: string; content: string }> = [] + + // Load English files from disk + const englishFiles: FileContext[] = config.targetPaths.map((fp) => ({ + path: fp, + content: fs.readFileSync(path.resolve(fp), "utf-8"), + type: fp.endsWith(".json") ? ("json" as const) : ("markdown" as const), + })) + + // Build task pool with per-language completion logging + const pool = createTaskPool({ + concurrency: config.concurrency, + onLanguageComplete: (lang, stats) => { + log( + `[${lang}] Complete: ${stats.tasksCompleted} tasks, ${stats.totalInputTokens} input, ${stats.totalOutputTokens} output tokens` + ) + }, + }) + + // Submit all file x language tasks to the pool + for (const file of englishFiles) { + for (const locale of targetLanguages) { + const destPath = getDestinationFromPath(file.path, locale) + const smPath = readSourceManifestPath(destPath, file.type, locale) + const localePath = readLocalePath( + destPath, + file.type, + locale, + path.basename(file.path) + ) + + const hasLocale = fs.existsSync(localePath) + const hasManifest = fs.existsSync(smPath) + + if (config.mode === "full" || !hasLocale || !hasManifest) { + const reason = + config.mode === "full" + ? "forced full" + : !hasLocale + ? "no locale file" + : "no manifest" + log(`[${locale}] ${file.path}: ${reason} -> full translation`) + + if (!isGeminiAvailable()) { + console.warn(`[${locale}] Skipping: GEMINI_API_KEY not set`) + continue + } + + pool.submit(locale, () => + runFullTranslation( + file, + locale, + destPath, + committer, + baseBranchSha, + committedFiles + ) + ) + continue + } + + // Incremental: check if English changed + const sourceManifestJson = fs.readFileSync(smPath, "utf-8") + if (!hasEnglishChanged(file.content, sourceManifestJson, file.type)) { + if (config.verbose) log(`[${locale}] ${file.path}: no changes`) + continue + } + + const localeContent = fs.readFileSync(localePath, "utf-8") + + if (config.stampOnly) { + log(`[${locale}] ${file.path}: stamp only`) + pool.submit(locale, async () => { + const sourceManifest = + file.type === "markdown" + ? buildMarkdownManifest(file.content, file.path, baseBranchSha) + : buildJsonManifest(file.content, file.path, baseBranchSha) + const manifestDest = + file.type === "markdown" + ? destPath.replace(/index\.md$/, ".manifest-source.json") + : `src/intl/${locale}/.manifest-source.json` + await committer.commitFile(manifestDest, sourceManifest, locale) + }) + continue + } + + pool.submit(locale, () => + runIncremental( + file, + locale, + destPath, + sourceManifestJson, + localeContent, + committer, + baseBranchSha, + committedFiles + ) + ) + } + } + + // Wait for all tasks to complete + await pool.drain() + + // Check for task failures + if (pool.hasErrors()) { + const errors = pool.getErrors() + console.error(`[pipeline] ${errors.length} task(s) failed:`) + for (const { language, error } of errors) { + console.error(` [${language}] ${error.message}`) + } + throw new Error( + `Pipeline aborted: ${errors.length} translation task(s) failed. Temp branch ${tempBranch} preserved with partial progress.` + ) + } + + // Squash interleaved commits into one per language + if (committedFiles.length > 0) { + await committer.squashByLanguage() + } + + // Post-processing: sanitize Gemini output + if (committedFiles.length > 0 && !config.stampOnly) { + const englishContentMap = new Map( + englishFiles.map((f) => [f.path, f.content]) + ) + try { + await sanitizeTranslations(committedFiles, tempBranch, englishContentMap) + } catch (error) { + console.warn( + `[pipeline] Sanitization failed (non-fatal): ${error instanceof Error ? error.message : String(error)}` + ) + } + } + + // Merge temp branch into target branch + if (committedFiles.length > 0) { + log(`Merging ${tempBranch} -> ${targetBranch}`) + await ensureStagingBranch(targetBranch, baseBranch) + const merged = await mergeBranchInto(tempBranch, targetBranch) + if (!merged) { + throw new Error( + `Failed to merge ${tempBranch} into ${targetBranch}. Temp branch preserved for manual resolution.` + ) + } + log(`Merged successfully`) + } else { + log(`No changes to merge`) + } + + // Create or update PR unless skipped + if (committedFiles.length > 0 && !config.skipPr) { + const languagePairs = targetLanguages.map((code) => { + const entry = i18nConfig.find((l: { code: string }) => l.code === code) + return { + internalLanguageCode: code, + languageName: entry + ? (entry as { code: string; name: string }).name + : code, + } + }) + try { + await createOrUpdateTranslationPR( + targetBranch, + committedFiles, + languagePairs, + config.mode + ) + } catch (error) { + console.warn( + `[pipeline] PR creation failed (non-fatal): ${error instanceof Error ? error.message : String(error)}` + ) + } + } + + // Print token summary from pool stats + const poolStats = pool.getStats() + if (Object.keys(poolStats).length > 0) { + printTokenSummary(poolStats, Date.now() - startTime) + } + + logSection("Complete") + log(`Finished in ${((Date.now() - startTime) / 1000).toFixed(1)}s`) +} + +main().catch((error) => { + console.error("\n========== ERROR ==========") + console.error(error instanceof Error ? error.message : String(error)) + if (error instanceof Error && error.stack) console.error(error.stack) + process.exit(1) +}) diff --git a/src/scripts/intl-pipeline/pipeline.ts b/src/scripts/intl-pipeline/pipeline.ts new file mode 100644 index 00000000000..c9040217b1e --- /dev/null +++ b/src/scripts/intl-pipeline/pipeline.ts @@ -0,0 +1,677 @@ +/** + * Incremental Translation Pipeline + * + * Given an English content change (A -> B), updates locale translations with + * minimum LLM usage. Changes that don't affect translatable prose are + * propagated deterministically. Only actual prose changes go to the LLM. + * + * Uses intl-content-tree for Merkle-tree-based change detection. + */ + +import { + type ContentTreeConfig, + diff, + extractChanges, + parseJson, + parseMarkdown, +} from "intl-content-tree" + +import { TRANSLATABLE_ATTRIBUTES } from "./lib/shared-patterns" + +// --------------------------------------------------------------------------- +// Config +// --------------------------------------------------------------------------- + +export const PIPELINE_CONFIG: Partial = { + depth: "element", + translatableAttributes: [...TRANSLATABLE_ATTRIBUTES], +} + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export type LlmTranslator = ( + sectionId: string, + englishContent: string +) => string + +// --------------------------------------------------------------------------- +// Text helpers (fence-aware) +// --------------------------------------------------------------------------- + +export function escapeRegex(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") +} + +function findNextHeading(text: string, startFrom: number): number { + let inFence = false + const lines = text.slice(startFrom).split("\n") + let pos = startFrom + for (let i = 0; i < lines.length; i++) { + if (i === 0) { + pos += lines[i].length + 1 + continue + } + const line = lines[i] + if (line.startsWith("```")) { + inFence = !inFence + } + if (!inFence && line.match(/^#{1,6}\s/)) { + return pos + } + pos += line.length + 1 + } + return text.length +} + +export function findSection( + text: string, + sectionId: string +): { start: number; end: number; headingLine: string } | null { + const pattern = new RegExp( + `(^#{1,6}\\s+[^\\n]*\\{#${escapeRegex(sectionId)}\\}[^\\n]*)`, + "m" + ) + const match = text.match(pattern) + if (!match || match.index === undefined) return null + const lineStart = text.lastIndexOf("\n", match.index) + 1 + const afterHeading = match.index + match[0].length + const end = findNextHeading(text, afterHeading) + return { start: lineStart, end, headingLine: match[0] } +} + +export function getSectionOrder(text: string): string[] { + const ids: string[] = [] + let inFence = false + for (const line of text.split("\n")) { + if (line.startsWith("```")) inFence = !inFence + if (inFence) continue + const m = line.match(/^#{2,6}\s+[^\n]*\{#([^}]+)\}/) + if (m) ids.push(m[1]) + } + return ids +} + +function parseFrontmatter( + text: string +): { yaml: string; body: string; start: number; end: number } | null { + if (!text.startsWith("---")) return null + const endIdx = text.indexOf("\n---", 3) + if (endIdx === -1) return null + const yamlEnd = endIdx + 4 + return { + yaml: text.slice(4, endIdx), + body: text.slice(yamlEnd), + start: 0, + end: yamlEnd, + } +} + +// --------------------------------------------------------------------------- +// JSON Pipeline +// --------------------------------------------------------------------------- + +function pipelineJson( + englishA: string, + englishB: string, + localeA: string, + config: Partial, + llm?: LlmTranslator +): string { + const enB = JSON.parse(englishB) as Record + const locA = JSON.parse(localeA) as Record + + const treeA = parseJson(englishA, config) + const treeB = parseJson(englishB, config) + const dr = diff(treeA, treeB) + const cs = extractChanges(treeA, treeB) + + const unchangedIds = new Set(dr.unchanged.map((e) => e.id)) + const inertIds = new Set(dr.inertDrift.map((e) => e.id)) + const translatableIds = new Set(dr.translatableDrift.map((e) => e.id)) + const addedIds = new Set(dr.added.map((e) => e.id)) + const removedIds = new Set(dr.removed.map((e) => e.id)) + + const result: Record = {} + + for (const key of Object.keys(enB)) { + if (removedIds.has(key)) continue + + if (addedIds.has(key)) { + result[key] = llm ? llm(key, String(enB[key])) : enB[key] + continue + } + + // Nested objects + if ( + typeof enB[key] === "object" && + enB[key] !== null && + !Array.isArray(enB[key]) + ) { + if (translatableIds.has(key)) { + if (llm) { + result[key] = llm(key, JSON.stringify(enB[key])) + } else { + const nested = { ...(locA[key] as Record) } + for (const change of cs.changes) { + if ( + change.path.startsWith(key + "/") && + change.action === "update" + ) { + const subKey = change.path.split("/").pop()! + if (change.contentType === "translatable") continue + if ( + subKey && + subKey in nested && + change.oldValue && + change.newValue + ) { + nested[subKey] = String(nested[subKey]).replace( + change.oldValue, + change.newValue + ) + } + } + } + result[key] = nested + } + continue + } + result[key] = locA[key] ?? enB[key] + continue + } + + if (unchangedIds.has(key)) { + result[key] = locA[key] + continue + } + + if (translatableIds.has(key)) { + result[key] = llm ? llm(key, String(enB[key])) : (locA[key] ?? enB[key]) + continue + } + + if (inertIds.has(key)) { + let value = String(locA[key] ?? enB[key]) + const keyChanges = cs.changes.filter( + (c) => c.path.startsWith(key + "/") || c.path === key + ) + for (const change of keyChanges) { + if ( + change.action !== "update" || + change.oldValue === undefined || + change.newValue === undefined + ) + continue + + if (change.elementType === "icu-variable") { + const oldVarName = change.oldValue.match(/^\{(\w+)/)?.[1] + const newVarName = change.newValue.match(/^\{(\w+)/)?.[1] + if (oldVarName && newVarName) { + value = value.replace( + new RegExp(`\\{${escapeRegex(oldVarName)}([,}])`, "g"), + `{${newVarName}$1` + ) + } + } else if (change.key === "href") { + value = value.replace(`"${change.oldValue}"`, `"${change.newValue}"`) + } + } + result[key] = value + continue + } + + result[key] = locA[key] ?? enB[key] + } + + return JSON.stringify(result, null, 2) + "\n" +} + +// --------------------------------------------------------------------------- +// Markdown Pipeline +// --------------------------------------------------------------------------- + +function pipelineMarkdown( + englishA: string, + englishB: string, + localeA: string, + config: Partial, + llm?: LlmTranslator +): string { + const treeA = parseMarkdown(englishA, config) + const treeB = parseMarkdown(englishB, config) + const dr = diff(treeA, treeB) + const cs = extractChanges(treeA, treeB) + + // Identify leaf translatableDrift sections + const tdPaths = dr.translatableDrift.map((e) => e.path) + const leafTdPaths = tdPaths.filter( + (p) => !tdPaths.some((o) => o !== p && o.startsWith(p + "/")) + ) + const leafTdIds = new Set( + dr.translatableDrift + .filter((e) => leafTdPaths.includes(e.path)) + .map((e) => e.id) + ) + + // Identify section renames + const renames = cs.sectionRenames + const renamedOldIds = new Set(renames.map((r) => r.oldId)) + const renamedNewIds = new Set(renames.map((r) => r.newId)) + + // Truly added sections (not renames) + const addedIds = new Set( + dr.added.filter((e) => !renamedNewIds.has(e.id)).map((e) => e.id) + ) + + // Sections that need LLM: leaf translatableDrift + truly added + const llmSectionIds = new Set([...leafTdIds, ...addedIds]) + // Remove frontmatter entries (handled separately) + for (const id of llmSectionIds) { + if (id.startsWith("frontmatter:")) { + llmSectionIds.delete(id) + } + } + + let result = localeA + + // --- Phase 3: Deterministic Propagation --- + + // 3a. Heading ID renames + for (const rename of renames) { + result = result.replace( + new RegExp(`\\{#${escapeRegex(rename.oldId)}\\}`, "g"), + `{#${rename.newId}}` + ) + } + + // 3b. Remove deleted components (e.g., ) + for (const removed of dr.removed) { + if (renamedOldIds.has(removed.id)) continue + if (removed.id.startsWith("component:")) { + result = result.replace(/\n*\s*\n*/g, "\n\n") + } + } + + // 3c. Apply inert value updates from extractChanges + for (const change of cs.changes) { + if (change.action !== "update") continue + if (change.oldValue === undefined || change.newValue === undefined) continue + + // Check if this change is inside an LLM section + let belongsToLlmSection = false + for (const id of llmSectionIds) { + if ( + change.path.includes("/" + id + "/") || + change.path.startsWith(id + "/") + ) { + belongsToLlmSection = true + break + } + } + for (const p of leafTdPaths) { + if (change.path.startsWith(p + "/") || change.path === p) { + belongsToLlmSection = true + break + } + } + + // Skip changes inside LLM sections when mock is provided + if (belongsToLlmSection && llm) continue + + // Skip translatable changes without LLM + if (change.contentType === "translatable") continue + + // Inert/mixed changes: apply to locale text + if (change.elementType === "frontmatter-field" && change.key) { + const fm = parseFrontmatter(result) + if (fm) { + const keyPattern = new RegExp( + `^(${escapeRegex(change.key)}:\\s*).*$`, + "m" + ) + const newYaml = fm.yaml.replace(keyPattern, `$1${change.newValue}`) + result = `---\n${newYaml}\n---${fm.body}` + } + continue + } + + if (change.elementType === "component-attribute" && change.key) { + const attrPattern = new RegExp( + `(${escapeRegex(change.key)}=")${escapeRegex(change.oldValue)}"`, + "g" + ) + if (attrPattern.test(result)) { + result = result.replace(attrPattern, `$1${change.newValue}"`) + } else { + const jsxPattern = new RegExp( + `(${escapeRegex(change.key)}=\\{)${escapeRegex(change.oldValue)}(\\})`, + "g" + ) + result = result.replace(jsxPattern, `$1${change.newValue}$2`) + } + continue + } + + if (change.elementType === "inline-code") { + result = result.replace( + new RegExp("`" + escapeRegex(change.oldValue) + "`", "g"), + "`" + change.newValue + "`" + ) + continue + } + + if (change.elementType === "link" && change.key === "href") { + result = result.replace( + new RegExp(`\\]\\(${escapeRegex(change.oldValue)}\\)`, "g"), + `](${change.newValue})` + ) + continue + } + + if (change.elementType === "html-tag" && change.key === "href") { + result = result.replace( + new RegExp(`href="${escapeRegex(change.oldValue)}"`, "g"), + `href="${change.newValue}"` + ) + continue + } + + if (change.elementType === "image" && change.key === "src") { + result = result.replace( + new RegExp(`\\]\\(${escapeRegex(change.oldValue)}\\)`, "g"), + `](${change.newValue})` + ) + continue + } + } + + // 3d. Apply structural additions + for (const change of cs.changes) { + if (change.action !== "add") continue + + const attrName = change.key || change.path.match(/attr:(\w+)$/)?.[1] + + if ( + change.elementType === "component-attribute" && + attrName && + change.newValue + ) { + for (const enBLine of englishB.split("\n")) { + if (!enBLine.includes(`${attrName}="${change.newValue}"`)) continue + const hrefMatch = enBLine.match(/href="([^"]+)"/) + if (hrefMatch) { + const href = hrefMatch[1] + const enAHasIt = englishA + .split("\n") + .some((l) => l.includes(`href="${href}"`) && l.includes(attrName)) + if (!enAHasIt) { + result = result.replace( + new RegExp(`(<\\w+\\s+href="${escapeRegex(href)}")`, "g"), + `$1 ${attrName}="${change.newValue}"` + ) + } + } + break + } + continue + } + + if (change.elementType === "code-body" && change.newValue) { + const pathParts = change.path.split("/") + const codeFenceIdx = pathParts.findIndex((p) => + p.startsWith("code-fence:") + ) + const sectionId = codeFenceIdx > 0 ? pathParts[codeFenceIdx - 1] : "" + + if (sectionId) { + const sec = findSection(result, sectionId) + if (sec) { + const enBSec = findSection(englishB, sectionId) + if (enBSec) { + const enBContent = englishB.slice(enBSec.start, enBSec.end) + const fencePattern = /```\w+\n[\s\S]*?```/g + const enASecObj = findSection(englishA, sectionId) + const enAContent = enASecObj + ? englishA.slice(enASecObj.start, enASecObj.end) + : "" + let fenceMatch: RegExpExecArray | null + while ((fenceMatch = fencePattern.exec(enBContent)) !== null) { + if (!enAContent.includes(fenceMatch[0])) { + const secContent = result.slice(sec.start, sec.end) + const trimmed = secContent.trimEnd() + const insertAt = sec.start + trimmed.length + result = + result.slice(0, insertAt) + + "\n\n" + + fenceMatch[0] + + "\n" + + result.slice(sec.end) + break + } + } + } + } + } + continue + } + } + + // 3e. Supplementary pass: propagate attribute changes extractChanges missed + const inertOrStructIds = new Set([ + ...dr.inertDrift.map((e) => e.id), + ...dr.structuralDrift.map((e) => e.id), + ]) + for (const sectionId of inertOrStructIds) { + if (sectionId.startsWith("frontmatter:") || sectionId.includes(":")) + continue + + const enASec = findSection(englishA, sectionId) + const enBSec = findSection(englishB, sectionId) + if (!enASec || !enBSec) continue + + const enAContent = englishA.slice(enASec.start, enASec.end) + const enBContent = englishB.slice(enBSec.start, enBSec.end) + + // Compare heading lines for attribute changes + const enAHeading = enAContent.split("\n")[0] + const enBHeading = enBContent.split("\n")[0] + if (enAHeading !== enBHeading) { + const headingAttrPattern = /(\w+)="([^"]*)"/g + const headingAAttrs: Array<[string, string]> = [] + const headingBAttrs: Array<[string, string]> = [] + let m: RegExpExecArray | null + while ((m = headingAttrPattern.exec(enAHeading)) !== null) { + headingAAttrs.push([m[1], m[2]]) + } + headingAttrPattern.lastIndex = 0 + while ((m = headingAttrPattern.exec(enBHeading)) !== null) { + headingBAttrs.push([m[1], m[2]]) + } + for (let i = 0; i < headingAAttrs.length; i++) { + if ( + headingBAttrs[i] && + headingAAttrs[i][0] === headingBAttrs[i][0] && + headingAAttrs[i][1] !== headingBAttrs[i][1] + ) { + const oldAttr = `${headingAAttrs[i][0]}="${headingAAttrs[i][1]}"` + const newAttr = `${headingBAttrs[i][0]}="${headingBAttrs[i][1]}"` + result = result.replace(oldAttr, newAttr) + } + } + } + + // Compare component attributes within the section + const enALines = enAContent.split("\n").slice(1) + const enBLines = enBContent.split("\n").slice(1) + for (let i = 0; i < Math.min(enALines.length, enBLines.length); i++) { + if (enALines[i] === enBLines[i]) continue + if (enALines[i].match(/^\s*() + const bAttrs = new Map() + let am: RegExpExecArray | null + while ((am = attrPat.exec(enALines[i])) !== null) + aAttrs.set(am[1], am[2]) + attrPat.lastIndex = 0 + while ((am = attrPat.exec(enBLines[i])) !== null) + bAttrs.set(am[1], am[2]) + for (const [key, oldVal] of aAttrs) { + const newVal = bAttrs.get(key) + if (newVal && newVal !== oldVal) { + const alreadyHandled = cs.changes.some( + (c) => c.oldValue === oldVal && c.newValue === newVal + ) + if (!alreadyHandled) { + result = result.replace( + `${key}="${oldVal}"`, + `${key}="${newVal}"` + ) + } + } + } + } + } + } + + // --- Phase 4 & 5: LLM Translation + Assembly --- + + for (const sectionId of llmSectionIds) { + const enBSec = findSection(englishB, sectionId) + if (!enBSec) continue + const englishContent = englishB.slice(enBSec.start, enBSec.end).trimEnd() + + if (!llm) { + // Without LLM, use English-B content as fallback + if (addedIds.has(sectionId)) { + const enBOrder = getSectionOrder(englishB) + const idx = enBOrder.indexOf(sectionId) + if (idx > 0) { + const prevId = enBOrder[idx - 1] + const prevSec = findSection(result, prevId) + if (prevSec) { + result = + result.slice(0, prevSec.end) + + "\n" + + englishContent + + "\n" + + result.slice(prevSec.end) + } + } + } else { + const localeSec = findSection(result, sectionId) + if (localeSec) { + result = + result.slice(0, localeSec.start) + + englishContent + + "\n" + + result.slice(localeSec.end) + } + } + continue + } + + const translated = llm(sectionId, englishContent) + + if (addedIds.has(sectionId)) { + const enBOrder = getSectionOrder(englishB) + const idx = enBOrder.indexOf(sectionId) + if (idx > 0) { + const prevId = enBOrder[idx - 1] + const prevSec = findSection(result, prevId) + if (prevSec) { + result = + result.slice(0, prevSec.end) + + "\n" + + translated + + "\n" + + result.slice(prevSec.end) + } + } + } else { + const localeSec = findSection(result, sectionId) + if (localeSec) { + result = + result.slice(0, localeSec.start) + + translated + + "\n" + + result.slice(localeSec.end) + } + } + } + + // --- Section Reordering --- + const enBH2Order = getSectionOrder(englishB).filter((id) => { + const match = englishB.match( + new RegExp(`^##\\s+[^\\n]*\\{#${escapeRegex(id)}\\}`, "m") + ) + return match !== null + }) + + const h2Sections: { id: string; content: string }[] = [] + let beforeFirstH2 = "" + + { + const lines = result.split("\n") + let inFence = false + const h2Starts: { id: string; lineIdx: number }[] = [] + for (let i = 0; i < lines.length; i++) { + if (lines[i].startsWith("```")) inFence = !inFence + if (inFence) continue + const m = lines[i].match(/^## [^\n]*\{#([^}]+)\}/) + if (m) h2Starts.push({ id: m[1], lineIdx: i }) + } + + if (h2Starts.length > 0) { + beforeFirstH2 = lines.slice(0, h2Starts[0].lineIdx).join("\n") + "\n" + for (let i = 0; i < h2Starts.length; i++) { + const startLine = h2Starts[i].lineIdx + const endLine = + i + 1 < h2Starts.length ? h2Starts[i + 1].lineIdx : lines.length + h2Sections.push({ + id: h2Starts[i].id, + content: + lines.slice(startLine, endLine).join("\n") + + (endLine < lines.length ? "\n" : ""), + }) + } + } + } + + if (h2Sections.length > 0 && enBH2Order.length > 0) { + const sectionMap = new Map(h2Sections.map((s) => [s.id, s.content])) + const reordered: string[] = [] + for (const id of enBH2Order) { + const content = sectionMap.get(id) + if (content) { + reordered.push(content) + sectionMap.delete(id) + } + } + for (const [, content] of sectionMap) { + reordered.push(content) + } + result = beforeFirstH2 + reordered.join("") + } + + return result.trimEnd() + "\n" +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +export function pipeline( + englishA: string, + englishB: string, + localeA: string, + format: "markdown" | "json", + llm?: LlmTranslator, + config: Partial = PIPELINE_CONFIG +): string { + if (format === "json") { + return pipelineJson(englishA, englishB, localeA, config, llm) + } + return pipelineMarkdown(englishA, englishB, localeA, config, llm) +} diff --git a/src/scripts/markdownChecker.ts b/src/scripts/markdownChecker.ts deleted file mode 100644 index 85b080f1617..00000000000 --- a/src/scripts/markdownChecker.ts +++ /dev/null @@ -1,396 +0,0 @@ -import fs from "fs" -import path from "path" - -import matter from "gray-matter" -import minimist from "minimist" - -import type { Lang } from "../lib/types" - -type Summary = Record - -const argv = minimist(process.argv.slice(2)) - -const LANG_ARG: string | null = argv.lang || null -const PATH_TO_INTL_MARKDOWN = "./public/content/translations/" -const PATH_TO_ALL_CONTENT = "./public/content/" -const TUTORIAL_DATE_REGEX = new RegExp("\\d{4}-\\d{2}-\\d{2}") -// Original -const WHITE_SPACE_IN_LINK_TEXT = new RegExp( - "\\[\\s.+\\]\\( | \\[.+\\s\\]\\(", - "g" -) -// Modified -// const WHITE_SPACE_IN_LINK_TEXT = new RegExp( -// "\\[\\s.+?\\]\\(|\\[.+?\\s\\]\\(", -// "g" -// ) -const BROKEN_LINK_REGEX = new RegExp( - "\\[[^\\]]+\\]\\([^\\)\\s]+\\s[^\\)]+\\)", - "g" -) -// This RegEx checks for invalid links in markdown content. -// The criteria for invalid links are: -// 1. Exclude images: The link shouldn't be preceded by an exclamation mark -// 2. Exclude internal links: The URL part of the link shouldn't start with a forward slash -// 3. Exclude fragment identifiers: The URL part of the link shouldn't start with a hash -// 4. Exclude typical external links: The URL part of the link shouldn't start with http or https -// 5. Exclude email links: The URL part of the link shouldn't start with mailto: -// 6. Exclude PDF links: The URL part of the link shouldn't end with .pdf -// 7. Exclude links wrapped in angled brackets: The URL part of the link shouldn't start with a < -const INVALID_LINK_REGEX = new RegExp( - "(? = ["", ""] -const SPELLING_MISTAKES: Array = [ - "Ethreum", - "Etherum", - "Etherium", - "Etheruem", - "Etereum", - "Eterium", - "Etherem", - "Etheerum", - "Ehtereum", - "Eferum", -] - -// ! Current usage of this const is commented out -// const CASE_SENSITIVE_SPELLING_MISTAKES = ["Thereum", "Metamask", "Github"] -// Ideas: -// Regex for explicit lang path (e.g., /en/) && for glossary links (trailing slash breaks links e.g., /glossary/#pos/ doesn't work) -// We should have case sensitive spelling mistakes && check they are not in links. - -interface Languages { - lang?: Array -} - -const langsArray = fs.readdirSync(PATH_TO_INTL_MARKDOWN) as Array -langsArray.push("en") - -function getAllMarkdownPaths( - dirPath: string, - arrayOfMarkdownPaths: Array = [] -): Array { - const files: Array = fs.readdirSync(dirPath) - - arrayOfMarkdownPaths = arrayOfMarkdownPaths || [] - - for (const file of files) { - if (fs.statSync(dirPath + "/" + file).isDirectory()) { - arrayOfMarkdownPaths = getAllMarkdownPaths( - dirPath + "/" + file, - arrayOfMarkdownPaths - ) - } else { - const filePath: string = path.join(dirPath, "/", file) - - if (filePath.includes(".md")) { - arrayOfMarkdownPaths.push(filePath) - } - } - } - - return arrayOfMarkdownPaths -} - -function sortMarkdownPathsIntoLanguages( - paths: Array, - excludeDefaultLang: boolean = false -): Languages { - const languages: Languages = langsArray.reduce((accumulator, value) => { - return { ...accumulator, [value]: [] } - }, {}) - - for (const path of paths) { - const translationDir = "/translations/" - const isTranslation = path.includes(translationDir) - const langIndex = path.indexOf(translationDir) + translationDir.length - - // RegEx to grab the root of the path (e.g., the lang code for translated files) - const regex = /^([^/]+)\// - const match = path.substring(langIndex).match(regex) - const lang = isTranslation && match && match.length > 1 ? match[1] : "en" - - if (LANG_ARG) { - if (LANG_ARG === lang && (lang !== "en" || !excludeDefaultLang)) { - languages[lang].push(path) - } - } else { - if (lang !== "en" || !excludeDefaultLang) { - languages[lang].push(path) - } - } - } - - return languages -} - -export async function getTranslatedMarkdownPaths() { - const markdownPaths: Array = getAllMarkdownPaths(PATH_TO_ALL_CONTENT) - const excludeDefaultLang = true - const languages = sortMarkdownPathsIntoLanguages( - markdownPaths, - excludeDefaultLang - ) - return languages -} - -function log( - message: string, - level: "warn" | "error" | "log", - summary: string[] -) { - summary.push(message) - console[level](message) -} - -function processFrontmatter( - path: string, - lang: string, - summary: string[] -): string[] { - const file = fs.readFileSync(path, "utf-8") - const frontmatter = matter(file).data - - if (!frontmatter.title) { - log(`Missing 'title' frontmatter at: ${path}`, "warn", summary) - } - // Description commented out as there are a lot of them missing :-)! - // if (!frontmatter.description) { - // summary.push(`Missing 'description' frontmatter at: ${path}`) - // } - if (!frontmatter.lang) { - log( - `Missing 'lang' frontmatter at: ${path}, Expected: ${lang}'`, - "error", - summary - ) - } else if (!(frontmatter.lang === lang)) { - log( - `Invalid 'lang' frontmatter at ${path}: Expected: ${lang}'. Received: ${frontmatter.lang}.`, - "error", - summary - ) - } - - if (frontmatter.emoji) { - if (!/^:\S+:$/.test(frontmatter.emoji)) { - log(`Frontmatter for 'emoji' is invalid at ${path}`, "error", summary) - } - } - - if (frontmatter.sidebar) { - log(`Unexpected 'sidebar' frontmatter at ${path}`, "error", summary) - } - - if (path.includes("/tutorials/")) { - if (!frontmatter.published) { - log(`Missing 'published' frontmatter at ${path}:`, "warn", summary) - } else { - try { - const stringDate = frontmatter.published.toISOString().slice(0, 10) - const dateIsFormattedCorrectly = TUTORIAL_DATE_REGEX.test(stringDate) - - if (!dateIsFormattedCorrectly) { - log( - `Invalid 'published' frontmatter at ${path}: Expected: 'YYYY-MM-DD' Received: ${frontmatter.published}`, - "warn", - summary - ) - } - } catch (e) { - log( - `Invalid 'published' frontmatter at ${path}: Expected: 'YYYY-MM-DD' Received: ${frontmatter.published}`, - "warn", - summary - ) - } - } - - if (!["beginner", "intermediate", "advanced"].includes(frontmatter.skill)) { - log( - `Skill frontmatter '${frontmatter.skill}' must be: beginner, intermediate, or advanced at: ${path}:`, - "log", - summary - ) - } - } - - return summary -} - -function processMarkdown(path: string, summary: string[]) { - const markdownFile: string = fs.readFileSync(path, "utf-8") - let brokenLinkMatch: RegExpExecArray | null - - while ((brokenLinkMatch = BROKEN_LINK_REGEX.exec(markdownFile))) { - const lineNumber = getLineNumber(markdownFile, brokenLinkMatch.index) - log(`Broken link found: ${path}:${lineNumber}`, "warn", summary) - - // if (!BROKEN_LINK_REGEX.global) break - } - - let invalidLinkMatch: RegExpExecArray | null - - // Check for invalid links - while ((invalidLinkMatch = INVALID_LINK_REGEX.exec(markdownFile))) { - const lineNumber = getLineNumber(markdownFile, invalidLinkMatch.index) - log(`Invalid link found: ${path}:${lineNumber}`, "warn", summary) - } - - let linkTextMissingMatch: RegExpExecArray | null - - // Check for links missing text - while ((linkTextMissingMatch = LINK_TEXT_MISSING_REGEX.exec(markdownFile))) { - const lineNumber = getLineNumber(markdownFile, linkTextMissingMatch.index) - log(`Link text missing: ${path}:${lineNumber}`, "warn", summary) - } - - let incorrectImagePathMatch: RegExpExecArray | null - - // Todo: refactor to simply check if the image exists relative to the path - if (path.includes("/translations/")) { - while ( - (incorrectImagePathMatch = - INCORRECT_PATH_IN_TRANSLATED_MARKDOWN.exec(markdownFile)) - ) { - const lineNumber = getLineNumber( - markdownFile, - incorrectImagePathMatch.index - ) - log(`Incorrect image path: ${path}:${lineNumber}`, "warn", summary) - } - } - - // TODO: refactor history pages to use a component for network upgrade summaries - // TODO: create .env commit warning component for tutorials - // Ignore tutorials with Javascript and ExpandableCards - if ( - !path.includes("/ethereum-forks/") && - !path.includes("/whitepaper/") && - !path.includes("/roadmap/") && - !path.includes("alchemy") && - !path.includes("nft") && - !path.includes("hello-world-smart-contract") && - !path.includes("opcodes") && - !path.includes("translation-program") && - !path.includes("/energy-consumption/") && - !markdownFile.includes("```javascript") && - !markdownFile.includes("ExpandableCard") - ) { - for (const tag of HTML_TAGS) { - const htmlTagRegex = new RegExp(tag, "g") - let htmlTagMatch: RegExpExecArray | null - - while ((htmlTagMatch = htmlTagRegex.exec(markdownFile))) { - const lineNumber = getLineNumber(markdownFile, htmlTagMatch.index) - log( - `Warning: ${tag} tag in markdown at ${path}:${lineNumber}`, - "warn", - summary - ) - - if (!htmlTagRegex.global) break - } - } - } - - // Commented out as 296 instances of whitespace in link texts - let whiteSpaceInLinkTextMatch: RegExpExecArray | null - - while ( - (whiteSpaceInLinkTextMatch = WHITE_SPACE_IN_LINK_TEXT.exec(markdownFile)) - ) { - const lineNumber = getLineNumber( - markdownFile, - whiteSpaceInLinkTextMatch.index - ) - log( - `Warning: White space in link found: ${path}:${lineNumber}`, - "warn", - summary - ) - } - - checkMarkdownSpellingMistakes(path, markdownFile, SPELLING_MISTAKES, summary) - // Turned this off for testing as there are lots of Github (instead of GitHub) and Metamask (instead of MetaMask). - // checkMarkdownSpellingMistakes(path, markdownFile, CASE_SENSITIVE_SPELLING_MISTAKES, true) -} - -function checkMarkdownSpellingMistakes( - path: string, - file: string, - spellingMistakes: Array, - summary: string[], - caseSensitive = false -): void { - for (const mistake of spellingMistakes) { - const mistakeRegex = caseSensitive - ? new RegExp(mistake, "g") - : new RegExp(mistake, "gi") - let spellingMistakeMatch: RegExpExecArray | null - - while ((spellingMistakeMatch = mistakeRegex.exec(file))) { - const lineNumber = getLineNumber(file, spellingMistakeMatch.index) - log( - `Spelling mistake "${mistake}" found at ${path}:${lineNumber}`, - "warn", - summary - ) - } - - if (!mistakeRegex.global) break - } -} - -function getLineNumber(file: string, index: number): string { - const fileSubstring = file.substring(0, index) - const lines = fileSubstring.split("\n") - const linePosition = lines.length - const charPosition = lines[lines.length - 1].length + 1 - const lineNumber = `${linePosition}:${charPosition}` - - return lineNumber -} - -const writeSummary = (summary: Summary, summaryWritePath: string) => { - fs.writeFileSync(summaryWritePath, JSON.stringify(summary, null, 2)) -} - -export function checkMarkdown(summaryWritePath?: string) { - console.log("Checking markdown for common issues...") - const summary = {} as Summary - const markdownPaths: Array = getAllMarkdownPaths(PATH_TO_ALL_CONTENT) - const markdownPathsByLang: Languages = - sortMarkdownPathsIntoLanguages(markdownPaths) - - for (const lang in markdownPathsByLang) { - summary[lang] = [] - - for (const path of markdownPathsByLang[lang]) { - processFrontmatter(path, lang, summary[lang]) - processMarkdown(path, summary[lang]) - } - - if (!summary[lang].length) delete summary[lang] - } - if (!summaryWritePath) return - - writeSummary(summary, summaryWritePath) - console.log("Writing markdown checker summary to:", summaryWritePath) -} - -checkMarkdown() diff --git a/tests/fixtures/incremental/english/fixture-a.json b/tests/fixtures/incremental/english/fixture-a.json new file mode 100644 index 00000000000..5f9fb203ea3 --- /dev/null +++ b/tests/fixtures/incremental/english/fixture-a.json @@ -0,0 +1,22 @@ +{ + "page-title": "Open Source Licensing", + "page-description": "Learn about free software licenses and how to choose the right one for your project.", + "hero-cta-primary": "Get started", + "hero-cta-secondary": "View license comparison", + "stat-label-projects": "Open source projects tracked", + "stat-label-contributors": "Active contributors", + "banner-text": "New to open source? Start with our contribution guide.", + "footer-note": "License data sourced from SPDX. Compliance tools provided by FOSSA.", + "filter-label": "Filter by license type", + "empty-results": "No licenses found matching your criteria. Try adjusting your filters.", + "welcome-user": "Welcome back, {username}!", + "project-count": "{count, plural, =0 {No projects found} one {# project found} other {# projects found}}", + "contribution-status": "{status, select, pending {Your contribution is pending review} approved {Your contribution has been approved} rejected {Your contribution needs revisions} other {Unknown status}}", + "nested": { + "section-title": "Additional Resources", + "section-description": "Explore more about open source licensing and compliance.", + "link-text": "View all resources" + }, + "multi-link": "Read the Open Source Definition, review the license chooser, and check SPDX identifiers.", + "markdown-description": "## Quick Reference\n\n- **MIT**: Most permissive, minimal restrictions\n- **Apache-2.0**: Permissive with patent grant\n- **GPL-3.0**: Strong copyleft, derivatives must be GPL\n\nFor more details, see the [full comparison](/open-source/#comparison-table)." +} diff --git a/tests/fixtures/incremental/english/fixture-a.md b/tests/fixtures/incremental/english/fixture-a.md new file mode 100644 index 00000000000..cdb286e8d97 --- /dev/null +++ b/tests/fixtures/incremental/english/fixture-a.md @@ -0,0 +1,219 @@ +--- +title: Understanding Open Source Licensing +description: A practical guide to free software licenses and collaborative development +image: /images/open-source/hero-licensing.png +alt: "Diagram showing different license types" +template: tutorial +lang: en +published: 2025-06-15 +tags: ["open-source", "licensing", "compliance"] +summaryPoints: + - Open source licenses define how code can be used, modified, and shared + - Copyleft licenses require derivative works to remain open source + - Permissive licenses allow proprietary use with minimal restrictions +--- + +# Understanding Open Source Licensing {#understanding-open-source-licensing} + +Open source software is built on the principle that code should be freely shared, studied, and improved. This guide covers the major license families, how to choose between them, and best practices for collaborative development. + +**Remember: licensing is a legal matter. This guide is educational, not legal advice. Consult a qualified attorney for your specific situation.** + + + +## What is open source? {#what-is-open-source} + +A software license determines how others can use, modify, and distribute your code. Open source licenses grant these rights explicitly, unlike proprietary licenses that restrict them. + +The [Open Source Initiative](https://opensource.org/osd) maintains the official Open Source Definition, which requires that licenses allow free redistribution, access to source code, and derived works. + +_Without clear licensing_, every project would need custom legal review. The `LICENSE` file in a repository signals exactly what permissions are granted, similar to how the `README.md` file explains the project's purpose. + +You can verify a project's license on Choose a License to understand what it permits. + +![License comparison chart](/images/open-source/license-comparison.png) + +### The four freedoms {#the-four-freedoms} + +The [Free Software Foundation](https://www.fsf.org/about/what-is-free-software) defines four essential freedoms: + +- **Freedom 0**: The freedom to run the program for any purpose +- **Freedom 1**: The freedom to study how the program works and adapt it +- **Freedom 2**: The freedom to redistribute copies +- **Freedom 3**: The freedom to improve the program and release improvements + +These freedoms form the foundation of all free and open source software (FOSS) licensing. + + + +The word "free" in free software refers to liberty, not cost. Proprietary software can be free of charge, and free software can be sold commercially. See the [GNU philosophy](https://www.gnu.org/philosophy/free-sw.html) for a detailed explanation. + + + +## Choosing a license {#choosing-a-license} + +### Copyleft licenses {#copyleft-licenses} + +Copyleft licenses like `GPL-3.0` require that derivative works use the same license. This ensures the software and all modifications remain free. The `AGPL-3.0` extends this requirement to software accessed over a network. + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// This contract demonstrates a simple registry +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Name required"); + projects[msg.sender] = name; + } +} +``` + +The key advantage of copyleft is that improvements must be shared back with the community. If someone builds on your GPL-licensed library, their modifications are also GPL-licensed. + + + +Use copyleft when: + +1. You want to ensure all derivative works remain open source +2. You are building a library or framework that others will extend +3. You want to prevent proprietary forks of your work + +The trade-off is that some companies avoid copyleft-licensed dependencies because of the "viral" nature of the license. + +Check the GPL FAQ for common questions about copyleft compliance. + + + +### Permissive licenses {#permissive-licenses} + +Permissive licenses like `MIT` and `Apache-2.0` allow proprietary derivatives. The `BSD-2-Clause` is another popular permissive option with minimal restrictions. + +```python +# Example: reading a project's license file +def read_license(path: str) -> str: + """Read and return the contents of a LICENSE file.""" + with open(path, "r") as f: + return f.read() + +# Check if the license is permissive +def is_permissive(license_text: str) -> bool: + permissive_keywords = ["MIT", "Apache", "BSD"] + return any(kw in license_text for kw in permissive_keywords) +``` + +The main advantage of permissive licenses is maximum adoption. Companies are more likely to use and contribute to permissively-licensed projects because there are no restrictions on how the code can be used. + +You can deploy projects using [GitHub](https://github.com/) on any hosting platform, and verify compliance with [SPDX](https://spdx.org/). + +Use [Remix](https://remix.ethereum.org/) on [Sepolia](https://sepolia.dev/) with a [block explorer](https://eth.blockscout.com/) to test [smart contracts](/glossary/#smart-contract) before deploying to production. + +### Comparison table {#comparison-table} + +| License | Type | Derivative works | Patent grant | +|---------|------|-----------------|-------------| +| GPL-3.0 | Copyleft | Must be GPL | Yes | +| AGPL-3.0 | Network copyleft | Must be AGPL | Yes | +| LGPL-3.0 | Weak copyleft | Library can be proprietary | Yes | +| MIT | Permissive | Any license | No | +| Apache-2.0 | Permissive | Any license | Yes | +| BSD-2-Clause | Permissive | Any license | No | +| MPL-2.0 | File-level copyleft | Modified files must be MPL | Yes | + +## Community collaboration {#community-collaboration} + +### Contributing to projects {#contributing-to-projects} + +Contributing to open source starts with understanding the project's workflow. Most projects use issue trackers to coordinate work and pull requests to propose changes. + +Start contributing today + + + How to contribute to this project + + + + Join our community + + +Before submitting a contribution, always check the project's `CONTRIBUTING.md` file for guidelines. Code style, test requirements, and review processes vary between projects. + +```md +## Pull Request Template + +**Description:** Brief summary of changes +**Related issue:** Link to the issue this addresses +**Testing:** How was this tested? +``` + +### Code review best practices {#code-review} + +Code review is essential for maintaining quality in collaborative projects. Reviewers should check for correctness, style consistency, and potential security issues. + + + + + + Good code reviews focus on the logic and design, not just formatting. Use automated tools like linters for style enforcement, and reserve human review for architectural decisions and edge cases. + + + + + + +## Compliance and auditing {#compliance-and-auditing} + +Organizations using open source software must track their dependencies and ensure license compliance1. Tools like [FOSSA](https://fossa.com/) and [Snyk](https://snyk.io/) can automate this process. + + + + + + +### License scanning {#license-scanning} + +Automated license scanning should be part of every CI/CD pipeline. It catches incompatible licenses before they enter your dependency tree. + +```bash +# Run a license scan on your project +npx license-checker --production --json > licenses.json + +# Check for copyleft licenses in production dependencies +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +Consider using Dependabot to keep dependencies updated and monitor for license changes. + +### SBOM generation {#sbom-generation} + +A Software Bill of Materials (SBOM) lists all components in your software. Generating an SBOM is increasingly required for regulatory compliance, especially in security-sensitive industries. + + + +### Key terms {#key-terms} + + + + + +Understanding these terms is essential for making informed decisions about licensing. + + + +
    + +Review the full [glossary of terms](/glossary/) for additional definitions. This resource is maintained by the community and updated regularly. + +
    +
    + +## Further reading {#further-reading} + +_This guide is adapted from materials by the [Open Source Initiative](https://opensource.org/) and the [Free Software Foundation](https://www.fsf.org/)._ + +- [Choose a License guide](https://choosealicense.com/) - _Simple tool to help pick the right license for your project_ +- [SPDX License List](https://spdx.org/licenses/) - _Standardized identifiers for over 500 open source licenses_ +- [Open Source Guide](https://opensource.guide/) - _Community-maintained resources for running and contributing to projects_ +- Compliance tooling on [FOSSA](https://fossa.com/) - _Automated license scanning and dependency management_ diff --git a/tests/fixtures/incremental/english/fixture-b.json b/tests/fixtures/incremental/english/fixture-b.json new file mode 100644 index 00000000000..56f4c09a225 --- /dev/null +++ b/tests/fixtures/incremental/english/fixture-b.json @@ -0,0 +1,22 @@ +{ + "page-title": "Open Source Licensing", + "page-description": "Discover open source licenses, compliance requirements, and how to choose the right one.", + "hero-cta-primary": "Get started", + "hero-cta-secondary": "View license comparison", + "stat-label-projects": "Open source projects tracked", + "stat-label-contributors": "Active contributors worldwide", + "banner-text": "New to open source? Start with our contribution guide.", + "footer-note": "License data sourced from SPDX. Compliance tools provided by FOSSA.", + "filter-label": "Filter by license type", + "welcome-user": "Welcome back, {displayName}!", + "project-count": "{total, plural, =0 {No projects found} one {# project found} other {# projects found}}", + "contribution-status": "{status, select, pending {Your contribution is pending review} approved {Your contribution has been approved} rejected {Your contribution needs revisions} other {Unknown status}}", + "nested": { + "section-title": "Additional Resources", + "section-description": "Explore more about open source licensing and compliance.", + "link-text": "Browse all resources" + }, + "multi-link": "Read the Open Source Definition, review the license chooser, and check SPDX identifiers.", + "markdown-description": "## Quick Reference\n\n- **MIT**: Most permissive, minimal restrictions\n- **Apache-2.0**: Permissive with patent grant\n- **GPL-3.0**: Strong copyleft, derivatives must be GPL\n\nFor more details, see the [full comparison](/open-source/#comparison-table).", + "new-key": "This is a brand new translation string added in version B." +} diff --git a/tests/fixtures/incremental/english/fixture-b.md b/tests/fixtures/incremental/english/fixture-b.md new file mode 100644 index 00000000000..fcdc8b56797 --- /dev/null +++ b/tests/fixtures/incremental/english/fixture-b.md @@ -0,0 +1,231 @@ +--- +title: Understanding Open Source Licensing +description: A comprehensive guide to open source licenses, compliance, and collaboration +image: /images/open-source/hero-licensing-v2.png +alt: "Diagram showing different license types" +template: tutorial +lang: en +published: 2025-06-15 +tags: ["open-source", "licensing", "compliance"] +summaryPoints: + - Open source licenses define how code can be used, modified, and shared + - Copyleft licenses require derivative works to remain open source + - Permissive licenses allow proprietary use with minimal restrictions +--- + +# Understanding Open Source Licensing {#understanding-open-source-licensing} + +Open source software is built on the principle that code should be freely shared, studied, and improved. This guide covers the major license families, how to choose between them, and best practices for collaborative development. + +**Remember: licensing is a legal matter. This guide is educational, not legal advice. Consult a qualified attorney for your specific situation.** + + +## What is open source? {#what-is-open-source} + +A software license determines how others can use, modify, and distribute your code. Open source licenses grant these rights explicitly, unlike proprietary licenses that restrict them. + +The [Open Source Initiative](https://opensource.org/osd/annotated) maintains the official Open Source Definition, which requires that licenses allow free redistribution, access to source code, and derived works. + +_Without clear licensing_, every project would need custom legal review. The `LICENSE.md` file in a repository signals exactly what permissions are granted, similar to how the `README.md` file explains the project's purpose. + +You can verify a project's license on Choose a License to understand what it permits. + +![License comparison chart](/images/open-source/license-comparison-v2.png) + +### The four freedoms {#the-four-freedoms} + +The [Free Software Foundation](https://www.fsf.org/about/what-is-free-software) defines four essential freedoms: + +- **Freedom 0**: The freedom to run the program for any purpose +- **Freedom 1**: The freedom to study how the program works and adapt it +- **Freedom 2**: The freedom to redistribute copies +- **Freedom 3**: The freedom to improve the program and release improvements + +These four freedoms are the philosophical foundation of the entire free and open source software (FOSS) movement. + + + +The word "free" in free software refers to liberty, not cost. Proprietary software can be free of charge, and free software can be sold commercially. See the [GNU philosophy](https://www.gnu.org/philosophy/free-sw.html) for a detailed explanation. + + + +## Choosing a license {#choosing-a-license} + +### Copyleft licenses {#copyleft-licenses} + +Copyleft licenses like `GPL-3.0` require that derivative works use the same license. This ensures the software and all modifications remain free. The `AGPL-3.0` extends this requirement to software accessed over a network. + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// This contract implements a basic project registry +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Project name cannot be empty"); + projects[msg.sender] = name; + } +} +``` + +The key advantage of copyleft is that improvements must be shared back with the community. If someone builds on your GPL-licensed library, their modifications are also GPL-licensed. + + + +Use copyleft when: + +1. You want to ensure all derivative works remain open source +2. You are building a library or framework that others will extend +3. You want to prevent proprietary forks of your work + +The trade-off is that some companies avoid copyleft-licensed dependencies because of the "viral" nature of the license. + +Check the GPL FAQ for common questions about copyleft compliance. + + + +### Permissive licenses {#permissive-licenses} + +Permissive licenses like `MIT` and `Apache-2.0` allow proprietary derivatives. The `BSD-2-Clause` is another popular permissive option with minimal restrictions. + +```python +# Example: loading a project's license from disk +def read_license(path: str) -> str: + """Read and return the contents of a LICENSE file.""" + with open(path, "r") as f: + return f.read() + +# Check if the license is permissive +def is_permissive(license_text: str) -> bool: + permissive_keywords = ["MIT", "Apache", "BSD"] + return any(kw in license_text for kw in permissive_keywords) +``` + +The main advantage of permissive licenses is maximum adoption. Companies are more likely to use and contribute to permissively-licensed projects because there are no restrictions on how the code can be used. + +You can deploy projects using [GitHub Repositories](https://github.com/new) on any hosting platform, and verify compliance with [SPDX](https://spdx.org/). + +Use [Remix](https://remix.ethereum.org/) on [Holesky](https://holesky.dev/) with a [block explorer](https://eth.blockscout.com/) to test [smart contracts](/glossary/#smart-contract) before deploying to production. + +### Comparison table {#comparison-table} + +| License | Type | Derivative works | Patent grant | +|---------|------|-----------------|-------------| +| GPL-3.0 | Copyleft | Must be GPL | Yes | +| AGPL-3.0 | Network copyleft | Must be AGPL | Yes | +| LGPL-3.0 | Weak copyleft | Library can be proprietary | Yes | +| MIT | Permissive | Any license | No | +| Apache-2.0 | Permissive | Any license | Yes | +| BSD-2-Clause | Permissive | Any license | No | +| MPL-2.0 | File-level copyleft | Modified files must be MPL | Yes | + +## Compliance and auditing {#compliance-and-auditing} + +Organizations using open source software must track their dependencies and ensure license compliance1. Tools like [FOSSA](https://fossa.com/) and [Snyk](https://snyk.io/) can automate this process. + + + + + + +### License scanning {#license-scanning} + +Automated license scanning should be part of every CI/CD pipeline. It catches incompatible licenses before they enter your dependency tree. + +```bash +# Run a license scan on your project +npx license-checker --production --json > licenses.json + +# Check for copyleft licenses in production dependencies +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +Consider using Dependabot to keep dependencies updated and monitor for license changes. + +```json +{ + "license-scan": { + "production": true, + "allowed": ["MIT", "Apache-2.0", "BSD-2-Clause"] + } +} +``` + +### SBOM generation {#sbom-generation} + +A Software Bill of Materials (SBOM) lists all components in your software. Generating an SBOM is increasingly required for regulatory compliance, especially in security-sensitive industries. + + + +### Key terms {#key-terms} + + + + + +Understanding these terms is essential for making informed decisions about licensing. + + + +
    + +Review the full [glossary of terms](/glossary/) for additional definitions. This resource is maintained by the community and updated regularly. + +
    +
    + +## Community collaboration {#community-collaboration} + +### Contributing to projects {#how-to-contribute} + +Contributing to open source starts with understanding the project's workflow. Most projects use issue trackers to coordinate work and pull requests to propose changes. + +Start contributing today + + + How to contribute to this project + + + + Join our community + + +Before submitting a contribution, always check the project's `CONTRIBUTING.md` file for guidelines. Code style, test requirements, and review processes vary between projects. + +```md +## Pull Request Template + +**Description:** Brief summary of changes +**Related issue:** Link to the issue this addresses +**Testing:** How was this tested? +``` + +### Code review best practices {#code-review} + +Code review is essential for maintaining quality in collaborative projects. Reviewers should check for correctness, style consistency, and potential security issues. + + + + + + Good code reviews focus on the logic and design, not just formatting. Use automated tools like linters for style enforcement, and reserve human review for architectural decisions and edge cases. + + + + + + +## Dual licensing {#dual-licensing} + +Some projects offer their code under two licenses simultaneously. This allows commercial users to purchase a proprietary license while keeping the open source version available under copyleft terms. Projects like [Qt](https://www.qt.io/licensing/) and [MySQL](https://www.mysql.com/about/legal/licensing/) use this model. + +## Further reading {#further-reading} + +_This guide is adapted from materials by the [Open Source Initiative](https://opensource.org/) and the [Free Software Foundation](https://www.fsf.org/)._ + +- [Choose a License guide](https://choosealicense.com/) - _Simple tool to help pick the right license for your project_ +- [SPDX License List](https://spdx.org/licenses/) - _Standardized identifiers for over 500 open source licenses_ +- [Open Source Guide](https://opensource.guide/) - _Community-maintained resources for running and contributing to projects_ +- Compliance tooling on [FOSSA](https://fossa.com/) - _Automated license scanning and dependency management_ diff --git a/tests/fixtures/incremental/locale-a/es/fixture.json b/tests/fixtures/incremental/locale-a/es/fixture.json new file mode 100644 index 00000000000..02392d28915 --- /dev/null +++ b/tests/fixtures/incremental/locale-a/es/fixture.json @@ -0,0 +1,22 @@ +{ + "page-title": "Licencias de código abierto", + "page-description": "Aprenda sobre las licencias de software libre y cómo elegir la adecuada para su proyecto.", + "hero-cta-primary": "Comenzar", + "hero-cta-secondary": "Ver comparación de licencias", + "stat-label-projects": "Proyectos de código abierto rastreados", + "stat-label-contributors": "Colaboradores activos", + "banner-text": "¿Es nuevo en el código abierto? Comience con nuestra guía de contribución.", + "footer-note": "Datos de licencias obtenidos de SPDX. Herramientas de cumplimiento proporcionadas por FOSSA.", + "filter-label": "Filtrar por tipo de licencia", + "empty-results": "No se encontraron licencias que coincidan con sus criterios. Intente ajustar sus filtros.", + "welcome-user": "¡Bienvenido de nuevo, {username}!", + "project-count": "{count, plural, =0 {No se encontraron proyectos} one {# proyecto encontrado} other {# proyectos encontrados}}", + "contribution-status": "{status, select, pending {Su contribución está pendiente de revisión} approved {Su contribución ha sido aprobada} rejected {Su contribución necesita revisiones} other {Estado desconocido}}", + "nested": { + "section-title": "Recursos adicionales", + "section-description": "Explore más sobre las licencias de código abierto y su cumplimiento.", + "link-text": "Ver todos los recursos" + }, + "multi-link": "Lea la Definición de código abierto, revise el selector de licencias y consulte los identificadores SPDX.", + "markdown-description": "## Referencia rápida\n\n- **MIT**: La más permisiva, restricciones mínimas\n- **Apache-2.0**: Permisiva con concesión de patentes\n- **GPL-3.0**: Copyleft fuerte, los derivados deben ser GPL\n\nPara obtener más detalles, consulte la [comparación completa](/open-source/#comparison-table)." +} \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-a/es/fixture.md b/tests/fixtures/incremental/locale-a/es/fixture.md new file mode 100644 index 00000000000..1a1394a4429 --- /dev/null +++ b/tests/fixtures/incremental/locale-a/es/fixture.md @@ -0,0 +1,219 @@ +--- +title: "Comprender las licencias de código abierto" +description: "Una guía práctica sobre licencias de software libre y desarrollo colaborativo" +image: /images/open-source/hero-licensing.png +alt: "Diagrama que muestra diferentes tipos de licencias" +template: tutorial +lang: es +published: 2025-06-15 +tags: ["código abierto", "licencias", "cumplimiento"] +summaryPoints: + - Las licencias de código abierto definen cómo se puede usar, modificar y compartir el código + - Las licencias copyleft requieren que las obras derivadas sigan siendo de código abierto + - Las licencias permisivas permiten el uso privativo con restricciones mínimas +--- + +# Comprender las licencias de código abierto {#understanding-open-source-licensing} + +El software de código abierto se basa en el principio de que el código debe compartirse, estudiarse y mejorarse libremente. Esta guía cubre las principales familias de licencias, cómo elegir entre ellas y las mejores prácticas para el desarrollo colaborativo. + +**Recuerde: la concesión de licencias es un asunto legal. Esta guía es educativa, no un asesoramiento legal. Consulte a un abogado calificado para su situación específica.** + + + +## ¿Qué es el código abierto? {#what-is-open-source} + +Una licencia de software determina cómo otros pueden usar, modificar y distribuir su código. Las licencias de código abierto otorgan estos derechos explícitamente, a diferencia de las licencias privativas que los restringen. + +La [Open Source Initiative](https://opensource.org/osd) mantiene la Definición de Código Abierto oficial, que requiere que las licencias permitan la libre redistribución, el acceso al código fuente y las obras derivadas. + +_Sin licencias claras_, cada proyecto necesitaría una revisión legal personalizada. El archivo `LICENSE` en un repositorio indica exactamente qué permisos se otorgan, de manera similar a cómo el archivo `README.md` explica el propósito del proyecto. + +Puede verificar la licencia de un proyecto en Choose a License para comprender lo que permite. + +![License comparison chart](/images/open-source/license-comparison.png) + +### Las cuatro libertades {#the-four-freedoms} + +La [Free Software Foundation](https://www.fsf.org/about/what-is-free-software) define cuatro libertades esenciales: + +- **Libertad 0**: La libertad de ejecutar el programa para cualquier propósito +- **Libertad 1**: La libertad de estudiar cómo funciona el programa y adaptarlo +- **Libertad 2**: La libertad de redistribuir copias +- **Libertad 3**: La libertad de mejorar el programa y publicar las mejoras + +Estas libertades forman la base de todas las licencias de software libre y de código abierto (FOSS). + + + +La palabra "free" (libre) en el software libre se refiere a la libertad, no al costo. El software privativo puede ser gratuito y el software libre puede venderse comercialmente. Consulte la [filosofía de GNU](https://www.gnu.org/philosophy/free-sw.html) para obtener una explicación detallada. + + + +## Elegir una licencia {#choosing-a-license} + +### Licencias copyleft {#copyleft-licenses} + +Las licencias copyleft como `GPL-3.0` requieren que las obras derivadas usen la misma licencia. Esto asegura que el software y todas las modificaciones sigan siendo libres. La `AGPL-3.0` extiende este requisito al software al que se accede a través de una red. + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// Este contrato demuestra un registro simple +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Name required"); + projects[msg.sender] = name; + } +} +``` + +La ventaja clave del copyleft es que las mejoras deben compartirse con la comunidad. Si alguien desarrolla sobre su biblioteca con licencia GPL, sus modificaciones también tendrán licencia GPL. + + + +Use copyleft cuando: + +1. Quiera asegurarse de que todas las obras derivadas sigan siendo de código abierto +2. Esté construyendo una biblioteca o framework que otros ampliarán +3. Quiera evitar forks privativos de su trabajo + +La desventaja es que algunas empresas evitan las dependencias con licencia copyleft debido a la naturaleza "viral" de la licencia. + +Consulte las Preguntas frecuentes sobre GPL para conocer las preguntas comunes sobre el cumplimiento del copyleft. + + + +### Licencias permisivas {#permissive-licenses} + +Las licencias permisivas como `MIT` y `Apache-2.0` permiten derivados privativos. La `BSD-2-Clause` es otra opción permisiva popular con restricciones mínimas. + +```python +# Ejemplo: leer el archivo de licencia de un proyecto +def read_license(path: str) -> str: + """Leer y devolver el contenido de un archivo LICENSE.""" + with open(path, "r") as f: + return f.read() + +# Comprobar si la licencia es permisiva +def is_permissive(license_text: str) -> bool: + permissive_keywords = ["MIT", "Apache", "BSD"] + return any(kw in license_text for kw in permissive_keywords) +``` + +La principal ventaja de las licencias permisivas es la máxima adopción. Es más probable que las empresas utilicen y contribuyan a proyectos con licencias permisivas porque no hay restricciones sobre cómo se puede usar el código. + +Puede desplegar proyectos usando [GitHub](https://github.com/) en cualquier plataforma de alojamiento, y verificar el cumplimiento con [SPDX](https://spdx.org/). + +Use [Remix](https://remix.ethereum.org/) en [Sepolia](https://sepolia.dev/) con un [explorador de bloques](https://eth.blockscout.com/) para probar [contratos inteligentes](/glossary/#smart-contract) antes de desplegar en producción. + +### Tabla comparativa {#comparison-table} + +| Licencia | Tipo | Obras derivadas | Concesión de patentes | +|---------|------|-----------------|-------------| +| GPL-3.0 | Copyleft | Debe ser GPL | Sí | +| AGPL-3.0 | Copyleft de red | Debe ser AGPL | Sí | +| LGPL-3.0 | Copyleft débil | La biblioteca puede ser privativa | Sí | +| MIT | Permisiva | Cualquier licencia | No | +| Apache-2.0 | Permisiva | Cualquier licencia | Sí | +| BSD-2-Clause | Permisiva | Cualquier licencia | No | +| MPL-2.0 | Copyleft a nivel de archivo | Los archivos modificados deben ser MPL | Sí | + +## Colaboración comunitaria {#community-collaboration} + +### Contribuir a proyectos {#contributing-to-projects} + +Contribuir al código abierto comienza por comprender el flujo de trabajo del proyecto. La mayoría de los proyectos utilizan issue trackers para coordinar el trabajo y pull requests para proponer cambios. + +Comience a contribuir hoy + + + Cómo contribuir a este proyecto + + + + Únase a nuestra comunidad + + +Antes de enviar una contribución, siempre revise el archivo `CONTRIBUTING.md` del proyecto para conocer las pautas. El estilo de código, los requisitos de prueba y los procesos de revisión varían entre proyectos. + +```md +## Plantilla de Pull Request + +**Descripción:** Breve resumen de los cambios +**Issue relacionado:** Enlace al issue que esto aborda +**Pruebas:** ¿Cómo se probó esto? +``` + +### Mejores prácticas para la revisión de código {#code-review} + +La revisión de código es esencial para mantener la calidad en proyectos colaborativos. Los revisores deben verificar la exactitud, la consistencia del estilo y los posibles problemas de seguridad. + + + + + + Las buenas revisiones de código se centran en la lógica y el diseño, no solo en el formato. Utilice herramientas automatizadas como linters para la aplicación del estilo, y reserve la revisión humana para decisiones arquitectónicas y casos extremos. + + + + + + +## Cumplimiento y auditoría {#compliance-and-auditing} + +Las organizaciones que utilizan software de código abierto deben rastrear sus dependencias y garantizar el cumplimiento de las licencias1. Herramientas como [FOSSA](https://fossa.com/) y [Snyk](https://snyk.io/) pueden automatizar este proceso. + + + + + + +### Escaneo de licencias {#license-scanning} + +El escaneo automatizado de licencias debe ser parte de cada CI/CD pipeline. Detecta licencias incompatibles antes de que entren en su árbol de dependencias. + +```bash +# Ejecutar un escaneo de licencias en tu proyecto +npx license-checker --production --json > licenses.json + +# Comprobar si hay licencias copyleft en las dependencias de producción +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +Considere usar Dependabot para mantener las dependencias actualizadas y monitorear los cambios de licencia. + +### Generación de SBOM {#sbom-generation} + +Una lista de materiales de software (SBOM) enumera todos los componentes de su software. La generación de un SBOM es cada vez más necesaria para el cumplimiento normativo, especialmente en industrias sensibles a la seguridad. + + + +### Términos clave {#key-terms} + + + + + +Comprender estos términos es esencial para tomar decisiones informadas sobre las licencias. + + + +
    + +Revise el [glosario de términos](/glossary/) completo para obtener definiciones adicionales. Este recurso es mantenido por la comunidad y se actualiza regularmente. + +
    +
    + +## Lecturas adicionales {#further-reading} + +_Esta guía está adaptada de materiales de la [Open Source Initiative](https://opensource.org/) y la [Free Software Foundation](https://www.fsf.org/)._ + +- [Guía Choose a License](https://choosealicense.com/) - _Herramienta sencilla para ayudar a elegir la licencia adecuada para su proyecto_ +- [Lista de licencias SPDX](https://spdx.org/licenses/) - _Identificadores estandarizados para más de 500 licencias de código abierto_ +- [Guía de código abierto](https://opensource.guide/) - _Recursos mantenidos por la comunidad para ejecutar y contribuir a proyectos_ +- Herramientas de cumplimiento en [FOSSA](https://fossa.com/) - _Escaneo automatizado de licencias y gestión de dependencias_ \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-a/ko/fixture.json b/tests/fixtures/incremental/locale-a/ko/fixture.json new file mode 100644 index 00000000000..598cb518051 --- /dev/null +++ b/tests/fixtures/incremental/locale-a/ko/fixture.json @@ -0,0 +1,22 @@ +{ + "page-title": "오픈 소스 라이선스", + "page-description": "자유 소프트웨어 라이선스에 대해 알아보고 프로젝트에 적합한 라이선스를 선택하는 방법을 알아보세요.", + "hero-cta-primary": "시작하기", + "hero-cta-secondary": "라이선스 비교 보기", + "stat-label-projects": "추적된 오픈 소스 프로젝트", + "stat-label-contributors": "활성 기여자", + "banner-text": "오픈 소스가 처음이신가요? 기여 가이드시작해 보세요.", + "footer-note": "라이선스 데이터는 SPDX에서 제공합니다. 컴플라이언스 도구는 FOSSA에서 제공합니다.", + "filter-label": "라이선스 유형별 필터링", + "empty-results": "조건에 맞는 라이선스를 찾을 수 없습니다. 필터를 조정해 보세요.", + "welcome-user": "다시 오신 것을 환영합니다, {username}님!", + "project-count": "{count, plural, =0 {프로젝트를 찾을 수 없습니다} one {#개의 프로젝트를 찾았습니다} other {#개의 프로젝트를 찾았습니다}}", + "contribution-status": "{status, select, pending {기여 내역이 검토 대기 중입니다} approved {기여 내역이 승인되었습니다} rejected {기여 내역에 수정이 필요합니다} other {알 수 없는 상태입니다}}", + "nested": { + "section-title": "추가 리소스", + "section-description": "오픈 소스 라이선스 및 컴플라이언스에 대해 더 자세히 알아보세요.", + "link-text": "모든 리소스 보기" + }, + "multi-link": "오픈 소스 정의를 읽고, 라이선스 선택기를 검토하며, SPDX 식별자를 확인하세요.", + "markdown-description": "## 빠른 참조\n\n- **MIT**: 가장 허용적이며 최소한의 제한\n- **Apache-2.0**: 특허 부여가 포함된 허용적 라이선스\n- **GPL-3.0**: 강력한 카피레프트, 파생물은 GPL이어야 함\n\n자세한 내용은 [전체 비교](/open-source/#comparison-table)를 참조하세요." +} \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-a/ko/fixture.md b/tests/fixtures/incremental/locale-a/ko/fixture.md new file mode 100644 index 00000000000..3e8f482f49d --- /dev/null +++ b/tests/fixtures/incremental/locale-a/ko/fixture.md @@ -0,0 +1,222 @@ +--- +title: "오픈 소스 라이선스 이해하기" +description: "자유 소프트웨어 라이선스 및 협업 개발을 위한 실용적인 가이드" +image: /images/open-source/hero-licensing.png +alt: "다양한 라이선스 유형을 보여주는 다이어그램" +template: tutorial +lang: ko +published: 2025-06-15 +tags: + - 오픈 소스 + - 라이선스 + - 컴플라이언스 +summaryPoints: + - 오픈 소스 라이선스는 코드를 사용, 수정 및 공유하는 방법을 정의합니다. + - 카피레프트 라이선스는 파생 저작물도 오픈 소스로 유지할 것을 요구합니다. + - 퍼미시브 라이선스는 최소한의 제한으로 독점적 사용을 허용합니다. +--- + +# 오픈 소스 라이선스 이해하기 {#understanding-open-source-licensing} + +오픈 소스 소프트웨어는 코드를 자유롭게 공유하고, 연구하며, 개선해야 한다는 원칙을 바탕으로 만들어집니다. 이 가이드는 주요 라이선스 제품군, 라이선스 선택 방법, 그리고 협업 개발을 위한 모범 사례를 다룹니다. + +**기억하세요: 라이선스는 법적인 문제입니다. 이 가이드는 교육 목적이며 법률적 조언이 아닙니다. 구체적인 상황에 대해서는 자격을 갖춘 변호사와 상담하세요.** + + + +## 오픈 소스란 무엇인가요? {#what-is-open-source} + +소프트웨어 라이선스는 다른 사람들이 여러분의 코드를 사용, 수정 및 배포하는 방법을 결정합니다. 이를 제한하는 독점 라이선스와 달리, 오픈 소스 라이선스는 이러한 권리를 명시적으로 부여합니다. + +[오픈 소스 이니셔티브(Open Source Initiative)](https://opensource.org/osd)는 공식 오픈 소스 정의를 유지 관리하며, 이 정의에 따르면 라이선스는 자유로운 재배포, 소스 코드 접근 및 파생 저작물을 허용해야 합니다. + +_명확한 라이선스가 없다면_, 모든 프로젝트는 맞춤형 법률 검토가 필요할 것입니다. 리포지토리의 `LICENSE` 파일은 `README.md` 파일이 프로젝트의 목적을 설명하는 것과 유사하게 어떤 권한이 부여되는지 정확히 알려줍니다. + +Choose a License에서 프로젝트의 라이선스를 확인하여 허용되는 사항을 이해할 수 있습니다. + +![License comparison chart](/images/open-source/license-comparison.png) + +### 4가지 자유 {#the-four-freedoms} + +[자유 소프트웨어 재단(Free Software Foundation)](https://www.fsf.org/about/what-is-free-software)은 4가지 필수적인 자유를 정의합니다: + +- **자유 0**: 어떤 목적으로든 프로그램을 실행할 수 있는 자유 +- **자유 1**: 프로그램이 어떻게 작동하는지 연구하고 이를 수정할 수 있는 자유 +- **자유 2**: 복사본을 재배포할 수 있는 자유 +- **자유 3**: 프로그램을 개선하고 그 개선 사항을 배포할 수 있는 자유 + +이러한 자유는 모든 자유 오픈 소스 소프트웨어(FOSS) 라이선스의 기반을 형성합니다. + + + +자유 소프트웨어에서 "자유(free)"라는 단어는 비용이 아니라 자유(liberty)를 의미합니다. 독점 소프트웨어도 무료일 수 있으며, 자유 소프트웨어도 상업적으로 판매될 수 있습니다. 자세한 설명은 [GNU 철학](https://www.gnu.org/philosophy/free-sw.html)을 참조하세요. + + + +## 라이선스 선택하기 {#choosing-a-license} + +### 카피레프트 라이선스 {#copyleft-licenses} + +`GPL-3.0`와 같은 카피레프트 라이선스는 파생 저작물도 동일한 라이선스를 사용하도록 요구합니다. 이는 소프트웨어와 모든 수정 사항이 자유롭게 유지되도록 보장합니다. `AGPL-3.0`는 이 요구 사항을 네트워크를 통해 액세스하는 소프트웨어로 확장합니다. + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// 이 컨트랙트는 간단한 레지스트리를 보여줍니다. +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Name required"); + projects[msg.sender] = name; + } +} +``` + +카피레프트의 주요 장점은 개선 사항을 커뮤니티와 다시 공유해야 한다는 것입니다. 누군가 여러분의 GPL 라이선스 라이브러리를 기반으로 개발한다면, 그들의 수정 사항 역시 GPL 라이선스를 따라야 합니다. + + + +다음과 같은 경우 카피레프트를 사용하세요: + +1. 모든 파생 저작물이 오픈 소스로 유지되기를 원할 때 +2. 다른 사람들이 확장할 라이브러리나 프레임워크를 구축할 때 +3. 작업물의 독점적인 포크(fork)를 방지하고 싶을 때 + +단점은 라이선스의 "바이러스성" 특성 때문에 일부 기업이 카피레프트 라이선스가 적용된 종속성을 피한다는 것입니다. + +카피레프트 컴플라이언스에 대한 일반적인 질문은 GPL FAQ를 확인하세요. + + + +### 퍼미시브 라이선스 {#permissive-licenses} + +`MIT` 및 `Apache-2.0`와 같은 퍼미시브 라이선스는 독점적인 파생물을 허용합니다. `BSD-2-Clause`는 제한이 최소화된 또 다른 인기 있는 퍼미시브 옵션입니다. + +```python +# 예제: 프로젝트의 라이선스 파일 읽기 +def read_license(path: str) -> str: + """LICENSE 파일의 내용을 읽고 반환합니다.""" + with open(path, "r") as f: + return f.read() + +# 라이선스가 허용적인지 확인합니다. +def is_permissive(license_text: str) -> bool: + permissive_keywords = ["MIT", "Apache", "BSD"] + return any(kw in license_text for kw in permissive_keywords) +``` + +퍼미시브 라이선스의 주요 장점은 채택을 극대화할 수 있다는 것입니다. 코드 사용 방법에 대한 제한이 없기 때문에 기업들이 퍼미시브 라이선스 프로젝트를 사용하고 기여할 가능성이 더 높습니다. + +[GitHub](https://github.com/)을 사용하여 모든 호스팅 플랫폼에 프로젝트를 배포할 수 있으며, [SPDX](https://spdx.org/)로 컴플라이언스를 확인할 수 있습니다. + +프로덕션 환경에 배포하기 전에 [Sepolia](https://sepolia.dev/)에서 [블록 탐색기](https://eth.blockscout.com/)와 함께 [Remix](https://remix.ethereum.org/)를 사용하여 [스마트 컨트랙트](/glossary/#smart-contract)를 테스트하세요. + +### 비교 표 {#comparison-table} + +| 라이선스 | 유형 | 파생 저작물 | 특허 부여 | +|---------|------|-----------------|-------------| +| GPL-3.0 | 카피레프트 | GPL이어야 함 | 예 | +| AGPL-3.0 | 네트워크 카피레프트 | AGPL이어야 함 | 예 | +| LGPL-3.0 | 약한 카피레프트 | 라이브러리는 독점적일 수 있음 | 예 | +| MIT | 퍼미시브 | 모든 라이선스 | 아니요 | +| Apache-2.0 | 퍼미시브 | 모든 라이선스 | 예 | +| BSD-2-Clause | 퍼미시브 | 모든 라이선스 | 아니요 | +| MPL-2.0 | 파일 수준 카피레프트 | 수정된 파일은 MPL이어야 함 | 예 | + +## 커뮤니티 협업 {#community-collaboration} + +### 프로젝트에 기여하기 {#contributing-to-projects} + +오픈 소스에 기여하는 것은 프로젝트의 워크플로우를 이해하는 것에서 시작됩니다. 대부분의 프로젝트는 이슈 트래커를 사용하여 작업을 조율하고 풀 리퀘스트(pull request)를 통해 변경 사항을 제안합니다. + +오늘 바로 기여를 시작하세요 + + + 이 프로젝트에 기여하는 방법 + + + + 커뮤니티 참여하기 + + +기여를 제출하기 전에 항상 프로젝트의 `CONTRIBUTING.md` 파일에서 가이드라인을 확인하세요. 코드 스타일, 테스트 요구 사항 및 리뷰 프로세스는 프로젝트마다 다릅니다. + +```md +## 풀 리퀘스트 템플릿 + +**설명:** 변경 사항에 대한 간략한 요약 +**관련 이슈:** 이 작업이 해결하는 이슈 링크 +**테스트:** 어떻게 테스트되었나요? +``` + +### 코드 리뷰 모범 사례 {#code-review} + +코드 리뷰는 협업 프로젝트에서 품질을 유지하는 데 필수적입니다. 리뷰어는 정확성, 스타일 일관성 및 잠재적인 보안 문제를 확인해야 합니다. + + + + + + 좋은 코드 리뷰는 단순한 포맷팅이 아니라 로직과 설계에 중점을 둡니다. 스타일 적용을 위해서는 린터(linter)와 같은 자동화된 도구를 사용하고, 아키텍처 결정이나 엣지 케이스(edge case)에 대해서는 사람이 직접 리뷰하도록 하세요. + + + + + + +## 컴플라이언스 및 감사 {#compliance-and-auditing} + +오픈 소스 소프트웨어를 사용하는 조직은 종속성을 추적하고 라이선스 컴플라이언스를 보장해야 합니다1. [FOSSA](https://fossa.com/) 및 [Snyk](https://snyk.io/)와 같은 도구를 사용하면 이 프로세스를 자동화할 수 있습니다. + + + + + + +### 라이선스 스캐닝 {#license-scanning} + +자동화된 라이선스 스캐닝은 모든 CI/CD 파이프라인의 일부가 되어야 합니다. 이는 호환되지 않는 라이선스가 종속성 트리에 들어가기 전에 잡아냅니다. + +```bash +# 프로젝트에서 라이선스 스캔을 실행합니다. +npx license-checker --production --json > licenses.json + +# 프로덕션 의존성에서 카피레프트 라이선스를 확인합니다. +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +종속성을 최신 상태로 유지하고 라이선스 변경 사항을 모니터링하려면 Dependabot 사용을 고려해 보세요. + +### SBOM 생성 {#sbom-generation} + +소프트웨어 자재 명세서(Software Bill of Materials, SBOM)는 소프트웨어의 모든 구성 요소를 나열합니다. 특히 보안에 민감한 산업에서는 규제 준수를 위해 SBOM 생성이 점점 더 요구되고 있습니다. + + + +### 주요 용어 {#key-terms} + + + + + +라이선스에 대해 정보에 입각한 결정을 내리려면 이러한 용어를 이해하는 것이 필수적입니다. + + + +
    + +추가적인 정의는 전체 [용어집](/glossary/)을 검토하세요. 이 리소스는 커뮤니티에서 유지 관리하며 정기적으로 업데이트됩니다. + +
    +
    + +## 추가 자료 {#further-reading} + +_이 가이드는 [오픈 소스 이니셔티브(Open Source Initiative)](https://opensource.org/) 및 [자유 소프트웨어 재단(Free Software Foundation)](https://www.fsf.org/)의 자료를 바탕으로 작성되었습니다._ + +- [Choose a License 가이드](https://choosealicense.com/) - _프로젝트에 적합한 라이선스를 선택하는 데 도움이 되는 간단한 도구_ +- [SPDX 라이선스 목록](https://spdx.org/licenses/) - _500개 이상의 오픈 소스 라이선스에 대한 표준화된 식별자_ +- [오픈 소스 가이드](https://opensource.guide/) - _프로젝트 운영 및 기여를 위해 커뮤니티에서 유지 관리하는 리소스_ +- [FOSSA](https://fossa.com/)의 컴플라이언스 도구 - _자동화된 라이선스 스캐닝 및 종속성 관리_ \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-a/ur/fixture.json b/tests/fixtures/incremental/locale-a/ur/fixture.json new file mode 100644 index 00000000000..bc13f24392d --- /dev/null +++ b/tests/fixtures/incremental/locale-a/ur/fixture.json @@ -0,0 +1,22 @@ +{ + "page-title": "اوپن سورس لائسنسنگ", + "page-description": "مفت سافٹ ویئر لائسنسز کے بارے میں جانیں اور یہ کہ اپنے پروجیکٹ کے لیے صحیح کا انتخاب کیسے کریں۔", + "hero-cta-primary": "شروع کریں", + "hero-cta-secondary": "لائسنس کا موازنہ دیکھیں", + "stat-label-projects": "ٹریک کیے گئے اوپن سورس پروجیکٹس", + "stat-label-contributors": "فعال حصہ دار", + "banner-text": "اوپن سورس میں نئے ہیں؟ ہماری شراکت کی گائیڈ کے ساتھ شروع کریں۔", + "footer-note": "لائسنس کا ڈیٹا SPDX سے حاصل کیا گیا ہے۔ تعمیل کے ٹولز FOSSA کے ذریعے فراہم کیے گئے ہیں۔", + "filter-label": "لائسنس کی قسم کے لحاظ سے فلٹر کریں", + "empty-results": "آپ کے معیار سے مماثل کوئی لائسنس نہیں ملا۔ اپنے فلٹرز کو ایڈجسٹ کرنے کی کوشش کریں۔", + "welcome-user": "خوش آمدید، {username}!", + "project-count": "{count, plural, =0 {کوئی پروجیکٹ نہیں ملا} one {# پروجیکٹ ملا} other {# پروجیکٹس ملے}}", + "contribution-status": "{status, select, pending {آپ کی شراکت کا جائزہ زیر التوا ہے} approved {آپ کی شراکت منظور کر لی گئی ہے} rejected {آپ کی شراکت میں ترامیم کی ضرورت ہے} other {نامعلوم حیثیت}}", + "nested": { + "section-title": "اضافی وسائل", + "section-description": "اوپن سورس لائسنسنگ اور تعمیل کے بارے میں مزید دریافت کریں۔", + "link-text": "تمام وسائل دیکھیں" + }, + "multi-link": "اوپن سورس کی تعریف پڑھیں، لائسنس کے انتخاب کنندہ کا جائزہ لیں، اور SPDX شناخت کنندگان چیک کریں۔", + "markdown-description": "## فوری حوالہ\n\n- **MIT**: سب سے زیادہ اجازت دینے والا، کم از کم پابندیاں\n- **Apache-2.0**: پیٹنٹ گرانٹ کے ساتھ اجازت دینے والا\n- **GPL-3.0**: مضبوط کاپی لیفٹ، مشتقات کا GPL ہونا لازمی ہے\n\nمزید تفصیلات کے لیے، [مکمل موازنہ](/open-source/#comparison-table) دیکھیں۔" +} \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-a/ur/fixture.md b/tests/fixtures/incremental/locale-a/ur/fixture.md new file mode 100644 index 00000000000..3eac3c4132e --- /dev/null +++ b/tests/fixtures/incremental/locale-a/ur/fixture.md @@ -0,0 +1,219 @@ +--- +title: "اوپن سورس لائسنسنگ کو سمجھنا" +description: "مفت سافٹ ویئر لائسنسز اور باہمی ترقی کے لیے ایک عملی گائیڈ" +image: /images/open-source/hero-licensing.png +alt: "مختلف لائسنس کی اقسام کو ظاہر کرنے والا خاکہ" +template: tutorial +lang: ur +published: 2025-06-15 +tags: ["اوپن سورس", "لائسنسنگ", "تعمیل"] +summaryPoints: + - اوپن سورس لائسنس اس بات کا تعین کرتے ہیں کہ کوڈ کو کیسے استعمال، تبدیل اور شیئر کیا جا سکتا ہے + - کاپی لیفٹ لائسنس اس بات کا تقاضا کرتے ہیں کہ ماخوذ کام اوپن سورس رہیں + - اجازت دینے والے لائسنس کم از کم پابندیوں کے ساتھ ملکیتی استعمال کی اجازت دیتے ہیں +--- + +# اوپن سورس لائسنسنگ کو سمجھنا {#understanding-open-source-licensing} + +اوپن سورس سافٹ ویئر اس اصول پر بنایا گیا ہے کہ کوڈ کو آزادانہ طور پر شیئر کیا جانا چاہیے، اس کا مطالعہ کیا جانا چاہیے اور اسے بہتر بنایا جانا چاہیے۔ یہ گائیڈ بڑے لائسنس خاندانوں، ان کے درمیان انتخاب کرنے کے طریقے، اور باہمی ترقی کے لیے بہترین طریقوں کا احاطہ کرتی ہے۔ + +**یاد رکھیں: لائسنسنگ ایک قانونی معاملہ ہے۔ یہ گائیڈ تعلیمی ہے، قانونی مشورہ نہیں۔ اپنی مخصوص صورتحال کے لیے کسی مستند وکیل سے مشورہ کریں۔** + + + +## اوپن سورس کیا ہے؟ {#what-is-open-source} + +ایک سافٹ ویئر لائسنس اس بات کا تعین کرتا ہے کہ دوسرے آپ کے کوڈ کو کیسے استعمال، تبدیل اور تقسیم کر سکتے ہیں۔ اوپن سورس لائسنس ان حقوق کو واضح طور پر دیتے ہیں، ملکیتی لائسنسوں کے برعکس جو ان پر پابندی لگاتے ہیں۔ + +[اوپن سورس انیشی ایٹو (Open Source Initiative)](https://opensource.org/osd) سرکاری اوپن سورس تعریف کو برقرار رکھتا ہے، جس کا تقاضا ہے کہ لائسنس مفت دوبارہ تقسیم، سورس کوڈ تک رسائی، اور ماخوذ کاموں کی اجازت دیں۔ + +_واضح لائسنسنگ کے بغیر_، ہر پروجیکٹ کو حسب ضرورت قانونی جائزے کی ضرورت ہوگی۔ ریپوزٹری میں `LICENSE` فائل بالکل یہ بتاتی ہے کہ کون سی اجازتیں دی گئی ہیں، بالکل اسی طرح جیسے `README.md` فائل پروجیکٹ کا مقصد بیان کرتی ہے۔ + +آپ یہ سمجھنے کے لیے کہ یہ کس چیز کی اجازت دیتا ہے، Choose a License پر کسی پروجیکٹ کے لائسنس کی تصدیق کر سکتے ہیں۔ + +![License comparison chart](/images/open-source/license-comparison.png) + +### چار آزادیاں {#the-four-freedoms} + +[فری سافٹ ویئر فاؤنڈیشن (Free Software Foundation)](https://www.fsf.org/about/what-is-free-software) چار ضروری آزادیوں کی وضاحت کرتی ہے: + +- **آزادی 0**: کسی بھی مقصد کے لیے پروگرام چلانے کی آزادی +- **آزادی 1**: یہ مطالعہ کرنے کی آزادی کہ پروگرام کیسے کام کرتا ہے اور اسے اپنانے کی آزادی +- **آزادی 2**: کاپیاں دوبارہ تقسیم کرنے کی آزادی +- **آزادی 3**: پروگرام کو بہتر بنانے اور بہتری جاری کرنے کی آزادی + +یہ آزادیاں تمام مفت اور اوپن سورس سافٹ ویئر (FOSS) لائسنسنگ کی بنیاد بناتی ہیں۔ + + + +مفت سافٹ ویئر میں لفظ "مفت" سے مراد آزادی ہے، قیمت نہیں۔ ملکیتی سافٹ ویئر مفت ہو سکتا ہے، اور مفت سافٹ ویئر تجارتی طور پر فروخت کیا جا سکتا ہے۔ تفصیلی وضاحت کے لیے [GNU فلسفہ](https://www.gnu.org/philosophy/free-sw.html) دیکھیں۔ + + + +## لائسنس کا انتخاب کرنا {#choosing-a-license} + +### کاپی لیفٹ لائسنس {#copyleft-licenses} + +`GPL-3.0` جیسے کاپی لیفٹ لائسنس اس بات کا تقاضا کرتے ہیں کہ ماخوذ کام وہی لائسنس استعمال کریں۔ یہ یقینی بناتا ہے کہ سافٹ ویئر اور تمام ترامیم مفت رہیں۔ `AGPL-3.0` اس ضرورت کو نیٹ ورک پر رسائی حاصل کرنے والے سافٹ ویئر تک بڑھاتا ہے۔ + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// یہ معاہدہ ایک سادہ رجسٹری کا مظاہرہ کرتا ہے +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Name required"); + projects[msg.sender] = name; + } +} +``` + +کاپی لیفٹ کا بنیادی فائدہ یہ ہے کہ بہتری کو کمیونٹی کے ساتھ واپس شیئر کیا جانا چاہیے۔ اگر کوئی آپ کی GPL-لائسنس یافتہ لائبریری پر کام کرتا ہے، تو ان کی ترامیم بھی GPL-لائسنس یافتہ ہوتی ہیں۔ + + + +کاپی لیفٹ کا استعمال کریں جب: + +1. آپ یہ یقینی بنانا چاہتے ہیں کہ تمام ماخوذ کام اوپن سورس رہیں +2. آپ ایک لائبریری یا فریم ورک بنا رہے ہیں جسے دوسرے بڑھائیں گے +3. آپ اپنے کام کے ملکیتی فورکس کو روکنا چاہتے ہیں + +اس کا نقصان یہ ہے کہ کچھ کمپنیاں لائسنس کی "وائرل" نوعیت کی وجہ سے کاپی لیفٹ-لائسنس یافتہ انحصار سے گریز کرتی ہیں۔ + +کاپی لیفٹ کی تعمیل کے بارے میں عام سوالات کے لیے GPL FAQ دیکھیں۔ + + + +### اجازت دینے والے لائسنس {#permissive-licenses} + +`MIT` اور `Apache-2.0` جیسے اجازت دینے والے لائسنس ملکیتی ماخوذات کی اجازت دیتے ہیں۔ `BSD-2-Clause` کم از کم پابندیوں کے ساتھ ایک اور مقبول اجازت دینے والا آپشن ہے۔ + +```python +# مثال: کسی پروجیکٹ کی لائسنس فائل پڑھنا +def read_license(path: str) -> str: + """لائسنس فائل کے مندرجات پڑھیں اور واپس کریں۔""" + with open(path, "r") as f: + return f.read() + +# چیک کریں کہ آیا لائسنس اجازت دینے والا ہے +def is_permissive(license_text: str) -> bool: + permissive_keywords = ["MIT", "Apache", "BSD"] + return any(kw in license_text for kw in permissive_keywords) +``` + +اجازت دینے والے لائسنسوں کا بنیادی فائدہ زیادہ سے زیادہ اپنانا ہے۔ کمپنیوں کے اجازت دینے والے لائسنس یافتہ پروجیکٹس کو استعمال کرنے اور ان میں حصہ ڈالنے کا زیادہ امکان ہوتا ہے کیونکہ اس بات پر کوئی پابندی نہیں ہوتی کہ کوڈ کو کیسے استعمال کیا جا سکتا ہے۔ + +آپ کسی بھی ہوسٹنگ پلیٹ فارم پر [GitHub](https://github.com/) کا استعمال کرتے ہوئے پروجیکٹس تعینات کر سکتے ہیں، اور [SPDX](https://spdx.org/) کے ساتھ تعمیل کی تصدیق کر سکتے ہیں۔ + +پروڈکشن میں تعینات کرنے سے پہلے سمارٹ کنٹریکٹس کی جانچ کرنے کے لیے بلاک ایکسپلورر کے ساتھ [Sepolia](https://sepolia.dev/) پر [Remix](https://remix.ethereum.org/) کا استعمال کریں۔ + +### موازنہ جدول {#comparison-table} + +| لائسنس | قسم | ماخوذ کام | پیٹنٹ گرانٹ | +|---------|------|-----------------|-------------| +| GPL-3.0 | کاپی لیفٹ | GPL ہونا چاہیے | ہاں | +| AGPL-3.0 | نیٹ ورک کاپی لیفٹ | AGPL ہونا چاہیے | ہاں | +| LGPL-3.0 | کمزور کاپی لیفٹ | لائبریری ملکیتی ہو سکتی ہے | ہاں | +| MIT | اجازت دینے والا | کوئی بھی لائسنس | نہیں | +| Apache-2.0 | اجازت دینے والا | کوئی بھی لائسنس | ہاں | +| BSD-2-Clause | اجازت دینے والا | کوئی بھی لائسنس | نہیں | +| MPL-2.0 | فائل لیول کاپی لیفٹ | تبدیل شدہ فائلیں MPL ہونی چاہئیں | ہاں | + +## کمیونٹی کا تعاون {#community-collaboration} + +### پروجیکٹس میں حصہ ڈالنا {#contributing-to-projects} + +اوپن سورس میں حصہ ڈالنے کا آغاز پروجیکٹ کے ورک فلو کو سمجھنے سے ہوتا ہے۔ زیادہ تر پروجیکٹس کام کو مربوط کرنے کے لیے ایشو ٹریکرز اور تبدیلیاں تجویز کرنے کے لیے پل ریکوئسٹس کا استعمال کرتے ہیں۔ + +آج ہی حصہ ڈالنا شروع کریں + + + اس پروجیکٹ میں کیسے حصہ ڈالیں + + + + ہماری کمیونٹی میں شامل ہوں + + +کوئی بھی حصہ ڈالنے سے پہلے، ہمیشہ گائیڈ لائنز کے لیے پروجیکٹ کی `CONTRIBUTING.md` فائل چیک کریں۔ کوڈ کا انداز، ٹیسٹ کی ضروریات، اور جائزے کے عمل پروجیکٹس کے درمیان مختلف ہوتے ہیں۔ + +```md +## پل ریکوئسٹ ٹیمپلیٹ + +**تفصیل:** تبدیلیوں کا مختصر خلاصہ +**متعلقہ ایشو:** اس ایشو کا لنک جسے یہ حل کرتا ہے +**ٹیسٹنگ:** اس کا ٹیسٹ کیسے کیا گیا؟ +``` + +### کوڈ کے جائزے کے بہترین طریقے {#code-review} + +باہمی پروجیکٹس میں معیار کو برقرار رکھنے کے لیے کوڈ کا جائزہ لینا ضروری ہے۔ جائزہ لینے والوں کو درستگی، انداز کی مستقل مزاجی، اور ممکنہ سیکیورٹی مسائل کی جانچ کرنی چاہیے۔ + + + + + + کوڈ کے اچھے جائزے صرف فارمیٹنگ پر نہیں بلکہ منطق اور ڈیزائن پر توجہ مرکوز کرتے ہیں۔ انداز کے نفاذ کے لیے لنٹرز جیسے خودکار ٹولز کا استعمال کریں، اور انسانی جائزے کو آرکیٹیکچرل فیصلوں اور غیر معمولی صورتحال (edge cases) کے لیے مخصوص رکھیں۔ + + + + + + +## تعمیل اور آڈیٹنگ {#compliance-and-auditing} + +اوپن سورس سافٹ ویئر استعمال کرنے والی تنظیموں کو اپنے انحصار کو ٹریک کرنا چاہیے اور لائسنس کی تعمیل کو یقینی بنانا چاہیے1۔ [FOSSA](https://fossa.com/) اور [Snyk](https://snyk.io/) جیسے ٹولز اس عمل کو خودکار بنا سکتے ہیں۔ + + + + + + +### لائسنس اسکیننگ {#license-scanning} + +خودکار لائسنس اسکیننگ ہر CI/CD پائپ لائن کا حصہ ہونی چاہیے۔ یہ غیر مطابقت پذیر لائسنسوں کو آپ کے انحصار کے درخت (dependency tree) میں داخل ہونے سے پہلے پکڑ لیتی ہے۔ + +```bash +# اپنے پروجیکٹ پر لائسنس اسکین چلائیں +npx license-checker --production --json > licenses.json + +# پروڈکشن کے انحصار میں کاپی لیفٹ لائسنس چیک کریں +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +انحصار کو اپ ڈیٹ رکھنے اور لائسنس کی تبدیلیوں کی نگرانی کے لیے Dependabot استعمال کرنے پر غور کریں۔ + +### SBOM جنریشن {#sbom-generation} + +ایک سافٹ ویئر بل آف میٹریلز (SBOM) آپ کے سافٹ ویئر کے تمام اجزاء کی فہرست بناتا ہے۔ SBOM تیار کرنا ریگولیٹری تعمیل کے لیے تیزی سے ضروری ہوتا جا رہا ہے، خاص طور پر سیکیورٹی کے لحاظ سے حساس صنعتوں میں۔ + + + +### کلیدی اصطلاحات {#key-terms} + + + + + +لائسنسنگ کے بارے میں باخبر فیصلے کرنے کے لیے ان اصطلاحات کو سمجھنا ضروری ہے۔ + + + +
    + +اضافی تعریفوں کے لیے مکمل [اصطلاحات کی فرہنگ](/glossary/) کا جائزہ لیں۔ یہ وسیلہ کمیونٹی کے ذریعے برقرار رکھا جاتا ہے اور اسے باقاعدگی سے اپ ڈیٹ کیا جاتا ہے۔ + +
    +
    + +## مزید مطالعہ {#further-reading} + +_یہ گائیڈ [اوپن سورس انیشی ایٹو](https://opensource.org/) اور [فری سافٹ ویئر فاؤنڈیشن](https://www.fsf.org/) کے مواد سے اخذ کی گئی ہے۔_ + +- [لائسنس گائیڈ کا انتخاب کریں](https://choosealicense.com/) - _آپ کے پروجیکٹ کے لیے صحیح لائسنس چننے میں مدد کرنے والا ایک سادہ ٹول_ +- [SPDX لائسنس کی فہرست](https://spdx.org/licenses/) - _500 سے زیادہ اوپن سورس لائسنسوں کے لیے معیاری شناخت کنندگان_ +- [اوپن سورس گائیڈ](https://opensource.guide/) - _پروجیکٹس چلانے اور ان میں حصہ ڈالنے کے لیے کمیونٹی کے زیر انتظام وسائل_ +- [FOSSA](https://fossa.com/) پر تعمیل کی ٹولنگ - _خودکار لائسنس اسکیننگ اور انحصار کا انتظام_ \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-b/es/fixture.json b/tests/fixtures/incremental/locale-b/es/fixture.json new file mode 100644 index 00000000000..b231b0d4cd5 --- /dev/null +++ b/tests/fixtures/incremental/locale-b/es/fixture.json @@ -0,0 +1,22 @@ +{ + "page-title": "Licencias de código abierto", + "page-description": "Descubre las licencias de código abierto, los requisitos de cumplimiento y cómo elegir la adecuada.", + "hero-cta-primary": "Comenzar", + "hero-cta-secondary": "Ver comparación de licencias", + "stat-label-projects": "Proyectos de código abierto rastreados", + "stat-label-contributors": "Colaboradores activos en todo el mundo", + "banner-text": "¿Eres nuevo en el código abierto? Empieza con nuestra guía de contribución.", + "footer-note": "Datos de licencias obtenidos de SPDX. Herramientas de cumplimiento proporcionadas por FOSSA.", + "filter-label": "Filtrar por tipo de licencia", + "welcome-user": "¡Te damos la bienvenida de nuevo, {displayName}!", + "project-count": "{total, plural, =0 {No se encontraron proyectos} one {# proyecto encontrado} other {# proyectos encontrados}}", + "contribution-status": "{status, select, pending {Tu contribución está pendiente de revisión} approved {Tu contribución ha sido aprobada} rejected {Tu contribución necesita revisiones} other {Estado desconocido}}", + "nested": { + "section-title": "Recursos adicionales", + "section-description": "Explora más sobre las licencias de código abierto y su cumplimiento.", + "link-text": "Explorar todos los recursos" + }, + "multi-link": "Lee la Definición de código abierto, revisa el selector de licencias y consulta los identificadores SPDX.", + "markdown-description": "## Referencia rápida\n\n- **MIT**: La más permisiva, restricciones mínimas\n- **Apache-2.0**: Permisiva con concesión de patentes\n- **GPL-3.0**: Copyleft fuerte, los derivados deben ser GPL\n\nPara obtener más detalles, consulta la [comparación completa](/open-source/#comparison-table).", + "new-key": "Esta es una cadena de traducción completamente nueva añadida en la versión B." +} \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-b/es/fixture.md b/tests/fixtures/incremental/locale-b/es/fixture.md new file mode 100644 index 00000000000..6aafdd7c0e5 --- /dev/null +++ b/tests/fixtures/incremental/locale-b/es/fixture.md @@ -0,0 +1,231 @@ +--- +title: "Comprender las licencias de código abierto" +description: "Una guía exhaustiva sobre licencias de código abierto, cumplimiento y colaboración" +image: /images/open-source/hero-licensing-v2.png +alt: "Diagrama que muestra diferentes tipos de licencias" +template: tutorial +lang: es +published: 2025-06-15 +tags: ["código abierto", "licencias", "cumplimiento"] +summaryPoints: + - Las licencias de código abierto definen cómo se puede usar, modificar y compartir el código + - Las licencias copyleft requieren que las obras derivadas sigan siendo de código abierto + - Las licencias permisivas permiten el uso privativo con restricciones mínimas +--- + +# Comprender las licencias de código abierto {#understanding-open-source-licensing} + +El software de código abierto se basa en el principio de que el código debe compartirse, estudiarse y mejorarse libremente. Esta guía cubre las principales familias de licencias, cómo elegir entre ellas y las mejores prácticas para el desarrollo colaborativo. + +**Recuerde: las licencias son un asunto legal. Esta guía es educativa, no asesoramiento legal. Consulte a un abogado calificado para su situación específica.** + + +## ¿Qué es el código abierto? {#what-is-open-source} + +Una licencia de software determina cómo otros pueden usar, modificar y distribuir su código. Las licencias de código abierto otorgan estos derechos explícitamente, a diferencia de las licencias privativas que los restringen. + +La [Open Source Initiative](https://opensource.org/osd/annotated) mantiene la Definición de Código Abierto oficial, que requiere que las licencias permitan la libre redistribución, el acceso al código fuente y las obras derivadas. + +_Sin licencias claras_, cada proyecto necesitaría una revisión legal personalizada. El archivo `LICENSE.md` en un repositorio señala exactamente qué permisos se otorgan, de manera similar a cómo el archivo `README.md` explica el propósito del proyecto. + +Puede verificar la licencia de un proyecto en Choose a License para comprender qué permite. + +![License comparison chart](/images/open-source/license-comparison-v2.png) + +### Las cuatro libertades {#the-four-freedoms} + +La [Free Software Foundation](https://www.fsf.org/about/what-is-free-software) define cuatro libertades esenciales: + +- **Libertad 0**: La libertad de ejecutar el programa para cualquier propósito +- **Libertad 1**: La libertad de estudiar cómo funciona el programa y adaptarlo +- **Libertad 2**: La libertad de redistribuir copias +- **Libertad 3**: La libertad de mejorar el programa y publicar las mejoras + +Estas cuatro libertades son la base filosófica de todo el movimiento del software libre y de código abierto (FOSS). + + + +La palabra "free" (libre/gratis) en el software libre se refiere a la libertad, no al costo. El software privativo puede ser gratuito y el software libre puede venderse comercialmente. Consulte la [filosofía de GNU](https://www.gnu.org/philosophy/free-sw.html) para obtener una explicación detallada. + + + +## Elegir una licencia {#choosing-a-license} + +### Licencias copyleft {#copyleft-licenses} + +Las licencias copyleft como `GPL-3.0` requieren que las obras derivadas utilicen la misma licencia. Esto garantiza que el software y todas las modificaciones sigan siendo libres. La `AGPL-3.0` extiende este requisito al software al que se accede a través de una red. + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// Este contrato implementa un registro básico de proyectos +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Project name cannot be empty"); + projects[msg.sender] = name; + } +} +``` + +La ventaja clave del copyleft es que las mejoras deben compartirse con la comunidad. Si alguien desarrolla sobre su biblioteca con licencia GPL, sus modificaciones también tendrán licencia GPL. + + + +Use copyleft cuando: + +1. Desee asegurarse de que todas las obras derivadas sigan siendo de código abierto +2. Esté construyendo una biblioteca o marco de trabajo (framework) que otros ampliarán +3. Desee evitar bifurcaciones (forks) privativas de su trabajo + +La desventaja es que algunas empresas evitan las dependencias con licencia copyleft debido a la naturaleza "viral" de la licencia. + +Consulte las preguntas frecuentes sobre la GPL para conocer las dudas comunes sobre el cumplimiento del copyleft. + + + +### Licencias permisivas {#permissive-licenses} + +Las licencias permisivas como `MIT` y `Apache-2.0` permiten derivados privativos. La `BSD-2-Clause` es otra opción permisiva popular con restricciones mínimas. + +```python +# Ejemplo: cargar la licencia de un proyecto desde el disco +def read_license(path: str) -> str: + """Leer y devolver el contenido de un archivo LICENSE.""" + with open(path, "r") as f: + return f.read() + +# Verificar si la licencia es permisiva +def is_permissive(license_text: str) -> bool: + permissive_keywords = ["MIT", "Apache", "BSD"] + return any(kw in license_text for kw in permissive_keywords) +``` + +La principal ventaja de las licencias permisivas es la máxima adopción. Es más probable que las empresas utilicen y contribuyan a proyectos con licencias permisivas porque no hay restricciones sobre cómo se puede usar el código. + +Puede desplegar proyectos usando [Repositorios de GitHub](https://github.com/new) en cualquier plataforma de alojamiento, y verificar el cumplimiento con [SPDX](https://spdx.org/). + +Use [Remix](https://remix.ethereum.org/) en [Holesky](https://holesky.dev/) con un [explorador de bloques](https://eth.blockscout.com/) para probar [contratos inteligentes](/glossary/#smart-contract) antes de desplegar en producción. + +### Tabla comparativa {#comparison-table} + +| Licencia | Tipo | Obras derivadas | Concesión de patentes | +|---------|------|-----------------|-------------| +| GPL-3.0 | Copyleft | Debe ser GPL | Sí | +| AGPL-3.0 | Copyleft de red | Debe ser AGPL | Sí | +| LGPL-3.0 | Copyleft débil | La biblioteca puede ser privativa | Sí | +| MIT | Permisiva | Cualquier licencia | No | +| Apache-2.0 | Permisiva | Cualquier licencia | Sí | +| BSD-2-Clause | Permisiva | Cualquier licencia | No | +| MPL-2.0 | Copyleft a nivel de archivo | Los archivos modificados deben ser MPL | Sí | + +## Cumplimiento y auditoría {#compliance-and-auditing} + +Las organizaciones que utilizan software de código abierto deben rastrear sus dependencias y garantizar el cumplimiento de las licencias1. Herramientas como [FOSSA](https://fossa.com/) y [Snyk](https://snyk.io/) pueden automatizar este proceso. + + + + + + +### Escaneo de licencias {#license-scanning} + +El escaneo automatizado de licencias debe ser parte de cada canalización (pipeline) de CI/CD. Detecta licencias incompatibles antes de que ingresen a su árbol de dependencias. + +```bash +# Ejecutar un escaneo de licencias en tu proyecto +npx license-checker --production --json > licenses.json + +# Verificar si hay licencias copyleft en las dependencias de producción +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +Considere usar Dependabot para mantener las dependencias actualizadas y monitorear los cambios de licencia. + +```json +{ + "license-scan": { + "production": true, + "allowed": ["MIT", "Apache-2.0", "BSD-2-Clause"] + } +} +``` + +### Generación de SBOM {#sbom-generation} + +Una lista de materiales de software (SBOM, por sus siglas en inglés) enumera todos los componentes de su software. La generación de un SBOM es cada vez más necesaria para el cumplimiento normativo, especialmente en industrias sensibles a la seguridad. + + + +### Términos clave {#key-terms} + + + + + +Comprender estos términos es esencial para tomar decisiones informadas sobre las licencias. + + + +
    + +Revise el [glosario de términos](/glossary/) completo para obtener definiciones adicionales. Este recurso es mantenido por la comunidad y se actualiza regularmente. + +
    +
    + +## Colaboración comunitaria {#community-collaboration} + +### Contribuir a proyectos {#how-to-contribute} + +Contribuir al código abierto comienza por comprender el flujo de trabajo del proyecto. La mayoría de los proyectos utilizan rastreadores de problemas (issue trackers) para coordinar el trabajo y solicitudes de extracción (pull requests) para proponer cambios. + +Comience a contribuir hoy + + + Cómo contribuir a este proyecto + + + + Únase a nuestra comunidad + + +Antes de enviar una contribución, siempre revise el archivo `CONTRIBUTING.md` del proyecto para conocer las pautas. El estilo de código, los requisitos de prueba y los procesos de revisión varían entre proyectos. + +```md +## Plantilla de Pull Request + +**Descripción:** Breve resumen de los cambios +**Problema relacionado:** Enlace al problema que esto aborda +**Pruebas:** ¿Cómo se probó esto? +``` + +### Mejores prácticas para la revisión de código {#code-review} + +La revisión de código es esencial para mantener la calidad en proyectos colaborativos. Los revisores deben verificar la corrección, la consistencia del estilo y los posibles problemas de seguridad. + + + + + + Las buenas revisiones de código se centran en la lógica y el diseño, no solo en el formato. Utilice herramientas automatizadas como linters para hacer cumplir el estilo, y reserve la revisión humana para decisiones arquitectónicas y casos extremos. + + + + + + +## Licencia dual {#dual-licensing} + +Algunos proyectos ofrecen su código bajo dos licencias simultáneamente. Esto permite a los usuarios comerciales comprar una licencia privativa mientras mantienen la versión de código abierto disponible bajo términos copyleft. Proyectos como [Qt](https://www.qt.io/licensing/) y [MySQL](https://www.mysql.com/about/legal/licensing/) utilizan este modelo. + +## Lecturas adicionales {#further-reading} + +_Esta guía está adaptada de materiales de la [Open Source Initiative](https://opensource.org/) y la [Free Software Foundation](https://www.fsf.org/)._ + +- [Guía Choose a License](https://choosealicense.com/) - _Herramienta sencilla para ayudar a elegir la licencia adecuada para su proyecto_ +- [Lista de licencias SPDX](https://spdx.org/licenses/) - _Identificadores estandarizados para más de 500 licencias de código abierto_ +- [Guía de código abierto](https://opensource.guide/) - _Recursos mantenidos por la comunidad para ejecutar y contribuir a proyectos_ +- Herramientas de cumplimiento en [FOSSA](https://fossa.com/) - _Escaneo automatizado de licencias y gestión de dependencias_ \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-b/ko/fixture.json b/tests/fixtures/incremental/locale-b/ko/fixture.json new file mode 100644 index 00000000000..76e7aabbcf8 --- /dev/null +++ b/tests/fixtures/incremental/locale-b/ko/fixture.json @@ -0,0 +1,22 @@ +{ + "page-title": "오픈 소스 라이선스", + "page-description": "오픈 소스 라이선스, 규정 준수 요건 및 올바른 라이선스를 선택하는 방법을 알아보세요.", + "hero-cta-primary": "시작하기", + "hero-cta-secondary": "라이선스 비교 보기", + "stat-label-projects": "추적된 오픈 소스 프로젝트", + "stat-label-contributors": "전 세계 활성 기여자", + "banner-text": "오픈 소스가 처음이신가요? 기여 가이드시작해 보세요.", + "footer-note": "라이선스 데이터는 SPDX에서 제공합니다. 규정 준수 도구는 FOSSA에서 제공합니다.", + "filter-label": "라이선스 유형별 필터링", + "welcome-user": "다시 오신 것을 환영합니다, {displayName}님!", + "project-count": "{total, plural, =0 {프로젝트를 찾을 수 없습니다} one {#개의 프로젝트를 찾았습니다} other {#개의 프로젝트를 찾았습니다}}", + "contribution-status": "{status, select, pending {기여하신 내용이 검토 대기 중입니다} approved {기여하신 내용이 승인되었습니다} rejected {기여하신 내용에 수정이 필요합니다} other {알 수 없는 상태}}", + "nested": { + "section-title": "추가 자료", + "section-description": "오픈 소스 라이선스 및 규정 준수에 대해 자세히 알아보세요.", + "link-text": "모든 자료 찾아보기" + }, + "multi-link": "오픈 소스 정의를 읽고, 라이선스 선택기를 검토하며, SPDX 식별자를 확인하세요.", + "markdown-description": "## 빠른 참조\n\n- **MIT**: 가장 관대하며 최소한의 제한\n- **Apache-2.0**: 특허 부여가 포함된 관대한 라이선스\n- **GPL-3.0**: 강력한 카피레프트, 파생물은 GPL이어야 함\n\n자세한 내용은 [전체 비교](/open-source/#comparison-table)를 참조하세요.", + "new-key": "버전 B에 추가된 완전히 새로운 번역 문자열입니다." +} \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-b/ko/fixture.md b/tests/fixtures/incremental/locale-b/ko/fixture.md new file mode 100644 index 00000000000..17ec8e5b0cc --- /dev/null +++ b/tests/fixtures/incremental/locale-b/ko/fixture.md @@ -0,0 +1,231 @@ +--- +title: "오픈 소스 라이선스 이해하기" +description: "오픈 소스 라이선스, 규정 준수 및 협업에 대한 종합 가이드" +image: /images/open-source/hero-licensing-v2.png +alt: "다양한 라이선스 유형을 보여주는 다이어그램" +template: tutorial +lang: ko +published: 2025-06-15 +tags: ["오픈 소스", "라이선스", "규정 준수"] +summaryPoints: + - 오픈 소스 라이선스는 코드를 사용, 수정 및 공유하는 방법을 정의합니다. + - 카피레프트 라이선스는 파생 저작물도 오픈 소스로 유지할 것을 요구합니다. + - 퍼미시브 라이선스는 최소한의 제한으로 독점적 사용을 허용합니다. +--- + +# 오픈 소스 라이선스 이해하기 {#understanding-open-source-licensing} + +오픈 소스 소프트웨어는 코드를 자유롭게 공유하고, 연구하며, 개선해야 한다는 원칙을 바탕으로 구축됩니다. 이 가이드에서는 주요 라이선스 제품군, 라이선스 선택 방법 및 협업 개발을 위한 모범 사례를 다룹니다. + +**기억하세요: 라이선스는 법적인 문제입니다. 이 가이드는 교육용이며 법적 조언이 아닙니다. 구체적인 상황에 대해서는 자격을 갖춘 변호사와 상담하세요.** + + +## 오픈 소스란 무엇인가요? {#what-is-open-source} + +소프트웨어 라이선스는 다른 사람이 코드를 사용, 수정 및 배포하는 방법을 결정합니다. 이를 제한하는 독점 라이선스와 달리, 오픈 소스 라이선스는 이러한 권리를 명시적으로 부여합니다. + +[오픈 소스 이니셔티브(Open Source Initiative)](https://opensource.org/osd/annotated)는 공식 오픈 소스 정의를 유지 관리하며, 이 정의에 따르면 라이선스는 자유로운 재배포, 소스 코드에 대한 접근 및 파생 저작물을 허용해야 합니다. + +_명확한 라이선스가 없다면_ 모든 프로젝트는 맞춤형 법적 검토가 필요할 것입니다. 리포지토리의 `LICENSE.md` 파일은 `README.md` 파일이 프로젝트의 목적을 설명하는 것과 유사하게 어떤 권한이 부여되는지 정확히 알려줍니다. + +Choose a License에서 프로젝트의 라이선스를 확인하여 허용되는 사항을 이해할 수 있습니다. + +![License comparison chart](/images/open-source/license-comparison-v2.png) + +### 4가지 자유 {#the-four-freedoms} + +[자유 소프트웨어 재단(Free Software Foundation)](https://www.fsf.org/about/what-is-free-software)은 4가지 필수적인 자유를 정의합니다: + +- **자유 0**: 어떤 목적으로든 프로그램을 실행할 수 있는 자유 +- **자유 1**: 프로그램의 작동 방식을 연구하고 수정할 수 있는 자유 +- **자유 2**: 복사본을 재배포할 수 있는 자유 +- **자유 3**: 프로그램을 개선하고 그 개선 사항을 배포할 수 있는 자유 + +이 4가지 자유는 전체 자유 오픈 소스 소프트웨어(FOSS) 운동의 철학적 기반입니다. + + + +자유 소프트웨어(free software)에서 "자유(free)"라는 단어는 비용이 아니라 자유(liberty)를 의미합니다. 독점 소프트웨어도 무료일 수 있으며, 자유 소프트웨어도 상업적으로 판매될 수 있습니다. 자세한 설명은 [GNU 철학](https://www.gnu.org/philosophy/free-sw.html)을 참조하세요. + + + +## 라이선스 선택하기 {#choosing-a-license} + +### 카피레프트 라이선스 {#copyleft-licenses} + +`GPL-3.0`와 같은 카피레프트 라이선스는 파생 저작물도 동일한 라이선스를 사용하도록 요구합니다. 이를 통해 소프트웨어와 모든 수정 사항이 자유롭게 유지되도록 보장합니다. `AGPL-3.0`는 이 요구 사항을 네트워크를 통해 접근하는 소프트웨어로 확장합니다. + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// 이 컨트랙트는 기본 프로젝트 레지스트리를 구현합니다 +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Project name cannot be empty"); + projects[msg.sender] = name; + } +} +``` + +카피레프트의 주요 장점은 개선 사항을 커뮤니티와 다시 공유해야 한다는 것입니다. 누군가 귀하의 GPL 라이선스 라이브러리를 기반으로 개발한다면, 그들의 수정 사항 역시 GPL 라이선스를 따라야 합니다. + + + +다음과 같은 경우 카피레프트를 사용하세요: + +1. 모든 파생 저작물이 오픈 소스로 유지되도록 보장하고 싶을 때 +2. 다른 사람들이 확장할 라이브러리나 프레임워크를 구축할 때 +3. 작업물의 독점적인 포크(fork)를 방지하고 싶을 때 + +단점은 일부 기업들이 라이선스의 "바이러스성(viral)" 특성 때문에 카피레프트 라이선스가 적용된 종속성을 피한다는 것입니다. + +카피레프트 규정 준수에 대한 일반적인 질문은 GPL FAQ를 확인하세요. + + + +### 퍼미시브 라이선스 {#permissive-licenses} + +`MIT` 및 `Apache-2.0`와 같은 퍼미시브 라이선스는 독점적인 파생물을 허용합니다. `BSD-2-Clause`는 제한이 최소화된 또 다른 인기 있는 퍼미시브 옵션입니다. + +```python +# 예시: 디스크에서 프로젝트의 라이선스 불러오기 +def read_license(path: str) -> str: + """LICENSE 파일의 내용을 읽고 반환합니다.""" + with open(path, "r") as f: + return f.read() + +# 라이선스가 허용적인지 확인합니다 +def is_permissive(license_text: str) -> bool: + permissive_keywords = ["MIT", "Apache", "BSD"] + return any(kw in license_text for kw in permissive_keywords) +``` + +퍼미시브 라이선스의 주요 장점은 채택을 극대화할 수 있다는 것입니다. 코드 사용 방법에 대한 제한이 없기 때문에 기업들이 퍼미시브 라이선스 프로젝트를 사용하고 기여할 가능성이 더 높습니다. + +[GitHub 리포지토리](https://github.com/new)를 사용하여 모든 호스팅 플랫폼에 프로젝트를 배포할 수 있으며, [SPDX](https://spdx.org/)를 통해 규정 준수를 확인할 수 있습니다. + +프로덕션 환경에 배포하기 전에 [Holesky](https://holesky.dev/)에서 [블록 탐색기](https://eth.blockscout.com/)와 함께 [Remix](https://remix.ethereum.org/)를 사용하여 [스마트 컨트랙트](/glossary/#smart-contract)를 테스트하세요. + +### 비교표 {#comparison-table} + +| 라이선스 | 유형 | 파생 저작물 | 특허 부여 | +|---------|------|-----------------|-------------| +| GPL-3.0 | 카피레프트 | GPL이어야 함 | 예 | +| AGPL-3.0 | 네트워크 카피레프트 | AGPL이어야 함 | 예 | +| LGPL-3.0 | 약한 카피레프트 | 라이브러리는 독점일 수 있음 | 예 | +| MIT | 퍼미시브 | 모든 라이선스 | 아니요 | +| Apache-2.0 | 퍼미시브 | 모든 라이선스 | 예 | +| BSD-2-Clause | 퍼미시브 | 모든 라이선스 | 아니요 | +| MPL-2.0 | 파일 수준 카피레프트 | 수정된 파일은 MPL이어야 함 | 예 | + +## 규정 준수 및 감사 {#compliance-and-auditing} + +오픈 소스 소프트웨어를 사용하는 조직은 종속성을 추적하고 라이선스 규정 준수를 보장해야 합니다1. [FOSSA](https://fossa.com/) 및 [Snyk](https://snyk.io/)와 같은 도구는 이 프로세스를 자동화할 수 있습니다. + + + + + + +### 라이선스 스캐닝 {#license-scanning} + +자동화된 라이선스 스캐닝은 모든 CI/CD 파이프라인의 일부여야 합니다. 이는 호환되지 않는 라이선스가 종속성 트리에 들어가기 전에 잡아냅니다. + +```bash +# 프로젝트에서 라이선스 스캔을 실행합니다 +npx license-checker --production --json > licenses.json + +# 프로덕션 의존성에서 카피레프트 라이선스를 확인합니다 +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +종속성을 최신 상태로 유지하고 라이선스 변경 사항을 모니터링하려면 Dependabot 사용을 고려해 보세요. + +```json +{ + "license-scan": { + "production": true, + "allowed": ["MIT", "Apache-2.0", "BSD-2-Clause"] + } +} +``` + +### SBOM 생성 {#sbom-generation} + +소프트웨어 자재 명세서(SBOM)는 소프트웨어의 모든 구성 요소를 나열합니다. 특히 보안에 민감한 산업에서는 규정 준수를 위해 SBOM 생성이 점점 더 요구되고 있습니다. + + + +### 주요 용어 {#key-terms} + + + + + +라이선스에 대해 정보에 입각한 결정을 내리려면 이러한 용어를 이해하는 것이 필수적입니다. + + + +
    + +추가적인 정의는 전체 [용어집](/glossary/)을 검토하세요. 이 리소스는 커뮤니티에서 유지 관리하며 정기적으로 업데이트됩니다. + +
    +
    + +## 커뮤니티 협업 {#community-collaboration} + +### 프로젝트에 기여하기 {#how-to-contribute} + +오픈 소스에 기여하는 것은 프로젝트의 워크플로우를 이해하는 것에서 시작됩니다. 대부분의 프로젝트는 작업을 조율하기 위해 이슈 트래커를 사용하고 변경 사항을 제안하기 위해 풀 리퀘스트(pull request)를 사용합니다. + +오늘 바로 기여를 시작하세요 + + + 이 프로젝트에 기여하는 방법 + + + + 커뮤니티 참여하기 + + +기여를 제출하기 전에 항상 프로젝트의 `CONTRIBUTING.md` 파일에서 가이드라인을 확인하세요. 코드 스타일, 테스트 요구 사항 및 검토 프로세스는 프로젝트마다 다릅니다. + +```md +## 풀 리퀘스트 템플릿 + +**설명:** 변경 사항에 대한 간략한 요약 +**관련 이슈:** 이 작업이 해결하는 이슈 링크 +**테스트:** 어떻게 테스트되었나요? +``` + +### 코드 리뷰 모범 사례 {#code-review} + +코드 리뷰는 협업 프로젝트에서 품질을 유지하는 데 필수적입니다. 리뷰어는 정확성, 스타일 일관성 및 잠재적인 보안 문제를 확인해야 합니다. + + + + + + 좋은 코드 리뷰는 단순한 포맷팅이 아니라 로직과 설계에 중점을 둡니다. 스타일 적용을 위해서는 린터(linter)와 같은 자동화된 도구를 사용하고, 아키텍처 결정 및 엣지 케이스에 대해서는 사람의 검토를 위해 남겨두세요. + + + + + + +## 이중 라이선스 {#dual-licensing} + +일부 프로젝트는 코드를 두 가지 라이선스로 동시에 제공합니다. 이를 통해 상업적 사용자는 독점 라이선스를 구매할 수 있으며, 오픈 소스 버전은 카피레프트 조건에 따라 계속 사용할 수 있습니다. [Qt](https://www.qt.io/licensing/) 및 [MySQL](https://www.mysql.com/about/legal/licensing/)과 같은 프로젝트가 이 모델을 사용합니다. + +## 더 읽어보기 {#further-reading} + +_이 가이드는 [오픈 소스 이니셔티브(Open Source Initiative)](https://opensource.org/) 및 [자유 소프트웨어 재단(Free Software Foundation)](https://www.fsf.org/)의 자료를 바탕으로 작성되었습니다._ + +- [Choose a License 가이드](https://choosealicense.com/) - _프로젝트에 적합한 라이선스를 선택하는 데 도움이 되는 간단한 도구_ +- [SPDX 라이선스 목록](https://spdx.org/licenses/) - _500개 이상의 오픈 소스 라이선스에 대한 표준화된 식별자_ +- [오픈 소스 가이드](https://opensource.guide/) - _프로젝트 운영 및 기여를 위해 커뮤니티에서 유지 관리하는 리소스_ +- [FOSSA](https://fossa.com/)의 규정 준수 도구 - _자동화된 라이선스 스캐닝 및 종속성 관리_ \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-b/ur/fixture.json b/tests/fixtures/incremental/locale-b/ur/fixture.json new file mode 100644 index 00000000000..5e3789d47c8 --- /dev/null +++ b/tests/fixtures/incremental/locale-b/ur/fixture.json @@ -0,0 +1,22 @@ +{ + "page-title": "اوپن سورس لائسنسنگ", + "page-description": "اوپن سورس لائسنسز، تعمیل کے تقاضوں، اور صحیح لائسنس کا انتخاب کرنے کا طریقہ دریافت کریں۔", + "hero-cta-primary": "شروع کریں", + "hero-cta-secondary": "لائسنس کا موازنہ دیکھیں", + "stat-label-projects": "ٹریک کیے گئے اوپن سورس پروجیکٹس", + "stat-label-contributors": "دنیا بھر میں فعال حصہ دار", + "banner-text": "اوپن سورس میں نئے ہیں؟ ہماری شراکت کی گائیڈ کے ساتھ شروع کریں۔", + "footer-note": "لائسنس کا ڈیٹا SPDX سے حاصل کیا گیا ہے۔ تعمیل کے ٹولز FOSSA کی طرف سے فراہم کیے گئے ہیں۔", + "filter-label": "لائسنس کی قسم کے لحاظ سے فلٹر کریں", + "welcome-user": "خوش آمدید، {displayName}!", + "project-count": "{total, plural, =0 {کوئی پروجیکٹ نہیں ملا} one {# پروجیکٹ ملا} other {# پروجیکٹس ملے}}", + "contribution-status": "{status, select, pending {آپ کی شراکت کا جائزہ زیر التوا ہے} approved {آپ کی شراکت منظور کر لی گئی ہے} rejected {آپ کی شراکت میں ترامیم کی ضرورت ہے} other {نامعلوم حیثیت}}", + "nested": { + "section-title": "اضافی وسائل", + "section-description": "اوپن سورس لائسنسنگ اور تعمیل کے بارے میں مزید دریافت کریں۔", + "link-text": "تمام وسائل براؤز کریں" + }, + "multi-link": "اوپن سورس کی تعریف پڑھیں، لائسنس کا انتخاب کرنے والے کا جائزہ لیں، اور SPDX شناخت کنندگان کو چیک کریں۔", + "markdown-description": "## فوری حوالہ\n\n- **MIT**: سب سے زیادہ اجازت دینے والا، کم از کم پابندیاں\n- **Apache-2.0**: پیٹنٹ گرانٹ کے ساتھ اجازت دینے والا\n- **GPL-3.0**: مضبوط کاپی لیفٹ، مشتقات کا GPL ہونا لازمی ہے\n\nمزید تفصیلات کے لیے، [مکمل موازنہ](/open-source/#comparison-table) دیکھیں۔", + "new-key": "یہ ورژن B میں شامل کی گئی ایک بالکل نئی ترجمہ شدہ سٹرنگ ہے۔" +} \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-b/ur/fixture.md b/tests/fixtures/incremental/locale-b/ur/fixture.md new file mode 100644 index 00000000000..4492f7a5f43 --- /dev/null +++ b/tests/fixtures/incremental/locale-b/ur/fixture.md @@ -0,0 +1,231 @@ +--- +title: "اوپن سورس لائسنسنگ کو سمجھنا" +description: "اوپن سورس لائسنسز، تعمیل، اور تعاون کے لیے ایک جامع گائیڈ" +image: /images/open-source/hero-licensing-v2.png +alt: "مختلف لائسنس کی اقسام کو ظاہر کرنے والا خاکہ" +template: tutorial +lang: ur +published: 2025-06-15 +tags: ["اوپن سورس", "لائسنسنگ", "تعمیل"] +summaryPoints: + - اوپن سورس لائسنس اس بات کا تعین کرتے ہیں کہ کوڈ کو کس طرح استعمال، تبدیل اور شیئر کیا جا سکتا ہے + - کاپی لیفٹ لائسنسز کا تقاضا ہے کہ ماخوذ کام اوپن سورس ہی رہیں + - اجازت دینے والے لائسنس کم سے کم پابندیوں کے ساتھ ملکیتی استعمال کی اجازت دیتے ہیں +--- + +# اوپن سورس لائسنسنگ کو سمجھنا {#understanding-open-source-licensing} + +اوپن سورس سافٹ ویئر اس اصول پر بنایا گیا ہے کہ کوڈ کو آزادانہ طور پر شیئر کیا جانا چاہیے، اس کا مطالعہ کیا جانا چاہیے اور اسے بہتر بنایا جانا چاہیے۔ یہ گائیڈ لائسنس کے بڑے خاندانوں، ان کے درمیان انتخاب کرنے کے طریقے، اور باہمی تعاون پر مبنی ترقی کے لیے بہترین طریقوں کا احاطہ کرتی ہے۔ + +**یاد رکھیں: لائسنسنگ ایک قانونی معاملہ ہے۔ یہ گائیڈ تعلیمی ہے، قانونی مشورہ نہیں۔ اپنی مخصوص صورتحال کے لیے کسی مستند وکیل سے مشورہ کریں۔** + + +## اوپن سورس کیا ہے؟ {#what-is-open-source} + +ایک سافٹ ویئر لائسنس اس بات کا تعین کرتا ہے کہ دوسرے آپ کے کوڈ کو کس طرح استعمال، تبدیل اور تقسیم کر سکتے ہیں۔ اوپن سورس لائسنس واضح طور پر یہ حقوق دیتے ہیں، ملکیتی لائسنسز کے برعکس جو ان پر پابندی لگاتے ہیں۔ + +[Open Source Initiative](https://opensource.org/osd/annotated) آفیشل اوپن سورس ڈیفینیشن کو برقرار رکھتی ہے، جس کا تقاضا ہے کہ لائسنس مفت دوبارہ تقسیم، سورس کوڈ تک رسائی، اور ماخوذ کاموں کی اجازت دیں۔ + +_واضح لائسنسنگ کے بغیر_، ہر پروجیکٹ کو کسٹم قانونی جائزے کی ضرورت ہوگی۔ ریپوزٹری میں موجود `LICENSE.md` فائل بالکل واضح کرتی ہے کہ کون سی اجازتیں دی گئی ہیں، بالکل اسی طرح جیسے `README.md` فائل پروجیکٹ کا مقصد بیان کرتی ہے۔ + +آپ یہ سمجھنے کے لیے کہ یہ کس چیز کی اجازت دیتا ہے، Choose a License پر کسی پروجیکٹ کے لائسنس کی تصدیق کر سکتے ہیں۔ + +![License comparison chart](/images/open-source/license-comparison-v2.png) + +### چار آزادیاں {#the-four-freedoms} + +[Free Software Foundation](https://www.fsf.org/about/what-is-free-software) چار ضروری آزادیوں کی وضاحت کرتی ہے: + +- **آزادی 0**: کسی بھی مقصد کے لیے پروگرام چلانے کی آزادی +- **آزادی 1**: یہ مطالعہ کرنے کی آزادی کہ پروگرام کیسے کام کرتا ہے اور اسے اپنانے کی آزادی +- **آزادی 2**: کاپیاں دوبارہ تقسیم کرنے کی آزادی +- **آزادی 3**: پروگرام کو بہتر بنانے اور بہتری جاری کرنے کی آزادی + +یہ چار آزادیاں پوری فری اینڈ اوپن سورس سافٹ ویئر (FOSS) تحریک کی فلسفیانہ بنیاد ہیں۔ + + + +فری سافٹ ویئر میں لفظ "فری" سے مراد آزادی ہے، قیمت نہیں۔ ملکیتی سافٹ ویئر مفت ہو سکتا ہے، اور فری سافٹ ویئر تجارتی طور پر فروخت کیا جا سکتا ہے۔ تفصیلی وضاحت کے لیے [GNU philosophy](https://www.gnu.org/philosophy/free-sw.html) دیکھیں۔ + + + +## لائسنس کا انتخاب {#choosing-a-license} + +### کاپی لیفٹ لائسنسز {#copyleft-licenses} + +`GPL-3.0` جیسے کاپی لیفٹ لائسنسز کا تقاضا ہے کہ ماخوذ کام ایک ہی لائسنس استعمال کریں۔ یہ یقینی بناتا ہے کہ سافٹ ویئر اور تمام ترامیم مفت رہیں۔ `AGPL-3.0` اس تقاضے کو نیٹ ورک پر رسائی حاصل کرنے والے سافٹ ویئر تک بڑھاتا ہے۔ + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// یہ کنٹریکٹ ایک بنیادی پروجیکٹ رجسٹری کو نافذ کرتا ہے +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Project name cannot be empty"); + projects[msg.sender] = name; + } +} +``` + +کاپی لیفٹ کا بنیادی فائدہ یہ ہے کہ بہتری کو کمیونٹی کے ساتھ واپس شیئر کیا جانا چاہیے۔ اگر کوئی آپ کی GPL-لائسنس یافتہ لائبریری پر کام کرتا ہے، تو ان کی ترامیم بھی GPL-لائسنس یافتہ ہوتی ہیں۔ + + + +کاپی لیفٹ کا استعمال کریں جب: + +1. آپ یہ یقینی بنانا چاہتے ہیں کہ تمام ماخوذ کام اوپن سورس رہیں +2. آپ ایک لائبریری یا فریم ورک بنا رہے ہیں جسے دوسرے بڑھائیں گے +3. آپ اپنے کام کے ملکیتی فورکس کو روکنا چاہتے ہیں + +اس کا نقصان یہ ہے کہ کچھ کمپنیاں لائسنس کی "وائرل" نوعیت کی وجہ سے کاپی لیفٹ-لائسنس یافتہ انحصار (dependencies) سے گریز کرتی ہیں۔ + +کاپی لیفٹ کی تعمیل کے بارے میں عام سوالات کے لیے GPL FAQ دیکھیں۔ + + + +### اجازت دینے والے لائسنس {#permissive-licenses} + +`MIT` اور `Apache-2.0` جیسے اجازت دینے والے لائسنس ملکیتی ماخوذات کی اجازت دیتے ہیں۔ `BSD-2-Clause` کم سے کم پابندیوں کے ساتھ ایک اور مقبول اجازت دینے والا آپشن ہے۔ + +```python +# مثال: ڈسک سے پروجیکٹ کا لائسنس لوڈ کرنا +def read_license(path: str) -> str: + """LICENSE فائل کے مندرجات کو پڑھیں اور واپس کریں۔""" + with open(path, "r") as f: + return f.read() + +# چیک کریں کہ آیا لائسنس پرمیسیو ہے +def is_permissive(license_text: str) -> bool: + permissive_keywords = ["MIT", "Apache", "BSD"] + return any(kw in license_text for kw in permissive_keywords) +``` + +اجازت دینے والے لائسنسز کا بنیادی فائدہ زیادہ سے زیادہ اپنانا ہے۔ کمپنیوں کے اجازت دینے والے لائسنس یافتہ پروجیکٹس کو استعمال کرنے اور ان میں حصہ ڈالنے کا زیادہ امکان ہوتا ہے کیونکہ اس بات پر کوئی پابندی نہیں ہوتی کہ کوڈ کو کس طرح استعمال کیا جا سکتا ہے۔ + +آپ کسی بھی ہوسٹنگ پلیٹ فارم پر [GitHub Repositories](https://github.com/new) کا استعمال کرتے ہوئے پروجیکٹس تعینات کر سکتے ہیں، اور [SPDX](https://spdx.org/) کے ساتھ تعمیل کی تصدیق کر سکتے ہیں۔ + +پروڈکشن میں تعینات کرنے سے پہلے سمارٹ کنٹریکٹس کو ٹیسٹ کرنے کے لیے [Holesky](https://holesky.dev/) پر [بلاک ایکسپلورر](https://eth.blockscout.com/) کے ساتھ [Remix](https://remix.ethereum.org/) کا استعمال کریں۔ + +### موازنہ ٹیبل {#comparison-table} + +| لائسنس | قسم | ماخوذ کام | پیٹنٹ گرانٹ | +|---------|------|-----------------|-------------| +| GPL-3.0 | کاپی لیفٹ | GPL ہونا چاہیے | ہاں | +| AGPL-3.0 | نیٹ ورک کاپی لیفٹ | AGPL ہونا چاہیے | ہاں | +| LGPL-3.0 | کمزور کاپی لیفٹ | لائبریری ملکیتی ہو سکتی ہے | ہاں | +| MIT | اجازت دینے والا | کوئی بھی لائسنس | نہیں | +| Apache-2.0 | اجازت دینے والا | کوئی بھی لائسنس | ہاں | +| BSD-2-Clause | اجازت دینے والا | کوئی بھی لائسنس | نہیں | +| MPL-2.0 | فائل لیول کاپی لیفٹ | ترمیم شدہ فائلیں MPL ہونی چاہئیں | ہاں | + +## تعمیل اور آڈیٹنگ {#compliance-and-auditing} + +اوپن سورس سافٹ ویئر استعمال کرنے والی تنظیموں کو اپنے انحصار (dependencies) کو ٹریک کرنا چاہیے اور لائسنس کی تعمیل کو یقینی بنانا چاہیے1۔ [FOSSA](https://fossa.com/) اور [Snyk](https://snyk.io/) جیسے ٹولز اس عمل کو خودکار بنا سکتے ہیں۔ + + + + + + +### لائسنس اسکیننگ {#license-scanning} + +خودکار لائسنس اسکیننگ ہر CI/CD پائپ لائن کا حصہ ہونی چاہیے۔ یہ غیر مطابقت پذیر لائسنسوں کو آپ کے انحصار کے درخت (dependency tree) میں داخل ہونے سے پہلے ہی پکڑ لیتی ہے۔ + +```bash +# اپنے پروجیکٹ پر لائسنس اسکین چلائیں +npx license-checker --production --json > licenses.json + +# پروڈکشن ڈیپینڈینسیز میں کاپی لیفٹ لائسنسز کی جانچ کریں +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +انحصار کو اپ ڈیٹ رکھنے اور لائسنس کی تبدیلیوں کی نگرانی کے لیے Dependabot استعمال کرنے پر غور کریں۔ + +```json +{ + "license-scan": { + "production": true, + "allowed": ["MIT", "Apache-2.0", "BSD-2-Clause"] + } +} +``` + +### SBOM جنریشن {#sbom-generation} + +ایک سافٹ ویئر بل آف میٹریلز (SBOM) آپ کے سافٹ ویئر کے تمام اجزاء کی فہرست بناتا ہے۔ ریگولیٹری تعمیل کے لیے، خاص طور پر سیکیورٹی کے لحاظ سے حساس صنعتوں میں، SBOM تیار کرنے کا تقاضا تیزی سے بڑھ رہا ہے۔ + + + +### کلیدی اصطلاحات {#key-terms} + + + + + +لائسنسنگ کے بارے میں باخبر فیصلے کرنے کے لیے ان اصطلاحات کو سمجھنا ضروری ہے۔ + + + +
    + +اضافی تعریفوں کے لیے مکمل [اصطلاحات کی فرہنگ](/glossary/) کا جائزہ لیں۔ یہ وسیلہ کمیونٹی کے زیر انتظام ہے اور اسے باقاعدگی سے اپ ڈیٹ کیا جاتا ہے۔ + +
    +
    + +## کمیونٹی کا تعاون {#community-collaboration} + +### پروجیکٹس میں حصہ ڈالنا {#how-to-contribute} + +اوپن سورس میں حصہ ڈالنے کا آغاز پروجیکٹ کے ورک فلو کو سمجھنے سے ہوتا ہے۔ زیادہ تر پروجیکٹس کام کو مربوط کرنے کے لیے ایشو ٹریکرز اور تبدیلیاں تجویز کرنے کے لیے پل ریکوئسٹس (pull requests) کا استعمال کرتے ہیں۔ + +آج ہی حصہ ڈالنا شروع کریں + + + اس پروجیکٹ میں کیسے حصہ ڈالیں + + + + ہماری کمیونٹی میں شامل ہوں + + +کوئی بھی حصہ ڈالنے سے پہلے، ہمیشہ گائیڈ لائنز کے لیے پروجیکٹ کی `CONTRIBUTING.md` فائل چیک کریں۔ کوڈ کا انداز، ٹیسٹ کے تقاضے، اور جائزے کے عمل پروجیکٹس کے درمیان مختلف ہوتے ہیں۔ + +```md +## پل ریکوئسٹ ٹیمپلیٹ + +**تفصیل:** تبدیلیوں کا مختصر خلاصہ +**متعلقہ ایشو:** اس ایشو کا لنک جسے یہ حل کرتا ہے +**ٹیسٹنگ:** اسے کیسے ٹیسٹ کیا گیا؟ +``` + +### کوڈ ریویو کے بہترین طریقے {#code-review} + +باہمی تعاون پر مبنی پروجیکٹس میں معیار کو برقرار رکھنے کے لیے کوڈ کا جائزہ (code review) ضروری ہے۔ جائزہ لینے والوں کو درستگی، اسٹائل کی مستقل مزاجی، اور ممکنہ سیکیورٹی مسائل کی جانچ کرنی چاہیے۔ + + + + + + اچھے کوڈ جائزے صرف فارمیٹنگ پر نہیں، بلکہ منطق اور ڈیزائن پر توجہ مرکوز کرتے ہیں۔ اسٹائل کے نفاذ کے لیے لنٹرز (linters) جیسے خودکار ٹولز کا استعمال کریں، اور انسانی جائزے کو آرکیٹیکچرل فیصلوں اور غیر معمولی صورتحال (edge cases) کے لیے مخصوص رکھیں۔ + + + + + + +## دوہرا لائسنسنگ {#dual-licensing} + +کچھ پروجیکٹس بیک وقت دو لائسنسوں کے تحت اپنا کوڈ پیش کرتے ہیں۔ یہ تجارتی صارفین کو ملکیتی لائسنس خریدنے کی اجازت دیتا ہے جبکہ اوپن سورس ورژن کو کاپی لیفٹ شرائط کے تحت دستیاب رکھتا ہے۔ [Qt](https://www.qt.io/licensing/) اور [MySQL](https://www.mysql.com/about/legal/licensing/) جیسے پروجیکٹس اس ماڈل کا استعمال کرتے ہیں۔ + +## مزید مطالعہ {#further-reading} + +_یہ گائیڈ [Open Source Initiative](https://opensource.org/) اور [Free Software Foundation](https://www.fsf.org/) کے مواد سے اخذ کی گئی ہے۔_ + +- [Choose a License گائیڈ](https://choosealicense.com/) - _آپ کے پروجیکٹ کے لیے صحیح لائسنس منتخب کرنے میں مدد کرنے والا ایک سادہ ٹول_ +- [SPDX لائسنس لسٹ](https://spdx.org/licenses/) - _500 سے زیادہ اوپن سورس لائسنسوں کے لیے معیاری شناخت کنندگان_ +- [اوپن سورس گائیڈ](https://opensource.guide/) - _پروجیکٹس چلانے اور ان میں حصہ ڈالنے کے لیے کمیونٹی کے زیر انتظام وسائل_ +- [FOSSA](https://fossa.com/) پر تعمیل کی ٹولنگ - _خودکار لائسنس اسکیننگ اور انحصار کا انتظام_ \ No newline at end of file diff --git a/tests/fixtures/incremental/locale-expected/es/fixture.json b/tests/fixtures/incremental/locale-expected/es/fixture.json new file mode 100644 index 00000000000..38ccdd3a54d --- /dev/null +++ b/tests/fixtures/incremental/locale-expected/es/fixture.json @@ -0,0 +1,22 @@ +{ + "page-title": "Licencias de código abierto", + "page-description": "Descubre las licencias de código abierto, los requisitos de cumplimiento y cómo elegir la adecuada.", + "hero-cta-primary": "Comenzar", + "hero-cta-secondary": "Ver comparación de licencias", + "stat-label-projects": "Proyectos de código abierto rastreados", + "stat-label-contributors": "Colaboradores activos en todo el mundo", + "banner-text": "¿Es nuevo en el código abierto? Comience con nuestra guía de contribución.", + "footer-note": "Datos de licencias obtenidos de SPDX. Herramientas de cumplimiento proporcionadas por FOSSA.", + "filter-label": "Filtrar por tipo de licencia", + "welcome-user": "¡Bienvenido de nuevo, {displayName}!", + "project-count": "{total, plural, =0 {No se encontraron proyectos} one {# proyecto encontrado} other {# proyectos encontrados}}", + "contribution-status": "{status, select, pending {Su contribución está pendiente de revisión} approved {Su contribución ha sido aprobada} rejected {Su contribución necesita revisiones} other {Estado desconocido}}", + "nested": { + "section-title": "Recursos adicionales", + "section-description": "Explora más sobre las licencias de código abierto y su cumplimiento.", + "link-text": "Explorar todos los recursos" + }, + "multi-link": "Lea la Definición de código abierto, revise el selector de licencias y consulte los identificadores SPDX.", + "markdown-description": "## Referencia rápida\n\n- **MIT**: La más permisiva, restricciones mínimas\n- **Apache-2.0**: Permisiva con concesión de patentes\n- **GPL-3.0**: Copyleft fuerte, los derivados deben ser GPL\n\nPara obtener más detalles, consulte la [comparación completa](/open-source/#comparison-table).", + "new-key": "Esta es una cadena de traducción completamente nueva añadida en la versión B." +} diff --git a/tests/fixtures/incremental/locale-expected/es/fixture.md b/tests/fixtures/incremental/locale-expected/es/fixture.md new file mode 100644 index 00000000000..6e1f4f3f49b --- /dev/null +++ b/tests/fixtures/incremental/locale-expected/es/fixture.md @@ -0,0 +1,209 @@ +--- +title: "Comprender las licencias de código abierto" +description: "Una guía práctica sobre licencias de software libre y desarrollo colaborativo" +image: /images/open-source/hero-licensing-v2.png +alt: "Diagrama que muestra diferentes tipos de licencias" +template: tutorial +lang: es +published: 2025-06-15 +tags: ["código abierto", "licencias", "cumplimiento"] +summaryPoints: + - Las licencias de código abierto definen cómo se puede usar, modificar y compartir el código + - Las licencias copyleft requieren que las obras derivadas sigan siendo de código abierto + - Las licencias permisivas permiten el uso privativo con restricciones mínimas +--- + +# Comprender las licencias de código abierto {#understanding-open-source-licensing} + +El software de código abierto se basa en el principio de que el código debe compartirse, estudiarse y mejorarse libremente. Esta guía cubre las principales familias de licencias, cómo elegir entre ellas y las mejores prácticas para el desarrollo colaborativo. + +**Recuerde: la concesión de licencias es un asunto legal. Esta guía es educativa, no un asesoramiento legal. Consulte a un abogado calificado para su situación específica.** + +## ¿Qué es el código abierto? {#what-is-open-source} + +Una licencia de software determina cómo otros pueden usar, modificar y distribuir su código. Las licencias de código abierto otorgan estos derechos explícitamente, a diferencia de las licencias privativas que los restringen. + +La [Open Source Initiative](https://opensource.org/osd/annotated) mantiene la Definición de Código Abierto oficial, que requiere que las licencias permitan la libre redistribución, el acceso al código fuente y las obras derivadas. + +_Sin licencias claras_, cada proyecto necesitaría una revisión legal personalizada. El archivo `LICENSE.md` en un repositorio indica exactamente qué permisos se otorgan, de manera similar a cómo el archivo `README.md` explica el propósito del proyecto. + +Puede verificar la licencia de un proyecto en Choose a License para comprender lo que permite. + +![License comparison chart](/images/open-source/license-comparison-v2.png) + +### Las cuatro libertades {#the-four-freedoms} + +La [Free Software Foundation](https://www.fsf.org/about/what-is-free-software) define cuatro libertades esenciales: + +- **Libertad 0**: La libertad de ejecutar el programa para cualquier propósito +- **Libertad 1**: La libertad de estudiar cómo funciona el programa y adaptarlo +- **Libertad 2**: La libertad de redistribuir copias +- **Libertad 3**: La libertad de mejorar el programa y publicar las mejoras + +Estas cuatro libertades son la base filosófica de todo el movimiento del software libre y de código abierto (FOSS). + + + +La palabra "free" (libre/gratis) en el software libre se refiere a la libertad, no al costo. El software privativo puede ser gratuito y el software libre puede venderse comercialmente. Consulte la [filosofía de GNU](https://www.gnu.org/philosophy/free-sw.html) para obtener una explicación detallada. + + +## Elegir una licencia {#choosing-a-license} + +### Licencias copyleft {#copyleft-licenses} + +Las licencias copyleft como `GPL-3.0` requieren que las obras derivadas utilicen la misma licencia. Esto garantiza que el software y todas las modificaciones sigan siendo libres. La `AGPL-3.0` extiende este requisito al software al que se accede a través de una red. + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// Este contrato implementa un registro básico de proyectos +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Project name cannot be empty"); + projects[msg.sender] = name; + } +} +``` + +La ventaja clave del copyleft es que las mejoras deben compartirse con la comunidad. Si alguien desarrolla sobre su biblioteca con licencia GPL, sus modificaciones también tendrán licencia GPL. + + + +Use copyleft cuando: + +1. Desee asegurarse de que todas las obras derivadas sigan siendo de código abierto +2. Esté construyendo una biblioteca o marco de trabajo (framework) que otros ampliarán +3. Desee evitar bifurcaciones (forks) privativas de su trabajo + +La desventaja es que algunas empresas evitan las dependencias con licencia copyleft debido a la naturaleza "viral" de la licencia. + +Consulte las preguntas frecuentes sobre la GPL para conocer las dudas comunes sobre el cumplimiento del copyleft. + + +### Licencias permisivas {#permissive-licenses} + +Las licencias permisivas como `MIT` y `Apache-2.0` permiten derivados privativos. La `BSD-2-Clause` es otra opción permisiva popular con restricciones mínimas. + +```python +### Tabla comparativa {#comparison-table} + +| Licencia | Tipo | Obras derivadas | Concesión de patentes | +|---------|------|-----------------|-------------| +| GPL-3.0 | Copyleft | Debe ser GPL | Sí | +| AGPL-3.0 | Copyleft de red | Debe ser AGPL | Sí | +| LGPL-3.0 | Copyleft débil | La biblioteca puede ser privativa | Sí | +| MIT | Permisiva | Cualquier licencia | No | +| Apache-2.0 | Permisiva | Cualquier licencia | Sí | +| BSD-2-Clause | Permisiva | Cualquier licencia | No | +| MPL-2.0 | Copyleft a nivel de archivo | Los archivos modificados deben ser MPL | Sí | + +## Colaboración comunitaria {#community-collaboration} + +### Contribuir a proyectos {#how-to-contribute} + +Contribuir al código abierto comienza por comprender el flujo de trabajo del proyecto. La mayoría de los proyectos utilizan issue trackers para coordinar el trabajo y pull requests para proponer cambios. + +Comience a contribuir hoy + + + Cómo contribuir a este proyecto + + + + Únase a nuestra comunidad + + +Antes de enviar una contribución, siempre revise el archivo `CONTRIBUTING.md` del proyecto para conocer las pautas. El estilo de código, los requisitos de prueba y los procesos de revisión varían entre proyectos. + +```md +## Plantilla de Pull Request + +**Descripción:** Breve resumen de los cambios +**Issue relacionado:** Enlace al issue que esto aborda +**Pruebas:** ¿Cómo se probó esto? +``` + +### Mejores prácticas para la revisión de código {#code-review} + +La revisión de código es esencial para mantener la calidad en proyectos colaborativos. Los revisores deben verificar la exactitud, la consistencia del estilo y los posibles problemas de seguridad. + + + + + + Las buenas revisiones de código se centran en la lógica y el diseño, no solo en el formato. Utilice herramientas automatizadas como linters para la aplicación del estilo, y reserve la revisión humana para decisiones arquitectónicas y casos extremos. + + + + + + + +## Licencia dual {#dual-licensing} + +Algunos proyectos ofrecen su código bajo dos licencias simultáneamente. Esto permite a los usuarios comerciales comprar una licencia privativa mientras mantienen la versión de código abierto disponible bajo términos copyleft. Proyectos como [Qt](https://www.qt.io/licensing/) y [MySQL](https://www.mysql.com/about/legal/licensing/) utilizan este modelo. +## Cumplimiento y auditoría {#compliance-and-auditing} + +Las organizaciones que utilizan software de código abierto deben rastrear sus dependencias y garantizar el cumplimiento de las licencias1. Herramientas como [FOSSA](https://fossa.com/) y [Snyk](https://snyk.io/) pueden automatizar este proceso. + + + + + + +### Escaneo de licencias {#license-scanning} + +El escaneo automatizado de licencias debe ser parte de cada CI/CD pipeline. Detecta licencias incompatibles antes de que entren en su árbol de dependencias. + +```bash +# Ejecutar un escaneo de licencias en tu proyecto +npx license-checker --production --json > licenses.json + +# Comprobar si hay licencias copyleft en las dependencias de producción +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +Considere usar Dependabot para mantener las dependencias actualizadas y monitorear los cambios de licencia. + +```json +{ + "license-scan": { + "production": true, + "allowed": ["MIT", "Apache-2.0", "BSD-2-Clause"] + } +} +``` +### Generación de SBOM {#sbom-generation} + +Una lista de materiales de software (SBOM) enumera todos los componentes de su software. La generación de un SBOM es cada vez más necesaria para el cumplimiento normativo, especialmente en industrias sensibles a la seguridad. + + + +### Términos clave {#key-terms} + + + + + +Comprender estos términos es esencial para tomar decisiones informadas sobre las licencias. + + + +
    + +Revise el [glosario de términos](/glossary/) completo para obtener definiciones adicionales. Este recurso es mantenido por la comunidad y se actualiza regularmente. + +
    +
    + +## Lecturas adicionales {#further-reading} + +_Esta guía está adaptada de materiales de la [Open Source Initiative](https://opensource.org/) y la [Free Software Foundation](https://www.fsf.org/)._ + +- [Guía Choose a License](https://choosealicense.com/) - _Herramienta sencilla para ayudar a elegir la licencia adecuada para su proyecto_ +- [Lista de licencias SPDX](https://spdx.org/licenses/) - _Identificadores estandarizados para más de 500 licencias de código abierto_ +- [Guía de código abierto](https://opensource.guide/) - _Recursos mantenidos por la comunidad para ejecutar y contribuir a proyectos_ +- Herramientas de cumplimiento en [FOSSA](https://fossa.com/) - _Escaneo automatizado de licencias y gestión de dependencias_ diff --git a/tests/fixtures/incremental/locale-expected/ko/fixture.json b/tests/fixtures/incremental/locale-expected/ko/fixture.json new file mode 100644 index 00000000000..d7a2910e9d0 --- /dev/null +++ b/tests/fixtures/incremental/locale-expected/ko/fixture.json @@ -0,0 +1,22 @@ +{ + "page-title": "오픈 소스 라이선스", + "page-description": "오픈 소스 라이선스, 규정 준수 요건 및 올바른 라이선스를 선택하는 방법을 알아보세요.", + "hero-cta-primary": "시작하기", + "hero-cta-secondary": "라이선스 비교 보기", + "stat-label-projects": "추적된 오픈 소스 프로젝트", + "stat-label-contributors": "전 세계 활성 기여자", + "banner-text": "오픈 소스가 처음이신가요? 기여 가이드시작해 보세요.", + "footer-note": "라이선스 데이터는 SPDX에서 제공합니다. 컴플라이언스 도구는 FOSSA에서 제공합니다.", + "filter-label": "라이선스 유형별 필터링", + "welcome-user": "다시 오신 것을 환영합니다, {displayName}님!", + "project-count": "{total, plural, =0 {프로젝트를 찾을 수 없습니다} one {#개의 프로젝트를 찾았습니다} other {#개의 프로젝트를 찾았습니다}}", + "contribution-status": "{status, select, pending {기여 내역이 검토 대기 중입니다} approved {기여 내역이 승인되었습니다} rejected {기여 내역에 수정이 필요합니다} other {알 수 없는 상태입니다}}", + "nested": { + "section-title": "추가 자료", + "section-description": "오픈 소스 라이선스 및 규정 준수에 대해 자세히 알아보세요.", + "link-text": "모든 자료 찾아보기" + }, + "multi-link": "오픈 소스 정의를 읽고, 라이선스 선택기를 검토하며, SPDX 식별자를 확인하세요.", + "markdown-description": "## 빠른 참조\n\n- **MIT**: 가장 허용적이며 최소한의 제한\n- **Apache-2.0**: 특허 부여가 포함된 허용적 라이선스\n- **GPL-3.0**: 강력한 카피레프트, 파생물은 GPL이어야 함\n\n자세한 내용은 [전체 비교](/open-source/#comparison-table)를 참조하세요.", + "new-key": "버전 B에 추가된 완전히 새로운 번역 문자열입니다." +} diff --git a/tests/fixtures/incremental/locale-expected/ko/fixture.md b/tests/fixtures/incremental/locale-expected/ko/fixture.md new file mode 100644 index 00000000000..003b25023e3 --- /dev/null +++ b/tests/fixtures/incremental/locale-expected/ko/fixture.md @@ -0,0 +1,212 @@ +--- +title: "오픈 소스 라이선스 이해하기" +description: "자유 소프트웨어 라이선스 및 협업 개발을 위한 실용적인 가이드" +image: /images/open-source/hero-licensing-v2.png +alt: "다양한 라이선스 유형을 보여주는 다이어그램" +template: tutorial +lang: ko +published: 2025-06-15 +tags: + - 오픈 소스 + - 라이선스 + - 컴플라이언스 +summaryPoints: + - 오픈 소스 라이선스는 코드를 사용, 수정 및 공유하는 방법을 정의합니다. + - 카피레프트 라이선스는 파생 저작물도 오픈 소스로 유지할 것을 요구합니다. + - 퍼미시브 라이선스는 최소한의 제한으로 독점적 사용을 허용합니다. +--- + +# 오픈 소스 라이선스 이해하기 {#understanding-open-source-licensing} + +오픈 소스 소프트웨어는 코드를 자유롭게 공유하고, 연구하며, 개선해야 한다는 원칙을 바탕으로 만들어집니다. 이 가이드는 주요 라이선스 제품군, 라이선스 선택 방법, 그리고 협업 개발을 위한 모범 사례를 다룹니다. + +**기억하세요: 라이선스는 법적인 문제입니다. 이 가이드는 교육 목적이며 법률적 조언이 아닙니다. 구체적인 상황에 대해서는 자격을 갖춘 변호사와 상담하세요.** + +## 오픈 소스란 무엇인가요? {#what-is-open-source} + +소프트웨어 라이선스는 다른 사람들이 여러분의 코드를 사용, 수정 및 배포하는 방법을 결정합니다. 이를 제한하는 독점 라이선스와 달리, 오픈 소스 라이선스는 이러한 권리를 명시적으로 부여합니다. + +[오픈 소스 이니셔티브(Open Source Initiative)](https://opensource.org/osd/annotated)는 공식 오픈 소스 정의를 유지 관리하며, 이 정의에 따르면 라이선스는 자유로운 재배포, 소스 코드 접근 및 파생 저작물을 허용해야 합니다. + +_명확한 라이선스가 없다면_, 모든 프로젝트는 맞춤형 법률 검토가 필요할 것입니다. 리포지토리의 `LICENSE.md` 파일은 `README.md` 파일이 프로젝트의 목적을 설명하는 것과 유사하게 어떤 권한이 부여되는지 정확히 알려줍니다. + +Choose a License에서 프로젝트의 라이선스를 확인하여 허용되는 사항을 이해할 수 있습니다. + +![License comparison chart](/images/open-source/license-comparison-v2.png) + +### 4가지 자유 {#the-four-freedoms} + +[자유 소프트웨어 재단(Free Software Foundation)](https://www.fsf.org/about/what-is-free-software)은 4가지 필수적인 자유를 정의합니다: + +- **자유 0**: 어떤 목적으로든 프로그램을 실행할 수 있는 자유 +- **자유 1**: 프로그램의 작동 방식을 연구하고 수정할 수 있는 자유 +- **자유 2**: 복사본을 재배포할 수 있는 자유 +- **자유 3**: 프로그램을 개선하고 그 개선 사항을 배포할 수 있는 자유 + +이 4가지 자유는 전체 자유 오픈 소스 소프트웨어(FOSS) 운동의 철학적 기반입니다. + + + +자유 소프트웨어(free software)에서 "자유(free)"라는 단어는 비용이 아니라 자유(liberty)를 의미합니다. 독점 소프트웨어도 무료일 수 있으며, 자유 소프트웨어도 상업적으로 판매될 수 있습니다. 자세한 설명은 [GNU 철학](https://www.gnu.org/philosophy/free-sw.html)을 참조하세요. + + +## 라이선스 선택하기 {#choosing-a-license} + +### 카피레프트 라이선스 {#copyleft-licenses} + +`GPL-3.0`와 같은 카피레프트 라이선스는 파생 저작물도 동일한 라이선스를 사용하도록 요구합니다. 이를 통해 소프트웨어와 모든 수정 사항이 자유롭게 유지되도록 보장합니다. `AGPL-3.0`는 이 요구 사항을 네트워크를 통해 접근하는 소프트웨어로 확장합니다. + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// 이 컨트랙트는 기본 프로젝트 레지스트리를 구현합니다 +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Project name cannot be empty"); + projects[msg.sender] = name; + } +} +``` + +카피레프트의 주요 장점은 개선 사항을 커뮤니티와 다시 공유해야 한다는 것입니다. 누군가 귀하의 GPL 라이선스 라이브러리를 기반으로 개발한다면, 그들의 수정 사항 역시 GPL 라이선스를 따라야 합니다. + + + +다음과 같은 경우 카피레프트를 사용하세요: + +1. 모든 파생 저작물이 오픈 소스로 유지되도록 보장하고 싶을 때 +2. 다른 사람들이 확장할 라이브러리나 프레임워크를 구축할 때 +3. 작업물의 독점적인 포크(fork)를 방지하고 싶을 때 + +단점은 일부 기업들이 라이선스의 "바이러스성(viral)" 특성 때문에 카피레프트 라이선스가 적용된 종속성을 피한다는 것입니다. + +카피레프트 규정 준수에 대한 일반적인 질문은 GPL FAQ를 확인하세요. + + +### 퍼미시브 라이선스 {#permissive-licenses} + +`MIT` 및 `Apache-2.0`와 같은 퍼미시브 라이선스는 독점적인 파생물을 허용합니다. `BSD-2-Clause`는 제한이 최소화된 또 다른 인기 있는 퍼미시브 옵션입니다. + +```python +### 비교 표 {#comparison-table} + +| 라이선스 | 유형 | 파생 저작물 | 특허 부여 | +|---------|------|-----------------|-------------| +| GPL-3.0 | 카피레프트 | GPL이어야 함 | 예 | +| AGPL-3.0 | 네트워크 카피레프트 | AGPL이어야 함 | 예 | +| LGPL-3.0 | 약한 카피레프트 | 라이브러리는 독점적일 수 있음 | 예 | +| MIT | 퍼미시브 | 모든 라이선스 | 아니요 | +| Apache-2.0 | 퍼미시브 | 모든 라이선스 | 예 | +| BSD-2-Clause | 퍼미시브 | 모든 라이선스 | 아니요 | +| MPL-2.0 | 파일 수준 카피레프트 | 수정된 파일은 MPL이어야 함 | 예 | + +## 커뮤니티 협업 {#community-collaboration} + +### 프로젝트에 기여하기 {#how-to-contribute} + +오픈 소스에 기여하는 것은 프로젝트의 워크플로우를 이해하는 것에서 시작됩니다. 대부분의 프로젝트는 이슈 트래커를 사용하여 작업을 조율하고 풀 리퀘스트(pull request)를 통해 변경 사항을 제안합니다. + +오늘 바로 기여를 시작하세요 + + + 이 프로젝트에 기여하는 방법 + + + + 커뮤니티 참여하기 + + +기여를 제출하기 전에 항상 프로젝트의 `CONTRIBUTING.md` 파일에서 가이드라인을 확인하세요. 코드 스타일, 테스트 요구 사항 및 리뷰 프로세스는 프로젝트마다 다릅니다. + +```md +## 풀 리퀘스트 템플릿 + +**설명:** 변경 사항에 대한 간략한 요약 +**관련 이슈:** 이 작업이 해결하는 이슈 링크 +**테스트:** 어떻게 테스트되었나요? +``` + +### 코드 리뷰 모범 사례 {#code-review} + +코드 리뷰는 협업 프로젝트에서 품질을 유지하는 데 필수적입니다. 리뷰어는 정확성, 스타일 일관성 및 잠재적인 보안 문제를 확인해야 합니다. + + + + + + 좋은 코드 리뷰는 단순한 포맷팅이 아니라 로직과 설계에 중점을 둡니다. 스타일 적용을 위해서는 린터(linter)와 같은 자동화된 도구를 사용하고, 아키텍처 결정이나 엣지 케이스(edge case)에 대해서는 사람이 직접 리뷰하도록 하세요. + + + + + + + +## 이중 라이선스 {#dual-licensing} + +일부 프로젝트는 코드를 두 가지 라이선스로 동시에 제공합니다. 이를 통해 상업적 사용자는 독점 라이선스를 구매할 수 있으며, 오픈 소스 버전은 카피레프트 조건에 따라 계속 사용할 수 있습니다. [Qt](https://www.qt.io/licensing/) 및 [MySQL](https://www.mysql.com/about/legal/licensing/)과 같은 프로젝트가 이 모델을 사용합니다. +## 컴플라이언스 및 감사 {#compliance-and-auditing} + +오픈 소스 소프트웨어를 사용하는 조직은 종속성을 추적하고 라이선스 컴플라이언스를 보장해야 합니다1. [FOSSA](https://fossa.com/) 및 [Snyk](https://snyk.io/)와 같은 도구를 사용하면 이 프로세스를 자동화할 수 있습니다. + + + + + + +### 라이선스 스캐닝 {#license-scanning} + +자동화된 라이선스 스캐닝은 모든 CI/CD 파이프라인의 일부가 되어야 합니다. 이는 호환되지 않는 라이선스가 종속성 트리에 들어가기 전에 잡아냅니다. + +```bash +# 프로젝트에서 라이선스 스캔을 실행합니다. +npx license-checker --production --json > licenses.json + +# 프로덕션 의존성에서 카피레프트 라이선스를 확인합니다. +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +종속성을 최신 상태로 유지하고 라이선스 변경 사항을 모니터링하려면 Dependabot 사용을 고려해 보세요. + +```json +{ + "license-scan": { + "production": true, + "allowed": ["MIT", "Apache-2.0", "BSD-2-Clause"] + } +} +``` +### SBOM 생성 {#sbom-generation} + +소프트웨어 자재 명세서(Software Bill of Materials, SBOM)는 소프트웨어의 모든 구성 요소를 나열합니다. 특히 보안에 민감한 산업에서는 규제 준수를 위해 SBOM 생성이 점점 더 요구되고 있습니다. + + + +### 주요 용어 {#key-terms} + + + + + +라이선스에 대해 정보에 입각한 결정을 내리려면 이러한 용어를 이해하는 것이 필수적입니다. + + + +
    + +추가적인 정의는 전체 [용어집](/glossary/)을 검토하세요. 이 리소스는 커뮤니티에서 유지 관리하며 정기적으로 업데이트됩니다. + +
    +
    + +## 추가 자료 {#further-reading} + +_이 가이드는 [오픈 소스 이니셔티브(Open Source Initiative)](https://opensource.org/) 및 [자유 소프트웨어 재단(Free Software Foundation)](https://www.fsf.org/)의 자료를 바탕으로 작성되었습니다._ + +- [Choose a License 가이드](https://choosealicense.com/) - _프로젝트에 적합한 라이선스를 선택하는 데 도움이 되는 간단한 도구_ +- [SPDX 라이선스 목록](https://spdx.org/licenses/) - _500개 이상의 오픈 소스 라이선스에 대한 표준화된 식별자_ +- [오픈 소스 가이드](https://opensource.guide/) - _프로젝트 운영 및 기여를 위해 커뮤니티에서 유지 관리하는 리소스_ +- [FOSSA](https://fossa.com/)의 컴플라이언스 도구 - _자동화된 라이선스 스캐닝 및 종속성 관리_ diff --git a/tests/fixtures/incremental/locale-expected/ur/fixture.json b/tests/fixtures/incremental/locale-expected/ur/fixture.json new file mode 100644 index 00000000000..e33392f15c6 --- /dev/null +++ b/tests/fixtures/incremental/locale-expected/ur/fixture.json @@ -0,0 +1,22 @@ +{ + "page-title": "اوپن سورس لائسنسنگ", + "page-description": "اوپن سورس لائسنسز، تعمیل کے تقاضوں، اور صحیح لائسنس کا انتخاب کرنے کا طریقہ دریافت کریں۔", + "hero-cta-primary": "شروع کریں", + "hero-cta-secondary": "لائسنس کا موازنہ دیکھیں", + "stat-label-projects": "ٹریک کیے گئے اوپن سورس پروجیکٹس", + "stat-label-contributors": "دنیا بھر میں فعال حصہ دار", + "banner-text": "اوپن سورس میں نئے ہیں؟ ہماری شراکت کی گائیڈ کے ساتھ شروع کریں۔", + "footer-note": "لائسنس کا ڈیٹا SPDX سے حاصل کیا گیا ہے۔ تعمیل کے ٹولز FOSSA کے ذریعے فراہم کیے گئے ہیں۔", + "filter-label": "لائسنس کی قسم کے لحاظ سے فلٹر کریں", + "welcome-user": "خوش آمدید، {displayName}!", + "project-count": "{total, plural, =0 {کوئی پروجیکٹ نہیں ملا} one {# پروجیکٹ ملا} other {# پروجیکٹس ملے}}", + "contribution-status": "{status, select, pending {آپ کی شراکت کا جائزہ زیر التوا ہے} approved {آپ کی شراکت منظور کر لی گئی ہے} rejected {آپ کی شراکت میں ترامیم کی ضرورت ہے} other {نامعلوم حیثیت}}", + "nested": { + "section-title": "اضافی وسائل", + "section-description": "اوپن سورس لائسنسنگ اور تعمیل کے بارے میں مزید دریافت کریں۔", + "link-text": "تمام وسائل براؤز کریں" + }, + "multi-link": "اوپن سورس کی تعریف پڑھیں، لائسنس کے انتخاب کنندہ کا جائزہ لیں، اور SPDX شناخت کنندگان چیک کریں۔", + "markdown-description": "## فوری حوالہ\n\n- **MIT**: سب سے زیادہ اجازت دینے والا، کم از کم پابندیاں\n- **Apache-2.0**: پیٹنٹ گرانٹ کے ساتھ اجازت دینے والا\n- **GPL-3.0**: مضبوط کاپی لیفٹ، مشتقات کا GPL ہونا لازمی ہے\n\nمزید تفصیلات کے لیے، [مکمل موازنہ](/open-source/#comparison-table) دیکھیں۔", + "new-key": "یہ ورژن B میں شامل کی گئی ایک بالکل نئی ترجمہ شدہ سٹرنگ ہے۔" +} diff --git a/tests/fixtures/incremental/locale-expected/ur/fixture.md b/tests/fixtures/incremental/locale-expected/ur/fixture.md new file mode 100644 index 00000000000..d589530cade --- /dev/null +++ b/tests/fixtures/incremental/locale-expected/ur/fixture.md @@ -0,0 +1,209 @@ +--- +title: "اوپن سورس لائسنسنگ کو سمجھنا" +description: "مفت سافٹ ویئر لائسنسز اور باہمی ترقی کے لیے ایک عملی گائیڈ" +image: /images/open-source/hero-licensing-v2.png +alt: "مختلف لائسنس کی اقسام کو ظاہر کرنے والا خاکہ" +template: tutorial +lang: ur +published: 2025-06-15 +tags: ["اوپن سورس", "لائسنسنگ", "تعمیل"] +summaryPoints: + - اوپن سورس لائسنس اس بات کا تعین کرتے ہیں کہ کوڈ کو کیسے استعمال، تبدیل اور شیئر کیا جا سکتا ہے + - کاپی لیفٹ لائسنس اس بات کا تقاضا کرتے ہیں کہ ماخوذ کام اوپن سورس رہیں + - اجازت دینے والے لائسنس کم از کم پابندیوں کے ساتھ ملکیتی استعمال کی اجازت دیتے ہیں +--- + +# اوپن سورس لائسنسنگ کو سمجھنا {#understanding-open-source-licensing} + +اوپن سورس سافٹ ویئر اس اصول پر بنایا گیا ہے کہ کوڈ کو آزادانہ طور پر شیئر کیا جانا چاہیے، اس کا مطالعہ کیا جانا چاہیے اور اسے بہتر بنایا جانا چاہیے۔ یہ گائیڈ بڑے لائسنس خاندانوں، ان کے درمیان انتخاب کرنے کے طریقے، اور باہمی ترقی کے لیے بہترین طریقوں کا احاطہ کرتی ہے۔ + +**یاد رکھیں: لائسنسنگ ایک قانونی معاملہ ہے۔ یہ گائیڈ تعلیمی ہے، قانونی مشورہ نہیں۔ اپنی مخصوص صورتحال کے لیے کسی مستند وکیل سے مشورہ کریں۔** + +## اوپن سورس کیا ہے؟ {#what-is-open-source} + +ایک سافٹ ویئر لائسنس اس بات کا تعین کرتا ہے کہ دوسرے آپ کے کوڈ کو کیسے استعمال، تبدیل اور تقسیم کر سکتے ہیں۔ اوپن سورس لائسنس ان حقوق کو واضح طور پر دیتے ہیں، ملکیتی لائسنسوں کے برعکس جو ان پر پابندی لگاتے ہیں۔ + +[اوپن سورس انیشی ایٹو (Open Source Initiative)](https://opensource.org/osd/annotated) سرکاری اوپن سورس تعریف کو برقرار رکھتا ہے، جس کا تقاضا ہے کہ لائسنس مفت دوبارہ تقسیم، سورس کوڈ تک رسائی، اور ماخوذ کاموں کی اجازت دیں۔ + +_واضح لائسنسنگ کے بغیر_، ہر پروجیکٹ کو حسب ضرورت قانونی جائزے کی ضرورت ہوگی۔ ریپوزٹری میں `LICENSE.md` فائل بالکل یہ بتاتی ہے کہ کون سی اجازتیں دی گئی ہیں، بالکل اسی طرح جیسے `README.md` فائل پروجیکٹ کا مقصد بیان کرتی ہے۔ + +آپ یہ سمجھنے کے لیے کہ یہ کس چیز کی اجازت دیتا ہے، Choose a License پر کسی پروجیکٹ کے لائسنس کی تصدیق کر سکتے ہیں۔ + +![License comparison chart](/images/open-source/license-comparison-v2.png) + +### چار آزادیاں {#the-four-freedoms} + +[Free Software Foundation](https://www.fsf.org/about/what-is-free-software) چار ضروری آزادیوں کی وضاحت کرتی ہے: + +- **آزادی 0**: کسی بھی مقصد کے لیے پروگرام چلانے کی آزادی +- **آزادی 1**: یہ مطالعہ کرنے کی آزادی کہ پروگرام کیسے کام کرتا ہے اور اسے اپنانے کی آزادی +- **آزادی 2**: کاپیاں دوبارہ تقسیم کرنے کی آزادی +- **آزادی 3**: پروگرام کو بہتر بنانے اور بہتری جاری کرنے کی آزادی + +یہ چار آزادیاں پوری فری اینڈ اوپن سورس سافٹ ویئر (FOSS) تحریک کی فلسفیانہ بنیاد ہیں۔ + + + +فری سافٹ ویئر میں لفظ "فری" سے مراد آزادی ہے، قیمت نہیں۔ ملکیتی سافٹ ویئر مفت ہو سکتا ہے، اور فری سافٹ ویئر تجارتی طور پر فروخت کیا جا سکتا ہے۔ تفصیلی وضاحت کے لیے [GNU philosophy](https://www.gnu.org/philosophy/free-sw.html) دیکھیں۔ + + +## لائسنس کا انتخاب کرنا {#choosing-a-license} + +### کاپی لیفٹ لائسنسز {#copyleft-licenses} + +`GPL-3.0` جیسے کاپی لیفٹ لائسنسز کا تقاضا ہے کہ ماخوذ کام ایک ہی لائسنس استعمال کریں۔ یہ یقینی بناتا ہے کہ سافٹ ویئر اور تمام ترامیم مفت رہیں۔ `AGPL-3.0` اس تقاضے کو نیٹ ورک پر رسائی حاصل کرنے والے سافٹ ویئر تک بڑھاتا ہے۔ + +```solidity +// SPDX-License-Identifier: GPL-3.0 +// یہ کنٹریکٹ ایک بنیادی پروجیکٹ رجسٹری کو نافذ کرتا ہے +pragma solidity ^0.8.0; + +contract ProjectRegistry { + mapping(address => string) public projects; + + function register(string memory name) public { + require(bytes(name).length > 0, "Project name cannot be empty"); + projects[msg.sender] = name; + } +} +``` + +کاپی لیفٹ کا بنیادی فائدہ یہ ہے کہ بہتری کو کمیونٹی کے ساتھ واپس شیئر کیا جانا چاہیے۔ اگر کوئی آپ کی GPL-لائسنس یافتہ لائبریری پر کام کرتا ہے، تو ان کی ترامیم بھی GPL-لائسنس یافتہ ہوتی ہیں۔ + + + +کاپی لیفٹ کا استعمال کریں جب: + +1. آپ یہ یقینی بنانا چاہتے ہیں کہ تمام ماخوذ کام اوپن سورس رہیں +2. آپ ایک لائبریری یا فریم ورک بنا رہے ہیں جسے دوسرے بڑھائیں گے +3. آپ اپنے کام کے ملکیتی فورکس کو روکنا چاہتے ہیں + +اس کا نقصان یہ ہے کہ کچھ کمپنیاں لائسنس کی "وائرل" نوعیت کی وجہ سے کاپی لیفٹ-لائسنس یافتہ انحصار (dependencies) سے گریز کرتی ہیں۔ + +کاپی لیفٹ کی تعمیل کے بارے میں عام سوالات کے لیے GPL FAQ دیکھیں۔ + + +### اجازت دینے والے لائسنس {#permissive-licenses} + +`MIT` اور `Apache-2.0` جیسے اجازت دینے والے لائسنس ملکیتی ماخوذات کی اجازت دیتے ہیں۔ `BSD-2-Clause` کم سے کم پابندیوں کے ساتھ ایک اور مقبول اجازت دینے والا آپشن ہے۔ + +```python +### موازنہ جدول {#comparison-table} + +| لائسنس | قسم | ماخوذ کام | پیٹنٹ گرانٹ | +|---------|------|-----------------|-------------| +| GPL-3.0 | کاپی لیفٹ | GPL ہونا چاہیے | ہاں | +| AGPL-3.0 | نیٹ ورک کاپی لیفٹ | AGPL ہونا چاہیے | ہاں | +| LGPL-3.0 | کمزور کاپی لیفٹ | لائبریری ملکیتی ہو سکتی ہے | ہاں | +| MIT | اجازت دینے والا | کوئی بھی لائسنس | نہیں | +| Apache-2.0 | اجازت دینے والا | کوئی بھی لائسنس | ہاں | +| BSD-2-Clause | اجازت دینے والا | کوئی بھی لائسنس | نہیں | +| MPL-2.0 | فائل لیول کاپی لیفٹ | تبدیل شدہ فائلیں MPL ہونی چاہئیں | ہاں | + +## کمیونٹی کا تعاون {#community-collaboration} + +### پروجیکٹس میں حصہ ڈالنا {#how-to-contribute} + +اوپن سورس میں حصہ ڈالنے کا آغاز پروجیکٹ کے ورک فلو کو سمجھنے سے ہوتا ہے۔ زیادہ تر پروجیکٹس کام کو مربوط کرنے کے لیے ایشو ٹریکرز اور تبدیلیاں تجویز کرنے کے لیے پل ریکوئسٹس کا استعمال کرتے ہیں۔ + +آج ہی حصہ ڈالنا شروع کریں + + + اس پروجیکٹ میں کیسے حصہ ڈالیں + + + + ہماری کمیونٹی میں شامل ہوں + + +کوئی بھی حصہ ڈالنے سے پہلے، ہمیشہ گائیڈ لائنز کے لیے پروجیکٹ کی `CONTRIBUTING.md` فائل چیک کریں۔ کوڈ کا انداز، ٹیسٹ کی ضروریات، اور جائزے کے عمل پروجیکٹس کے درمیان مختلف ہوتے ہیں۔ + +```md +## پل ریکوئسٹ ٹیمپلیٹ + +**تفصیل:** تبدیلیوں کا مختصر خلاصہ +**متعلقہ ایشو:** اس ایشو کا لنک جسے یہ حل کرتا ہے +**ٹیسٹنگ:** اس کا ٹیسٹ کیسے کیا گیا؟ +``` + +### کوڈ کے جائزے کے بہترین طریقے {#code-review} + +باہمی پروجیکٹس میں معیار کو برقرار رکھنے کے لیے کوڈ کا جائزہ لینا ضروری ہے۔ جائزہ لینے والوں کو درستگی، انداز کی مستقل مزاجی، اور ممکنہ سیکیورٹی مسائل کی جانچ کرنی چاہیے۔ + + + + + + کوڈ کے اچھے جائزے صرف فارمیٹنگ پر نہیں بلکہ منطق اور ڈیزائن پر توجہ مرکوز کرتے ہیں۔ انداز کے نفاذ کے لیے لنٹرز جیسے خودکار ٹولز کا استعمال کریں، اور انسانی جائزے کو آرکیٹیکچرل فیصلوں اور غیر معمولی صورتحال (edge cases) کے لیے مخصوص رکھیں۔ + + + + + + + +## دوہرا لائسنسنگ {#dual-licensing} + +کچھ پروجیکٹس بیک وقت دو لائسنسوں کے تحت اپنا کوڈ پیش کرتے ہیں۔ یہ تجارتی صارفین کو ملکیتی لائسنس خریدنے کی اجازت دیتا ہے جبکہ اوپن سورس ورژن کو کاپی لیفٹ شرائط کے تحت دستیاب رکھتا ہے۔ [Qt](https://www.qt.io/licensing/) اور [MySQL](https://www.mysql.com/about/legal/licensing/) جیسے پروجیکٹس اس ماڈل کا استعمال کرتے ہیں۔ +## تعمیل اور آڈیٹنگ {#compliance-and-auditing} + +اوپن سورس سافٹ ویئر استعمال کرنے والی تنظیموں کو اپنے انحصار کو ٹریک کرنا چاہیے اور لائسنس کی تعمیل کو یقینی بنانا چاہیے1۔ [FOSSA](https://fossa.com/) اور [Snyk](https://snyk.io/) جیسے ٹولز اس عمل کو خودکار بنا سکتے ہیں۔ + + + + + + +### لائسنس اسکیننگ {#license-scanning} + +خودکار لائسنس اسکیننگ ہر CI/CD پائپ لائن کا حصہ ہونی چاہیے۔ یہ غیر مطابقت پذیر لائسنسوں کو آپ کے انحصار کے درخت (dependency tree) میں داخل ہونے سے پہلے پکڑ لیتی ہے۔ + +```bash +# اپنے پروجیکٹ پر لائسنس اسکین چلائیں +npx license-checker --production --json > licenses.json + +# پروڈکشن کے انحصار میں کاپی لیفٹ لائسنس چیک کریں +npx license-checker --production --onlyAllow "MIT;Apache-2.0;BSD-2-Clause;ISC" +``` + +انحصار کو اپ ڈیٹ رکھنے اور لائسنس کی تبدیلیوں کی نگرانی کے لیے Dependabot استعمال کرنے پر غور کریں۔ + +```json +{ + "license-scan": { + "production": true, + "allowed": ["MIT", "Apache-2.0", "BSD-2-Clause"] + } +} +``` +### SBOM جنریشن {#sbom-generation} + +ایک سافٹ ویئر بل آف میٹریلز (SBOM) آپ کے سافٹ ویئر کے تمام اجزاء کی فہرست بناتا ہے۔ SBOM تیار کرنا ریگولیٹری تعمیل کے لیے تیزی سے ضروری ہوتا جا رہا ہے، خاص طور پر سیکیورٹی کے لحاظ سے حساس صنعتوں میں۔ + + + +### کلیدی اصطلاحات {#key-terms} + + + + + +لائسنسنگ کے بارے میں باخبر فیصلے کرنے کے لیے ان اصطلاحات کو سمجھنا ضروری ہے۔ + + + +
    + +اضافی تعریفوں کے لیے مکمل [اصطلاحات کی فرہنگ](/glossary/) کا جائزہ لیں۔ یہ وسیلہ کمیونٹی کے ذریعے برقرار رکھا جاتا ہے اور اسے باقاعدگی سے اپ ڈیٹ کیا جاتا ہے۔ + +
    +
    + +## مزید مطالعہ {#further-reading} + +_یہ گائیڈ [اوپن سورس انیشی ایٹو](https://opensource.org/) اور [فری سافٹ ویئر فاؤنڈیشن](https://www.fsf.org/) کے مواد سے اخذ کی گئی ہے۔_ + +- [لائسنس گائیڈ کا انتخاب کریں](https://choosealicense.com/) - _آپ کے پروجیکٹ کے لیے صحیح لائسنس چننے میں مدد کرنے والا ایک سادہ ٹول_ +- [SPDX لائسنس کی فہرست](https://spdx.org/licenses/) - _500 سے زیادہ اوپن سورس لائسنسوں کے لیے معیاری شناخت کنندگان_ +- [اوپن سورس گائیڈ](https://opensource.guide/) - _پروجیکٹس چلانے اور ان میں حصہ ڈالنے کے لیے کمیونٹی کے زیر انتظام وسائل_ +- [FOSSA](https://fossa.com/) پر تعمیل کی ٹولنگ - _خودکار لائسنس اسکیننگ اور انحصار کا انتظام_ diff --git a/tests/specs/CONCURRENCY-SPEC.md b/tests/specs/CONCURRENCY-SPEC.md new file mode 100644 index 00000000000..ea2151fe66e --- /dev/null +++ b/tests/specs/CONCURRENCY-SPEC.md @@ -0,0 +1,201 @@ +# Concurrency, Chunking, and Commit Strategy Spec + +## What we're building + +The translation pipeline currently processes files sequentially (one file, one language at a time). For production use with 25 languages and hundreds of content files, we need concurrent Gemini API calls, smarter chunking for large files, and clean commit history via per-language squashing. + +## What success looks like + +Given N files and M languages with a concurrency limit of C: +- All file/language pairs are processed with up to C concurrent Gemini API calls +- Large files are chunked by byte size (not key count) so no single Gemini call exceeds safe limits +- Each language's output is squashed into one commit as soon as that language completes +- Partial failures (one language fails) don't corrupt the target branch +- Zero-drift files produce zero Gemini calls and zero commits (already working) + +## Gemini 3.1 Pro limits (reference) + +- Input context: 1,048,576 tokens (~1M) +- Output limit: 65,536 tokens (~65K) +- English: ~4 chars per token +- CJK (Korean, Chinese, Japanese): ~1-2 chars per token + +--- + +## Part 1: Concurrency + +### Goal + +Process all (file, language) translation tasks through a shared semaphore pool. The pool size is configurable via `GEMINI_CONCURRENCY` (workflow input, default 16). + +### Task granularity + +- Each Gemini API call is one task in the pool +- For files that need chunking: each chunk is its own task +- A 10-chunk file takes 10 pool slots; remaining slots serve other tasks +- Chunks for the same file are independent (they translate different portions) + +### Assembly + +- Chunks are assembled back into the complete file BEFORE committing +- A chunked file = N Gemini calls, 1 commit with the merged result +- The chunk is the task unit for the semaphore pool, NOT the commit unit + +### Execution flow + +1. Build task list: enumerate all (file, language) pairs +2. For each pair, determine if chunking is needed. If yes, expand into chunk tasks. +3. Submit all tasks to the shared semaphore pool +4. Track per-language completion: when all tasks for a language finish, trigger squash +5. After all languages complete: merge working branch into target branch + +### Test assertions + +- With concurrency=2 and 4 tasks, at most 2 run simultaneously +- All tasks eventually complete regardless of submission order +- Token stats accumulate correctly across concurrent tasks +- Per-language completion callback fires exactly once per language, after all its tasks finish + +--- + +## Part 2: Chunking + +### Goal + +Split large files into chunks that stay safely within Gemini's output token limit. Prefer more smaller calls over fewer larger calls for reliability. + +### Chunk size budget + +``` +MAX_CHUNK_BYTES = 65_536 (64KB) +``` + +At ~4 chars/token (English), 64KB = ~16K tokens input. With CJK at ~1-2 chars/token, 64KB = ~32-64K tokens -- still within the 65K output limit. This is deliberately conservative: more calls, fewer failures. + +### JSON chunking (replaces current key-count approach) + +**Current:** Split at 100 keys regardless of value size. Breaks when values are long strings. + +**New algorithm:** +1. Iterate top-level keys in order +2. For each key, measure byte size: `key.length + JSON.stringify(value).length + overhead` +3. Accumulate into current chunk +4. When accumulated bytes exceed MAX_CHUNK_BYTES, start a new chunk +5. Minimum: at least 1 key per chunk (handles single keys exceeding budget) +6. Nested objects: measure the entire nested value as one unit + +**Backward compatibility:** The HTML placeholder extraction pass runs BEFORE chunking (unchanged). Chunking operates on the placeholder-replaced content. + +**Test assertions:** +- A JSON file with 50 keys averaging 2KB each (~100KB total) produces 2 chunks +- A JSON file with 3 keys where one value is 200KB produces 3 chunks (one per key) +- A JSON file under 64KB produces 1 chunk (no splitting) +- Key order is preserved across chunks +- Merged output matches original structure + +### Markdown prose chunking (enhancement to current approach) + +**Current:** Split at heading boundaries when > 40,000 chars (`PROSE_SIZE_THRESHOLD`). + +**New:** Replace `PROSE_SIZE_THRESHOLD` with `MAX_CHUNK_BYTES` (64KB). Additionally, if a single section exceeds MAX_CHUNK_BYTES, split on paragraph boundaries within that section. + +**Paragraph splitting algorithm:** +1. Split section on blank lines (`\n\n`) +2. Accumulate paragraphs into chunks up to MAX_CHUNK_BYTES +3. Each chunk includes the section heading for context +4. Minimum: at least 1 paragraph per chunk + +**Test assertions:** +- A markdown file under 64KB produces 1 chunk +- A markdown file with 3 sections of 40KB each produces 3 chunks (one per section) +- A single section of 100KB splits on paragraph boundaries into 2 chunks +- Heading context is included in each chunk of a split section +- Reassembled output matches original content + +### Incremental section batching + +**Current:** All changed sections batched into one Gemini prompt (no size limit). + +**New:** If the total byte size of TRANSLATE sections exceeds MAX_CHUNK_BYTES, split into multiple Gemini calls. Each call includes relevant CONTEXT sections for translation quality. + +**Test assertions:** +- 5 small changed sections (total 10KB) produce 1 Gemini call +- 3 large changed sections (total 200KB) produce multiple calls +- CONTEXT sections are included in each call for quality +- All translated sections are available for assembly + +--- + +## Part 3: Commit Strategy + +### Goal + +Protect the target branch from partial failures while preserving crash safety during long-running translations. + +### Temp branch pattern + +1. Pipeline creates `tmp-intl/run-MMDD-HHMM` as the working branch +2. All commit-as-you-go writes go to this branch (crash safety) +3. As each language completes all its files, immediately squash that language's commits into one +4. After ALL languages complete: merge temp branch into target branch +5. On success: delete temp branch +6. On failure: temp branch preserved with partial progress, target branch untouched + +### Target branch + +- Default: `intl/pending` (or user-specified via `TARGET_BRANCH`) +- Never receives partial work directly +- Only receives merged results from successful runs + +### Per-language squashing + +When a language completes: +1. Collect all blob SHAs committed for that language (tracked by `SharedCommitter`) +2. Create a single tree containing all files for that language +3. Create one commit: `i18n(lang): translate N files` +4. Log completion + +**Squash triggers per-language, not at end of run.** This means: +- If ko finishes before es, ko is squashed immediately while es continues +- The squashed commit is on the temp branch +- Final merge to target happens after all languages are squashed + +### Progress tracking + +Per-language state: +``` +pending -> in_progress -> complete +``` + +Log entry on language completion: +``` +[pipeline] [ko] Complete: 5 files, 12,450 input tokens, 8,200 output tokens +[pipeline] [ko] Squashed to 1 commit +``` + +### Test assertions + +- Single language run: temp branch created, 1 squashed commit, merged to target, temp deleted +- Multi-language run: each language squashed independently, final merge has N commits (one per language) +- Failed run: temp branch exists with partial commits, target branch unchanged +- Zero-drift run: no temp branch created, no commits + +--- + +## Part 4: Implementation order + +1. **Chunking** -- byte-size-aware splitting (can test independently with unit tests) +2. **Commit strategy** -- temp branch + per-language squash (needs GH Action test) +3. **Concurrency** -- task pool (builds on both chunking and commit strategy) + +Each phase should have tests passing before moving to the next. + +--- + +## What this spec does NOT cover + +- PR creation (separate workflow step, post-merge) +- Glossary loading (already implemented) +- Sanitization (already implemented, runs post-translation) +- Manifest generation (already implemented) +- Retry logic for individual Gemini calls (already implemented in `callGeminiRaw`) diff --git a/tests/specs/PIPELINE-SPEC.md b/tests/specs/PIPELINE-SPEC.md new file mode 100644 index 00000000000..fba2ec00a4d --- /dev/null +++ b/tests/specs/PIPELINE-SPEC.md @@ -0,0 +1,323 @@ +# Incremental Translation Pipeline Spec + +## What we're building + +A developer changes an English content file. The pipeline runs. For every locale: +- If the change was a URL, path, attribute, code, or structural element: the locale file is updated instantly by a script. No LLM. No cost. No delay. +- If the change was actual prose: only the specific changed section is sent to the LLM. The response is spliced back in. Everything else in the file is untouched. +- The developer doesn't think about translations. It just works. + +## What success looks like + +Given the test fixtures (28 markdown mutations, 10 JSON mutations across inert, translatable, structural, rename, add, remove, and reorder change types), the pipeline: +- Produces correct locale-B from locale-A for all three test languages (es, ko, ur) +- Makes zero unnecessary LLM calls (nothing sent to the LLM that a script could handle) +- Corrupts zero existing translations (unchanged content is byte-for-byte identical) +- Handles SOV languages correctly (Korean, Urdu -- inline element order differs from English) +- Handles RTL correctly (Urdu) + +## Goal + +Given an English content change (A -> B), update all locale translations with minimum LLM usage. Changes that don't affect translatable prose are propagated deterministically by scripts. Only actual prose changes go to the LLM. + +## Inputs + +- **english-A**: the previous English content (retrieved via `git show {sha}:{path}`) +- **english-B**: the current English content (on disk) +- **locale-A**: the existing translation (on disk, corresponds to english-A) +- **source manifest**: Merkle tree hashes of the English content at the time of last pipeline run (used for quick "did anything change?" check via rootHash comparison; `sourceCommitSha` enables retrieval of english-A) +- **translation manifest**: Merkle tree of the locale content, mirroring the English tree structure. Tracks per-section hashes so the pipeline knows which sections are up to date in each locale. + +## Output + +- **locale-B**: the updated translation reflecting all changes from A -> B + +## Principles + +1. **Deterministic where possible.** If no prose changed, no LLM call. Ever. +2. **Minimal LLM scope.** When prose changes, send only the specific changed section, not the whole file. Include surrounding context but request only the replacement text. +3. **Do no harm.** Unchanged sections must be byte-for-byte identical to locale-A. The pipeline must never corrupt existing translations. +4. **Hash-based detection.** Use intl-content-tree's Merkle tree to detect exactly what changed at the node level. No regex heuristics for change detection. +5. **Structural integrity.** The locale file must maintain structural parity with English. If the English has 4 links in a sentence, the locale must have 4 links. Structural mismatches (translator dropped a link, reformatted a component) are flagged for human review, not silently ignored. + +--- + +## Phase 1: Change Detection + +**Input:** english-A (string), english-B (string), format ("markdown" | "json") +**Output:** ChangeSet (from intl-content-tree's `extractChanges`), DiffResult (from `diff`) + +**What it does:** +1. Parse english-A into a tree: `parseMarkdown(englishA, config)` or `parseJson(englishA, config)` +2. Parse english-B into a tree: same +3. Run `extractChanges(treeA, treeB)` to get a `ChangeSet` +4. Run `diff(treeA, treeB)` to get the section-level `DiffResult` + +**ChangeSet contains:** +- `changes`: array of `NodeChange` (action: update/add/remove, with path, elementType, contentType, old/new values) +- `relocations`: array of `NodeRelocation` (same hash, different path) +- `sectionRenames`: array of `SectionRename` (heading ID changed, content overlaps) + +**DiffResult contains (section-level classifications):** +- `unchanged`: sections with no changes at all +- `inertDrift`: sections where only inert content changed +- `structuralDrift`: sections where structure changed but no prose changed +- `translatableDrift`: sections where prose actually changed +- `added`: new sections +- `removed`: sections that no longer exist +- `renamed`: sections with same content but different ID +- `reordered`: sections in different positions + +**Test assertions:** +- Given fixture-a.md and fixture-b.md, the ChangeSet contains exactly the expected changes per SPEC.md mutation table +- Given fixture-a.json and fixture-b.json, same +- No false positives (unchanged content reported as changed) +- No false negatives (actual changes missed) + +--- + +## Phase 2: Routing + +**Input:** ChangeSet, DiffResult +**Output:** Three work lists: + 1. **deterministic**: sections/changes that can be applied by script (inert-only sections, structural-only sections, renames, removals, reorders) + 2. **llm-required**: sections that need the LLM (sections with translatable prose changes, new sections) + 3. **no-op**: unchanged sections (do nothing) + +**What it does:** +Classify each section into one of the three lists based on the DiffResult: + +| DiffResult classification | Route | Reason | +|--------------------------|-------|--------| +| unchanged | no-op | Nothing changed | +| inertDrift | deterministic | Only URLs/paths/attributes changed | +| structuralDrift | deterministic | Nodes added/removed but no prose changed | +| translatableDrift | llm-required | Prose text changed (may also contain inert changes) | +| added | llm-required | Brand new section, needs fresh translation | +| removed | deterministic | Delete from locale | +| renamed | deterministic | Update heading ID in locale | +| reordered | deterministic | Reorder sections in locale | + +**Mixed sections (both inert and translatable changes):** When a section has `translatableDrift`, it goes entirely to the llm-required list. The LLM receives english-B content which already contains the correct inert values. Phase 3 does NOT apply inert changes to llm-required sections -- the LLM handoff handles everything for that section, similar to a full translation. + +**Section depth and deduplication:** `diff()` recurses into nested sections. A change in an h3 will also flag its parent h2 and grandparent h1 as `translatableDrift` (because their merkled hashes changed). The pipeline must deduplicate by using only the **deepest** (most specific) sections. If a `translatableDrift` entry's path is a prefix of another entry's path, it's a parent -- skip it. Only the leaf-level entries go to the LLM. + +For example, if diff returns `translatableDrift` for both `choosing-a-license` (h2) and `choosing-a-license/copyleft-licenses` (h3), only `copyleft-licenses` is sent for retranslation. The parent `choosing-a-license` is flagged only because its child changed -- its own prose may be unchanged and should be preserved from locale-A. + +**Caveat:** A parent section may have BOTH its own prose change AND a child section change. In that case, both the parent's own content (excluding children) and the child section are separate LLM entries. The pipeline must handle parent-level prose separately from nested child sections. + +**Frontmatter fields** are classified independently (each field is its own node in the tree): +- `description`, `title`, `alt`, `summaryPoints`, `tags` -> translatable (llm-required if changed) +- `image`, `lang`, `template`, `published` -> inert (deterministic if changed) + +**Test assertions:** +- Given the ChangeSet from Phase 1, routing produces the correct work lists +- Each of the 28 MD mutations and 10 JSON mutations lands in the correct list +- Sections with mixed changes (e.g., #copyleft-licenses with both inert and translatable) go entirely to llm-required +- Frontmatter fields are classified independently +- Zero translatable changes in the deterministic list + +--- + +## Phase 3: Deterministic Propagation + +**Input:** locale-A (string), deterministic work list, english-B (string), format +**Output:** locale-A with all deterministic changes applied + +**Important:** Phase 3 ONLY operates on sections in the deterministic list. Sections routed to llm-required are left untouched -- their changes (including any inert changes within them) are handled entirely by the LLM in Phase 4. + +**Operation ordering for markdown:** +1. Heading ID renames (so subsequent operations can find sections by their new IDs) +2. Section removals +3. Inert value updates (scoped to deterministic sections only) +4. Structural changes (component add/remove, attribute add) +5. Section reorders (last, since it rearranges what's already been updated) + +**Inert value replacement algorithm (markdown):** +Replacements must be precise, not naive find/replace: +1. Scope the replacement to the narrowest heading section (by `{#id}`) +2. Use element-type context patterns to disambiguate (e.g., `href="oldValue"` for links, `` `oldValue` `` for inline code) +3. Match by OLD VALUE, not by position. Translated sentences may reorder inline elements due to different word order (SOV languages like Korean, Urdu). The 2nd link in English may be the 4th link in Korean. Always find the element by its current value. +4. Verify the old value exists before replacing; skip with warning if not found + +**SOV / inline element reordering:** +In languages with different word order (Korean, Urdu, etc.), inline elements within a sentence may appear in a different order than English. For example: + +English: `Use [Remix](url1) on [Sepolia](url2) with a [block explorer](url3) to test [smart contracts](url4)` +Korean: `[smart contracts](url4)를 테스트하려면 [block explorer](url3)와 함께 [Sepolia](url2)에서 [Remix](url1)를 사용하세요` + +The links are in reverse order. If `url2` changes from `sepolia.dev` to `holesky.dev`, the pipeline must find `sepolia.dev` by value, not by "the 2nd link." This applies to all inline elements: links, images, inline code, bold/emphasis, HTML tags. + +**What it does for markdown:** + +| Change type | How to apply | +|------------|-------------| +| Inert value update (href, src, id, etc.) | Scoped find/replace using context pattern + occurrence counting | +| Heading ID rename | Find `{#old-id}` in locale file, replace with `{#new-id}` | +| Component removal (``) | Find the component in locale file, remove it and normalize surrounding whitespace | +| Component/code fence addition | Find insertion point by section heading, insert content from english-B | +| Attribute addition (className) | Find the component tag in locale file by existing attributes, add the new attribute | +| Section removal | Find section by heading ID, remove heading + content until next same-level heading | +| Section reorder | Reorder sections in locale file to match english-B section order | +| Relocation | Move content from old position to new position | + +**What it does for JSON:** +- Inert value updates: `JSON.parse`, walk to key, apply targeted replacement within the value string (for hrefs, ICU variables), `JSON.stringify` with 2-space indent +- Key removal: delete the key from the parsed object +- Key reorder: reorder keys to match english-B key order +- ICU variable rename: find `{oldName}` or `{oldName,` in the value string, replace with `{newName}` or `{newName,` +- HTML href/attribute change in value: find `attr="oldValue"`, replace with `attr="newValue"` + +**Frontmatter (markdown):** +Frontmatter is a distinct scope (YAML key-value pairs between `---` markers). For inert frontmatter fields: +- Find the key line (e.g., `image: /old/path.png`) +- Replace the value portion only, preserving the key and any YAML formatting + +**Test assertions:** +- Each inert change from the mutation table is correctly applied +- Unchanged prose is byte-for-byte identical to locale-A +- No locale prose is modified by this phase +- Component attributes are correctly added/updated +- Heading IDs are correctly renamed +- Removed components/sections are gone +- Added structural content (code fences) is present +- Whitespace is normalized after removals (no double blank lines) +- Section order matches english-B after reorder +- Sections in the llm-required list are NOT modified by this phase + +--- + +## Phase 4: LLM Translation + +**Input:** english-B sections that need translation, locale-A (for context), llm-required work list +**Output:** translated sections (strings) + +**What it does:** +For each section in the llm-required list: +1. Extract the section content from english-B (by heading ID) +2. Include surrounding context from locale-A (neighboring sections) for translation quality +3. Send to LLM with instructions: + - Translate only the provided section + - Preserve all markdown formatting, components, links, code fences + - Do NOT translate component attributes, code bodies, URLs, or heading IDs + - Return ONLY the translated section content +4. Receive translated section + +The LLM receives english-B content, which already contains all inert values (new URLs, new attributes, etc.). The LLM is expected to preserve these as-is. This is the same behavior as a full translation -- the LLM never sees old inert values. + +For new sections (added): +1. Extract the new section from english-B +2. Send to LLM for fresh translation +3. Receive translated section + +For frontmatter translatable fields: +1. Extract the field value from english-B (e.g., new description text) +2. Send to LLM with the locale-A version as context +3. Receive translated value + +**Test assertions (with mocked LLM):** +- The mock receives exactly the sections classified as llm-required in Phase 2 +- The mock receives the correct english-B content for each section +- The mock does NOT receive unchanged or inert-only sections +- The mock does NOT receive structural-only sections +- Frontmatter translatable fields are sent individually, not as part of a section + +--- + +## Phase 5: Assembly + +**Input:** locale file with deterministic changes applied (from Phase 3), translated sections (from Phase 4), english-B (for section ordering reference) +**Output:** locale-B (final updated translation) + +**What it does:** +Splice the LLM-translated sections into the deterministically-updated locale file: +1. For each translated section, find its position in the locale file (by heading ID) +2. **Section replacement**: replace the section content from its heading line to the line before the next heading (any level). This is safe because Phase 3 did not modify llm-required sections. +3. For new sections, insert at the correct position (matching english-B section order) +4. For translated frontmatter fields, replace the value in the frontmatter block + +**Section boundaries:** A section's content in the locale file is defined as: from its heading line (inclusive) to the line before the next heading line of any level (exclusive). This avoids the complexity of same-level vs nested heading tracking. + +**Test assertions:** +- Final output contains all deterministic changes from Phase 3 +- Final output contains all LLM translations from Phase 4 +- Unchanged sections are byte-for-byte identical to locale-A +- Section order matches english-B +- No orphaned content (sections present in output but not in english-B) +- No missing content (sections in english-B but not in output) +- No conflict between Phase 3 and Phase 4 outputs (they operate on disjoint sections) + +--- + +## Phase 6: Manifest Update + +**Input:** english-B tree, output locale file +**Output:** updated source manifest + updated translation manifest + +There are TWO manifests per file: +- **Source manifest** (`.manifest-source.json`): Merkle tree of the English content. Used to detect what changed in English between runs. Contains `sourceCommitSha` for retrieving old English via git. +- **Translation manifest** (`.manifest-translation.json`): Merkle tree of the locale content, mirroring the English tree structure. Tracks per-section hashes so the pipeline knows which sections are up to date in each locale. Also records the structural mapping between English and locale elements, enabling the pipeline to target the correct element in the locale file when the corresponding English element changes. + +**What it does:** +1. Serialize english-B tree as the new source manifest +2. Record the current git SHA as `sourceCommitSha` in the source manifest +3. Update the translation manifest to reflect the new locale state +4. Store both manifests for next incremental run + +**On partial failure:** If Phase 4 fails for some sections (LLM error), do NOT update either manifest. The next run should re-detect those changes and retry. Only stamp manifests when all changes are successfully applied. + +**Test assertions:** +- Source manifest rootHash matches english-B tree hash +- sourceCommitSha is populated +- Manifest version is correct +- Neither manifest is written if any phase failed + +--- + +## End-to-End Test + +Run against all three test languages to cover different script types and word orders: +- **es** (Spanish): Latin script, SVO word order (similar to English) +- **ko** (Korean): Hangul script, SOV word order (inline elements reordered) +- **ur** (Urdu): Arabic script, RTL, SOV word order + +**Input:** fixture-a.md, fixture-b.md, locale-a/{lang}/fixture.md +**Mock:** LLM returns corresponding sections from locale-b/{lang}/fixture.md +**Expected output (locale-B):** locale file that has: +- All 28 MD mutations correctly reflected in the output: + - Inert mutations in deterministic-only sections: applied by Phase 3 scripts + - Inert mutations in llm-required sections: present because the LLM received english-B (which has the new values) + - Structural mutations: applied by Phase 3 scripts + - Translatable mutations: handled by mock LLM (returns from locale-B fixtures) + - Section rename: heading ID updated by Phase 3 + - Section reorder: sections in english-B order + - New section: mock LLM response spliced in +- All unchanged sections preserved byte-for-byte from locale-A + +Same pattern for JSON fixtures (per language). + +**SOV-specific assertions (ko, ur):** +- Multi-link inert propagation: when one of four links changes its href, verify the correct link is updated even though inline element order differs from English +- Inline code replacement: verify the correct inline code is updated regardless of position in the translated sentence + +--- + +## Open questions + +- **Structural mismatch handling:** When a locale file has fewer inline elements than English (e.g., Urdu drops 2 of 4 links in a sentence), this is a structural integrity violation. The pipeline should flag this for human review rather than silently skipping. How this flag is surfaced (PR comment, log warning, separate report) is TBD. +- **Duplicate inert values in one section:** If the same URL appears twice in a section and only one instance changed, match-by-value is ambiguous. May need surrounding context (the paragraph) as a tiebreaker. +- **Code fence insertion point:** When adding a structural code fence to a section, where exactly within the section? English-B position relative to other elements? Needs implementation-level definition. +- **Partial failure strategy:** Currently all-or-nothing (don't stamp manifests if any LLM call fails). Acceptable for v1 but wasteful on retry. May revisit for per-section stamping later. + +--- + +## What this spec does NOT cover + +- Gemini API integration (mocked in tests) +- GitHub Actions workflow (tested separately) +- Git operations (file retrieval via sha, committing results) +- Multi-file batching (test is per-file) +- Chunking for large files +- Post-import sanitization +- PR creation +- Image alt text translation (known gap; alt text in markdown images is not currently classified as translatable by the parser) diff --git a/tests/specs/SPEC.md b/tests/specs/SPEC.md new file mode 100644 index 00000000000..b11b5fe7bc4 --- /dev/null +++ b/tests/specs/SPEC.md @@ -0,0 +1,123 @@ +# Incremental Translation Test Spec + +Test fixtures for validating the incremental translation pipeline. Version A is the "before" English content; version B has mutations applied. Both are translated via Gemini to produce locale files. + +## Fixture files + +- `public/content/test-spec-fixture-a/index.md` -- English markdown (before) +- `public/content/test-spec-fixture-b/index.md` -- English markdown (after mutations) +- `src/intl/en/page-test-spec-fixture-a.json` -- English JSON (before) +- `src/intl/en/page-test-spec-fixture-b.json` -- English JSON (after mutations) + +After Gemini translation: +- Locale A files: translation of the A fixtures (pipeline input) +- Locale B files: translation of the B fixtures (ground truth for prose) + +## Markdown mutations (A -> B) + +| # | Section | Change | Classification | +|---|---------|--------|----------------| +| 1 | Frontmatter | `image` path -> hero-licensing-v2.png | inert | +| 2 | Frontmatter | `description` reworded | translatable | +| 3 | #understanding-open-source-licensing | Remove `` | structural | +| 4 | #what-is-open-source | OSI link href appended `/annotated` | inert | +| 5 | #what-is-open-source | Inline code `LICENSE` -> `LICENSE.md` | inert | +| 6 | #what-is-open-source | `` href appended `?lang=en` | inert | +| 7 | #what-is-open-source | Image path -> license-comparison-v2.png | inert | +| 8 | #the-four-freedoms | Prose sentence reworded | translatable | +| 9 | #the-four-freedoms | InfoBanner `title` attr "Important distinction" -> "Key concept" | translatable | +| 10 | #copyleft-licenses | Solidity code comment changed | translatable | +| 11 | #copyleft-licenses | Solidity code body changed (require message) | inert | +| 12 | #copyleft-licenses | ExpandableCard `eventCategory` appended `-guide` | inert | +| 13 | #copyleft-licenses | `` href inside ExpandableCard appended `#AllCompatibility` | inert | +| 14 | #permissive-licenses | Python code comment changed | translatable | +| 15 | #permissive-licenses | Link text + URL: [GitHub](github.com) -> [GitHub Repositories](github.com/new) | translatable | +| 16 | #permissive-licenses | Multi-link: [Sepolia](sepolia.dev) -> [Holesky](holesky.dev) (1 of 4 links, text + href) | translatable | +| 17 | #contributing-to-projects | Heading ID rename -> `#how-to-contribute` | rename | +| 18 | #how-to-contribute | DocLink href appended `/getting-started/` | inert | +| 19 | #code-review | AlertEmoji `text` attr `:mag:` -> `:eyes:` | inert | +| 20 | #community-collaboration | DocLink `className="featured"` attribute added | structural | +| 21 | #license-scanning | New JSON code fence added | structural | +| 22 | #sbom-generation | YouTube `id` attr changed | inert | +| 23 | (new section) | `#dual-licensing` section added | added (translatable) | +| 24 | top-level | `#compliance-and-auditing` and `#community-collaboration` sections swapped | reordered | +| 25 | #how-to-contribute | ButtonLink href -> `/quick-start/` | inert | +| 26 | #code-review | Emoji `text` attr in heading `:bulb:` -> `:star:` | inert | +| 27 | #compliance-and-auditing | Card `href` attr -> `/license-audit/` | inert | +| 28 | #code-review | QuizWidget `quizKey` attr changed | inert | + +## JSON mutations (A -> B) + +| # | Key | Change | Classification | +|---|-----|--------|----------------| +| 1 | page-description | Prose reworded | translatable | +| 2 | stat-label-contributors | "Active contributors" -> "Active contributors worldwide" | translatable | +| 3 | banner-text | `` href `/contributing/` -> `/contributing/getting-started/` | inert | +| 4 | footer-note | First `` href appended `?sortBy=name` | inert | +| 5 | welcome-user | ICU variable `{username}` -> `{displayName}` | inert | +| 6 | project-count | ICU variable `{count}` -> `{total}` | inert | +| 7 | nested.link-text | "View all resources" -> "Browse all resources" | translatable | +| 8 | multi-link | Third `` href `spdx.org/` -> `spdx.org/licenses/` | inert | +| 9 | new-key | Brand new key added | added (translatable) | +| 10 | empty-results | Key removed | removed | + +## Classification summary + +### Markdown +- **Inert**: 15 (mutations 1, 4, 5, 6, 7, 11, 12, 13, 18, 19, 22, 25, 26, 27, 28) +- **Translatable**: 7 (mutations 2, 8, 9, 10, 14, 15, 16) +- **Structural**: 3 (mutations 3, 20, 21) +- **Rename**: 1 (mutation 17) +- **Added**: 1 (mutation 23) +- **Reordered**: 1 (mutation 24) + +### JSON +- **Inert**: 5 (mutations 3, 4, 5, 6, 8) +- **Translatable**: 3 (mutations 1, 2, 7) +- **Added**: 1 (mutation 9) +- **Removed**: 1 (mutation 10) + +## Test assertion rules + +1. **Inert changes**: Pipeline propagates deterministically. No Gemini call. Old value replaced with new value in all locale files. +2. **Translatable changes**: Pipeline sends the affected section to Gemini for retranslation. Splices the result back. +3. **Structural changes**: Pipeline propagates structure changes (component add/remove) without Gemini. +4. **Renames**: Pipeline updates heading `{#id}` in locale files. No Gemini. +5. **Added sections**: Pipeline sends new section to Gemini for fresh translation. +6. **Reordered sections**: Pipeline reorders sections in locale files to match English. No Gemini. +7. **Removed keys (JSON)**: Pipeline deletes the key from all locale JSON files. +8. **Unchanged sections**: Must be byte-for-byte identical after pipeline runs. "Do no harm." + +## Components covered + +- `` (self-closing, inert) +- `` (block, translatable + inert attrs) +- `` (block, translatable + inert attrs, inner content with `` tags) +- `` (block, inert href, translatable children) +- `` / `` / `` / `` (nested block) +- `` (self-closing, inert id) +- `` (block, inert href, translatable children) +- `` (self-closing, mixed attrs, used inside heading) +- `` (self-closing, inert quizKey) +- `` / `` (nested, translatable title/description, inert href) +- `` (self-closing, inert term) + +## Inline patterns covered + +- Markdown links `[text](url)` +- Markdown images `![alt](path)` +- Inline code `` `code` `` +- HTML `text` with target attr +- HTML ``, `` inline +- HTML `` footnote +- HTML `
    ` wrapping inside components +- Multi-link sentences (4 links in one paragraph, SOV testing) +- Bold `**text**` and italic `_text_` + +## Code fence patterns covered + +- Solidity (with translatable `//` comments + inert code body) +- Python (with translatable `#` comments + inert code body) +- Bash (with translatable `#` comments + inert code body) +- JSON (pure inert, no comments) +- Markdown `md` fence (translatable prose content) diff --git a/tests/unit/intl-pipeline/chunking.spec.ts b/tests/unit/intl-pipeline/chunking.spec.ts new file mode 100644 index 00000000000..21de51c7041 --- /dev/null +++ b/tests/unit/intl-pipeline/chunking.spec.ts @@ -0,0 +1,375 @@ +/** + * Chunking Tests -- CONCURRENCY-SPEC.md Part 2 + * + * Validates byte-size-aware chunking for JSON, Markdown, and incremental sections. + * MAX_CHUNK_BYTES = 65,536 (64KB) + * + * Tests with stubs will throw until real implementations are wired in. + */ + +import { expect, test } from "@playwright/test" + +import { MAX_CHUNK_BYTES } from "../../../src/scripts/intl-pipeline/constants" +import { chunkMarkdownProse } from "../../../src/scripts/intl-pipeline/lib/llm/code-block-extractor" +import { batchSections } from "../../../src/scripts/intl-pipeline/lib/llm/incremental-translate" +// --------------------------------------------------------------------------- +// Imports -- real for implemented modules, stubs for pending +// --------------------------------------------------------------------------- +import { + chunkJson, + mergeJsonBatches as mergeJsonChunks, +} from "../../../src/scripts/intl-pipeline/lib/llm/json-batcher" + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function makeJsonWithKeys(count: number, valueSize: number): string { + const obj: Record = {} + for (let i = 0; i < count; i++) { + obj[`key-${String(i).padStart(3, "0")}`] = "x".repeat(valueSize) + } + return JSON.stringify(obj, null, 2) +} + +function makeMarkdownSections(count: number, sizePerSection: number): string { + const sections: string[] = [] + for (let i = 0; i < count; i++) { + const heading = `## Section ${i + 1} {#section-${i + 1}}` + const para = "Lorem ipsum dolor sit amet. ".repeat( + Math.ceil(sizePerSection / 28) + ) + sections.push(`${heading}\n\n${para.slice(0, sizePerSection)}`) + } + return sections.join("\n\n") +} + +function byteSize(s: string): number { + return Buffer.byteLength(s, "utf-8") +} + +// =================================================================== +// PART 2A: JSON Chunking -- byte-size-aware (Spec lines 75-94) +// =================================================================== + +test.describe("JSON chunking (byte-size-aware)", () => { + test("file under 64KB produces 1 chunk", () => { + const small = makeJsonWithKeys(10, 100) + expect(byteSize(small)).toBeLessThan(MAX_CHUNK_BYTES) + const chunks = chunkJson(small) + expect(chunks).toHaveLength(1) + }) + + test("50 keys averaging 2KB each (~100KB) produces 2 chunks", () => { + const large = makeJsonWithKeys(50, 2000) + expect(byteSize(large)).toBeGreaterThan(MAX_CHUNK_BYTES) + const chunks = chunkJson(large) + expect(chunks).toHaveLength(2) + for (const chunk of chunks) { + expect(byteSize(chunk)).toBeLessThanOrEqual(MAX_CHUNK_BYTES + 1024) + } + }) + + test("3 keys with one 200KB value produces 3 chunks (1 per key)", () => { + const obj: Record = { + small1: "short value", + huge: "x".repeat(200_000), + small2: "another short value", + } + const json = JSON.stringify(obj, null, 2) + const chunks = chunkJson(json) + expect(chunks).toHaveLength(3) + }) + + test("single key exceeding budget produces 1 chunk (minimum guarantee)", () => { + const obj = { "only-key": "x".repeat(100_000) } + const json = JSON.stringify(obj, null, 2) + const chunks = chunkJson(json) + expect(chunks).toHaveLength(1) + }) + + test("key order preserved across chunks", () => { + const keys = Array.from( + { length: 30 }, + (_, i) => `key-${String(i).padStart(2, "0")}` + ) + const obj: Record = {} + for (const k of keys) { + obj[k] = "x".repeat(3000) + } + const json = JSON.stringify(obj, null, 2) + const chunks = chunkJson(json) + expect(chunks.length).toBeGreaterThan(1) + + const allKeys: string[] = [] + for (const chunk of chunks) { + allKeys.push(...Object.keys(JSON.parse(chunk))) + } + expect(allKeys).toEqual(keys) + }) + + test("merged output matches original structure", () => { + const obj: Record = {} + for (let i = 0; i < 40; i++) { + obj[`key-${i}`] = `value-${i}-${"padding".repeat(500)}` + } + const json = JSON.stringify(obj, null, 2) + const chunks = chunkJson(json) + expect(chunks.length).toBeGreaterThan(1) + + const merged = mergeJsonChunks(chunks) + expect(JSON.parse(merged)).toEqual(JSON.parse(json)) + }) + + test("nested objects measured as one unit", () => { + const obj: Record = { + simple: "x".repeat(20_000), + nested: { + a: "x".repeat(40_000), + b: "y".repeat(40_000), + }, + another: "y".repeat(20_000), + } + const json = JSON.stringify(obj, null, 2) + // Total: ~120KB. "nested" alone is ~80KB (one unit, exceeds budget) + // Should split into at least 2 chunks + const chunks = chunkJson(json) + expect(chunks.length).toBeGreaterThanOrEqual(2) + }) + + test("empty JSON object produces 1 chunk", () => { + const chunks = chunkJson("{}") + expect(chunks).toHaveLength(1) + }) + + test("file at exactly 64KB produces 1 chunk (boundary)", () => { + let obj: Record = { a: "" } + let json = JSON.stringify(obj, null, 2) + const padding = MAX_CHUNK_BYTES - byteSize(json) + obj = { a: "x".repeat(Math.max(0, padding - 10)) } + json = JSON.stringify(obj, null, 2) + expect(byteSize(json)).toBeLessThanOrEqual(MAX_CHUNK_BYTES) + const chunks = chunkJson(json) + expect(chunks).toHaveLength(1) + }) + + test("CJK values chunk by byte size, not character count", () => { + // Korean chars are 3 bytes each in UTF-8 + const koreanValue = "\uD55C".repeat(22_000) // ~66KB in UTF-8 + const obj = { "ko-text": koreanValue } + const json = JSON.stringify(obj, null, 2) + expect(byteSize(json)).toBeGreaterThan(MAX_CHUNK_BYTES) + // Single key: minimum guarantee still produces 1 chunk + const chunks = chunkJson(json) + expect(chunks).toHaveLength(1) + }) +}) + +// =================================================================== +// PART 2B: Markdown Prose Chunking (Spec lines 96-113) +// =================================================================== + +test.describe("Markdown chunking (heading + paragraph aware)", () => { + test("file under 64KB produces 1 chunk", () => { + const small = makeMarkdownSections(3, 5000) + expect(byteSize(small)).toBeLessThan(MAX_CHUNK_BYTES) + const chunks = chunkMarkdownProse(small) + expect(chunks).toHaveLength(1) + }) + + test("3 sections of 40KB each produces 3 chunks", () => { + const large = makeMarkdownSections(3, 40_000) + expect(byteSize(large)).toBeGreaterThan(MAX_CHUNK_BYTES) + const chunks = chunkMarkdownProse(large) + expect(chunks).toHaveLength(3) + }) + + test("single 100KB section splits on paragraph boundaries", () => { + const heading = "## Large section {#large-section}" + const paragraphs: string[] = [] + for (let i = 0; i < 20; i++) { + paragraphs.push("Lorem ipsum. ".repeat(400)) + } + const section = `${heading}\n\n${paragraphs.join("\n\n")}` + expect(byteSize(section)).toBeGreaterThan(MAX_CHUNK_BYTES) + const chunks = chunkMarkdownProse(section) + expect(chunks.length).toBeGreaterThan(1) + }) + + test("heading context included in each chunk of a split section", () => { + const heading = "## Split section {#split-section}" + const paragraphs: string[] = [] + for (let i = 0; i < 20; i++) { + paragraphs.push(`Paragraph ${i}. ${"x".repeat(5000)}`) + } + const section = `${heading}\n\n${paragraphs.join("\n\n")}` + const chunks = chunkMarkdownProse(section) + for (const chunk of chunks) { + expect(chunk).toContain("{#split-section}") + } + }) + + test("reassembled output preserves all headings and content", () => { + const sections = makeMarkdownSections(5, 20_000) + const chunks = chunkMarkdownProse(sections) + expect(chunks.length).toBeGreaterThan(1) + const reassembled = chunks.join("\n\n") + for (let i = 1; i <= 5; i++) { + expect(reassembled).toContain(`{#section-${i}}`) + } + }) + + test("code fences not split mid-fence", () => { + const heading = "## Code section {#code-section}" + const codeFence = "```solidity\n" + "// line\n".repeat(3000) + "```" + const prose = "Some text before.\n\n" + codeFence + "\n\nSome text after." + const section = `${heading}\n\n${prose}` + const chunks = chunkMarkdownProse(section) + for (const chunk of chunks) { + const opens = (chunk.match(/```/g) || []).length + expect(opens % 2).toBe(0) + } + }) + + test("flat prose with no headings still chunks on paragraphs", () => { + const paragraphs: string[] = [] + for (let i = 0; i < 20; i++) { + paragraphs.push("No heading here. ".repeat(300)) + } + const flatProse = paragraphs.join("\n\n") + expect(byteSize(flatProse)).toBeGreaterThan(MAX_CHUNK_BYTES) + const chunks = chunkMarkdownProse(flatProse) + expect(chunks.length).toBeGreaterThan(1) + }) + + test("empty sections do not produce empty chunks", () => { + const md = [ + "## Empty one {#empty-one}", + "", + "## Empty two {#empty-two}", + "", + "## Has content {#has-content}", + "", + "x".repeat(10_000), + ].join("\n") + const chunks = chunkMarkdownProse(md) + for (const chunk of chunks) { + expect(chunk.trim().length).toBeGreaterThan(0) + } + }) + + test("single paragraph exceeding 64KB still produces a chunk (minimum guarantee)", () => { + const heading = "## Giant para {#giant-para}" + const giant = "word ".repeat(20_000) + const section = `${heading}\n\n${giant}` + const chunks = chunkMarkdownProse(section) + expect(chunks.length).toBeGreaterThanOrEqual(1) + expect(chunks[0]).toContain("{#giant-para}") + }) +}) + +// =================================================================== +// PART 2C: Incremental Section Batching (Spec lines 115-126) +// =================================================================== + +test.describe("Incremental section batching (byte-size-aware)", () => { + test("5 small sections (total 10KB) produce 1 batch", () => { + const sections = Array.from({ length: 5 }, (_, i) => ({ + id: `section-${i}`, + content: "x".repeat(2000), + action: "TRANSLATE" as const, + })) + const batches = batchSections(sections) + expect(batches).toHaveLength(1) + }) + + test("3 large sections (total 200KB) produce multiple batches", () => { + const sections = Array.from({ length: 3 }, (_, i) => ({ + id: `section-${i}`, + content: "x".repeat(70_000), + action: "TRANSLATE" as const, + })) + const batches = batchSections(sections) + expect(batches.length).toBeGreaterThan(1) + }) + + test("CONTEXT sections included in each batch", () => { + const sections = [ + { + id: "ctx-1", + content: "Context for quality.", + action: "CONTEXT" as const, + }, + { id: "tr-1", content: "x".repeat(40_000), action: "TRANSLATE" as const }, + { id: "tr-2", content: "x".repeat(40_000), action: "TRANSLATE" as const }, + ] + const batches = batchSections(sections) + expect(batches.length).toBeGreaterThan(1) + for (const batch of batches) { + expect(batch.some((s) => s.action === "CONTEXT")).toBe(true) + } + }) + + test("all TRANSLATE sections present across batches", () => { + const translateIds = ["a", "b", "c", "d", "e"] + const sections = translateIds.map((id) => ({ + id, + content: "x".repeat(20_000), + action: "TRANSLATE" as const, + })) + const batches = batchSections(sections) + const allIds = batches.flatMap((b) => + b.filter((s) => s.action === "TRANSLATE").map((s) => s.id) + ) + expect(allIds.sort()).toEqual(translateIds.sort()) + }) + + test("single oversized TRANSLATE section gets its own batch", () => { + const sections = [ + { id: "small", content: "short", action: "TRANSLATE" as const }, + { + id: "huge", + content: "x".repeat(100_000), + action: "TRANSLATE" as const, + }, + { id: "small2", content: "also short", action: "TRANSLATE" as const }, + ] + const batches = batchSections(sections) + const hugeBatch = batches.find((b) => + b.some((s) => s.id === "huge" && s.action === "TRANSLATE") + ) + expect(hugeBatch).toBeDefined() + expect(hugeBatch!.filter((s) => s.action === "TRANSLATE")).toHaveLength(1) + }) + + test("all CONTEXT, no TRANSLATE produces 0 batches", () => { + const sections = [ + { id: "ctx-1", content: "context only", action: "CONTEXT" as const }, + { id: "ctx-2", content: "more context", action: "CONTEXT" as const }, + ] + const batches = batchSections(sections) + expect(batches).toHaveLength(0) + }) + + test("single small TRANSLATE section produces 1 batch", () => { + const sections = [ + { id: "only", content: "short text", action: "TRANSLATE" as const }, + ] + const batches = batchSections(sections) + expect(batches).toHaveLength(1) + }) + + test("large CONTEXT sections counted toward batch byte budget", () => { + const sections = [ + { + id: "big-ctx", + content: "x".repeat(50_000), + action: "CONTEXT" as const, + }, + { id: "tr-1", content: "x".repeat(20_000), action: "TRANSLATE" as const }, + { id: "tr-2", content: "x".repeat(20_000), action: "TRANSLATE" as const }, + ] + const batches = batchSections(sections) + expect(batches.length).toBeGreaterThan(1) + }) +}) diff --git a/tests/unit/intl-pipeline/commit-strategy.spec.ts b/tests/unit/intl-pipeline/commit-strategy.spec.ts new file mode 100644 index 00000000000..67740187071 --- /dev/null +++ b/tests/unit/intl-pipeline/commit-strategy.spec.ts @@ -0,0 +1,70 @@ +/** + * Commit Strategy Tests -- CONCURRENCY-SPEC.md Part 3 + * + * Branch naming is unit-tested locally. + * Squash/merge/failure behaviors are validated via GH Action runs + * (see test-manual-11 series) and documented here as contracts. + */ + +import { expect, test } from "@playwright/test" + +import { generateTempBranchName } from "../../../src/scripts/intl-pipeline/lib/utils/branch-naming" + +test.describe("Commit strategy", () => { + test("temp branch name format: tmp-intl/run-MMDD-HHMM", () => { + const name = generateTempBranchName() + expect(name).toMatch(/^tmp-intl\/run-\d{4}-\d{4}$/) + }) + + test("temp branch name uses UTC", () => { + const name = generateTempBranchName() + const match = name.match(/run-(\d{2})(\d{2})-(\d{2})(\d{2})/) + expect(match).not.toBeNull() + const month = parseInt(match![1]) + const day = parseInt(match![2]) + const hour = parseInt(match![3]) + const minute = parseInt(match![4]) + expect(month).toBeGreaterThanOrEqual(1) + expect(month).toBeLessThanOrEqual(12) + expect(day).toBeGreaterThanOrEqual(1) + expect(day).toBeLessThanOrEqual(31) + expect(hour).toBeGreaterThanOrEqual(0) + expect(hour).toBeLessThanOrEqual(23) + expect(minute).toBeGreaterThanOrEqual(0) + expect(minute).toBeLessThanOrEqual(59) + }) +}) + +/** + * The following behaviors are validated via GH Action test runs + * and documented here as contracts. They require real GitHub API + * interactions that cannot be unit-tested without mock infrastructure. + * + * Validated by: test-manual-11a (full, ko+es) and test-manual-11b (incremental, ko+es) + * Run URLs: + * 11a: https://github.com/ethereum/ethereum-org-website/actions/runs/24324935910 + * 11b: https://github.com/ethereum/ethereum-org-website/actions/runs/24325006864 + * + * Verified behaviors: + * + * 1. Single and multi-language squash: + * - Each language produces one squashed commit: "i18n(es): Gemini translation", "i18n(ko): Gemini translation" + * - Individual per-file commits are collapsed into the squashed commit + * - Commit history: es commit -> ko commit -> sanitize -> merge (clean, no interleaving) + * + * 2. Temp branch + merge: + * - Pipeline creates tmp-intl/run-MMDD-HHMM as working branch + * - All commits land on temp branch during execution + * - On success: temp branch merged into target with "i18n: merge tmp-intl/run-... into ..." + * + * 3. Sanitizer runs after squash, before merge: + * - "i18n: sanitize translation output" commit appears after language commits, before merge + * + * 4. Zero-drift produces no commits: + * - Validated by test-manual-10c (https://github.com/ethereum/ethereum-org-website/actions/runs/24298937958) + * - Branch created but zero new commits when manifests are current + * + * 5. Per-language squash at end of run (not mid-run): + * - squashByLanguage() called after pool.drain() to avoid race conditions + * - All languages finish before any squashing begins + */ diff --git a/tests/unit/intl-pipeline/concurrency.spec.ts b/tests/unit/intl-pipeline/concurrency.spec.ts new file mode 100644 index 00000000000..5de5b8027c7 --- /dev/null +++ b/tests/unit/intl-pipeline/concurrency.spec.ts @@ -0,0 +1,88 @@ +/** + * Concurrency Pool Tests -- CONCURRENCY-SPEC.md Part 1 + */ + +import { expect, test } from "@playwright/test" + +import { createTaskPool } from "../../../src/scripts/intl-pipeline/lib/utils/task-pool" + +test.describe("Concurrency pool", () => { + test("with concurrency=2 and 4 tasks, at most 2 run simultaneously", async () => { + let activeTasks = 0 + let maxActive = 0 + + const pool = createTaskPool({ concurrency: 2, delayBetweenMs: 0 }) + + for (let i = 0; i < 4; i++) { + pool.submit("ko", async () => { + activeTasks++ + maxActive = Math.max(maxActive, activeTasks) + await new Promise((r) => setTimeout(r, 50)) + activeTasks-- + }) + } + + await pool.drain() + expect(maxActive).toBeLessThanOrEqual(2) + expect(maxActive).toBeGreaterThan(0) + }) + + test("all tasks complete regardless of submission order", async () => { + const completed: number[] = [] + const pool = createTaskPool({ concurrency: 3, delayBetweenMs: 0 }) + + const delays = [30, 10, 50, 20, 40, 5] + delays.forEach((delay, i) => { + pool.submit("ko", async () => { + await new Promise((r) => setTimeout(r, delay)) + completed.push(i) + }) + }) + + await pool.drain() + expect(completed.sort()).toEqual([0, 1, 2, 3, 4, 5]) + }) + + test("token stats accumulate correctly across concurrent tasks", async () => { + const pool = createTaskPool({ concurrency: 4, delayBetweenMs: 0 }) + + pool.submit("ko", async () => ({ tokens: { input: 100, output: 50 } })) + pool.submit("ko", async () => ({ tokens: { input: 200, output: 80 } })) + pool.submit("es", async () => ({ tokens: { input: 150, output: 60 } })) + + await pool.drain() + const stats = pool.getStats() + + expect(stats["ko"].totalInputTokens).toBe(300) + expect(stats["ko"].totalOutputTokens).toBe(130) + expect(stats["es"].totalInputTokens).toBe(150) + expect(stats["es"].totalOutputTokens).toBe(60) + }) + + test("per-language completion callback fires exactly once per language", async () => { + const completions: string[] = [] + + const pool = createTaskPool({ + concurrency: 4, + delayBetweenMs: 0, + onLanguageComplete: (lang) => completions.push(lang), + }) + + pool.submit("ko", async () => { + await new Promise((r) => setTimeout(r, 10)) + }) + pool.submit("ko", async () => { + await new Promise((r) => setTimeout(r, 20)) + }) + pool.submit("es", async () => { + await new Promise((r) => setTimeout(r, 5)) + }) + + await pool.drain() + + expect(completions).toContain("ko") + expect(completions).toContain("es") + expect(completions.filter((l) => l === "ko")).toHaveLength(1) + expect(completions.filter((l) => l === "es")).toHaveLength(1) + }) +}) diff --git a/tests/unit/intl-pipeline/content-normalizer.spec.ts b/tests/unit/intl-pipeline/content-normalizer.spec.ts new file mode 100644 index 00000000000..7d8d498edac --- /dev/null +++ b/tests/unit/intl-pipeline/content-normalizer.spec.ts @@ -0,0 +1,124 @@ +/** + * Unit tests for the content normalizer. + * + * Covers key behaviors and regressions: + * - Indented fence handling (no double-indent) + * - Duplicate placeholder reconstruction + * - Component children recursive normalization + * - Prose fence (md/markdown) wrapper placeholders + * - Hash stability (inert changes don't flip hash) + */ + +import { expect, test } from "@playwright/test" + +import { normalizeContent } from "@/scripts/intl-pipeline/lib/llm/content-normalizer" + +test.describe("Content Normalizer", () => { + test.describe("indented code fences", () => { + test("does not double-indent placeholder for indented fences", () => { + const input = "1. Step\n\n ```sh\n pnpm install\n ```\n\n2. Next" + const { normalized } = normalizeContent(input) + // Placeholder should NOT have leading spaces + const lines = normalized.split("\n") + const phLine = lines.find((l) => l.includes("HTML-PLACEHOLDER-CODEBLOCK")) + expect(phLine).toBeDefined() + expect(phLine!.startsWith(" ")).toBe(false) + }) + + test("reconstruction restores original indentation", () => { + const input = "1. Step\n\n ```sh\n pnpm install\n ```\n\n2. Next" + const { normalized, extractions } = normalizeContent(input) + // Simulate reconstruction + let result = normalized + extractions.forEach((original, placeholder) => { + result = result.split(placeholder).join(original) + }) + expect(result).toContain(" ```sh") + expect(result).toContain(" ```") + }) + }) + + test.describe("duplicate content-addressed placeholders", () => { + test("same inline code produces same placeholder ID", () => { + const input = "Use `base fee` and then `base fee` again" + const { normalized } = normalizeContent(input) + const matches = normalized.match(/HTML-PLACEHOLDER-CODE-[a-f0-9]+/g) || [] + expect(matches.length).toBe(2) + expect(matches[0]).toBe(matches[1]) + }) + + test("same link produces same placeholder ID", () => { + const input = "See [docs](/docs/) and also [docs](/docs/)" + const { normalized } = normalizeContent(input) + const matches = normalized.match(/HTML-PLACEHOLDER-LINK-[a-f0-9]+/g) || [] + // 2 open + 2 close = 4 matches, all same hash + expect(matches.length).toBe(4) + expect(matches[0]).toBe(matches[1]) + }) + }) + + test.describe("component children normalization", () => { + test("recursively normalizes children into sub-nodes", () => { + const input = + '\n\nSee [link](/url/) for details.\n\n' + const { tree } = normalizeContent(input) + // Should have a component node + const component = tree.find((n) => n.type === "component") + expect(component).toBeDefined() + // Component children should include link sub-nodes + if (component && "children" in component && component.children) { + const linkNode = component.children.find((c) => c.type === "link") + expect(linkNode).toBeDefined() + } + }) + + test("component children text is visible in normalized output (wrapper style)", () => { + const input = + '\n\nThis is translatable.\n\n' + const { normalized } = normalizeContent(input) + expect(normalized).toContain("This is translatable.") + expect(normalized).toContain("HTML-PLACEHOLDER-COMPONENT") + }) + }) + + test.describe("prose code fences", () => { + test("md-tagged fences use wrapper placeholders", () => { + const input = "```md\nSome prose content\n```" + const { normalized } = normalizeContent(input) + // Should be wrapper, not self-closing + expect(normalized).toContain(" { + const input = "```markdown\n# Heading\n\nParagraph\n```" + const { normalized } = normalizeContent(input) + expect(normalized).toContain(" { + const input = "```solidity\ncontract Foo {}\n```" + const { normalized } = normalizeContent(input) + expect(normalized).toContain("HTML-PLACEHOLDER-CODEBLOCK-") + expect(normalized).toContain("/>") + expect(normalized).not.toContain("contract Foo") + }) + }) + + // Hash stability tests removed -- relied on deleted manifest-generator. + // Hash behavior now validated by intl-content-tree package (182 tests). + + test.describe("placeholder pre-validation", () => { + test("rejects content containing reserved placeholder syntax", () => { + const input = "Text with in it" + expect(() => normalizeContent(input)).toThrow("reserved") + }) + + test("rejects placeholder syntax even inside backticks", () => { + // Pre-validation runs on raw content before any extraction + const input = "Use `` as example" + expect(() => normalizeContent(input)).toThrow("reserved") + }) + }) +}) diff --git a/tests/unit/intl-pipeline/incremental-pipeline.spec.ts b/tests/unit/intl-pipeline/incremental-pipeline.spec.ts new file mode 100644 index 00000000000..d2f7350e7ad --- /dev/null +++ b/tests/unit/intl-pipeline/incremental-pipeline.spec.ts @@ -0,0 +1,637 @@ +/** + * Incremental Translation Pipeline Tests + * + * Contract: pipeline(englishA, englishB, localeA, format, llmMock?) -> localeB + * + * Phase 1-2 tests verify the intl-content-tree package produces correct + * detection and classification. These pass now. + * + * Phase 3-5 tests verify the pipeline implementation produces correct output. + * These call pipeline() which must be wired to the real implementation. + * + * LLM is mocked: returns corresponding sections from locale-B fixtures. + */ + +import { + type ContentTreeConfig, + diff, + extractChanges, + parseJson, + parseMarkdown, +} from "intl-content-tree" +import { readFileSync } from "node:fs" +import { join } from "node:path" +import { expect, test } from "@playwright/test" + +import { pipeline } from "../../../src/scripts/intl-pipeline" + +// --------------------------------------------------------------------------- +// Fixtures +// --------------------------------------------------------------------------- + +const FIXTURES = join(__dirname, "../../fixtures/incremental") +const read = (p: string) => readFileSync(join(FIXTURES, p), "utf-8") + +const EN_A_MD = read("english/fixture-a.md") +const EN_B_MD = read("english/fixture-b.md") +const EN_A_JSON = read("english/fixture-a.json") +const EN_B_JSON = read("english/fixture-b.json") + +const locA = (lang: string, ext: string) => + read(`locale-a/${lang}/fixture.${ext}`) +const locB = (lang: string, ext: string) => + read(`locale-b/${lang}/fixture.${ext}`) +const locExpected = (lang: string, ext: string) => + read(`locale-expected/${lang}/fixture.${ext}`) + +const CONFIG: Partial = { + depth: "element", + translatableAttributes: [ + "title", + "description", + "alt", + "label", + "aria-label", + "placeholder", + "buttonLabel", + "name", + "caption", + "contentPreview", + "location", + ], +} + +const LANGS = ["es", "ko", "ur"] as const + +// --------------------------------------------------------------------------- +// Pipeline: imported from src/scripts/intl-pipeline +// --------------------------------------------------------------------------- + +// =================================================================== +// PHASE 1: Change Detection -- passes now (package only) +// =================================================================== + +test.describe("Phase 1: Markdown change detection", () => { + const treeA = parseMarkdown(EN_A_MD, CONFIG) + const treeB = parseMarkdown(EN_B_MD, CONFIG) + const cs = extractChanges(treeA, treeB) + const dr = diff(treeA, treeB) + + // --- DiffResult section-level --- + + test("28 unchanged entries", () => { + expect(dr.unchanged).toHaveLength(28) + }) + + test("7 inertDrift entries", () => { + expect(dr.inertDrift).toHaveLength(7) + const ids = dr.inertDrift.map((e) => e.id) + expect(ids).toContain("frontmatter:image") + expect(ids).toContain("code-review") + expect(ids).toContain("sbom-generation") + }) + + test("3 structuralDrift entries", () => { + expect(dr.structuralDrift).toHaveLength(3) + const ids = dr.structuralDrift.map((e) => e.id) + expect(ids).toContain("community-collaboration") + expect(ids).toContain("compliance-and-auditing") + expect(ids).toContain("license-scanning") + }) + + test("7 translatableDrift entries (includes parent sections)", () => { + expect(dr.translatableDrift).toHaveLength(7) + expect( + dr.translatableDrift.some((e) => e.id === "frontmatter:description") + ).toBe(true) + expect(dr.translatableDrift.some((e) => e.id === "copyleft-licenses")).toBe( + true + ) + expect( + dr.translatableDrift.some((e) => e.id === "permissive-licenses") + ).toBe(true) + expect(dr.translatableDrift.some((e) => e.id === "the-four-freedoms")).toBe( + true + ) + }) + + test("2 added entries", () => { + expect(dr.added).toHaveLength(2) + const ids = dr.added.map((e) => e.id) + expect(ids).toContain("dual-licensing") + expect(ids).toContain("how-to-contribute") + }) + + test("2 removed entries", () => { + expect(dr.removed).toHaveLength(2) + const ids = dr.removed.map((e) => e.id) + expect(ids).toContain("contributing-to-projects") + expect(ids).toContain("component:2") // Divider + }) + + test("comparison-table, key-terms, further-reading are unchanged", () => { + const ids = dr.unchanged.map((e) => e.id) + expect(ids).toContain("comparison-table") + expect(ids).toContain("key-terms") + expect(ids).toContain("further-reading") + }) + + // --- ChangeSet node-level --- + + test("30 node-level changes", () => { + expect(cs.changes).toHaveLength(30) + }) + + test("1 section rename: contributing-to-projects -> how-to-contribute", () => { + expect(cs.sectionRenames).toHaveLength(1) + expect(cs.sectionRenames[0].oldId).toBe("contributing-to-projects") + expect(cs.sectionRenames[0].newId).toBe("how-to-contribute") + }) + + test("1 relocation detected", () => { + expect(cs.relocations).toHaveLength(1) + }) + + test("specific inert changes detected", () => { + // M4: OSI href + expect( + cs.changes.some( + (c) => + c.oldValue === "https://opensource.org/osd" && + c.newValue === "https://opensource.org/osd/annotated" + ) + ).toBe(true) + + // M5: inline code LICENSE -> LICENSE.md + expect( + cs.changes.some( + (c) => + c.elementType === "inline-code" && + c.oldValue === "LICENSE" && + c.newValue === "LICENSE.md" + ) + ).toBe(true) + + // M6: choosealicense href + expect( + cs.changes.some( + (c) => + c.oldValue === "https://choosealicense.com/" && + c.newValue === "https://choosealicense.com/?lang=en" + ) + ).toBe(true) + }) + + test("specific translatable changes detected", () => { + // M10: solidity code comment + expect( + cs.changes.some( + (c) => + c.contentType === "translatable" && + c.elementType === "code-comment" && + c.oldValue?.includes("demonstrates a simple registry") + ) + ).toBe(true) + + // M9: InfoBanner title attr + expect( + cs.changes.some( + (c) => + c.key === "title" && + c.oldValue === "Important distinction" && + c.newValue === "Key concept" + ) + ).toBe(true) + }) +}) + +test.describe("Phase 1: JSON change detection", () => { + const treeA = parseJson(EN_A_JSON, CONFIG) + const treeB = parseJson(EN_B_JSON, CONFIG) + const cs = extractChanges(treeA, treeB) + const dr = diff(treeA, treeB) + + test("5 unchanged keys", () => { + expect(dr.unchanged).toHaveLength(5) + const ids = dr.unchanged.map((e) => e.id) + expect(ids).toContain("page-title") + expect(ids).toContain("hero-cta-primary") + expect(ids).toContain("filter-label") + }) + + test("5 inertDrift keys", () => { + expect(dr.inertDrift).toHaveLength(5) + const ids = dr.inertDrift.map((e) => e.id) + expect(ids).toContain("banner-text") + expect(ids).toContain("footer-note") + expect(ids).toContain("welcome-user") + expect(ids).toContain("project-count") + expect(ids).toContain("multi-link") + }) + + test("3 translatableDrift keys", () => { + expect(dr.translatableDrift).toHaveLength(3) + const ids = dr.translatableDrift.map((e) => e.id) + expect(ids).toContain("page-description") + expect(ids).toContain("stat-label-contributors") + expect(ids).toContain("nested") + }) + + test("1 added key (new-key)", () => { + expect(dr.added).toHaveLength(1) + expect(dr.added[0].id).toBe("new-key") + }) + + test("1 removed key (empty-results)", () => { + expect(dr.removed).toHaveLength(1) + expect(dr.removed[0].id).toBe("empty-results") + }) + + test("11 node-level changes", () => { + expect(cs.changes).toHaveLength(11) + }) + + test("ICU variable renames detected", () => { + expect( + cs.changes.some( + (c) => + c.elementType === "icu-variable" && c.path.includes("welcome-user") + ) + ).toBe(true) + expect( + cs.changes.some( + (c) => + c.elementType === "icu-variable" && c.path.includes("project-count") + ) + ).toBe(true) + }) + + test("HTML href changes in values detected", () => { + expect( + cs.changes.some((c) => c.key === "href" && c.path.includes("banner-text")) + ).toBe(true) + expect( + cs.changes.some((c) => c.key === "href" && c.path.includes("footer-note")) + ).toBe(true) + expect( + cs.changes.some((c) => c.key === "href" && c.path.includes("multi-link")) + ).toBe(true) + }) +}) + +// =================================================================== +// PHASE 2: Routing -- passes now (package only) +// =================================================================== + +test.describe("Phase 2: Routing", () => { + const treeA = parseMarkdown(EN_A_MD, CONFIG) + const treeB = parseMarkdown(EN_B_MD, CONFIG) + const dr = diff(treeA, treeB) + + test("inertDrift entries have anchorHashChanged but not contentHashChanged", () => { + for (const e of dr.inertDrift) { + expect(e.contentHashChanged).toBe(false) + expect(e.anchorHashChanged).toBe(true) + } + }) + + test("translatableDrift entries have contentHashChanged", () => { + for (const e of dr.translatableDrift) { + expect(e.contentHashChanged).toBe(true) + } + }) + + test("no section in both inertDrift and translatableDrift", () => { + const inertIds = new Set(dr.inertDrift.map((e) => e.id)) + for (const e of dr.translatableDrift) { + expect(inertIds.has(e.id)).toBe(false) + } + }) + + test("leaf-level deduplication: deepest translatableDrift entries", () => { + const paths = dr.translatableDrift.map((e) => e.path) + const leaves = paths.filter( + (p) => !paths.some((o) => o !== p && o.startsWith(p + "/")) + ) + // Leaves should include the deepest sections, not their parents + expect(leaves).toContain( + "understanding-open-source-licensing/what-is-open-source/the-four-freedoms" + ) + expect(leaves).toContain( + "understanding-open-source-licensing/choosing-a-license/copyleft-licenses" + ) + expect(leaves).toContain( + "understanding-open-source-licensing/choosing-a-license/permissive-licenses" + ) + // Parents should NOT be leaves + expect(leaves).not.toContain("understanding-open-source-licensing") + expect(leaves).not.toContain( + "understanding-open-source-licensing/choosing-a-license" + ) + }) +}) + +// =================================================================== +// PHASE 3-5: Pipeline output -- fails until implemented +// =================================================================== + +for (const lang of LANGS) { + test.describe(`Pipeline output [${lang}] Markdown`, () => { + // --- Inert changes (deterministic, no LLM) --- + + test("M1: frontmatter image path updated", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("hero-licensing-v2.png") + expect(result).not.toContain("hero-licensing.png") + }) + + test("M4: OSI link href updated", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("opensource.org/osd/annotated") + }) + + test("M5: inline code LICENSE -> LICENSE.md", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("`LICENSE.md`") + }) + + test("M6: HTML href choosealicense ?lang=en", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("choosealicense.com/?lang=en") + }) + + test("M7: image path updated to v2", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("license-comparison-v2.png") + }) + + test("M12: ExpandableCard eventCategory updated", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("copyleft-guide") + }) + + test("M13: GPL FAQ href #AllCompatibility", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("gpl-faq.html#AllCompatibility") + }) + + test("M18: DocLink href /getting-started/", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("/contributing/getting-started/") + }) + + test("M19: AlertEmoji text :eyes:", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain(":eyes:") + expect(result).not.toContain(":mag:") + }) + + test("M22: YouTube id spec-fixture-002", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("spec-fixture-002") + expect(result).not.toContain("spec-fixture-001") + }) + + test("M25: ButtonLink href /quick-start/", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("/contributing/quick-start/") + }) + + test("M26: Emoji text :star: in heading", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain(":star:") + expect(result).not.toContain(":bulb:") + }) + + test("M27: Card href /license-audit/", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("/tools/license-audit/") + }) + + test("M28: QuizWidget quizKey oss-licensing-v2", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("oss-licensing-v2") + }) + + // --- Structural changes (deterministic, no LLM) --- + + test("M3: removed", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).not.toContain("") + }) + + test("M20: className='featured' added to DocLink", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain('className="featured"') + }) + + test("M21: JSON code fence added", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain('"license-scan"') + }) + + // --- Rename --- + + test("M17: heading ID renamed to how-to-contribute", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("{#how-to-contribute}") + expect(result).not.toContain("{#contributing-to-projects}") + }) + + // --- Reorder --- + + test("M24: compliance-and-auditing before community-collaboration", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + const compIdx = result.indexOf("{#compliance-and-auditing}") + const commIdx = result.indexOf("{#community-collaboration}") + expect(compIdx).toBeGreaterThan(-1) + expect(commIdx).toBeGreaterThan(-1) + expect(compIdx).toBeLessThan(commIdx) + }) + + // --- Added section --- + + test("M23: #dual-licensing section present", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + expect(result).toContain("{#dual-licensing}") + }) + + // --- Do no harm --- + + test("unchanged: comparison-table section preserved from locale-A", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + const loc = locA(lang, "md") + // Extract the comparison-table section from both + const extract = (s: string) => { + const start = s.indexOf("{#comparison-table}") + if (start === -1) return "" + const lineStart = s.lastIndexOf("\n", start) + 1 + const nextH = s.indexOf("\n#", start + 1) + return s.slice(lineStart, nextH > -1 ? nextH : undefined).trim() + } + expect(extract(result)).toBe(extract(loc)) + }) + + test("unchanged: further-reading section preserved from locale-A", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + const loc = locA(lang, "md") + const extract = (s: string) => { + const start = s.indexOf("{#further-reading}") + if (start === -1) return "" + const lineStart = s.lastIndexOf("\n", start) + 1 + return s.slice(lineStart).trim() + } + expect(extract(result)).toBe(extract(loc)) + }) + }) + + test.describe(`Pipeline output [${lang}] JSON`, () => { + test("J3: banner-text href /getting-started/", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + expect(result["banner-text"]).toContain("/contributing/getting-started/") + expect(result["banner-text"]).not.toContain('"/contributing/"') + }) + + test("J4: footer-note href ?sortBy=name", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + expect(result["footer-note"]).toContain("?sortBy=name") + }) + + test("J5: ICU {username} -> {displayName}", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + expect(result["welcome-user"]).toContain("{displayName}") + expect(result["welcome-user"]).not.toContain("{username}") + }) + + test("J6: ICU {count} -> {total}", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + expect(result["project-count"]).toContain("{total,") + expect(result["project-count"]).not.toContain("{count,") + }) + + test("J8: multi-link third href spdx.org/licenses/", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + expect(result["multi-link"]).toContain("spdx.org/licenses/") + }) + + test("J9: new-key added", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + expect(result["new-key"]).toBeDefined() + }) + + test("J10: empty-results removed", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + expect(result["empty-results"]).toBeUndefined() + }) + + test("unchanged: page-title preserved from locale-A", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + const orig = JSON.parse(locA(lang, "json")) + expect(result["page-title"]).toBe(orig["page-title"]) + }) + + test("unchanged: hero-cta-primary preserved from locale-A", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + const orig = JSON.parse(locA(lang, "json")) + expect(result["hero-cta-primary"]).toBe(orig["hero-cta-primary"]) + }) + + test("unchanged: contribution-status preserved from locale-A", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + const orig = JSON.parse(locA(lang, "json")) + expect(result["contribution-status"]).toBe(orig["contribution-status"]) + }) + + test("unchanged: filter-label preserved from locale-A", () => { + const result = JSON.parse( + pipeline(EN_A_JSON, EN_B_JSON, locA(lang, "json"), "json") + ) + const orig = JSON.parse(locA(lang, "json")) + expect(result["filter-label"]).toBe(orig["filter-label"]) + }) + }) +} + +// =================================================================== +// SOV: Inline element ordering -- fails until implemented +// =================================================================== + +for (const lang of ["ko", "ur"] as const) { + test.describe(`SOV [${lang}]`, () => { + test("multi-link: correct href updated regardless of element order", () => { + const result = pipeline(EN_A_MD, EN_B_MD, locA(lang, "md"), "markdown") + // Sepolia href replaced with Holesky + expect(result).toContain("holesky.dev") + expect(result).not.toContain("sepolia.dev") + // Other three links untouched + expect(result).toContain("remix.ethereum.org") + expect(result).toContain("eth.blockscout.com") + expect(result).toContain("/glossary/#smart-contract") + }) + }) +} + +// =================================================================== +// E2E: full pipeline output matches expected fixtures +// =================================================================== + +for (const lang of LANGS) { + test(`E2E [${lang}] markdown: output matches expected`, () => { + const expected = locExpected(lang, "md") + const result = pipeline( + EN_A_MD, + EN_B_MD, + locA(lang, "md"), + "markdown", + (sectionId, _englishContent) => { + // Mock LLM: extract section from locale-B by heading ID + const lb = locB(lang, "md") + const pattern = new RegExp( + `(^#{1,6}\\s+[^\\n]*\\{#${sectionId}\\}[^\\n]*$)`, + "m" + ) + const match = lb.match(pattern) + if (!match) return _englishContent + const start = lb.indexOf(match[0]) + const nextH = lb.indexOf("\n#", start + 1) + return lb.slice(start, nextH > -1 ? nextH : undefined).trim() + } + ) + expect(result).toBe(expected) + }) + + test(`E2E [${lang}] JSON: output matches expected`, () => { + const expected = locExpected(lang, "json") + const result = pipeline( + EN_A_JSON, + EN_B_JSON, + locA(lang, "json"), + "json", + (key, _englishContent) => { + const lb = JSON.parse(locB(lang, "json")) + return lb[key] ?? _englishContent + } + ) + expect(result).toBe(expected) + }) +} diff --git a/tests/unit/intl-pipeline/incremental-translate.spec.ts b/tests/unit/intl-pipeline/incremental-translate.spec.ts new file mode 100644 index 00000000000..fbe653d33a8 --- /dev/null +++ b/tests/unit/intl-pipeline/incremental-translate.spec.ts @@ -0,0 +1,486 @@ +/** + * Tests for incremental section-level translation. + * Tests response parsing and section replacement logic. + * Does NOT test Gemini API calls. + */ + +import { expect, test } from "@playwright/test" + +import { + buildSectionList, + extractJsonSections, + extractSections, + parseIncrementalResponse, + removeMarkdownSection, + replaceJsonValues, + replaceSections, +} from "@/scripts/intl-pipeline/lib/llm/incremental-translate" + +test.describe("parseIncrementalResponse", () => { + test("parses valid JSON response", () => { + const response = JSON.stringify({ + "my-section": "Translated content here.", + "other-section": "More translated content.", + }) + const result = parseIncrementalResponse(response) + expect(result["my-section"]).toBe("Translated content here.") + expect(result["other-section"]).toBe("More translated content.") + }) + + test("strips markdown json code block wrapping", () => { + const response = '```json\n{"my-section": "Translated."}\n```' + const result = parseIncrementalResponse(response) + expect(result["my-section"]).toBe("Translated.") + }) + + test("strips plain code block wrapping", () => { + const response = '```\n{"my-section": "Translated."}\n```' + const result = parseIncrementalResponse(response) + expect(result["my-section"]).toBe("Translated.") + }) + + test("throws on non-object response", () => { + expect(() => parseIncrementalResponse('"just a string"')).toThrow( + "Expected a JSON object" + ) + }) + + test("throws on array response", () => { + expect(() => parseIncrementalResponse("[]")).toThrow( + "Expected a JSON object" + ) + }) + + test("throws on non-string values", () => { + expect(() => parseIncrementalResponse('{"section": 42}')).toThrow( + 'Section "section" value is not a string' + ) + }) + + test("throws on invalid JSON", () => { + expect(() => parseIncrementalResponse("not json at all")).toThrow( + "Failed to parse" + ) + }) + + test("handles response with whitespace", () => { + const response = ' \n{"my-section": "Content."}\n ' + const result = parseIncrementalResponse(response) + expect(result["my-section"]).toBe("Content.") + }) +}) + +test.describe("replaceSections", () => { + const locale = [ + "---", + "title: Prueba", + "lang: es", + "---", + "", + "## Primera seccion {#first}", + "", + "Contenido de la primera seccion.", + "", + "## Segunda seccion {#second}", + "", + "Contenido de la segunda seccion.", + "", + "## Tercera seccion {#third}", + "", + "Contenido de la tercera seccion.", + ].join("\n") + + test("replaces a single section by ID", () => { + const result = replaceSections(locale, { + second: "Nuevo contenido de la segunda seccion.", + }) + expect(result).toContain("Contenido de la primera seccion.") + expect(result).toContain("Nuevo contenido de la segunda seccion.") + expect(result).toContain("Contenido de la tercera seccion.") + expect(result).toContain("{#second}") + }) + + test("replaces multiple sections", () => { + const result = replaceSections(locale, { + first: "Nuevo primero.", + third: "Nuevo tercero.", + }) + expect(result).toContain("Nuevo primero.") + expect(result).toContain("Contenido de la segunda seccion.") + expect(result).toContain("Nuevo tercero.") + }) + + test("preserves heading line with {#id}", () => { + const result = replaceSections(locale, { first: "Replacement content." }) + expect(result).toContain("## Primera seccion {#first}") + expect(result).toContain("Replacement content.") + expect(result).not.toContain("Contenido de la primera seccion.") + }) + + test("returns unchanged content when no translations provided", () => { + const result = replaceSections(locale, {}) + expect(result).toBe(locale) + }) + + test("handles section with no matching ID gracefully", () => { + const result = replaceSections(locale, { + nonexistent: "This should not appear anywhere problematic.", + }) + // The nonexistent ID is simply ignored + expect(result).toContain("Contenido de la primera seccion.") + expect(result).toContain("Contenido de la segunda seccion.") + expect(result).toContain("Contenido de la tercera seccion.") + }) + + test("handles nested headings (h3 under h2)", () => { + const nestedLocale = [ + "## Parent {#parent}", + "", + "Parent content.", + "", + "### Child {#child}", + "", + "Child content.", + "", + "## Next {#next}", + "", + "Next content.", + ].join("\n") + + const result = replaceSections(nestedLocale, { + child: "New child content.", + }) + expect(result).toContain("Parent content.") + expect(result).toContain("New child content.") + expect(result).toContain("Next content.") + expect(result).toContain("### Child {#child}") + }) + + test("does not duplicate content when parent and child are both replaced", () => { + const nestedLocale = [ + "## Parent {#parent}", + "", + "Parent content.", + "", + "### Child {#child}", + "", + "Child content.", + "", + "## Next {#next}", + "", + "Next content.", + ].join("\n") + + const result = replaceSections(nestedLocale, { + parent: "New parent content.", + child: "New child content.", + }) + // Each should appear exactly once (regression: previously duplicated) + const parentMatches = result.match(/New parent content\./g) + const childMatches = result.match(/New child content\./g) + expect(parentMatches).toHaveLength(1) + expect(childMatches).toHaveLength(1) + // Structure preserved + expect(result).toContain("## Parent {#parent}") + expect(result).toContain("### Child {#child}") + expect(result).toContain("Next content.") + }) + + test("preserves frontmatter", () => { + const result = replaceSections(locale, { first: "Replaced." }) + expect(result).toContain("title: Prueba") + expect(result).toContain("lang: es") + }) + + test("handles last section (no following heading)", () => { + const result = replaceSections(locale, { third: "Final replacement." }) + expect(result).toContain("Final replacement.") + expect(result).not.toContain("Contenido de la tercera seccion.") + }) +}) + +test.describe("extractSections", () => { + test("extracts direct body only, not nested subsections", () => { + const md = [ + "## Parent {#parent}", + "", + "Parent body.", + "", + "### Child {#child}", + "", + "Child body.", + "", + "## Sibling {#sibling}", + "", + "Sibling body.", + ].join("\n") + + const sections = extractSections(md) + const parent = sections.find((s) => s.id === "parent") + const child = sections.find((s) => s.id === "child") + + expect(parent).toBeDefined() + expect(parent!.body).toContain("Parent body.") + expect(parent!.body).not.toContain("Child body.") + + expect(child).toBeDefined() + expect(child!.body).toContain("Child body.") + }) +}) + +// =================================================================== +// buildSectionList -- P1-10 +// =================================================================== + +test.describe("buildSectionList", () => { + const englishSections = [ + { + id: "intro", + level: 2, + headingText: "Introduction", + body: "English intro.", + }, + { + id: "details", + level: 2, + headingText: "Details", + body: "English details.", + }, + { + id: "conclusion", + level: 2, + headingText: "Conclusion", + body: "English conclusion.", + }, + ] + + const localeSections = [ + { + id: "intro", + level: 2, + headingText: "Introduccion", + body: "Intro en espanol.", + }, + { + id: "details", + level: 2, + headingText: "Detalles", + body: "Detalles en espanol.", + }, + { + id: "conclusion", + level: 2, + headingText: "Conclusion", + body: "Conclusion en espanol.", + }, + ] + + test("marks specified IDs as TRANSLATE, others as CONTEXT", () => { + const result = buildSectionList(englishSections, localeSections, [ + "details", + ]) + expect(result).toHaveLength(3) + expect(result.find((s) => s.id === "intro")!.action).toBe("CONTEXT") + expect(result.find((s) => s.id === "details")!.action).toBe("TRANSLATE") + expect(result.find((s) => s.id === "conclusion")!.action).toBe("CONTEXT") + }) + + test("TRANSLATE sections use English content", () => { + const result = buildSectionList(englishSections, localeSections, [ + "details", + ]) + expect(result.find((s) => s.id === "details")!.content).toBe( + "English details." + ) + }) + + test("CONTEXT sections use locale content", () => { + const result = buildSectionList(englishSections, localeSections, [ + "details", + ]) + expect(result.find((s) => s.id === "intro")!.content).toBe( + "Intro en espanol." + ) + }) + + test("sections without locale match are omitted from CONTEXT", () => { + const result = buildSectionList(englishSections, [], ["details"]) + // intro and conclusion have no locale match, so only TRANSLATE section appears + expect(result).toHaveLength(1) + expect(result[0].action).toBe("TRANSLATE") + }) + + test("preserves document order", () => { + const result = buildSectionList(englishSections, localeSections, [ + "conclusion", + "intro", + ]) + expect(result.map((s) => s.id)).toEqual(["intro", "details", "conclusion"]) + }) +}) + +// =================================================================== +// removeMarkdownSection -- P1-10 +// =================================================================== + +test.describe("removeMarkdownSection", () => { + test("removes section by heading ID", () => { + const content = `## Keep me {#keep} + +Keep content. + +## Remove me {#remove} + +Remove content. + +## Also keep {#also-keep} + +Also keep content.` + + const result = removeMarkdownSection(content, "remove") + expect(result).toContain("{#keep}") + expect(result).toContain("Keep content.") + expect(result).not.toContain("{#remove}") + expect(result).not.toContain("Remove content.") + expect(result).toContain("{#also-keep}") + }) + + test("removes section and its subsections", () => { + const content = `## Parent {#parent} + +Parent content. + +### Child {#child} + +Child content. + +## Next {#next} + +Next content.` + + const result = removeMarkdownSection(content, "parent") + expect(result).not.toContain("Parent content.") + expect(result).not.toContain("Child content.") + expect(result).toContain("Next content.") + }) + + test("returns content unchanged if ID not found", () => { + const content = "## Section {#exists}\n\nContent." + expect(removeMarkdownSection(content, "nonexistent")).toBe(content) + }) + + test("removes section at end of file", () => { + const content = `## First {#first} + +First content. + +## Last {#last} + +Last content.` + + const result = removeMarkdownSection(content, "last") + expect(result).toContain("First content.") + expect(result).not.toContain("Last content.") + }) +}) + +// =================================================================== +// extractJsonSections -- P1-10 +// =================================================================== + +test.describe("extractJsonSections", () => { + test("extracts top-level string keys", () => { + const json = JSON.stringify({ title: "Hello", desc: "World" }, null, 2) + const sections = extractJsonSections(json) + expect(sections).toHaveLength(2) + expect(sections[0].id).toBe("title") + expect(sections[0].body).toBe("Hello") + expect(sections[1].id).toBe("desc") + expect(sections[1].body).toBe("World") + }) + + test("flattens nested objects with / separator", () => { + const json = JSON.stringify( + { + nested: { title: "Nested Title", desc: "Nested Desc" }, + }, + null, + 2 + ) + const sections = extractJsonSections(json) + expect(sections).toHaveLength(2) + expect(sections[0].id).toBe("nested/title") + expect(sections[1].id).toBe("nested/desc") + }) + + test("skips non-string values (numbers, booleans, arrays)", () => { + const json = JSON.stringify( + { + text: "translatable", + count: 42, + active: true, + items: ["a", "b"], + }, + null, + 2 + ) + const sections = extractJsonSections(json) + expect(sections).toHaveLength(1) + expect(sections[0].id).toBe("text") + }) + + test("handles empty object", () => { + const sections = extractJsonSections("{}") + expect(sections).toHaveLength(0) + }) +}) + +// =================================================================== +// replaceJsonValues -- P1-10 +// =================================================================== + +test.describe("replaceJsonValues", () => { + test("replaces top-level key value", () => { + const json = JSON.stringify({ title: "Hello", desc: "World" }, null, 2) + const result = replaceJsonValues(json, { title: "Hola" }) + const parsed = JSON.parse(result) + expect(parsed.title).toBe("Hola") + expect(parsed.desc).toBe("World") + }) + + test("replaces nested key with / path", () => { + const json = JSON.stringify( + { + nested: { title: "Hello", desc: "World" }, + }, + null, + 2 + ) + const result = replaceJsonValues(json, { "nested/title": "Hola" }) + const parsed = JSON.parse(result) + expect(parsed.nested.title).toBe("Hola") + expect(parsed.nested.desc).toBe("World") + }) + + test("replaces multiple keys at once", () => { + const json = JSON.stringify({ a: "1", b: "2", c: "3" }, null, 2) + const result = replaceJsonValues(json, { a: "uno", c: "tres" }) + const parsed = JSON.parse(result) + expect(parsed.a).toBe("uno") + expect(parsed.b).toBe("2") + expect(parsed.c).toBe("tres") + }) + + test("skips invalid key path gracefully", () => { + const json = JSON.stringify({ title: "Hello" }, null, 2) + const result = replaceJsonValues(json, { "nonexistent/deep/path": "Hola" }) + const parsed = JSON.parse(result) + expect(parsed.title).toBe("Hello") + }) + + test("output ends with newline", () => { + const json = JSON.stringify({ title: "Hello" }, null, 2) + const result = replaceJsonValues(json, { title: "Hola" }) + expect(result.endsWith("\n")).toBe(true) + }) +}) diff --git a/tests/unit/intl-pipeline/output-validation.spec.ts b/tests/unit/intl-pipeline/output-validation.spec.ts new file mode 100644 index 00000000000..f440c06ba87 --- /dev/null +++ b/tests/unit/intl-pipeline/output-validation.spec.ts @@ -0,0 +1,190 @@ +/** + * Output Validation Tests -- P1-9 + * + * Tests for pre-commit validation of LLM translation output. + */ + +import { expect, test } from "@playwright/test" + +import { + validateTranslatedJson, + validateTranslatedMarkdown, +} from "../../../src/scripts/intl-pipeline/lib/llm/output-validation" + +// =================================================================== +// JSON Validation +// =================================================================== + +test.describe("validateTranslatedJson", () => { + const english = JSON.stringify({ title: "Hello", desc: "World" }, null, 2) + + test("valid translation passes", () => { + const translated = JSON.stringify({ title: "Hola", desc: "Mundo" }, null, 2) + expect(validateTranslatedJson(translated, english)).toEqual({ valid: true }) + }) + + test("empty output fails", () => { + const result = validateTranslatedJson("", english) + expect(result.valid).toBe(false) + expect(result.error).toContain("Empty") + }) + + test("invalid JSON fails", () => { + const result = validateTranslatedJson("{broken", english) + expect(result.valid).toBe(false) + expect(result.error).toContain("Invalid JSON") + }) + + test("missing key fails", () => { + // Same count but different keys + const translated = JSON.stringify({ title: "Hola", wrong: "Mal" }, null, 2) + const result = validateTranslatedJson(translated, english) + expect(result.valid).toBe(false) + expect(result.error).toContain("Missing keys") + }) + + test("key count mismatch fails", () => { + const translated = JSON.stringify( + { title: "Hola", desc: "Mundo", extra: "Oops" }, + null, + 2 + ) + const result = validateTranslatedJson(translated, english) + expect(result.valid).toBe(false) + expect(result.error).toContain("Key count mismatch") + }) + + test("Gemini refusal detected", () => { + const result = validateTranslatedJson( + "I cannot translate this content", + english + ) + expect(result.valid).toBe(false) + expect(result.error).toContain("refusal") + }) +}) + +// =================================================================== +// Markdown Validation +// =================================================================== + +test.describe("validateTranslatedMarkdown", () => { + const english = `--- +title: Test Page +description: A test description +--- + +# Heading {#heading} + +Some content here. +` + + test("valid translation passes", () => { + const translated = `--- +title: Pagina de prueba +description: Una descripcion de prueba +--- + +# Encabezado {#heading} + +Contenido aqui. +` + expect(validateTranslatedMarkdown(translated, english)).toEqual({ + valid: true, + }) + }) + + test("empty output fails", () => { + const result = validateTranslatedMarkdown("", english) + expect(result.valid).toBe(false) + }) + + test("missing frontmatter fails when English has it", () => { + const result = validateTranslatedMarkdown("# No frontmatter", english) + expect(result.valid).toBe(false) + expect(result.error).toContain("Missing frontmatter") + }) + + test("unclosed frontmatter fails", () => { + const result = validateTranslatedMarkdown("---\ntitle: Broken\n", english) + expect(result.valid).toBe(false) + expect(result.error).toContain("Unclosed frontmatter") + }) + + test("suspiciously short output fails", () => { + const result = validateTranslatedMarkdown("---\ntitle: X\n---\nY", english) + expect(result.valid).toBe(false) + expect(result.error).toContain("Suspiciously short") + }) + + test("untranslated frontmatter (both title and desc) fails", () => { + const translated = `--- +title: Test Page +description: A test description +--- + +# Contenido traducido +` + const result = validateTranslatedMarkdown(translated, english) + expect(result.valid).toBe(false) + expect(result.error).toContain("not translated") + }) + + test("untranslated title only is OK (technical titles)", () => { + const translated = `--- +title: Test Page +description: Una descripcion traducida que es lo suficientemente larga +--- + +# Contenido traducido aqui con suficiente texto para no ser sospechosamente corto y pasar la validacion +` + const result = validateTranslatedMarkdown(translated, english) + expect(result.valid).toBe(true) + }) + + test("missing code block placeholder fails", () => { + const englishWithCode = + "Some text\n\nMore text\n" + const translated = "Texto\n\nMas texto" + const result = validateTranslatedMarkdown(translated, englishWithCode) + expect(result.valid).toBe(false) + expect(result.error).toContain("CODE_BLOCK_1") + }) + + test("hallucinated code fences fail when placeholders expected", () => { + const englishWithCode = "Text\n\nMore" + const translated = + "Texto\n\n```solidity\nfake code\n```" + const result = validateTranslatedMarkdown(translated, englishWithCode) + expect(result.valid).toBe(false) + expect(result.error).toContain("hallucinating") + }) + + test("Gemini refusal at start detected", () => { + const result = validateTranslatedMarkdown( + "I'm sorry, I cannot translate this.", + english + ) + expect(result.valid).toBe(false) + expect(result.error).toContain("refusal") + }) + + test("mid-content refusal detected", () => { + const translated = `--- +title: Pagina +description: Desc traducida con suficiente longitud para pasar +--- + +# Seccion uno + +Contenido normal aqui. + +I cannot translate this section due to policy restrictions. + +Mas contenido. +` + const result = validateTranslatedMarkdown(translated, english) + expect(result.valid).toBe(false) + expect(result.error).toContain("Mid-content refusal") + }) +}) diff --git a/tests/unit/sanitizer/code-block-extractor.spec.ts b/tests/unit/intl-pipeline/sanitizer/code-block-extractor.spec.ts similarity index 87% rename from tests/unit/sanitizer/code-block-extractor.spec.ts rename to tests/unit/intl-pipeline/sanitizer/code-block-extractor.spec.ts index d622d966fde..f59a25e602b 100644 --- a/tests/unit/sanitizer/code-block-extractor.spec.ts +++ b/tests/unit/intl-pipeline/sanitizer/code-block-extractor.spec.ts @@ -12,7 +12,7 @@ import { getCommentSyntax, restoreCodeBlocks, restoreComments, -} from "@/scripts/i18n/lib/ai/code-block-extractor" +} from "@/scripts/intl-pipeline/lib/llm/code-block-extractor" // --------------------------------------------------------------------------- // extractCodeBlocks + restoreCodeBlocks @@ -403,6 +403,13 @@ test.describe("getCommentSyntax", () => { expect(getCommentSyntax("Solidity")).toBe("js") expect(getCommentSyntax("PYTHON")).toBe("python") }) + + test("strips metadata after language name (e.g., 'sh copy')", () => { + expect(getCommentSyntax("sh copy")).toBe("shell") + expect(getCommentSyntax("bash copy")).toBe("shell") + expect(getCommentSyntax("solidity showLineNumbers")).toBe("js") + expect(getCommentSyntax("python {1,3-5}")).toBe("python") + }) }) // --------------------------------------------------------------------------- @@ -441,6 +448,60 @@ y = 2` const result = restoreComments(code, [], "js") expect(result).toBe(code) }) + + test("restores multi-line JS comment into stripped code without duplication", () => { + const original = `/** + * @dev Returns the amount of tokens in existence. + */ +function totalSupply() external view returns (uint256);` + + // Extract comments (produces stripped code with empties) + const { strippedCode, comments } = extractComments(original, "solidity") + + // Simulate translation + const translated = comments.map((c) => ({ + ...c, + text: "@dev Mengembalikan jumlah token yang ada.", + })) + + // Restore into STRIPPED code (not original) + const result = restoreComments(strippedCode, translated, "js") + + // Should have the Indonesian comment + expect(result).toContain("Mengembalikan jumlah token yang ada") + // Should NOT have the English comment + expect(result).not.toContain("Returns the amount of tokens") + // Should still have the function + expect(result).toContain("function totalSupply()") + }) + + test("extract-translate-restore round trip produces clean NatSpec", () => { + const original = ` /** + * @dev Moves tokens from caller to recipient. + * + * Returns a boolean value. + */ + function transfer(address to, uint256 amount) external returns (bool);` + + const { strippedCode, comments } = extractComments(original, "solidity") + + const translated = comments.map((c) => ({ + ...c, + text: "@dev Memindahkan token dari pemanggil ke penerima.\n *\n * Mengembalikan nilai boolean.", + })) + + const result = restoreComments(strippedCode, translated, "js") + + // Should contain translated text + expect(result).toContain("Memindahkan token") + // Should NOT contain English text + expect(result).not.toContain("Moves tokens from caller") + // Should have proper comment syntax + expect(result).toContain("/*") + expect(result).toContain("*/") + // Should have the function declaration + expect(result).toContain("function transfer") + }) }) // --------------------------------------------------------------------------- diff --git a/tests/unit/sanitizer/english-comparison.spec.ts b/tests/unit/intl-pipeline/sanitizer/english-comparison.spec.ts similarity index 99% rename from tests/unit/sanitizer/english-comparison.spec.ts rename to tests/unit/intl-pipeline/sanitizer/english-comparison.spec.ts index dfa7b88e01f..68b9f0b07f6 100644 --- a/tests/unit/sanitizer/english-comparison.spec.ts +++ b/tests/unit/intl-pipeline/sanitizer/english-comparison.spec.ts @@ -5,7 +5,7 @@ import { expect, test } from "@playwright/test" -import { _testOnly } from "@/scripts/i18n/post_import_sanitize" +import { _testOnly } from "@/scripts/intl-pipeline/intl-sanitizer" const { syncHeaderIdsWithEnglish, diff --git a/tests/unit/sanitizer/integration.spec.ts b/tests/unit/intl-pipeline/sanitizer/integration.spec.ts similarity index 98% rename from tests/unit/sanitizer/integration.spec.ts rename to tests/unit/intl-pipeline/sanitizer/integration.spec.ts index 5e3215e583b..da9130a84fa 100644 --- a/tests/unit/sanitizer/integration.spec.ts +++ b/tests/unit/intl-pipeline/sanitizer/integration.spec.ts @@ -5,7 +5,7 @@ import { expect, test } from "@playwright/test" -import { _testOnly } from "@/scripts/i18n/post_import_sanitize" +import { _testOnly } from "@/scripts/intl-pipeline/intl-sanitizer" const { processMarkdownFile, processJsonFile } = _testOnly diff --git a/tests/unit/sanitizer/standalone-fixes.spec.ts b/tests/unit/intl-pipeline/sanitizer/standalone-fixes.spec.ts similarity index 93% rename from tests/unit/sanitizer/standalone-fixes.spec.ts rename to tests/unit/intl-pipeline/sanitizer/standalone-fixes.spec.ts index 03f76e572db..076c5ec6c3c 100644 --- a/tests/unit/sanitizer/standalone-fixes.spec.ts +++ b/tests/unit/intl-pipeline/sanitizer/standalone-fixes.spec.ts @@ -5,7 +5,7 @@ import { expect, test } from "@playwright/test" -import { _testOnly } from "@/scripts/i18n/post_import_sanitize" +import { _testOnly } from "@/scripts/intl-pipeline/intl-sanitizer" const { fixDuplicatedHeadings, @@ -57,6 +57,9 @@ const { fixBoldWrappedOrderedListNumerals, fixEscapedQuotesInJsxAttributes, fixTranslatedJsonPlaceholders, + fixBareRtlValues, + fixUnitOutsideSpan, + fixMisalignedCodeFences, } = _testOnly test.describe("Standalone Fixes", () => { @@ -2769,4 +2772,189 @@ author: Ori Pomerantz expect(fixCount).toBe(0) }) }) + + test.describe("fixBareRtlValues", () => { + test("wraps number with Latin unit", () => { + const input = "ایتھر کی مقدار 32 ETH ہے" + const { content, fixCount } = fixBareRtlValues(input, "ur") + expect(content).toBe('ایتھر کی مقدار 32 ETH ہے') + expect(fixCount).toBe(1) + }) + + test("wraps percentage", () => { + const input = "تقریباً 12.5% کمی" + const { content, fixCount } = fixBareRtlValues(input, "ur") + expect(content).toBe('تقریباً 12.5% کمی') + expect(fixCount).toBe(1) + }) + + test("wraps currency with symbol", () => { + const input = "قیمت $2,500 USD ہے" + const { content, fixCount } = fixBareRtlValues(input, "ur") + expect(content).toBe('قیمت $2,500 USD ہے') + expect(fixCount).toBe(1) + }) + + test("wraps version/protocol ID", () => { + const input = "اپ گریڈ EIP-1559 کے بعد" + const { content, fixCount } = fixBareRtlValues(input, "ur") + expect(content).toBe('اپ گریڈ EIP-1559 کے بعد') + expect(fixCount).toBe(1) + }) + + test("wraps large formatted number", () => { + const input = "گیس کے 21,000 یونٹس" + const { content, fixCount } = fixBareRtlValues(input, "ur") + expect(content).toBe('گیس کے 21,000 یونٹس') + expect(fixCount).toBe(1) + }) + + test("skips already-wrapped content", () => { + const input = 'مقدار 32 ETH ہے' + const { content, fixCount } = fixBareRtlValues(input, "ur") + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("skips content inside backticks", () => { + const input = "استعمال کریں `32 ETH` جمع کرنے کے لیے" + const { content, fixCount } = fixBareRtlValues(input, "ur") + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("skips code fences", () => { + const input = "```\n32 ETH\n```" + const { content, fixCount } = fixBareRtlValues(input, "ur") + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("no-ops for non-RTL locales", () => { + const input = "costs 32 ETH to stake" + const { content, fixCount } = fixBareRtlValues(input, "es") + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("handles multiple values in one line", () => { + const input = "فیس 10 Gwei اور ٹپ 2 Gwei ہے" + const { content, fixCount } = fixBareRtlValues(input, "ar") + expect(content).toContain('10 Gwei') + expect(content).toContain('2 Gwei') + expect(fixCount).toBe(2) + }) + + test("skips frontmatter", () => { + const input = "---\ntitle: 32 ETH staking\n---\n\n32 ETH جمع کریں" + const { content, fixCount } = fixBareRtlValues(input, "ur") + expect(content).toContain("title: 32 ETH staking") + expect(content).toContain('32 ETH') + expect(fixCount).toBe(1) + }) + + test("wraps multiplier", () => { + const input = "سائز 2x ہے" + const { content, fixCount } = fixBareRtlValues(input, "ur") + expect(content).toBe('سائز 2x ہے') + expect(fixCount).toBe(1) + }) + }) + + test.describe("fixUnitOutsideSpan", () => { + test("moves unit inside span", () => { + const input = '$100,000 USD' + const { content, fixCount } = fixUnitOutsideSpan(input, "ur") + expect(content).toBe('$100,000 USD') + expect(fixCount).toBe(1) + }) + + test("moves ETH inside span", () => { + const input = '0.000252 ETH' + const { content, fixCount } = fixUnitOutsideSpan(input, "ur") + expect(content).toBe('0.000252 ETH') + expect(fixCount).toBe(1) + }) + + test("moves Gwei inside span", () => { + const input = '10 Gwei' + const { content, fixCount } = fixUnitOutsideSpan(input, "ar") + expect(content).toBe('10 Gwei') + expect(fixCount).toBe(1) + }) + + test("leaves correct wrapping unchanged", () => { + const input = '$100,000 USD' + const { content, fixCount } = fixUnitOutsideSpan(input, "ur") + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("no-ops for non-RTL locales", () => { + const input = '$100,000 USD' + const { content, fixCount } = fixUnitOutsideSpan(input, "es") + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("handles multiple instances", () => { + const input = + '100 Gwei اور 32 ETH' + const { content, fixCount } = fixUnitOutsideSpan(input, "ur") + expect(content).toContain('100 Gwei') + expect(content).toContain('32 ETH') + expect(fixCount).toBe(2) + }) + }) + + test.describe("fixMisalignedCodeFences", () => { + test("fixes unindented closing fence when opening is indented", () => { + const input = "1. Step one\n\n ```sh\n pnpm install\n```" + const { content, fixCount } = fixMisalignedCodeFences(input) + expect(content).toBe( + "1. Step one\n\n ```sh\n pnpm install\n ```" + ) + expect(fixCount).toBe(1) + }) + + test("fixes multiple misaligned fences", () => { + const input = + " ```sh\n cmd1\n```\n\ntext\n\n ```sh\n cmd2\n```" + const { content, fixCount } = fixMisalignedCodeFences(input) + expect(content).toBe( + " ```sh\n cmd1\n ```\n\ntext\n\n ```sh\n cmd2\n ```" + ) + expect(fixCount).toBe(2) + }) + + test("leaves correctly indented fences unchanged", () => { + const input = " ```sh\n pnpm install\n ```" + const { content, fixCount } = fixMisalignedCodeFences(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("leaves non-indented fences unchanged", () => { + const input = "```solidity\ncontract Foo {}\n```" + const { content, fixCount } = fixMisalignedCodeFences(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("handles mixed indented and non-indented fences", () => { + const input = " ```sh\n cmd\n```\n\n```typescript\nconst x = 1\n```" + const { content, fixCount } = fixMisalignedCodeFences(input) + expect(content).toBe( + " ```sh\n cmd\n ```\n\n```typescript\nconst x = 1\n```" + ) + expect(fixCount).toBe(1) + }) + + test("handles tilde fences", () => { + const input = " ~~~sh\n cmd\n~~~" + const { content, fixCount } = fixMisalignedCodeFences(input) + expect(content).toBe(" ~~~sh\n cmd\n ~~~") + expect(fixCount).toBe(1) + }) + }) }) diff --git a/tests/unit/sanitizer/warnings.spec.ts b/tests/unit/intl-pipeline/sanitizer/warnings.spec.ts similarity index 94% rename from tests/unit/sanitizer/warnings.spec.ts rename to tests/unit/intl-pipeline/sanitizer/warnings.spec.ts index ebe6a2d88bf..9c45c27e3fd 100644 --- a/tests/unit/sanitizer/warnings.spec.ts +++ b/tests/unit/intl-pipeline/sanitizer/warnings.spec.ts @@ -5,7 +5,7 @@ import { expect, test } from "@playwright/test" -import { _testOnly } from "@/scripts/i18n/post_import_sanitize" +import { _testOnly } from "@/scripts/intl-pipeline/intl-sanitizer" const { warnPunctuationOnlyHeadings, @@ -50,12 +50,37 @@ test.describe("Warning Functions", () => { expect(warnings).toHaveLength(0) }) - test("warns when code content was translated", () => { - const english = "```js\nconst x = 1\n```" - const translated = "```js\nconst x = 1\u306E\u5024\n```" + test("no warning when only comments differ (JS //)", () => { + const english = "```js\n// This is a comment\nconst x = 1\n```" + const translated = + "```js\n// \u06CC\u06C1 \u0627\u06CC\u06A9 \u062A\u0628\u0635\u0631\u06C1 \u06C1\u06D2\nconst x = 1\n```" + const warnings = warnCodeFenceContentDrift(translated, english) + expect(warnings).toHaveLength(0) + }) + + test("no warning when only comments differ (JS /* */)", () => { + const english = + "```solidity\n/* @dev Returns the balance */\nfunction balanceOf() {}\n```" + const translated = + "```solidity\n/* @dev \u0628\u06CC\u0644\u0646\u0633 \u0648\u0627\u067E\u0633 \u06A9\u0631\u062A\u0627 \u06C1\u06D2 */\nfunction balanceOf() {}\n```" + const warnings = warnCodeFenceContentDrift(translated, english) + expect(warnings).toHaveLength(0) + }) + + test("no warning when only comments differ (Python #)", () => { + const english = "```python\n# This is a helper\ndef foo():\n pass\n```" + const translated = + "```python\n# \u06CC\u06C1 \u0627\u06CC\u06A9 \u06C1\u06CC\u0644\u067E\u0631 \u06C1\u06D2\ndef foo():\n pass\n```" + const warnings = warnCodeFenceContentDrift(translated, english) + expect(warnings).toHaveLength(0) + }) + + test("warns when functional code differs", () => { + const english = "```js\nconst x = node_hash\n```" + const translated = "```js\nconst x = node\n```" const warnings = warnCodeFenceContentDrift(translated, english) expect(warnings.length).toBe(1) - expect(warnings[0]).toContain("content differs") + expect(warnings[0]).toContain("functional code differs") }) test("warns on fence count mismatch", () => {
  • that would consume nested children. + */ +const HTML_TAG_WITH_CHILDREN_RE = new RegExp( + `<(${HTML_TAGS})(\\s[^>]+)>([\\s\\S]*?)<\\/\\1>`, + "gi" +) + +/** Self-closing HTML tags with attributes: , etc. */ +const HTML_SELF_CLOSING_RE = new RegExp( + `<(${HTML_TAGS})(\\s[^>]+)\\s*\\/?>`, + "gi" +) + +function extractHtmlTags( + markdown: string, + tree: ContentNode[], + extractions: Map +): string { + // Loop until no more matches -- handles nested tags (e.g., inside