From 09c3d57d8e0a266a961739c7b745be633144f37e Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Mon, 23 Mar 2026 21:37:37 +0000 Subject: [PATCH 01/23] feat(i18n): add timestamped logging + token summary Add timestamped REQUEST/RESPONSE logging to Gemini API calls with model, duration, and token counts. Add verbose prompt logging behind VERBOSE flag. Add per-language timing and formatted token usage summary table at end of pipeline run. Co-Authored-By: Claude Opus 4.6 (1M context) Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/lib/ai/gemini-translate.ts | 97 +++++++++++++++---- .../lib/workflows/gemini-translate-files.ts | 8 +- src/scripts/i18n/main-gemini.ts | 73 ++++++++++++-- 3 files changed, 148 insertions(+), 30 deletions(-) diff --git a/src/scripts/i18n/lib/ai/gemini-translate.ts b/src/scripts/i18n/lib/ai/gemini-translate.ts index 988ebd7d654..109018715a2 100644 --- a/src/scripts/i18n/lib/ai/gemini-translate.ts +++ b/src/scripts/i18n/lib/ai/gemini-translate.ts @@ -60,6 +60,15 @@ export interface TranslateFileResult { tokensUsed: { input: number; output: number } } +/** Optional metadata for richer Gemini API call logging */ +interface GeminiCallMetadata { + filePath?: string + targetLanguage?: string + chunkIndex?: number + totalChunks?: number + label?: string +} + /** * Translate a single file via Gemini. * @@ -81,7 +90,7 @@ export async function translateFile( // JSON files: translate directly, no extraction needed if (fileType === "json") { - return callGemini({ ...options, fileContent }) + return callGemini({ ...options, fileContent }, { filePath, targetLanguage }) } // Markdown: extract code blocks first @@ -100,10 +109,10 @@ export async function translateFile( if (chunks.length === 1) { // Single chunk: translate normally - const result = await callGemini({ - ...options, - fileContent: prose, - }) + const result = await callGemini( + { ...options, fileContent: prose }, + { filePath, targetLanguage } + ) translatedProse = result.translatedContent totalTokens = result.tokensUsed } else { @@ -111,10 +120,10 @@ export async function translateFile( console.log(` [chunk] ${filePath}: split into ${chunks.length} chunks`) const translatedChunks: string[] = [] for (let i = 0; i < chunks.length; i++) { - const result = await callGemini({ - ...options, - fileContent: chunks[i], - }) + const result = await callGemini( + { ...options, fileContent: chunks[i] }, + { filePath, targetLanguage, chunkIndex: i, totalChunks: chunks.length } + ) translatedChunks.push(result.translatedContent) totalTokens.input += result.tokensUsed.input totalTokens.output += result.tokensUsed.output @@ -132,7 +141,8 @@ export async function translateFile( finalContent, blocks, targetLanguage, - glossaryTerms + glossaryTerms, + filePath ) } catch (error) { console.warn( @@ -155,7 +165,8 @@ async function translateCodeComments( content: string, blocks: CodeBlock[], targetLanguage: string, - glossaryTerms: Map + glossaryTerms: Map, + filePath: string ): Promise { // Extract comments from all blocks const allComments: CodeComment[] = [] @@ -200,7 +211,11 @@ async function translateCodeComments( ${JSON.stringify(commentPayload, null, 2)}` - const result = await callGeminiRaw(commentPrompt) + const result = await callGeminiRaw(commentPrompt, { + filePath, + targetLanguage, + label: "code-comments", + }) let translatedMap: Record try { @@ -243,7 +258,8 @@ ${JSON.stringify(commentPayload, null, 2)}` * Used by both prose translation and comment translation. */ async function callGemini( - options: TranslateFileOptions + options: TranslateFileOptions, + metadata?: GeminiCallMetadata ): Promise { const { filePath, fileContent, fileType, targetLanguage, glossaryTerms } = options @@ -260,7 +276,7 @@ async function callGemini( // Retry loop for validation failures (API call retries are in callGeminiRaw) for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { - const result = await callGeminiRaw(prompt) + const result = await callGeminiRaw(prompt, metadata) let text = result.text text = stripCodeBlockWrapping(text, fileType) @@ -294,18 +310,37 @@ async function callGemini( } /** - * Raw Gemini API call with retries and model fallback. + * Raw Gemini API call with retries, model fallback, and verbose logging. + * + * Logging behavior: + * - Always: timestamped REQUEST/RESPONSE lines with model, duration, tokens + * - Verbose: full prompt content between === PROMPT START/END === markers + * * Returns the raw text response and token usage. */ async function callGeminiRaw( - prompt: string + prompt: string, + metadata?: GeminiCallMetadata ): Promise<{ text: string; tokensUsed: { input: number; output: number } }> { const client = getGeminiClient() + const verbose = process.env.VERBOSE === "true" + const ts = () => new Date().toISOString() const modelsToTry = process.env.GEMINI_MODEL ? [process.env.GEMINI_MODEL] : GEMINI_MODELS + // Build context string for log lines + const ctx = [ + metadata?.filePath && `file=${metadata.filePath}`, + metadata?.targetLanguage && `lang=${metadata.targetLanguage}`, + metadata?.chunkIndex != null && + `chunk=${(metadata.chunkIndex ?? 0) + 1}/${metadata.totalChunks}`, + metadata?.label, + ] + .filter(Boolean) + .join(" ") + let lastError: Error | null = null const modelNotFound = new Set() @@ -313,6 +348,20 @@ async function callGeminiRaw( let modelFailed = false for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { + const startTime = Date.now() + + console.log( + `[${ts()}] [gemini] REQUEST model=${modelId} ${ctx}${attempt > 1 ? ` attempt=${attempt}` : ""}` + ) + + if (verbose) { + console.log( + `[${ts()}] [gemini] === PROMPT START (${prompt.length} chars) ===` + ) + console.log(prompt) + console.log(`[${ts()}] [gemini] === PROMPT END ===`) + } + try { const response = await client.models.generateContent({ model: modelId, @@ -320,6 +369,11 @@ async function callGeminiRaw( config: { temperature: 0 }, }) const usage = response.usageMetadata + const duration = ((Date.now() - startTime) / 1000).toFixed(1) + + console.log( + `[${ts()}] [gemini] RESPONSE model=${modelId} ${ctx} duration=${duration}s tokens_in=${usage?.promptTokenCount || 0} tokens_out=${usage?.candidatesTokenCount || 0}` + ) return { text: response.text ?? "", @@ -329,6 +383,7 @@ async function callGeminiRaw( }, } } catch (error) { + const duration = ((Date.now() - startTime) / 1000).toFixed(1) lastError = error instanceof Error ? error : new Error(String(error)) if ( @@ -337,7 +392,7 @@ async function callGeminiRaw( lastError.message.includes("deprecated") ) { console.warn( - `[WARN] Model ${modelId} unavailable: ${lastError.message}. Trying next model...` + `[${ts()}] [gemini] MODEL_UNAVAILABLE model=${modelId} duration=${duration}s error="${lastError.message}"` ) modelNotFound.add(modelId) modelFailed = true @@ -350,7 +405,7 @@ async function callGeminiRaw( ) { const backoff = RETRY_DELAY_MS * Math.pow(2, attempt) console.warn( - `[WARN] Rate limited (${modelId}). Waiting ${backoff / 1000}s...` + `[${ts()}] [gemini] RATE_LIMITED model=${modelId} ${ctx} duration=${duration}s backoff=${backoff / 1000}s` ) await delay(backoff) continue @@ -358,11 +413,15 @@ async function callGeminiRaw( if (attempt < MAX_RETRIES) { console.warn( - `[WARN] Attempt ${attempt} (${modelId}) failed: ${lastError.message}. Retrying...` + `[${ts()}] [gemini] ERROR model=${modelId} ${ctx} attempt=${attempt} duration=${duration}s error="${lastError.message.slice(0, 200)}"` ) await delay(RETRY_DELAY_MS * attempt) continue } + + console.error( + `[${ts()}] [gemini] FAILED model=${modelId} ${ctx} duration=${duration}s error="${lastError.message.slice(0, 200)}"` + ) } } diff --git a/src/scripts/i18n/lib/workflows/gemini-translate-files.ts b/src/scripts/i18n/lib/workflows/gemini-translate-files.ts index 987e4405b31..b73c01cb9ae 100644 --- a/src/scripts/i18n/lib/workflows/gemini-translate-files.ts +++ b/src/scripts/i18n/lib/workflows/gemini-translate-files.ts @@ -34,6 +34,7 @@ interface TranslationStats { filesFailed: number totalInputTokens: number totalOutputTokens: number + durationSeconds: number } /** @@ -60,6 +61,7 @@ export async function geminiTranslateFiles( for (const language of targetLanguages) { logSection(`Translating: ${language}`) + const langStartTime = Date.now() if (isLanguageCompleted(progress, language)) { console.log(`[translate] ${language} already completed, skipping`) @@ -80,6 +82,7 @@ export async function geminiTranslateFiles( progress ) + stats.durationSeconds = (Date.now() - langStartTime) / 1000 allStats[language] = stats allCommittedFiles.push(...files) allFailedFiles.push(...failedFiles.map((f) => `${language}:${f}`)) @@ -89,10 +92,10 @@ export async function geminiTranslateFiles( } console.log( - `[translate] ${language} done: ${stats.filesTranslated} translated, ${stats.filesSkipped} skipped, ${stats.filesFailed} failed` + `[translate] ${language} done: ${stats.filesTranslated} translated, ${stats.filesSkipped} skipped, ${stats.filesFailed} failed (${stats.durationSeconds.toFixed(1)}s)` ) console.log( - `[translate] ${language} tokens: ${stats.totalInputTokens} in, ${stats.totalOutputTokens} out` + `[translate] ${language} tokens: ${stats.totalInputTokens.toLocaleString("en-US")} in, ${stats.totalOutputTokens.toLocaleString("en-US")} out` ) } @@ -145,6 +148,7 @@ async function translateLanguage( filesFailed: 0, totalInputTokens: 0, totalOutputTokens: 0, + durationSeconds: 0, } const translatedFiles: CommitFile[] = [] diff --git a/src/scripts/i18n/main-gemini.ts b/src/scripts/i18n/main-gemini.ts index 10b496323de..fc35f96f084 100644 --- a/src/scripts/i18n/main-gemini.ts +++ b/src/scripts/i18n/main-gemini.ts @@ -35,6 +35,7 @@ import { logSection } from "./lib/workflows/utils" import { config } from "./config" async function main() { + const pipelineStartTime = Date.now() logSection("Gemini Translation Pipeline") // Preflight checks @@ -132,15 +133,8 @@ async function main() { // Cleanup progress manifest on success cleanupProgress({ runId, startedAt: "", languages: {} }) - logSection("Complete") - console.log("[main] Gemini translation pipeline finished.") - - // Print summary - for (const [lang, s] of Object.entries(stats)) { - console.log( - ` ${lang}: ${s.filesTranslated} translated, ${s.filesFailed} failed, ${s.totalInputTokens + s.totalOutputTokens} tokens` - ) - } + // Print token usage summary table + printTokenSummary(stats, Date.now() - pipelineStartTime) if (failedFiles.length > 0) { console.warn( @@ -150,6 +144,67 @@ async function main() { console.warn(` - ${f}`) } } + + logSection("Complete") + console.log("[main] Gemini translation pipeline finished.") +} + +/** + * Print a formatted token usage summary table with per-language breakdown + * and approximate cost estimation. + */ +function printTokenSummary( + stats: Record, + pipelineDurationMs: number +): void { + logSection("Token Usage Summary") + + const fmt = (n: number) => n.toLocaleString("en-US") + const pad = (s: string, w: number) => s.padStart(w) + + // Column headers + console.log( + `${"Language".padEnd(10)}| ${"Files".padStart(5)} | ${"Input".padStart(10)} | ${"Output".padStart(10)} | ${"Total".padStart(10)} | ${"Duration".padStart(9)}` + ) + const sep = `${"-".repeat(10)}|${"-".repeat(7)}|${"-".repeat(12)}|${"-".repeat(12)}|${"-".repeat(12)}|${"-".repeat(10)}` + console.log(sep) + + let grandInput = 0 + let grandOutput = 0 + let grandFiles = 0 + + for (const [lang, s] of Object.entries(stats)) { + const total = s.totalInputTokens + s.totalOutputTokens + grandInput += s.totalInputTokens + grandOutput += s.totalOutputTokens + grandFiles += s.filesTranslated + + console.log( + `${lang.padEnd(10)}| ${pad(String(s.filesTranslated), 5)} | ${pad(fmt(s.totalInputTokens), 10)} | ${pad(fmt(s.totalOutputTokens), 10)} | ${pad(fmt(total), 10)} | ${pad((s.durationSeconds || 0).toFixed(1) + "s", 9)}` + ) + } + + console.log(sep) + const grandTotal = grandInput + grandOutput + const pipelineSecs = (pipelineDurationMs / 1000).toFixed(1) + console.log( + `${"TOTAL".padEnd(10)}| ${pad(String(grandFiles), 5)} | ${pad(fmt(grandInput), 10)} | ${pad(fmt(grandOutput), 10)} | ${pad(fmt(grandTotal), 10)} | ${pad(pipelineSecs + "s", 9)}` + ) + + // Approximate cost estimation + // Rates based on Gemini Pro pricing ($/1M tokens) -- update as pricing changes + const APPROX_INPUT_RATE = 1.25 + const APPROX_OUTPUT_RATE = 10.0 + const estCost = + (grandInput / 1_000_000) * APPROX_INPUT_RATE + + (grandOutput / 1_000_000) * APPROX_OUTPUT_RATE + + console.log( + `\n Estimated cost: ~$${estCost.toFixed(2)} (approximate -- based on Gemini Pro rates: $${APPROX_INPUT_RATE}/1M input, $${APPROX_OUTPUT_RATE}/1M output)` + ) + console.log( + ` Pipeline wall time: ${pipelineSecs}s` + ) } main().catch((error) => { From dd76d9aba0d239cea4ab76d05dca29ab78df4113 Mon Sep 17 00:00:00 2001 From: wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 23 Mar 2026 20:44:50 -0700 Subject: [PATCH 02/23] fix(i18n): code comment translation bugs Strip metadata from fence language tag before syntax lookup so "sh copy" maps to shell, not js (avoids treating // in URLs as comments). Use strippedCode instead of original block content when restoring translated comments to prevent duplication. Co-Authored-By: Claude Opus 4.6 --- .../i18n/lib/ai/code-block-extractor.ts | 3 +- src/scripts/i18n/lib/ai/gemini-translate.ts | 6 +- .../sanitizer/code-block-extractor.spec.ts | 61 +++++++++++++++++++ 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/src/scripts/i18n/lib/ai/code-block-extractor.ts b/src/scripts/i18n/lib/ai/code-block-extractor.ts index 715e7d64fef..96d240a0549 100644 --- a/src/scripts/i18n/lib/ai/code-block-extractor.ts +++ b/src/scripts/i18n/lib/ai/code-block-extractor.ts @@ -100,7 +100,8 @@ type CommentSyntax = "js" | "python" | "shell" /** Map fence language tags to comment syntax family */ function getCommentSyntax(language: string): CommentSyntax { - const lang = language.toLowerCase() + // Strip metadata after the language name (e.g., "sh copy", "solidity showLineNumbers") + const lang = language.toLowerCase().split(/\s+/)[0] // JS/Solidity/TS family: // and /* */ if ( diff --git a/src/scripts/i18n/lib/ai/gemini-translate.ts b/src/scripts/i18n/lib/ai/gemini-translate.ts index 109018715a2..786d79ab833 100644 --- a/src/scripts/i18n/lib/ai/gemini-translate.ts +++ b/src/scripts/i18n/lib/ai/gemini-translate.ts @@ -227,7 +227,7 @@ ${JSON.stringify(commentPayload, null, 2)}` } // Restore translated comments into the code blocks within content - for (const { block, comments } of blockData) { + for (const { block, strippedCode, comments } of blockData) { if (comments.length === 0) continue const syntax = getCommentSyntax(block.language) @@ -239,10 +239,12 @@ ${JSON.stringify(commentPayload, null, 2)}` }) // Find and replace the code block in content + // Use strippedCode (English comments removed) instead of block.content + // to avoid duplicating English comments alongside translated ones const fence = "```" const originalBlock = `${fence}${block.language}\n${block.content}\n${fence}` const restoredCode = restoreComments( - block.content, + strippedCode, translatedComments, syntax ) diff --git a/tests/unit/sanitizer/code-block-extractor.spec.ts b/tests/unit/sanitizer/code-block-extractor.spec.ts index d622d966fde..b25e7554d33 100644 --- a/tests/unit/sanitizer/code-block-extractor.spec.ts +++ b/tests/unit/sanitizer/code-block-extractor.spec.ts @@ -403,6 +403,13 @@ test.describe("getCommentSyntax", () => { expect(getCommentSyntax("Solidity")).toBe("js") expect(getCommentSyntax("PYTHON")).toBe("python") }) + + test("strips metadata after language name (e.g., 'sh copy')", () => { + expect(getCommentSyntax("sh copy")).toBe("shell") + expect(getCommentSyntax("bash copy")).toBe("shell") + expect(getCommentSyntax("solidity showLineNumbers")).toBe("js") + expect(getCommentSyntax("python {1,3-5}")).toBe("python") + }) }) // --------------------------------------------------------------------------- @@ -441,6 +448,60 @@ y = 2` const result = restoreComments(code, [], "js") expect(result).toBe(code) }) + + test("restores multi-line JS comment into stripped code without duplication", () => { + const original = `/** + * @dev Returns the amount of tokens in existence. + */ +function totalSupply() external view returns (uint256);` + + // Extract comments (produces stripped code with empties) + const { strippedCode, comments } = extractComments(original, "solidity") + + // Simulate translation + const translated = comments.map((c) => ({ + ...c, + text: "@dev Mengembalikan jumlah token yang ada.", + })) + + // Restore into STRIPPED code (not original) + const result = restoreComments(strippedCode, translated, "js") + + // Should have the Indonesian comment + expect(result).toContain("Mengembalikan jumlah token yang ada") + // Should NOT have the English comment + expect(result).not.toContain("Returns the amount of tokens") + // Should still have the function + expect(result).toContain("function totalSupply()") + }) + + test("extract-translate-restore round trip produces clean NatSpec", () => { + const original = ` /** + * @dev Moves tokens from caller to recipient. + * + * Returns a boolean value. + */ + function transfer(address to, uint256 amount) external returns (bool);` + + const { strippedCode, comments } = extractComments(original, "solidity") + + const translated = comments.map((c) => ({ + ...c, + text: "@dev Memindahkan token dari pemanggil ke penerima.\n *\n * Mengembalikan nilai boolean.", + })) + + const result = restoreComments(strippedCode, translated, "js") + + // Should contain translated text + expect(result).toContain("Memindahkan token") + // Should NOT contain English text + expect(result).not.toContain("Moves tokens from caller") + // Should have proper comment syntax + expect(result).toContain("/*") + expect(result).toContain("*/") + // Should have the function declaration + expect(result).toContain("function transfer") + }) }) // --------------------------------------------------------------------------- From 9563b1abca5ca9adce664c4cb17319d67e772d69 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Tue, 24 Mar 2026 16:00:05 +0000 Subject: [PATCH 03/23] fix(i18n): clean up workflow log output - Remove duplicate "Creating Pull Request" banner - Wrap verbose prompt output in collapsible groups Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/lib/ai/gemini-translate.ts | 20 ++++++++++++++++---- src/scripts/i18n/main-gemini.ts | 2 -- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/scripts/i18n/lib/ai/gemini-translate.ts b/src/scripts/i18n/lib/ai/gemini-translate.ts index 786d79ab833..eeec389bbcd 100644 --- a/src/scripts/i18n/lib/ai/gemini-translate.ts +++ b/src/scripts/i18n/lib/ai/gemini-translate.ts @@ -357,11 +357,23 @@ async function callGeminiRaw( ) if (verbose) { - console.log( - `[${ts()}] [gemini] === PROMPT START (${prompt.length} chars) ===` + // Split prompt into sections for collapsible groups + const sourceMatch = prompt.match( + /([\s\S]*?)(=== SOURCE FILE ===[\s\S]*?=== END SOURCE FILE ===)([\s\S]*)/ ) - console.log(prompt) - console.log(`[${ts()}] [gemini] === PROMPT END ===`) + if (sourceMatch) { + const [, preamble, sourceFile] = sourceMatch + console.log(`::group::Prompt preamble: ${ctx} (rules, glossary, hints)`) + console.log(preamble.trim()) + console.log("::endgroup::") + console.log(`::group::Source file: ${ctx} (${prompt.length} chars)`) + console.log(sourceFile) + console.log("::endgroup::") + } else { + console.log(`::group::Prompt: ${ctx} (${prompt.length} chars)`) + console.log(prompt) + console.log("::endgroup::") + } } try { diff --git a/src/scripts/i18n/main-gemini.ts b/src/scripts/i18n/main-gemini.ts index fc35f96f084..fbe6b8ca0ff 100644 --- a/src/scripts/i18n/main-gemini.ts +++ b/src/scripts/i18n/main-gemini.ts @@ -115,8 +115,6 @@ async function main() { ) if (!skipPr) { - logSection("Creating Pull Request") - const languagePairs = Object.keys(stats).map((code) => ({ crowdinId: code, internalLanguageCode: code, From 94ed35649b8dc11c2c043654eeb582ff11345473 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Tue, 24 Mar 2026 16:23:01 +0000 Subject: [PATCH 04/23] feat(i18n): add exclude_path workflow input Support comma-separated exclude paths to skip specific files or directories from translation. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- .github/workflows/gemini-translations.yml | 5 +++++ src/scripts/i18n/config.ts | 12 +++++++++--- src/scripts/i18n/lib/github/files.ts | 12 +++++------- src/scripts/i18n/main-gemini.ts | 1 + 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/.github/workflows/gemini-translations.yml b/.github/workflows/gemini-translations.yml index c3a6029d50b..02af2632eff 100644 --- a/.github/workflows/gemini-translations.yml +++ b/.github/workflows/gemini-translations.yml @@ -7,6 +7,10 @@ on: description: "Path(s) to translate (comma-separated files, single directory, or blank for all)" required: false type: string + exclude_path: + description: "Path(s) to exclude (comma-separated files or directories)" + required: false + type: string target_languages: description: "Comma-separated language codes (blank for all locales)" required: false @@ -62,6 +66,7 @@ jobs: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} TARGET_PATH: ${{ github.event.inputs.target_path }} + EXCLUDE_PATH: ${{ github.event.inputs.exclude_path }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} BASE_BRANCH: ${{ github.event.inputs.base_branch }} GEMINI_CONCURRENCY: ${{ github.event.inputs.concurrency }} diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index cb78d2559e8..31fae316902 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -72,7 +72,13 @@ const targetPaths = targetPathRaw .map((p) => p.trim()) .filter(Boolean) : [] -const excludePath = process.env.EXCLUDE_PATH?.trim() || "" +const excludePathRaw = process.env.EXCLUDE_PATH?.trim() || "" +const excludePaths = excludePathRaw + ? excludePathRaw + .split(",") + .map((p) => p.trim()) + .filter(Boolean) + : [] // Skip awaiting pre-translation completion (exit early with ID for manual resume) const skipAwait = ["1", "true", "yes", "on"].includes( @@ -114,7 +120,7 @@ if (verbose) { console.log( `[DEBUG] - Target path: ${targetPath || "none (full translation)"}` ) - console.log(`[DEBUG] - Exclude path: ${excludePath || "none"}`) + console.log(`[DEBUG] - Exclude paths: ${excludePaths.length ? excludePaths.join(", ") : "none"}`) console.log(`[DEBUG] - Skip await: ${skipAwait}`) console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) if (existingPreTranslationIds.length > 0) { @@ -141,7 +147,7 @@ export const config = { baseBranch, targetPath, targetPaths, - excludePath, + excludePaths, skipAwait, pretranslateTimeoutMs, pretranslatePollBaseMs, diff --git a/src/scripts/i18n/lib/github/files.ts b/src/scripts/i18n/lib/github/files.ts index 091d38b9f47..9035847c0f1 100644 --- a/src/scripts/i18n/lib/github/files.ts +++ b/src/scripts/i18n/lib/github/files.ts @@ -37,18 +37,16 @@ function isFilePath(targetPath: string): boolean { export const getAllEnglishFiles = async (): Promise< GitHubQueryResponseItem[] > => { - const { targetPath, excludePath } = config + const { targetPath, excludePaths } = config - // Add runtime exclusion if specified - const allExcludedPaths = excludePath - ? [...doNotTranslatePaths, excludePath] - : doNotTranslatePaths + // Add runtime exclusions if specified + const allExcludedPaths = [...doNotTranslatePaths, ...excludePaths] debugLog( `Do-not-translate paths loaded: ${doNotTranslatePaths.length} entries` ) - if (excludePath) { - debugLog(`Runtime path exclusions: ${excludePath}`) + if (excludePaths.length > 0) { + debugLog(`Runtime path exclusions: ${excludePaths.join(", ")}`) } // Multi-file mode: comma-separated paths each fetched individually diff --git a/src/scripts/i18n/main-gemini.ts b/src/scripts/i18n/main-gemini.ts index fbe6b8ca0ff..b67796a952c 100644 --- a/src/scripts/i18n/main-gemini.ts +++ b/src/scripts/i18n/main-gemini.ts @@ -12,6 +12,7 @@ * GEMINI_API_KEY - Gemini API key (required) * I18N_GITHUB_API_KEY - GitHub API key (required) * TARGET_PATH - Comma-separated file paths or single directory + * EXCLUDE_PATH - Comma-separated paths to exclude from translation * TARGET_LANGUAGES - Comma-separated language codes (blank = all) * GEMINI_CONCURRENCY - Max parallel Gemini requests per language (default: 3) * RESUME_RUN_ID - Resume an interrupted run by ID From f4ed2f0fbe3c3f8eff1a7256bc28ab82e564b299 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Tue, 24 Mar 2026 18:05:48 +0000 Subject: [PATCH 05/23] feat(i18n): bump default concurrency to 6 Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- .github/workflows/gemini-translations.yml | 2 +- src/scripts/i18n/lib/workflows/gemini-translate-files.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gemini-translations.yml b/.github/workflows/gemini-translations.yml index 02af2632eff..a49d92d862e 100644 --- a/.github/workflows/gemini-translations.yml +++ b/.github/workflows/gemini-translations.yml @@ -23,7 +23,7 @@ on: concurrency: description: "Max parallel Gemini requests per language" required: false - default: "3" + default: "6" type: string resume_run_id: description: "Resume an interrupted run by its ID" diff --git a/src/scripts/i18n/lib/workflows/gemini-translate-files.ts b/src/scripts/i18n/lib/workflows/gemini-translate-files.ts index b73c01cb9ae..58172e957de 100644 --- a/src/scripts/i18n/lib/workflows/gemini-translate-files.ts +++ b/src/scripts/i18n/lib/workflows/gemini-translate-files.ts @@ -53,7 +53,7 @@ export async function geminiTranslateFiles( failedFiles: string[] }> { const { englishFiles, glossary, targetLanguages } = context - const concurrency = Number(process.env.GEMINI_CONCURRENCY) || 3 + const concurrency = Number(process.env.GEMINI_CONCURRENCY) || 6 const progress = initProgress(runId, targetLanguages) const allStats: Record = {} const allCommittedFiles: CommitFile[] = [] From 04fbb7d7a9f40d5b57c6a050dc71b7a453846c7b Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Tue, 24 Mar 2026 19:30:08 +0000 Subject: [PATCH 06/23] fix(i18n): resolve sanitizer paths in CI Relative paths passed to runSanitizer caused English source lookups to fail silently in GitHub Actions, skipping all English-comparison fixes. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/main-gemini.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/main-gemini.ts b/src/scripts/i18n/main-gemini.ts index b67796a952c..b808be8fcc4 100644 --- a/src/scripts/i18n/main-gemini.ts +++ b/src/scripts/i18n/main-gemini.ts @@ -20,6 +20,8 @@ * SKIP_PR_CREATION - Skip PR creation (default: false) */ +import * as path from "path" + import { isGeminiAvailable } from "./lib/ai/gemini" import { cleanupProgress } from "./lib/ai/progress-tracker" import { @@ -82,7 +84,7 @@ async function main() { // Phase 3: Post-import sanitization const sanitizerInput = committedFiles.map((f) => ({ - path: f.path, + path: path.resolve(f.path), content: f.content, })) const sanitizeResult = await runPostImportSanitization( From 4245a37dab03797b968538acb2e7fe9a32322fcd Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Tue, 24 Mar 2026 19:31:35 +0000 Subject: [PATCH 07/23] fix(i18n): re-run sanitizer after JSX step JSX attribute translation runs after the sanitizer and can reintroduce issues. Add a second sanitizer pass after Phase 4 to catch these. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/main-gemini.ts | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/scripts/i18n/main-gemini.ts b/src/scripts/i18n/main-gemini.ts index b808be8fcc4..cc7a9da3cb4 100644 --- a/src/scripts/i18n/main-gemini.ts +++ b/src/scripts/i18n/main-gemini.ts @@ -112,7 +112,19 @@ async function main() { } } - // Phase 5: Create PR + // Phase 5: Re-run sanitizer after JSX translation + const resanitizeResult = await runPostImportSanitization( + sanitizerInput, + branchName + ) + + // Merge changed files from both sanitizer passes for PR summary + const allSanitizerChanges = [ + ...sanitizeResult.changedFiles, + ...resanitizeResult.changedFiles, + ] + + // Phase 6: Create PR const skipPr = ["1", "true", "yes", "on"].includes( (process.env.SKIP_PR_CREATION || "").toLowerCase() ) @@ -126,7 +138,7 @@ async function main() { await createTranslationPR( branchName, sanitizerInput, - sanitizeResult.changedFiles, + allSanitizerChanges, languagePairs ) } From 7dd307be580894ef307585091bec5fa3a9e6b00d Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Tue, 24 Mar 2026 19:58:47 +0000 Subject: [PATCH 08/23] feat(i18n): shared concurrency pool Replace per-language sequential processing with a single shared Gemini concurrency pool. Languages dispatch files simultaneously; commits serialized via SharedCommitter then squashed one-per-language. Bump default concurrency from 6 to 16. Add parallel JSX attribute translation. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- .github/workflows/gemini-translations.yml | 2 +- src/scripts/i18n/lib/ai/gemini.ts | 50 ++-- src/scripts/i18n/lib/github/commits.ts | 274 ++++++++++++++++++ .../lib/workflows/gemini-translate-files.ts | 87 +++--- .../i18n/lib/workflows/jsx-translation.ts | 13 +- 5 files changed, 358 insertions(+), 68 deletions(-) diff --git a/.github/workflows/gemini-translations.yml b/.github/workflows/gemini-translations.yml index a49d92d862e..288867e4094 100644 --- a/.github/workflows/gemini-translations.yml +++ b/.github/workflows/gemini-translations.yml @@ -23,7 +23,7 @@ on: concurrency: description: "Max parallel Gemini requests per language" required: false - default: "6" + default: "16" type: string resume_run_id: description: "Resume an interrupted run by its ID" diff --git a/src/scripts/i18n/lib/ai/gemini.ts b/src/scripts/i18n/lib/ai/gemini.ts index 29fc3022b83..a804db5a3cd 100644 --- a/src/scripts/i18n/lib/ai/gemini.ts +++ b/src/scripts/i18n/lib/ai/gemini.ts @@ -7,6 +7,7 @@ import { GoogleGenAI } from "@google/genai" import i18nConfig from "../../../../../i18n.config.json" import type { ExtractedAttribute, TranslatedAttribute } from "../jsx-attributes" import { delay } from "../workflows/utils" +import { createRateLimiter } from "./rate-limiter" /** Gemini API configuration */ const GEMINI_MODEL = "gemini-2.5-pro" @@ -211,33 +212,42 @@ export async function translateAttributesWithRetry( } /** - * Translate attributes grouped by file, processing each file's batch sequentially - * to avoid rate limits while maximizing context per request. + * Translate attributes grouped by file, processing files with bounded + * concurrency to speed up the JSX translation phase. */ export async function translateAttributesByFile( attributesByFile: Map, targetLanguage: string, - glossaryTerms?: Map + glossaryTerms?: Map, + concurrency?: number ): Promise> { const results = new Map() - - for (const [filePath, attributes] of attributesByFile) { - try { - const translated = await translateAttributesWithRetry( - attributes, - targetLanguage, - glossaryTerms - ) - results.set(filePath, translated) - console.log( - `[GEMINI] ✓ Translated ${translated.length} attributes in ${filePath}` - ) - } catch (error) { - console.error(`[GEMINI] ✗ Failed to translate ${filePath}:`, error) - // Continue with other files even if one fails - results.set(filePath, []) + const maxConcurrent = concurrency || Number(process.env.GEMINI_CONCURRENCY) || 16 + const limiter = createRateLimiter(maxConcurrent) + + const tasks = Array.from(attributesByFile.entries()).map( + ([filePath, attributes]) => async () => { + await limiter.acquire() + try { + const translated = await translateAttributesWithRetry( + attributes, + targetLanguage, + glossaryTerms + ) + results.set(filePath, translated) + console.log( + `[GEMINI] ✓ Translated ${translated.length} attributes in ${filePath}` + ) + } catch (error) { + console.error(`[GEMINI] ✗ Failed to translate ${filePath}:`, error) + results.set(filePath, []) + } finally { + limiter.release() + } } - } + ) + + await Promise.all(tasks.map((task) => task())) return results } diff --git a/src/scripts/i18n/lib/github/commits.ts b/src/scripts/i18n/lib/github/commits.ts index 01b5cb6aa3b..c219144d94b 100644 --- a/src/scripts/i18n/lib/github/commits.ts +++ b/src/scripts/i18n/lib/github/commits.ts @@ -163,6 +163,280 @@ export class IncrementalCommitter { } } +/** + * Shared committer for parallel language translation. + * + * Unlike IncrementalCommitter (one amending commit per language), this + * creates individual chained commits (each parented on the previous) + * so multiple languages can interleave safely. Each file appears on the + * branch immediately for crash safety. + * + * After all translations complete, call squashByLanguage() to collapse + * the individual commits into one per language for a clean history. + */ +export class SharedCommitter { + private currentCommitSha = "" + private currentTreeSha = "" + private queue: Promise = Promise.resolve() + private baseUrl = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}` + /** Track blob SHAs per language for squashing */ + private blobsByLanguage = new Map() + /** SHA of the original base before any translations */ + private originalBaseSha = "" + + constructor(private branch: string) {} + + /** Snapshot the current branch state. */ + async init(): Promise { + const refRes = await fetchWithRetry( + `${this.baseUrl}/git/ref/heads/${this.branch}`, + { headers: gitHubBearerHeaders } + ) + if (!refRes.ok) { + const body = await refRes.text().catch(() => "") + throw new Error( + `SharedCommitter init ref (${refRes.status}): ${body}` + ) + } + const refData: { object: { sha: string } } = await refRes.json() + this.currentCommitSha = refData.object.sha + this.originalBaseSha = refData.object.sha + + const commitRes = await fetchWithRetry( + `${this.baseUrl}/git/commits/${this.currentCommitSha}`, + { headers: gitHubBearerHeaders } + ) + if (!commitRes.ok) { + const body = await commitRes.text().catch(() => "") + throw new Error( + `SharedCommitter init commit (${commitRes.status}): ${body}` + ) + } + const commitData: { tree: { sha: string } } = await commitRes.json() + this.currentTreeSha = commitData.tree.sha + } + + /** + * Queue a file commit. Serialized so concurrent languages don't race. + * Each commit chains on the previous (not amending). + */ + commitFile( + filePath: string, + content: string, + language: string + ): Promise { + const result = this.queue.then(() => + this._doCommit(filePath, content, language) + ) + this.queue = result.then( + () => {}, + () => {} + ) + return result + } + + private async _doCommit( + filePath: string, + content: string, + language: string + ): Promise { + // 1. Create blob + const blobRes = await fetchWithRetry(`${this.baseUrl}/git/blobs`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + content: Buffer.from(content, "utf8").toString("base64"), + encoding: "base64", + }), + }) + if (!blobRes.ok) { + const body = await blobRes.text().catch(() => "") + throw new Error( + `Failed to create blob for ${filePath} (${blobRes.status}): ${body}` + ) + } + const blobData: { sha: string } = await blobRes.json() + + const item: TreeItem = { + path: filePath, + mode: "100644", + type: "blob", + sha: blobData.sha, + } + + // Track blob for squashing + if (!this.blobsByLanguage.has(language)) { + this.blobsByLanguage.set(language, []) + } + this.blobsByLanguage.get(language)!.push(item) + + // 2. Create tree on top of current tree + const treeRes = await fetchWithRetry(`${this.baseUrl}/git/trees`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + base_tree: this.currentTreeSha, + tree: [item], + }), + }) + if (!treeRes.ok) { + const body = await treeRes.text().catch(() => "") + throw new Error(`Failed to create tree (${treeRes.status}): ${body}`) + } + const treeData: { sha: string } = await treeRes.json() + + // 3. Create commit parented on the current tip (chaining, not amending) + const commitRes = await fetchWithRetry(`${this.baseUrl}/git/commits`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + message: `i18n(${language}): ${filePath.split("/").pop()}`, + tree: treeData.sha, + parents: [this.currentCommitSha], + }), + }) + if (!commitRes.ok) { + const body = await commitRes.text().catch(() => "") + throw new Error(`Failed to create commit (${commitRes.status}): ${body}`) + } + const commitData: { sha: string } = await commitRes.json() + + // 4. Update branch ref (no force needed -- linear chain) + const updateRes = await fetchWithRetry( + `${this.baseUrl}/git/refs/heads/${this.branch}`, + { + method: "PATCH", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ sha: commitData.sha }), + } + ) + if (!updateRes.ok) { + const body = await updateRes.text().catch(() => "") + throw new Error(`Failed to update ref (${updateRes.status}): ${body}`) + } + + // Advance internal state + this.currentCommitSha = commitData.sha + this.currentTreeSha = treeData.sha + + debugLog( + `SharedCommitter [${language}]: committed ${filePath}` + ) + } + + /** + * Squash all individual commits into one per language. + * Builds a new commit chain from the original base: + * base -> lang1 (all files) -> lang2 (all files) -> ... + * Then force-updates the branch ref. + */ + async squashByLanguage(): Promise { + const languages = Array.from(this.blobsByLanguage.keys()).sort() + if (languages.length === 0) return + + console.log( + `[SharedCommitter] Squashing ${languages.length} language(s): ${languages.join(", ")}` + ) + + let parentSha = this.originalBaseSha + // Get the original base tree + const baseCommitRes = await fetchWithRetry( + `${this.baseUrl}/git/commits/${this.originalBaseSha}`, + { headers: gitHubBearerHeaders } + ) + if (!baseCommitRes.ok) { + const body = await baseCommitRes.text().catch(() => "") + throw new Error( + `Failed to get base commit for squash (${baseCommitRes.status}): ${body}` + ) + } + const baseCommitData: { tree: { sha: string } } = await baseCommitRes.json() + let currentTree = baseCommitData.tree.sha + + for (const lang of languages) { + const blobs = this.blobsByLanguage.get(lang)! + + // Create tree with all blobs for this language on top of current tree + const treeRes = await fetchWithRetry(`${this.baseUrl}/git/trees`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + base_tree: currentTree, + tree: blobs, + }), + }) + if (!treeRes.ok) { + const body = await treeRes.text().catch(() => "") + throw new Error( + `Failed to create squash tree for ${lang} (${treeRes.status}): ${body}` + ) + } + const treeData: { sha: string } = await treeRes.json() + + // Create squashed commit + const commitRes = await fetchWithRetry(`${this.baseUrl}/git/commits`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + message: `i18n(${lang}): Gemini translation`, + tree: treeData.sha, + parents: [parentSha], + }), + }) + if (!commitRes.ok) { + const body = await commitRes.text().catch(() => "") + throw new Error( + `Failed to create squash commit for ${lang} (${commitRes.status}): ${body}` + ) + } + const commitData: { sha: string } = await commitRes.json() + + parentSha = commitData.sha + currentTree = treeData.sha + + console.log( + `[SharedCommitter] Squashed ${blobs.length} files for ${lang}` + ) + } + + // Force-update branch to squashed chain + const updateRes = await fetchWithRetry( + `${this.baseUrl}/git/refs/heads/${this.branch}`, + { + method: "PATCH", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ sha: parentSha, force: true }), + } + ) + if (!updateRes.ok) { + const body = await updateRes.text().catch(() => "") + throw new Error( + `Failed to update ref after squash (${updateRes.status}): ${body}` + ) + } + + // Update internal state + this.currentCommitSha = parentSha + this.currentTreeSha = currentTree + + console.log( + `[SharedCommitter] Squash complete: ${languages.length} commits` + ) + } + + get totalFiles(): number { + let count = 0 + for (const blobs of this.blobsByLanguage.values()) { + count += blobs.length + } + return count + } + + get languageCount(): number { + return this.blobsByLanguage.size + } +} + /** * Commit multiple files in a single commit using GitHub's Git Data API. * This avoids creating one commit per file. diff --git a/src/scripts/i18n/lib/workflows/gemini-translate-files.ts b/src/scripts/i18n/lib/workflows/gemini-translate-files.ts index 58172e957de..f1f5bfb6235 100644 --- a/src/scripts/i18n/lib/workflows/gemini-translate-files.ts +++ b/src/scripts/i18n/lib/workflows/gemini-translate-files.ts @@ -1,9 +1,14 @@ /** - * Orchestrate file translation per language. + * Orchestrate file translation across all languages. * - * Each successful translation is committed immediately (amend pattern: - * one growing commit per language). Failed files are skipped, not fatal. - * The pipeline only throws if ALL files for ALL languages fail. + * All languages share a single Gemini concurrency pool (GEMINI_CONCURRENCY, + * default 16). Languages don't wait for each other -- as slots free up, + * the next pending file from any language fills the slot. This naturally + * pipelines across languages when there are fewer files than slots. + * + * Each translated file is committed immediately via a SharedCommitter + * (serialized internally) for crash safety. After all translations + * complete, individual commits are squashed into one per language. */ import { translateFile } from "../ai/gemini-translate" @@ -16,8 +21,8 @@ import { markLanguageCompleted, type TranslationProgress, } from "../ai/progress-tracker" -import { createRateLimiter } from "../ai/rate-limiter" -import { getDestinationFromPath, IncrementalCommitter } from "../github/commits" +import { createRateLimiter, type RateLimiter } from "../ai/rate-limiter" +import { getDestinationFromPath, SharedCommitter } from "../github/commits" import { getGlossaryForLanguage } from "../supabase/glossary" import type { GeminiWorkflowContext } from "./gemini-initialize" @@ -37,11 +42,6 @@ interface TranslationStats { durationSeconds: number } -/** - * Translate all files for all target languages. - * Files are committed as they complete (no work lost on partial failure). - * Throws only if zero files were translated across all languages. - */ export async function geminiTranslateFiles( context: GeminiWorkflowContext, branchName: string, @@ -53,32 +53,42 @@ export async function geminiTranslateFiles( failedFiles: string[] }> { const { englishFiles, glossary, targetLanguages } = context - const concurrency = Number(process.env.GEMINI_CONCURRENCY) || 6 + const concurrency = Number(process.env.GEMINI_CONCURRENCY) || 16 const progress = initProgress(runId, targetLanguages) const allStats: Record = {} const allCommittedFiles: CommitFile[] = [] const allFailedFiles: string[] = [] - for (const language of targetLanguages) { - logSection(`Translating: ${language}`) - const langStartTime = Date.now() + // One shared committer for all languages -- serializes ref updates + const committer = new SharedCommitter(branchName) + await committer.init() + + // One shared Gemini concurrency pool across all languages + const limiter = createRateLimiter(concurrency) + console.log( + `[translate] ${targetLanguages.length} language(s), ${englishFiles.length} file(s) each, concurrency ${concurrency}` + ) + + // Dispatch all languages concurrently -- they share the limiter + const languageTasks = targetLanguages.map((language) => async () => { if (isLanguageCompleted(progress, language)) { console.log(`[translate] ${language} already completed, skipping`) - continue + return } + const langStartTime = Date.now() const glossaryTerms = getGlossaryForLanguage(glossary, language) console.log( - `[translate] ${language}: ${englishFiles.length} files, ${glossaryTerms.size} glossary terms, concurrency ${concurrency}` + `[translate] ${language}: ${englishFiles.length} files, ${glossaryTerms.size} glossary terms` ) const { stats, files, failedFiles } = await translateLanguage( englishFiles, language, glossaryTerms, - branchName, - concurrency, + committer, + limiter, progress ) @@ -97,7 +107,10 @@ export async function geminiTranslateFiles( console.log( `[translate] ${language} tokens: ${stats.totalInputTokens.toLocaleString("en-US")} in, ${stats.totalOutputTokens.toLocaleString("en-US")} out` ) - } + }) + + // All languages run concurrently, bounded by the shared limiter + await Promise.all(languageTasks.map((task) => task())) // Fail if nothing was translated at all const totalTranslated = Object.values(allStats).reduce( @@ -116,6 +129,12 @@ export async function geminiTranslateFiles( ) } + // Squash individual file commits into one per language + if (committer.totalFiles > 0) { + logSection("Squashing Commits") + await committer.squashByLanguage() + } + return { branch: branchName, stats: allStats, @@ -126,22 +145,20 @@ export async function geminiTranslateFiles( /** * Translate all files for a single language. - * Each success is committed immediately via IncrementalCommitter. - * Failures are logged and skipped. + * Uses the shared rate limiter so slots are shared across languages. */ async function translateLanguage( englishFiles: GeminiWorkflowContext["englishFiles"], language: string, glossaryTerms: Map, - branchName: string, - concurrency: number, + committer: SharedCommitter, + limiter: RateLimiter, progress: TranslationProgress ): Promise<{ stats: TranslationStats files: CommitFile[] failedFiles: string[] }> { - const limiter = createRateLimiter(concurrency) const stats: TranslationStats = { filesTranslated: 0, filesSkipped: 0, @@ -154,16 +171,7 @@ async function translateLanguage( const translatedFiles: CommitFile[] = [] const failedFiles: string[] = [] - // Incremental committer: one amending commit per language - const committer = new IncrementalCommitter( - branchName, - `i18n(${language}): Gemini translation` - ) - await committer.init() - - // Process files with bounded concurrency const tasks = englishFiles.map((file) => async () => { - // Skip already completed if (isFileCompleted(progress, language, file.path)) { stats.filesSkipped++ return @@ -181,8 +189,8 @@ async function translateLanguage( const destPath = getDestinationFromPath(file.path, language) - // Commit immediately -- serialized internally by the committer - await committer.commitFile(destPath, result.translatedContent) + // Commit immediately -- serialized by the shared committer's queue + await committer.commitFile(destPath, result.translatedContent, language) translatedFiles.push({ path: destPath, @@ -209,14 +217,7 @@ async function translateLanguage( } }) - // Execute all tasks (concurrency handled by limiter) await Promise.all(tasks.map((task) => task())) - if (committer.fileCount > 0) { - console.log( - `[translate] ${language}: ${committer.fileCount} files committed to branch` - ) - } - return { stats, files: translatedFiles, failedFiles } } diff --git a/src/scripts/i18n/lib/workflows/jsx-translation.ts b/src/scripts/i18n/lib/workflows/jsx-translation.ts index c0bad47561a..a82ef36465e 100644 --- a/src/scripts/i18n/lib/workflows/jsx-translation.ts +++ b/src/scripts/i18n/lib/workflows/jsx-translation.ts @@ -21,6 +21,8 @@ export interface JsxTranslationResult { /** * Translate JSX attributes in markdown files via Gemini. + * All languages are dispatched concurrently -- the per-file concurrency + * inside translateJsxAttributes is bounded by GEMINI_CONCURRENCY. * Updates committedFiles in-place with translated content. */ export async function runJsxTranslation( @@ -45,8 +47,9 @@ export async function runJsxTranslation( let totalAttributesTranslated = 0 let totalFilesUpdated = 0 - // Process each language separately - for (const langPair of languagePairs) { + // Dispatch all languages concurrently -- Gemini concurrency is + // bounded inside translateAttributesByFile via the shared pool + const languageTasks = languagePairs.map((langPair) => async () => { const langCode = langPair.internalLanguageCode // Filter files for this language (markdown only) @@ -57,7 +60,7 @@ export async function runJsxTranslation( if (langFiles.length === 0) { console.log(`[JSX-TRANSLATE] No markdown files for ${langCode}`) - continue + return } console.log( @@ -108,7 +111,9 @@ export async function runJsxTranslation( ) } } - } + }) + + await Promise.all(languageTasks.map((task) => task())) return { geminiSkipped: false, From 11492fe1b716bf8eab39d23d5a2861aab341a102 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Tue, 24 Mar 2026 22:07:22 +0000 Subject: [PATCH 09/23] fix(i18n): sanitizer uses BASE_BRANCH English Pass English content map (fetched from BASE_BRANCH via GitHub API) to the sanitizer instead of reading from disk. Ensures English comparison matches the same branch used for translation, not whatever the CI runner checked out. Disk fallback preserved for local/CLI usage. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- .../i18n/lib/workflows/sanitization.ts | 9 +++- src/scripts/i18n/main-gemini.ts | 13 +++++- src/scripts/i18n/post_import_sanitize.ts | 44 ++++++++++++++----- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/src/scripts/i18n/lib/workflows/sanitization.ts b/src/scripts/i18n/lib/workflows/sanitization.ts index 548871af2dc..a32b62eb230 100644 --- a/src/scripts/i18n/lib/workflows/sanitization.ts +++ b/src/scripts/i18n/lib/workflows/sanitization.ts @@ -19,13 +19,18 @@ export interface SanitizationResult { */ export async function runPostImportSanitization( committedFiles: CommittedFile[], - branch: string + branch: string, + englishContentMap?: Map ): Promise { logSection("Running Post-Import Sanitizer") console.log(`[SANITIZE] Processing ${committedFiles.length} committed files`) - const sanitizeResult = await runSanitizer(committedFiles) + const sanitizeResult = await runSanitizer( + committedFiles, + undefined, + englishContentMap + ) const changedFiles = sanitizeResult.changedFiles || [] if (changedFiles.length) { diff --git a/src/scripts/i18n/main-gemini.ts b/src/scripts/i18n/main-gemini.ts index cc7a9da3cb4..f4af298090d 100644 --- a/src/scripts/i18n/main-gemini.ts +++ b/src/scripts/i18n/main-gemini.ts @@ -87,9 +87,17 @@ async function main() { path: path.resolve(f.path), content: f.content, })) + + // Build English content map so the sanitizer compares against + // the same BASE_BRANCH English files used for translation (not disk) + const englishContentMap = new Map( + context.englishFiles.map((f) => [f.path, f.content]) + ) + const sanitizeResult = await runPostImportSanitization( sanitizerInput, - branchName + branchName, + englishContentMap ) // Phase 4: JSX attribute translation (reuse existing Gemini JSX flow) @@ -115,7 +123,8 @@ async function main() { // Phase 5: Re-run sanitizer after JSX translation const resanitizeResult = await runPostImportSanitization( sanitizerInput, - branchName + branchName, + englishContentMap ) // Merge changed files from both sanitizer passes for PR summary diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index f982e9b3b47..23712df3919 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -3681,7 +3681,8 @@ function detectUntranslatedContent(content: string, locale: string): string[] { function processMarkdownFile( mdPath: string, - providedContent?: string + providedContent?: string, + englishContentMap?: Map ): { fixed: boolean issues: string[] @@ -3700,14 +3701,28 @@ function processMarkdownFile( if (idx === -1 || idx + 2 >= parts.length) { issues.push("No translations segment found; skipping formatting sync") } else { - // Use path.resolve to preserve absolute paths (path.join loses leading /) - const englishPath = path.resolve( - path.sep, - ...parts.slice(0, idx), - ...parts.slice(idx + 2) // drop translations/ - ) - if (fs.existsSync(englishPath)) { - englishMd = fs.readFileSync(englishPath, "utf8") + // Derive the relative English path (e.g. public/content/bridges/index.md) + const englishRelPath = [...parts.slice(0, idx), ...parts.slice(idx + 2)] + .join(path.sep) + // Strip leading absolute prefix to get repo-relative path + .replace(/^.*?public\/content\//, "public/content/") + + // Try in-memory map first (from GitHub API), then fall back to disk + if (englishContentMap?.has(englishRelPath)) { + englishMd = englishContentMap.get(englishRelPath)! + } else { + // Absolute path for disk fallback (local/CLI usage) + const englishPath = path.resolve( + path.sep, + ...parts.slice(0, idx), + ...parts.slice(idx + 2) // drop translations/ + ) + if (fs.existsSync(englishPath)) { + englishMd = fs.readFileSync(englishPath, "utf8") + } + } + + if (englishMd) { // Fix detached heading anchors BEFORE syncing IDs { const snapshot = content @@ -3719,6 +3734,11 @@ function processMarkdownFile( } content = syncHeaderIdsWithEnglish(content, englishMd) } else { + const englishPath = path.resolve( + path.sep, + ...parts.slice(0, idx), + ...parts.slice(idx + 2) + ) issues.push(`English source missing: ${path.relative(ROOT, englishPath)}`) } } @@ -4195,7 +4215,8 @@ function languagesFromEnv(): string[] | undefined { export async function runSanitizer( filesWithContent?: Array<{ path: string; content: string }>, - langs?: string[] + langs?: string[], + englishContentMap?: Map ) { console.log("[SANITIZE] Starting post-import sanitizer") await loadFranc() @@ -4322,7 +4343,8 @@ export async function runSanitizer( : null const { fixed, issues, content } = processMarkdownFile( fileInfo.path, - fileInfo.content + fileInfo.content, + englishContentMap ) if (fixed) { mdFixed++ From bfef94ebfa8c469268905c02415cc6276a05a404 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Wed, 25 Mar 2026 05:22:36 +0000 Subject: [PATCH 10/23] fix(i18n): fence indent, prompt, validation Restore closing code fence indentation lost during code block extraction/restoration. Explicitly instruct Gemini to translate frontmatter values and transliterate author names for non-Latin scripts. Add validation to reject untranslated frontmatter. Collapse blank lines left by multi-line comment restoration. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- .../i18n/lib/ai/code-block-extractor.ts | 29 ++++++++++++-- .../i18n/lib/ai/gemini-output-validation.ts | 40 +++++++++++++++++++ src/scripts/i18n/lib/ai/gemini-translate.ts | 5 ++- src/scripts/i18n/lib/ai/prompt-builder.ts | 23 ++++++++--- 4 files changed, 87 insertions(+), 10 deletions(-) diff --git a/src/scripts/i18n/lib/ai/code-block-extractor.ts b/src/scripts/i18n/lib/ai/code-block-extractor.ts index 96d240a0549..466fd7a308e 100644 --- a/src/scripts/i18n/lib/ai/code-block-extractor.ts +++ b/src/scripts/i18n/lib/ai/code-block-extractor.ts @@ -12,6 +12,8 @@ export interface CodeBlock { index: number language: string content: string + /** Indentation prefix (spaces/tabs) of the original fence */ + indent: string } /** Result of extracting code blocks from markdown */ @@ -26,6 +28,8 @@ export interface ExtractionResult { export interface CodeComment { blockIndex: number line: number + /** Last line of a multi-line comment (for collapsing placeholder lines) */ + endLine?: number type: "single" | "multi" text: string } @@ -66,6 +70,7 @@ export function extractCodeBlocks(markdown: string): ExtractionResult { index, language: lang, content, + indent: ind, }) return `${ind}${makePlaceholder(index)}` } @@ -85,7 +90,9 @@ export function restoreCodeBlocks(prose: string, blocks: CodeBlock[]): string { const placeholder = makePlaceholder(block.index) const fence = "```" const langTag = block.language ? block.language : "" - const restored = `${fence}${langTag}\n${block.content}\n${fence}` + // Opening fence gets indent from the prose context (placeholder was indented). + // Closing fence needs explicit indent since it's on a new line in the replacement. + const restored = `${fence}${langTag}\n${block.content}\n${block.indent}${fence}` result = result.replace(placeholder, restored) } @@ -187,6 +194,7 @@ export function extractComments( comments.push({ blockIndex: -1, // filled in by caller line: multiLineStart, + endLine: i, type: "multi", text: multiLineBuffer.trim(), }) @@ -372,9 +380,24 @@ export function restoreComments( // Multi-line: wrap in block comment syntax const indent = existing.match(/^(\s*)/)?.[1] || "" if (syntax === "js") { - lines[comment.line] = `${indent}/* ${comment.text} */\n${existing}` + lines[comment.line] = `${indent}/* ${comment.text} */` } else { - lines[comment.line] = `${indent}# ${comment.text}\n${existing}` + lines[comment.line] = `${indent}# ${comment.text}` + } + // Collapse empty placeholder lines left by multi-line comment extraction. + // Preserve the endLine if it has code after the comment close (e.g., "*/ doSomething()"). + if (comment.endLine != null && comment.endLine > comment.line) { + const endContent = lines[comment.endLine]?.trim() || "" + if (endContent) { + // Keep endLine (has code after */), remove only the middle lines + const removeCount = comment.endLine - comment.line - 1 + if (removeCount > 0) { + lines.splice(comment.line + 1, removeCount) + } + } else { + // endLine is empty too -- remove all placeholder lines + lines.splice(comment.line + 1, comment.endLine - comment.line) + } } } } diff --git a/src/scripts/i18n/lib/ai/gemini-output-validation.ts b/src/scripts/i18n/lib/ai/gemini-output-validation.ts index 1c5d9163efe..40da280e022 100644 --- a/src/scripts/i18n/lib/ai/gemini-output-validation.ts +++ b/src/scripts/i18n/lib/ai/gemini-output-validation.ts @@ -86,6 +86,12 @@ export function validateTranslatedMarkdown( } } + // Frontmatter title/description should be translated, not left in English + const untranslatedFm = checkFrontmatterTranslated(translated, english) + if (untranslatedFm) { + return { valid: false, error: untranslatedFm } + } + return { valid: true } } @@ -141,3 +147,37 @@ function validateCommon(translated: string): ValidationResult { return { valid: true } } + +/** + * Extract a frontmatter field value from raw markdown. + * Returns undefined if the field is not found. + */ +function extractFrontmatterField( + content: string, + field: string +): string | undefined { + const fmMatch = content.match(/^---\n([\s\S]*?)\n---/) + if (!fmMatch) return undefined + const re = new RegExp(`^${field}:\\s*"?(.+?)"?\\s*$`, "m") + const match = fmMatch[1].match(re) + return match?.[1] +} + +/** + * Check that key frontmatter fields (title, description) were actually + * translated and not left identical to the English source. + * Returns an error string if untranslated, or undefined if OK. + */ +function checkFrontmatterTranslated( + translated: string, + english: string +): string | undefined { + for (const field of ["title", "description"]) { + const enValue = extractFrontmatterField(english, field) + const trValue = extractFrontmatterField(translated, field) + if (enValue && trValue && enValue === trValue) { + return `Frontmatter "${field}" was not translated (identical to English)` + } + } + return undefined +} diff --git a/src/scripts/i18n/lib/ai/gemini-translate.ts b/src/scripts/i18n/lib/ai/gemini-translate.ts index eeec389bbcd..0f8e51c883a 100644 --- a/src/scripts/i18n/lib/ai/gemini-translate.ts +++ b/src/scripts/i18n/lib/ai/gemini-translate.ts @@ -242,13 +242,14 @@ ${JSON.stringify(commentPayload, null, 2)}` // Use strippedCode (English comments removed) instead of block.content // to avoid duplicating English comments alongside translated ones const fence = "```" - const originalBlock = `${fence}${block.language}\n${block.content}\n${fence}` + const ind = block.indent || "" + const originalBlock = `${fence}${block.language}\n${block.content}\n${ind}${fence}` const restoredCode = restoreComments( strippedCode, translatedComments, syntax ) - const newBlock = `${fence}${block.language}\n${restoredCode}\n${fence}` + const newBlock = `${fence}${block.language}\n${restoredCode}\n${ind}${fence}` content = content.replace(originalBlock, newBlock) } diff --git a/src/scripts/i18n/lib/ai/prompt-builder.ts b/src/scripts/i18n/lib/ai/prompt-builder.ts index 7b0934566ac..eb4775268c9 100644 --- a/src/scripts/i18n/lib/ai/prompt-builder.ts +++ b/src/scripts/i18n/lib/ai/prompt-builder.ts @@ -10,7 +10,11 @@ * transliteration norms, etc. better than any regex. */ -import { getLanguageGroup, getSiteSpecificNotes } from "./language-groups" +import { + type LanguageGroup, + getLanguageGroup, + getSiteSpecificNotes, +} from "./language-groups" interface PromptOptions { filePath: string @@ -37,7 +41,7 @@ export function buildTranslationPrompt(options: PromptOptions): string { const group = getLanguageGroup(targetLanguage) const siteNotes = getSiteSpecificNotes(group) const glossarySection = formatGlossary(glossaryTerms) - const formatRules = getFormatRules(fileType) + const formatRules = getFormatRules(fileType, group) const sanitizerHints = getSanitizerHints() return `Translate this ${fileType} file from English to ${languageName} (${targetLanguage}). @@ -59,7 +63,10 @@ ${fileContent} Output ONLY the translated file content. No explanations, no markdown wrapping, no commentary.` } -function getFormatRules(fileType: "markdown" | "json"): string { +function getFormatRules( + fileType: "markdown" | "json", + group: LanguageGroup +): string { if (fileType === "json") { return `Format rules: - Output valid JSON with identical key structure. @@ -69,9 +76,15 @@ function getFormatRules(fileType: "markdown" | "json"): string { - Internal href paths (/developers/docs/...) must stay in English.` } + // Author handling differs by script family + const authorRule = + group === "latin" + ? "Keep the author field unchanged." + : "Transliterate the author field into the target script (phonetic, not semantic). Pseudonyms or GitHub handles (e.g., qbzzt, jdourlens) must stay in Latin." + return `Format rules: -- Preserve all frontmatter fields and structure exactly. -- Preserve all markdown syntax (headings, lists, links, code blocks). +- Frontmatter: translate the values of title, description, and breadcrumb. ${authorRule} Keep all other fields (tags, skill, published, lang, sidebarDepth) unchanged. Preserve YAML structure exactly. +- Preserve all markdown syntax (headings, lists, links, code blocks) and their indentation exactly. - Preserve all JSX/HTML components and their attributes exactly. - Preserve heading anchor IDs exactly as in English ({#anchor-id}). - Never translate content inside code fences (\`\`\` blocks). From 7482ad23e7395ce7939028589c6d36c273bc56d2 Mon Sep 17 00:00:00 2001 From: wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 24 Mar 2026 23:36:35 -0700 Subject: [PATCH 11/23] chore: autofix to sort imports --- src/scripts/i18n/lib/ai/gemini.ts | 41 ++++++++++++----------- src/scripts/i18n/lib/ai/prompt-builder.ts | 2 +- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/scripts/i18n/lib/ai/gemini.ts b/src/scripts/i18n/lib/ai/gemini.ts index a804db5a3cd..b6db3c0af53 100644 --- a/src/scripts/i18n/lib/ai/gemini.ts +++ b/src/scripts/i18n/lib/ai/gemini.ts @@ -7,6 +7,7 @@ import { GoogleGenAI } from "@google/genai" import i18nConfig from "../../../../../i18n.config.json" import type { ExtractedAttribute, TranslatedAttribute } from "../jsx-attributes" import { delay } from "../workflows/utils" + import { createRateLimiter } from "./rate-limiter" /** Gemini API configuration */ @@ -222,29 +223,31 @@ export async function translateAttributesByFile( concurrency?: number ): Promise> { const results = new Map() - const maxConcurrent = concurrency || Number(process.env.GEMINI_CONCURRENCY) || 16 + const maxConcurrent = + concurrency || Number(process.env.GEMINI_CONCURRENCY) || 16 const limiter = createRateLimiter(maxConcurrent) const tasks = Array.from(attributesByFile.entries()).map( - ([filePath, attributes]) => async () => { - await limiter.acquire() - try { - const translated = await translateAttributesWithRetry( - attributes, - targetLanguage, - glossaryTerms - ) - results.set(filePath, translated) - console.log( - `[GEMINI] ✓ Translated ${translated.length} attributes in ${filePath}` - ) - } catch (error) { - console.error(`[GEMINI] ✗ Failed to translate ${filePath}:`, error) - results.set(filePath, []) - } finally { - limiter.release() + ([filePath, attributes]) => + async () => { + await limiter.acquire() + try { + const translated = await translateAttributesWithRetry( + attributes, + targetLanguage, + glossaryTerms + ) + results.set(filePath, translated) + console.log( + `[GEMINI] ✓ Translated ${translated.length} attributes in ${filePath}` + ) + } catch (error) { + console.error(`[GEMINI] ✗ Failed to translate ${filePath}:`, error) + results.set(filePath, []) + } finally { + limiter.release() + } } - } ) await Promise.all(tasks.map((task) => task())) diff --git a/src/scripts/i18n/lib/ai/prompt-builder.ts b/src/scripts/i18n/lib/ai/prompt-builder.ts index eb4775268c9..fdc4af15981 100644 --- a/src/scripts/i18n/lib/ai/prompt-builder.ts +++ b/src/scripts/i18n/lib/ai/prompt-builder.ts @@ -11,9 +11,9 @@ */ import { - type LanguageGroup, getLanguageGroup, getSiteSpecificNotes, + type LanguageGroup, } from "./language-groups" interface PromptOptions { From a04ebdc23a1f150b06f1dae1d7e43827682d8b9b Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:22:02 +0000 Subject: [PATCH 12/23] fix(i18n): tell Gemini to set lang to target code The prompt told Gemini to keep `lang` unchanged, preserving `lang: en` from the English source. Now explicitly instructs it to set the lang field to the target language code. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/lib/ai/prompt-builder.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/scripts/i18n/lib/ai/prompt-builder.ts b/src/scripts/i18n/lib/ai/prompt-builder.ts index fdc4af15981..418b980725b 100644 --- a/src/scripts/i18n/lib/ai/prompt-builder.ts +++ b/src/scripts/i18n/lib/ai/prompt-builder.ts @@ -41,7 +41,7 @@ export function buildTranslationPrompt(options: PromptOptions): string { const group = getLanguageGroup(targetLanguage) const siteNotes = getSiteSpecificNotes(group) const glossarySection = formatGlossary(glossaryTerms) - const formatRules = getFormatRules(fileType, group) + const formatRules = getFormatRules(fileType, group, targetLanguage) const sanitizerHints = getSanitizerHints() return `Translate this ${fileType} file from English to ${languageName} (${targetLanguage}). @@ -65,7 +65,8 @@ Output ONLY the translated file content. No explanations, no markdown wrapping, function getFormatRules( fileType: "markdown" | "json", - group: LanguageGroup + group: LanguageGroup, + targetLanguage: string ): string { if (fileType === "json") { return `Format rules: @@ -83,7 +84,7 @@ function getFormatRules( : "Transliterate the author field into the target script (phonetic, not semantic). Pseudonyms or GitHub handles (e.g., qbzzt, jdourlens) must stay in Latin." return `Format rules: -- Frontmatter: translate the values of title, description, and breadcrumb. ${authorRule} Keep all other fields (tags, skill, published, lang, sidebarDepth) unchanged. Preserve YAML structure exactly. +- Frontmatter: translate the values of title, description, and breadcrumb. ${authorRule} Change the \`lang\` field to \`${targetLanguage}\`. Keep all other fields (tags, skill, published, sidebarDepth) unchanged. Preserve YAML structure exactly. - Preserve all markdown syntax (headings, lists, links, code blocks) and their indentation exactly. - Preserve all JSX/HTML components and their attributes exactly. - Preserve heading anchor IDs exactly as in English ({#anchor-id}). From 08561aa5619878b99c47c2fb262854cf7f0830a4 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:22:09 +0000 Subject: [PATCH 13/23] fix(i18n): sanitizer forces lang to match locale Add fixFrontmatterLang() as a deterministic backup that forces the frontmatter `lang` field to match the locale derived from the file path (public/content/translations/LANG_CODE/**/*.md). - 10 unit tests covering edge cases - Exported via _testOnly for testing Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/post_import_sanitize.ts | 38 +++++++++- tests/unit/sanitizer/standalone-fixes.spec.ts | 74 +++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 23712df3919..f4a56113aa0 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -2054,7 +2054,7 @@ function syncProtectedFrontmatterFields( ): { content: string; fixCount: number } { // Fields that should never be translated - sync from English canonical // Note: 'buttons' array needs special handling (content translatable, toId/isSecondary not) - // Note: 'lang' must NOT be protected - it must remain as target language code + // Note: 'lang' must NOT be protected - it is handled by fixFrontmatterLang() // Note: 'author' is excluded for non-Latin locales -- author names render to readers // and should be transliterated for reading flow const isTranslitLang = locale ? TRANSLITERATION_LOCALES.has(locale) : false @@ -2130,6 +2130,37 @@ function syncProtectedFrontmatterFields( return { content: translatedMd, fixCount } } +/** + * Force the frontmatter `lang` field to match the locale derived from the file path. + * The lang field must always equal the target language code (e.g., "ur", "ja", "es"). + * This is a deterministic fix -- the correct value is encoded in the path itself: + * public/content/translations//... + */ +function fixFrontmatterLang( + content: string, + locale: string +): { content: string; fixCount: number } { + if (!locale) return { content, fixCount: 0 } + + const frontmatterRe = /^---\n([\s\S]*?)\n---/ + const match = content.match(frontmatterRe) + if (!match) return { content, fixCount: 0 } + + const frontmatter = match[1] + const langRe = /^lang:\s*(.+)$/m + const langMatch = frontmatter.match(langRe) + if (!langMatch) return { content, fixCount: 0 } + + const currentLang = langMatch[1].trim() + if (currentLang === locale) return { content, fixCount: 0 } + + const fixedFrontmatter = frontmatter.replace(langRe, `lang: ${locale}`) + return { + content: content.replace(frontmatterRe, `---\n${fixedFrontmatter}\n---`), + fixCount: 1, + } +} + /** * Sync non-translatable fields (toId, isSecondary) in the buttons frontmatter * array from the English source. The 'content' field is translatable and preserved. @@ -3864,6 +3895,10 @@ function processMarkdownFile( () => quoteFrontmatterNonAscii(content), (n) => `Quoted ${n} frontmatter values with non-ASCII chars` ) + applyFix( + () => fixFrontmatterLang(content, locale), + (n) => `Fixed ${n} frontmatter lang field to match locale "${locale}"` + ) applyFix( () => fixAsciiGuillemets(content), (n) => `Fixed ${n} ASCII guillemets (<< >>) to Unicode (« »)` @@ -4560,6 +4595,7 @@ export const _testOnly = { removeOrphanedClosingTags, normalizeFrontmatterDates, quoteFrontmatterNonAscii, + fixFrontmatterLang, normalizeBlockHtmlLines, fixAsymmetricBackticks, // English-comparison fixes diff --git a/tests/unit/sanitizer/standalone-fixes.spec.ts b/tests/unit/sanitizer/standalone-fixes.spec.ts index dbc6df27835..29eaf0f38b3 100644 --- a/tests/unit/sanitizer/standalone-fixes.spec.ts +++ b/tests/unit/sanitizer/standalone-fixes.spec.ts @@ -49,6 +49,7 @@ const { fixMissingComponentClosingTags, fixMangledDocLinks, fixBrandCapitalization, + fixFrontmatterLang, } = _testOnly test.describe("Standalone Fixes", () => { @@ -2195,4 +2196,77 @@ author: Ori Pomerantz expect(fixCount).toBe(2) }) }) + + test.describe("fixFrontmatterLang", () => { + test("fixes lang: en to target locale", () => { + const input = `---\ntitle: "Test"\nlang: en\n---\n\nBody text` + const { content, fixCount } = fixFrontmatterLang(input, "ur") + expect(content).toBe(`---\ntitle: "Test"\nlang: ur\n---\n\nBody text`) + expect(fixCount).toBe(1) + }) + + test("fixes wrong locale to correct one", () => { + const input = `---\ntitle: "Test"\nlang: ja\n---\n\nBody` + const { content, fixCount } = fixFrontmatterLang(input, "ko") + expect(content).toBe(`---\ntitle: "Test"\nlang: ko\n---\n\nBody`) + expect(fixCount).toBe(1) + }) + + test("leaves correct locale unchanged", () => { + const input = `---\ntitle: "Test"\nlang: ur\n---\n\nBody` + const { content, fixCount } = fixFrontmatterLang(input, "ur") + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("returns unchanged when no frontmatter", () => { + const input = `# Just a heading\n\nNo frontmatter here` + const { content, fixCount } = fixFrontmatterLang(input, "ur") + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("returns unchanged when no lang field in frontmatter", () => { + const input = `---\ntitle: "Test"\nskill: beginner\n---\n\nBody` + const { content, fixCount } = fixFrontmatterLang(input, "ur") + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("returns unchanged when locale is empty string", () => { + const input = `---\ntitle: "Test"\nlang: en\n---\n\nBody` + const { content, fixCount } = fixFrontmatterLang(input, "") + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("handles lang with extra whitespace", () => { + const input = `---\ntitle: "Test"\nlang: en \n---\n\nBody` + const { content, fixCount } = fixFrontmatterLang(input, "ar") + expect(content).toContain("lang: ar") + expect(fixCount).toBe(1) + }) + + test("handles quoted lang value", () => { + const input = `---\ntitle: "Test"\nlang: "en"\n---\n\nBody` + const { content, fixCount } = fixFrontmatterLang(input, "hi") + expect(content).toContain("lang: hi") + expect(fixCount).toBe(1) + }) + + test("does not modify lang-like text in body", () => { + const input = `---\ntitle: "Test"\nlang: ur\n---\n\nlang: en appears in body` + const { content, fixCount } = fixFrontmatterLang(input, "ur") + expect(content).toBe(input) + expect(content).toContain("lang: en appears in body") + expect(fixCount).toBe(0) + }) + + test("handles hyphenated locale codes", () => { + const input = `---\ntitle: "Test"\nlang: en\n---\n\nBody` + const { content, fixCount } = fixFrontmatterLang(input, "zh-tw") + expect(content).toContain("lang: zh-tw") + expect(fixCount).toBe(1) + }) + }) }) From 8c1a01a986ef2fb6cba78e8e84443b21c1217126 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:29:47 +0000 Subject: [PATCH 14/23] fix(i18n): protect code block placeholders from Gemini Gemini was dropping CODE_BLOCK placeholders and hallucinating replacement code from training data, producing wrong language tags and modified code content. - Prompt: tell Gemini placeholders are sacrosanct - Prompt: fallback rules if a real fence slips through - Validation: reject output with missing placeholders - Validation: reject output with hallucinated code fences Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- .../i18n/lib/ai/gemini-output-validation.ts | 27 +++++++++++++++++++ src/scripts/i18n/lib/ai/prompt-builder.ts | 6 ++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/scripts/i18n/lib/ai/gemini-output-validation.ts b/src/scripts/i18n/lib/ai/gemini-output-validation.ts index 40da280e022..3691722b4c2 100644 --- a/src/scripts/i18n/lib/ai/gemini-output-validation.ts +++ b/src/scripts/i18n/lib/ai/gemini-output-validation.ts @@ -92,6 +92,33 @@ export function validateTranslatedMarkdown( return { valid: false, error: untranslatedFm } } + // Code block placeholders must survive translation intact. + // The pipeline extracts code blocks before sending to Gemini and restores + // them afterward. If Gemini drops or corrupts a placeholder, the code + // block is lost and Gemini may hallucinate replacement code. + const expectedPlaceholders = ( + english.match(//g) || [] + ) + for (const placeholder of expectedPlaceholders) { + if (!translated.includes(placeholder)) { + return { + valid: false, + error: `Missing code block placeholder: ${placeholder}`, + } + } + } + + // Gemini must not introduce code fences -- all code was extracted + if (expectedPlaceholders.length > 0) { + const fenceCount = (translated.match(/^```/gm) || []).length + if (fenceCount > 0) { + return { + valid: false, + error: `Output contains ${fenceCount} code fences but code blocks were extracted -- Gemini is hallucinating code`, + } + } + } + return { valid: true } } diff --git a/src/scripts/i18n/lib/ai/prompt-builder.ts b/src/scripts/i18n/lib/ai/prompt-builder.ts index 418b980725b..432533dc36d 100644 --- a/src/scripts/i18n/lib/ai/prompt-builder.ts +++ b/src/scripts/i18n/lib/ai/prompt-builder.ts @@ -85,11 +85,11 @@ function getFormatRules( return `Format rules: - Frontmatter: translate the values of title, description, and breadcrumb. ${authorRule} Change the \`lang\` field to \`${targetLanguage}\`. Keep all other fields (tags, skill, published, sidebarDepth) unchanged. Preserve YAML structure exactly. -- Preserve all markdown syntax (headings, lists, links, code blocks) and their indentation exactly. +- Preserve all markdown syntax (headings, lists, links) and their indentation exactly. - Preserve all JSX/HTML components and their attributes exactly. - Preserve heading anchor IDs exactly as in English ({#anchor-id}). -- Never translate content inside code fences (\`\`\` blocks). -- Code comments inside code fences may be translated. +- HTML comment placeholders like \`\` are code block stand-ins managed by our pipeline. You MUST preserve them EXACTLY as-is -- same text, same position, same line. Do NOT remove, translate, modify, or replace them with code. They will be restored automatically after translation. +- If a true code fence (\`\`\` block) is encountered in the source, never translate the functional code inside it. Only code comments (// or /* */ or #) within fences may be translated. Never change the language identifier after the opening fence (e.g. \`\`\`python, \`\`\`solidity, \`\`\`bash must stay exactly as-is). - Internal links (href starting with /) must match English exactly. - Image paths must match English exactly.` } From e73a3f9cf92e1f4210a02311672cbb57030e25c8 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:18:16 +0000 Subject: [PATCH 15/23] fix(i18n): compare functional code only in fence drift warning warnCodeFenceContentDrift was flagging every code block with translated comments as "differs from English" -- noise that obscured real code corruption. Now strips comments (// /* */ # and docstrings) before comparing, so only functional code differences trigger the warning. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/post_import_sanitize.ts | 56 ++++++++++++++++++++++-- tests/unit/sanitizer/warnings.spec.ts | 29 ++++++++++-- 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index f4a56113aa0..6b95805fa7e 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -1054,6 +1054,51 @@ function warnPunctuationOnlyHeadings(content: string): string[] { * Code inside fences should never be translated (variable names, keywords, etc.). * Catches issues like `or` → `または` inside code fences. */ +/** + * Strip comments from a code block body so we can compare functional code only. + * Handles JS-family (// and block comments), Python/Vyper (# and docstrings), and shell (#). + */ +function stripCodeComments(body: string, lang: string): string { + const l = lang.toLowerCase().split(/\s+/)[0] + + const isPython = ["python", "py", "vyper", "ruby", "rb"].includes(l) + const isShell = [ + "bash", + "sh", + "shell", + "zsh", + "fish", + "yaml", + "yml", + "toml", + ].includes(l) + const isJs = !isPython && !isShell // default to JS-family + + let result = body + + if (isPython) { + // Remove """ ... """ docstrings (multiline) + result = result.replace(/"""[\s\S]*?"""/g, '""""""') + // Remove # comments (preserve the line structure) + result = result.replace(/#[^\n]*/g, "#") + } else if (isShell) { + result = result.replace(/#[^\n]*/g, "#") + } else if (isJs) { + // Remove /* ... */ block comments (preserve as marker) + result = result.replace(/\/\*[\s\S]*?\*\//g, "/**/") + // Remove // line comments + result = result.replace(/\/\/[^\n]*/g, "//") + } + + // Normalize whitespace for comparison: collapse blank lines, trim each line + return result + .split("\n") + .map((line) => line.trimEnd()) + .join("\n") + .replace(/\n{3,}/g, "\n\n") + .trim() +} + function warnCodeFenceContentDrift( translatedContent: string, englishContent: string @@ -1083,10 +1128,14 @@ function warnCodeFenceContentDrift( } for (let i = 0; i < engFences.length; i++) { - if (engFences[i].body !== transFences[i].body) { - const preview = transFences[i].body.substring(0, 60).replace(/\n/g, "\\n") + const lang = engFences[i].lang || transFences[i].lang + const engStripped = stripCodeComments(engFences[i].body, lang) + const transStripped = stripCodeComments(transFences[i].body, lang) + + if (engStripped !== transStripped) { + const preview = transStripped.substring(0, 60).replace(/\n/g, "\\n") warnings.push( - `Code fence #${i + 1} content differs from English: "${preview}..."` + `Code fence #${i + 1} functional code differs from English: "${preview}..."` ) } } @@ -4648,6 +4697,7 @@ export const _testOnly = { warnTranslatedTechnicalNumerals, warnTranslatedInlineCode, warnCodeFenceContentDrift, + stripCodeComments, warnCatastrophicCodeFenceDrift, detectCrossScriptContamination, // Utilities diff --git a/tests/unit/sanitizer/warnings.spec.ts b/tests/unit/sanitizer/warnings.spec.ts index ebe6a2d88bf..bc285f2d33b 100644 --- a/tests/unit/sanitizer/warnings.spec.ts +++ b/tests/unit/sanitizer/warnings.spec.ts @@ -50,12 +50,33 @@ test.describe("Warning Functions", () => { expect(warnings).toHaveLength(0) }) - test("warns when code content was translated", () => { - const english = "```js\nconst x = 1\n```" - const translated = "```js\nconst x = 1\u306E\u5024\n```" + test("no warning when only comments differ (JS //)", () => { + const english = "```js\n// This is a comment\nconst x = 1\n```" + const translated = "```js\n// \u06CC\u06C1 \u0627\u06CC\u06A9 \u062A\u0628\u0635\u0631\u06C1 \u06C1\u06D2\nconst x = 1\n```" + const warnings = warnCodeFenceContentDrift(translated, english) + expect(warnings).toHaveLength(0) + }) + + test("no warning when only comments differ (JS /* */)", () => { + const english = "```solidity\n/* @dev Returns the balance */\nfunction balanceOf() {}\n```" + const translated = "```solidity\n/* @dev \u0628\u06CC\u0644\u0646\u0633 \u0648\u0627\u067E\u0633 \u06A9\u0631\u062A\u0627 \u06C1\u06D2 */\nfunction balanceOf() {}\n```" + const warnings = warnCodeFenceContentDrift(translated, english) + expect(warnings).toHaveLength(0) + }) + + test("no warning when only comments differ (Python #)", () => { + const english = "```python\n# This is a helper\ndef foo():\n pass\n```" + const translated = "```python\n# \u06CC\u06C1 \u0627\u06CC\u06A9 \u06C1\u06CC\u0644\u067E\u0631 \u06C1\u06D2\ndef foo():\n pass\n```" + const warnings = warnCodeFenceContentDrift(translated, english) + expect(warnings).toHaveLength(0) + }) + + test("warns when functional code differs", () => { + const english = "```js\nconst x = node_hash\n```" + const translated = "```js\nconst x = node\n```" const warnings = warnCodeFenceContentDrift(translated, english) expect(warnings.length).toBe(1) - expect(warnings[0]).toContain("content differs") + expect(warnings[0]).toContain("functional code differs") }) test("warns on fence count mismatch", () => { From 8c6663b636548b0219eda655a8288103e5e1f03d Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Thu, 26 Mar 2026 05:03:34 +0000 Subject: [PATCH 16/23] patch: ai/language-groups instructions Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/lib/ai/language-groups.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/lib/ai/language-groups.ts b/src/scripts/i18n/lib/ai/language-groups.ts index 44a7edf5ac5..15d39e4ede5 100644 --- a/src/scripts/i18n/lib/ai/language-groups.ts +++ b/src/scripts/i18n/lib/ai/language-groups.ts @@ -66,8 +66,9 @@ Site-specific rules for ethereum.org: switch (group) { case "rtl": return `${common} +- Content inside backticks (\`inline code\`) is already rendered as LTR monospace. Do NOT wrap backtick content in -- this breaks MDX rendering. Use ... ONLY for bare mathematical expressions or bare numeric dates that are NOT already inside backticks. - Wrap bare numeric dates (YYYY-MM-DD, DD/MM/YYYY) in ... to prevent BiDi flipping. -- Wrap mathematical equations with operators in .... +- Wrap mathematical equations with operators in ..., but only when they are NOT inside backticks. - Use Western Arabic numerals (1, 2, 3) for Arabic. Urdu uses native numerals for prose but Western for technical identifiers. - Never convert Gregorian dates to Hijri calendar. - The word "state" in blockchain context means computational state, not political state.` From 1b3fabb568f61fab11c96a7c6b8a2551bf1ae57c Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Fri, 27 Mar 2026 20:06:38 +0000 Subject: [PATCH 17/23] feat(i18n): post failed files as PR comment When files fail to translate, post the list as a comment on the newly created PR for easy follow-up tracking. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/main-gemini.ts | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/scripts/i18n/main-gemini.ts b/src/scripts/i18n/main-gemini.ts index f4af298090d..bd256c0d318 100644 --- a/src/scripts/i18n/main-gemini.ts +++ b/src/scripts/i18n/main-gemini.ts @@ -29,6 +29,7 @@ import { createBranchName, getBranchObject, } from "./lib/github/branches" +import { postPullRequestComment } from "./lib/github/pull-requests" import { geminiInitialize } from "./lib/workflows/gemini-initialize" import { geminiTranslateFiles } from "./lib/workflows/gemini-translate-files" import { runJsxTranslation } from "./lib/workflows/jsx-translation" @@ -144,12 +145,29 @@ async function main() { internalLanguageCode: code, })) - await createTranslationPR( + const pr = await createTranslationPR( branchName, sanitizerInput, allSanitizerChanges, languagePairs ) + + // Post failed files as a PR comment for follow-up + if (failedFiles.length > 0) { + const commentBody = + `Failed files noted for follow-up:\n\n` + + "```\n" + + failedFiles.join("\n") + + "\n```" + try { + await postPullRequestComment(pr.number, commentBody) + console.log(`[main] Posted failed file${failedFiles.length > 1 ? "s" : ""} comment on PR #${pr.number}`) + } catch (error) { + console.warn( + `[main] Could not post failed files comment: ${error instanceof Error ? error.message : String(error)}` + ) + } + } } // Cleanup progress manifest on success @@ -160,7 +178,7 @@ async function main() { if (failedFiles.length > 0) { console.warn( - `\n[main] ${failedFiles.length} file(s) could not be translated:` + `\n[main] ${failedFiles.length} file${failedFiles.length > 1 ? "s" : ""} could not be translated:` ) for (const f of failedFiles) { console.warn(` - ${f}`) From 470d076d9227e41aaf9616368e9f405dd162e956 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Sat, 28 Mar 2026 02:58:35 +0000 Subject: [PATCH 18/23] feat(i18n): add JSON batching and HTML extraction Adds batching for large JSON translation files (~100 keys per Gemini request) and pre-translation HTML placeholder extraction/restoration for values with embedded HTML tags. This targets the 8+ language failures on glossary.json (406 keys, 595 HTML tags) and learn-quizzes.json (696 keys). Also updates the translation roadmap with the agreed-upon priority plan for v3 fixes and v4 infrastructure. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- docs/gemini-translation-roadmap.md | 365 ++++++++++++++++++++ src/scripts/i18n/lib/ai/gemini-translate.ts | 111 +++++- src/scripts/i18n/lib/ai/json-batcher.ts | 290 ++++++++++++++++ src/scripts/i18n/lib/ai/prompt-builder.ts | 14 +- 4 files changed, 773 insertions(+), 7 deletions(-) create mode 100644 docs/gemini-translation-roadmap.md create mode 100644 src/scripts/i18n/lib/ai/json-batcher.ts diff --git a/docs/gemini-translation-roadmap.md b/docs/gemini-translation-roadmap.md new file mode 100644 index 00000000000..a6f2ea17116 --- /dev/null +++ b/docs/gemini-translation-roadmap.md @@ -0,0 +1,365 @@ +# Gemini Translation Pipeline -- Roadmap + +Status: Active plan +Last updated: 2026-03-27 + +--- + +## Current state + +The initial full-repo translation pass is ~97-99% complete across 24 non-English +languages. The Gemini translation pipeline (`gemini-translations.yml`) works well +for full-file translation but has limitations as we shift to ongoing maintenance. + +### What works today + +- Full-file translation with glossary enforcement +- Code block extraction/restoration (`` placeholders) +- Comment translation within code blocks +- Incremental commit per language (no work lost on partial failure) +- Progress tracking and run resumption +- Post-import sanitization and transliteration +- Configurable concurrency, include/exclude paths, per-language targeting +- 100% custom header ID coverage (`{#custom-id}`) across all markdown files, + preserved identically in translations (verified 2026-03-27) + +### Gaps (being addressed) + +1. ~42 file/language pairs failed during the initial pass (see "Failed files") +2. No drift detection (no way to know which translations are stale) +3. No incremental translation (every run retranslates from scratch) +4. Manual triggering (no automation for ongoing maintenance) +5. Limited error logging from the `@google/gen-ai` SDK +6. Some existing translations done with Gemini 2.5 Pro before current sanitizer + improvements, transliteration banks, and glossary enhancements + +### Cost context + +- Initial full-repo pass: ~$1,500 (via Crowdin + Gemini 2.5 Pro) +- Current pipeline (direct Gemini, bypassing Crowdin): ~80% cheaper +- Estimated full sweep with current pipeline: ~$300-500 +- Gemini Pro pricing (approximate): + - Input: ~$1.25 / 1M tokens + - Output: ~$10.00 / 1M tokens (output dominates cost) + +--- + +## Priority 1: Fix failed files (branch: `gemini-v3`) + +Close the initial pass from ~97-99% to ~100%. This is the most urgent work item. + +### Failed file inventory + +42 file/language pairs failed. Full list: + +``` +ar: glossary.json +bn: json-rpc/index.md, ethash/index.md, ethereum-forks/index.md, + fusaka/peerdas/index.md, glossary.json, learn-quizzes.json, + page-resources.json, page-trillion-dollar-security.json +de: ethereum-forks/index.md, whitepaper/index.md, glossary.json, + learn-quizzes.json +id: nodes-and-clients/index.md, glamsterdam/index.md, merge/index.md, + glossary.json, learn-quizzes.json +it: hello-world-smart-contract-fullstack/index.md, glossary.json, + learn-quizzes.json +sw: glossary.json +ta: translatathon/index.md, json-rpc/index.md, poa/index.md, + pos-vs-pow/index.md, ethash/index.md, web2-vs-web3/index.md, + fusaka/peerdas/index.md, glamsterdam/index.md, glossary.json, + learn-quizzes.json +ur: json-rpc/index.md, dagger-hashimoto/index.md, ethash/index.md, + dapps/index.md, secret-state/index.md, ethereum-forks/index.md, + fusaka/peerdas/index.md, pectra/maxeb/index.md, glossary.json, + page-what-is-the-ethereum-network.json +``` + +### Failure pattern analysis + +| Root cause | Files affected | Details | +|------------------------|----------------|--------------------------------------------| +| Token overload (>15k) | ~7 | whitepaper (90KB), json-rpc (75KB), etc. | +| Code block density | ~5 | json-rpc (172 blocks), hello-world (128) | +| Table/component density| ~5 | ethereum-forks (60 JSX + 33 tables) | +| JSON with embedded HTML| ~3 | glossary.json (317 anchors, 540 escapes) | + +**Repeat offenders:** +- `glossary.json` -- fails for 8 languages (ar, bn, de, id, it, sw, ta, ur) +- `learn-quizzes.json` -- fails for 5 languages (bn, de, id, ta, ur) + +**Languages with most failures:** Tamil (10), Urdu (10), Bengali (8) + +### Fixes to implement on `gemini-v3` + +#### A. Markdown: header ID-based chunking + +Replace token-count-based chunking (which failed) with structure-aware chunking +using the `{#custom-id}` header anchors. + +- Split at heading boundaries, grouping sections up to a token budget per chunk +- Each chunk carries its header IDs for deterministic reassembly +- Header IDs are 100% consistent across the repo and preserved in translations +- Intro content before the first heading gets a synthetic `_intro` key + +#### B. JSON: namespace batching with HTML placeholder pre-parsing + +Two improvements for large/complex JSON files: + +1. **Batch by top-level keys**: Send ~100 key-value pairs per request (with a + ~20 key buffer to avoid wasteful tiny final batches -- e.g., a file with 110 + keys sends one batch of 110, not 100 + 10) + +2. **HTML placeholder pre-parsing**: Before translation, replace embedded HTML + in JSON values with numbered placeholders (similar to Crowdin's `<0>` + pattern but more descriptive). Restore after translation. + + ``` + Before: "A DAO is..." + After: "A DAO is..." + (with restoration map stored separately) + ``` + + Validation: after restoration, verify all placeholders were preserved. Flag + chunks with missing/duplicated placeholders for retry. + +#### C. Code fence extraction audit + +The `` extraction works on successful files. Investigate why +it fails on code-dense files: +- Run the extractor in isolation on failing files, inspect output +- Check for edge cases: nested fences, non-standard fence syntax, very high + placeholder counts (>100 per file) +- May be interaction between chunking failure + code blocks (if chunking fails, + the entire code-heavy file hits Gemini as one blob) + +#### D. Error logging improvements + +Add structured error logging from the `@google/gen-ai` SDK: +- Capture failure reason, response status, partial output if available +- Log per-file/per-language so failures can be triaged without re-running +- Distinguish error types: rate limit vs. content filter vs. malformed output + vs. timeout (each needs different retry strategy) + +#### E. Validation + +- Retranslate the ~42 failed file/language pairs as the test case +- Compare output quality against successfully translated files of similar size + +--- + +## Priority 2: Section hash manifest (branch: `gemini-v4`) + +Build per-section content hashing infrastructure. This is the foundation for both +drift detection and incremental translation. + +### Markdown: header ID-keyed section hashes + +Parse each English markdown file into a tree of sections keyed by `{#custom-id}`. +Hash each section's content. Structure: + +```json +{ + "public/content/roadmap/index.md": { + "fileHash": "abc123", + "sections": { + "_intro": "def456", + "what-is-the-roadmap": "ghi789", + "why-does-ethereum-need-a-roadmap": "jkl012", + ... + } + } +} +``` + +**Possible future optimization**: merkle trie structure where leaf hashes bubble +up to parent sections. Allows O(1) "has anything changed?" checks at the file +level, with drill-down to find exactly which sections changed. Worth considering +once the flat hash map is working, if performance demands it. + +### JSON: key-level hashes + +For JSON files, hash individual key-value pairs (or namespace groups for deeply +nested files). Structure: + +```json +{ + "src/intl/en/glossary.json": { + "fileHash": "mno345", + "keys": { + "account": "pqr678", + "address": "stu901", + ... + } + } +} +``` + +### Storage: manifest file + +**Decision**: Use a manifest file (`src/intl/translation-manifest.json`). + +- Single file, easy to query, no content file pollution +- Works for both JSON and markdown +- Can include metadata: timestamp, pipeline version, token cost, Gemini model +- Trade-off: potential merge conflicts if multiple translation PRs run + simultaneously (mitigated by per-language PRs or lock-step merging) + +--- + +## Priority 3: Baseline sweep + quality refresh + +**Decision**: "Stamp now" approach (Option B from brainstorming). + +One combined operation (~$300-500) that accomplishes two goals simultaneously: + +1. **Establish baseline**: Record current English source SHAs in the manifest + for every file/language pair. Going forward, drift is detectable by comparing + recorded SHA against current English SHA. + +2. **Quality refresh**: Retranslate everything using current best pipeline: + - Gemini 3.1 Pro (upgraded from 2.5 Pro used in original pass) + - Current sanitizer with all accumulated fixes + - Transliteration banks for non-Latin script languages + - Improved translation glossary (in development separately) + +After this sweep, every translation in the repo is (a) generated by the best +available pipeline and (b) tracked in the manifest with a known English source +SHA. This is the clean foundation for incremental work going forward. + +### Prerequisite: glossary and transliteration improvements + +The quality refresh is most valuable after: +- Translation glossary expansion is complete (in flight) +- Transliteration bank coverage is solid for non-Latin scripts +- All Priority 1 fixes are deployed (so zero files fail) + +### Approach alternatives considered + +**Option A (rejected): Git history bootstrap** -- Analyze commit messages +(pattern: `i18n(pl): Crowdin translations`) to determine when each file was +last truly translated. Feasible since commits are programmatic, but complicated +by cleanup commits that are more recent than actual translation timestamps. + +**Option B (selected): Stamp now, sweep forward** -- Accept that current +translations have unknown-precision freshness. Do one full sweep with current +pipeline, stamping SHAs as we go. After this, the manifest is authoritative. + +**Option C (rejected): Hybrid git + LLM spot-check** -- Use git where clear, +LLM where ambiguous. More accurate bootstrap but more complexity for marginal +benefit given we want a quality refresh anyway. + +--- + +## Priority 4: Incremental translation (branch: `gemini-v4`) + +Once the manifest exists with per-section hashes, incremental translation +becomes straightforward. + +### JSON: key-level diff and translate + +1. Deep-diff current English JSON against manifest's recorded English version +2. Collect added and changed key paths +3. Send only those key-value pairs to Gemini for translation +4. Deep-merge translated pairs into existing translation JSON +5. Update manifest with new SHAs +6. Run sanitizer on the merged file + +**Complexity**: Low. JSON key merging is deterministic and safe. + +### Markdown: section-level diff and translate + +1. Parse current English file into sections keyed by `{#header-id}` +2. Compare section hashes against manifest +3. For each changed section: + a. Extract corresponding section from existing translation + b. Send to Gemini: English section + existing translation + context + c. Receive translated section +4. Reassemble: unchanged sections from existing translation + new translations +5. Update manifest with new SHAs +6. Run sanitizer on reassembled file + +**Complexity**: Medium. The 100% header ID coverage makes this much more +feasible than initially estimated. Splicing by ID is deterministic. Edge case: +intro content before first heading (use synthetic `_intro` key). + +**Fallback**: If >50% of sections changed, fall back to full-file retranslation +(the incremental overhead isn't worth it at that point). + +### "Previous English version" question (resolved) + +The manifest's recorded SHA IS the previous English version. When a translation +is generated, the manifest records the English source SHA. On the next +incremental run, diff current English against that SHA to identify what changed. + +--- + +## Priority 5: Automation (branch: `gemini-v4`) + +### End-state vision + +``` +English content merged to dev + | + v +Drift detection scan (automatic or cron) + | + v +Stale file list (per language) + | + v +Batching logic (group by language, thresholds, cooldown) + | + v +Incremental translation dispatch (Gemini 3.1 Pro) + | + v +Sanitizer + transliteration + review agents + | + v +PR(s) created, ready for human merge +``` + +### Graduation plan + +**Phase 1 (near-term): Manual + tooling** +- Drift scan script runs manually or on cron, outputs report +- Human reviews report and manually dispatches translation +- Existing sanitizer + review pipeline handles quality + +**Phase 2 (mid-term): Semi-automated** +- Nightly/weekly cron runs drift scan +- When stale count exceeds threshold, auto-dispatches translation +- Human merges resulting PRs + +**Phase 3 (long-term): Full automation** +- Push to dev triggers path-filtered action (`public/content/`, `src/intl/en/`) +- Batching logic groups changes (cooldown window during active dev) +- Translation -> sanitizer -> review agents -> PR ready for human merge +- Cron job as safety net catches anything the push trigger missed +- Human stays in the loop at the merge step + +### Batching considerations + +- One PR per language per run (clearest for review) +- Skip whitespace-only or comment-only changes +- Cooldown: don't retranslate files translated in the last N hours +- Size cap: if >50 files stale, split into multiple runs or prioritize by traffic + +--- + +## Branch strategy + +- **`gemini-v3`**: Priority 1 (fix failed files). Patches to the existing + pipeline: chunking, batching, HTML placeholders, error logging. +- **`gemini-v4`**: Priorities 2-5 (new infrastructure). Manifest, drift + detection, incremental translation, automation. + +--- + +## Related workstreams (tracked elsewhere) + +- **Translation glossary expansion** -- in flight, separate task +- **Transliteration bank improvements** -- ongoing per non-Latin locale +- **Full-language retroactive cleanup** -- see `src/scripts/i18n/FUTURE.md` #9 +- **Lowercase ethereum initiative** -- content standardization, tracked in + `docs/lowercase-ethereum-plan.md` diff --git a/src/scripts/i18n/lib/ai/gemini-translate.ts b/src/scripts/i18n/lib/ai/gemini-translate.ts index 0f8e51c883a..3347173b819 100644 --- a/src/scripts/i18n/lib/ai/gemini-translate.ts +++ b/src/scripts/i18n/lib/ai/gemini-translate.ts @@ -26,6 +26,11 @@ import { validateTranslatedMarkdown, type ValidationResult, } from "./gemini-output-validation" +import { + mergeJsonBatches, + prepareJsonBatches, + restoreJsonBatch, +} from "./json-batcher" import { buildTranslationPrompt } from "./prompt-builder" const GEMINI_MODELS = ["gemini-3.1-pro-preview", "gemini-3.1-pro"] @@ -53,6 +58,8 @@ export interface TranslateFileOptions { fileType: "markdown" | "json" targetLanguage: string glossaryTerms: Map + /** Set by JSON batching when HTML tags have been extracted to placeholders */ + htmlExtracted?: boolean } export interface TranslateFileResult { @@ -88,9 +95,9 @@ export async function translateFile( const { filePath, fileContent, fileType, targetLanguage, glossaryTerms } = options - // JSON files: translate directly, no extraction needed + // JSON files: batch large files, extract HTML from values if (fileType === "json") { - return callGemini({ ...options, fileContent }, { filePath, targetLanguage }) + return translateJsonFile(options) } // Markdown: extract code blocks first @@ -157,6 +164,95 @@ export async function translateFile( } } +/** + * Translate a JSON file with batching and HTML placeholder extraction. + * + * 1. Parse and split into ~100-key batches (if large) + * 2. Extract HTML tags from values into numbered placeholders + * 3. Translate each batch via Gemini + * 4. Restore HTML tags from placeholders + * 5. Merge batches and validate against full English source + */ +async function translateJsonFile( + options: TranslateFileOptions +): Promise { + const { filePath, fileContent, targetLanguage } = options + + const prepared = prepareJsonBatches(fileContent) + const totalTokens = { input: 0, output: 0 } + + if (prepared.batchContents.length > 1) { + console.log( + ` [json-batch] ${filePath}: ${prepared.totalKeys} keys -> ${prepared.batchContents.length} batches (${prepared.batchSizes.join(", ")})` + ) + } + if (prepared.htmlExtracted) { + console.log(` [html-extract] ${filePath}: HTML tags replaced with placeholders`) + } + + const translatedBatches: string[] = [] + + for (let i = 0; i < prepared.batchContents.length; i++) { + const batchContent = prepared.batchContents[i] + const isMultiBatch = prepared.batchContents.length > 1 + + // Translate this batch (callGemini handles retries and validation) + const result = await callGemini( + { + ...options, + fileContent: batchContent, + htmlExtracted: prepared.htmlExtracted, + }, + { + filePath, + targetLanguage, + chunkIndex: isMultiBatch ? i : undefined, + totalChunks: isMultiBatch ? prepared.batchContents.length : undefined, + label: isMultiBatch ? "json-batch" : undefined, + } + ) + + totalTokens.input += result.tokensUsed.input + totalTokens.output += result.tokensUsed.output + + // Restore HTML placeholders in translated output + const placeholderMap = prepared.placeholderMaps[i] + if (placeholderMap.size > 0) { + const { content, failures } = restoreJsonBatch( + result.translatedContent, + placeholderMap + ) + if (failures.length > 0) { + console.warn( + ` [html-restore] ${filePath}${isMultiBatch ? ` batch ${i + 1}` : ""}: ${failures.length} placeholder(s) missing:\n` + + failures.map((f) => ` - ${f}`).join("\n") + ) + } + translatedBatches.push(content) + } else { + translatedBatches.push(result.translatedContent) + } + } + + // Merge batches into final JSON + const finalContent = mergeJsonBatches(translatedBatches) + + // Final validation: merged result against original English + if (prepared.batchContents.length > 1) { + const validation = validateTranslatedJson(finalContent, fileContent) + if (!validation.valid) { + console.warn( + ` [json-batch] ${filePath}: merged validation warning: ${validation.error}` + ) + } + } + + return { + translatedContent: finalContent, + tokensUsed: totalTokens, + } +} + /** * Extract comments from all code blocks, translate them in a single * Gemini call, and restore them into the final content. @@ -264,8 +360,14 @@ async function callGemini( options: TranslateFileOptions, metadata?: GeminiCallMetadata ): Promise { - const { filePath, fileContent, fileType, targetLanguage, glossaryTerms } = - options + const { + filePath, + fileContent, + fileType, + targetLanguage, + glossaryTerms, + htmlExtracted, + } = options const languageName = LANGUAGE_NAMES[targetLanguage] || targetLanguage const prompt = buildTranslationPrompt({ @@ -275,6 +377,7 @@ async function callGemini( targetLanguage, languageName, glossaryTerms, + htmlExtracted, }) // Retry loop for validation failures (API call retries are in callGeminiRaw) diff --git a/src/scripts/i18n/lib/ai/json-batcher.ts b/src/scripts/i18n/lib/ai/json-batcher.ts new file mode 100644 index 00000000000..cccb2b6aaa0 --- /dev/null +++ b/src/scripts/i18n/lib/ai/json-batcher.ts @@ -0,0 +1,290 @@ +/** + * JSON batching and HTML placeholder extraction for large JSON translation. + * + * For large JSON files (>120 top-level keys), splits into ~100-key batches + * to stay within Gemini's reliable output range. + * + * For JSON values containing embedded HTML (, ,
, etc.), + * extracts tags to numbered placeholders before translation and restores + * them after. This prevents Gemini from mangling or dropping HTML structure. + */ + +type JsonValue = + | string + | number + | boolean + | null + | JsonValue[] + | { [key: string]: JsonValue } + +interface PlaceholderEntry { + placeholder: string + original: string +} + +/** Map of JSON path -> placeholder entries for that value */ +export type PlaceholderMap = Map + +export interface PreparedJsonBatches { + /** JSON strings with HTML extracted, ready for Gemini */ + batchContents: string[] + /** Per-batch placeholder maps for HTML restoration */ + placeholderMaps: PlaceholderMap[] + /** Whether any HTML was actually extracted */ + htmlExtracted: boolean + /** Total top-level key count */ + totalKeys: number + /** Key count per batch (for logging) */ + batchSizes: number[] +} + +/** Keys per Gemini request */ +const BATCH_SIZE = 100 +/** Avoid tiny final batches -- absorb up to this many extra keys */ +const BATCH_BUFFER = 20 + +/** HTML tag pattern: opening, closing, and self-closing tags */ +const HTML_TAG_RE = /<\/?[a-zA-Z][^>]*\/?>/g + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/** + * Prepare a JSON file for batched translation. + * + * 1. Parses the JSON + * 2. Splits top-level keys into batches (~100 per batch) + * 3. Extracts HTML tags from string values, replacing with placeholders + * 4. Returns batch contents ready for Gemini + restoration maps + */ +export function prepareJsonBatches(jsonContent: string): PreparedJsonBatches { + const parsed = JSON.parse(jsonContent) as Record + const keys = Object.keys(parsed) + const keyBatches = splitIntoBatches(keys, BATCH_SIZE, BATCH_BUFFER) + + let htmlExtracted = false + const batchContents: string[] = [] + const placeholderMaps: PlaceholderMap[] = [] + const batchSizes: number[] = [] + + for (const batchKeys of keyBatches) { + // Build sub-object for this batch + const batchObj: Record = {} + for (const key of batchKeys) { + batchObj[key] = parsed[key] + } + + // Extract HTML from string values + const placeholderMap: PlaceholderMap = new Map() + const sanitized = extractHtmlFromObject(batchObj, placeholderMap) + + if (placeholderMap.size > 0) htmlExtracted = true + + batchContents.push(JSON.stringify(sanitized, null, 2)) + placeholderMaps.push(placeholderMap) + batchSizes.push(batchKeys.length) + } + + return { + batchContents, + placeholderMaps, + htmlExtracted, + totalKeys: keys.length, + batchSizes, + } +} + +/** + * Restore HTML tags in a translated JSON batch from its placeholder map. + * + * Returns the restored JSON string and a list of any placeholder failures + * (missing placeholders that could not be restored). + */ +export function restoreJsonBatch( + translatedJson: string, + placeholderMap: PlaceholderMap +): { content: string; failures: string[] } { + if (placeholderMap.size === 0) { + return { content: translatedJson, failures: [] } + } + + const parsed = JSON.parse(translatedJson) as Record + const failures: string[] = [] + const restored = restoreHtmlInObject(parsed, placeholderMap, "", failures) + return { + content: JSON.stringify(restored, null, 2), + failures, + } +} + +/** + * Merge multiple translated JSON batch strings into a single JSON string. + * Preserves key order from the original batches. + */ +export function mergeJsonBatches(batchContents: string[]): string { + if (batchContents.length === 1) return batchContents[0] + + const merged: Record = {} + for (const content of batchContents) { + const parsed = JSON.parse(content) as Record + Object.assign(merged, parsed) + } + return JSON.stringify(merged, null, 2) +} + +/** + * Check whether a JSON file needs batching (has more keys than the threshold). + */ +export function needsBatching(jsonContent: string): boolean { + const parsed = JSON.parse(jsonContent) as Record + return Object.keys(parsed).length > BATCH_SIZE + BATCH_BUFFER +} + +// --------------------------------------------------------------------------- +// Batching +// --------------------------------------------------------------------------- + +function splitIntoBatches( + keys: string[], + size: number, + buffer: number +): string[][] { + if (keys.length <= size + buffer) return [keys] + + const batches: string[][] = [] + for (let i = 0; i < keys.length; i += size) { + const remaining = keys.length - i + // If remaining fits in one more batch (with buffer), take it all + if (remaining <= size + buffer) { + batches.push(keys.slice(i)) + break + } + batches.push(keys.slice(i, i + size)) + } + return batches +} + +// --------------------------------------------------------------------------- +// HTML extraction (pre-translation) +// --------------------------------------------------------------------------- + +function extractHtmlFromObject( + obj: Record, + map: PlaceholderMap, + prefix = "" +): Record { + const result: Record = {} + for (const [key, value] of Object.entries(obj)) { + const path = prefix ? `${prefix}.${key}` : key + result[key] = extractHtmlFromValue(value, map, path) + } + return result +} + +function extractHtmlFromValue( + value: JsonValue, + map: PlaceholderMap, + path: string +): JsonValue { + if (typeof value === "string") { + return extractHtmlFromString(value, map, path) + } + if (Array.isArray(value)) { + return value.map((item, i) => + extractHtmlFromValue(item, map, `${path}[${i}]`) + ) + } + if (value !== null && typeof value === "object") { + return extractHtmlFromObject( + value as Record, + map, + path + ) + } + return value +} + +function extractHtmlFromString( + text: string, + map: PlaceholderMap, + path: string +): string { + const entries: PlaceholderEntry[] = [] + let counter = 0 + + const result = text.replace(HTML_TAG_RE, (match) => { + const placeholder = `` + entries.push({ placeholder, original: match }) + counter++ + return placeholder + }) + + if (entries.length > 0) { + map.set(path, entries) + } + return result +} + +// --------------------------------------------------------------------------- +// HTML restoration (post-translation) +// --------------------------------------------------------------------------- + +function restoreHtmlInObject( + obj: Record, + map: PlaceholderMap, + prefix: string, + failures: string[] +): Record { + const result: Record = {} + for (const [key, value] of Object.entries(obj)) { + const path = prefix ? `${prefix}.${key}` : key + result[key] = restoreHtmlInValue(value, map, path, failures) + } + return result +} + +function restoreHtmlInValue( + value: JsonValue, + map: PlaceholderMap, + path: string, + failures: string[] +): JsonValue { + if (typeof value === "string") { + return restoreHtmlInString(value, map, path, failures) + } + if (Array.isArray(value)) { + return value.map((item, i) => + restoreHtmlInValue(item, map, `${path}[${i}]`, failures) + ) + } + if (value !== null && typeof value === "object") { + return restoreHtmlInObject( + value as Record, + map, + path, + failures + ) + } + return value +} + +function restoreHtmlInString( + text: string, + map: PlaceholderMap, + path: string, + failures: string[] +): string { + const entries = map.get(path) + if (!entries) return text + + let result = text + for (const { placeholder, original } of entries) { + if (!result.includes(placeholder)) { + failures.push(`${path}: missing ${placeholder} (was: ${original})`) + continue + } + result = result.replace(placeholder, original) + } + return result +} diff --git a/src/scripts/i18n/lib/ai/prompt-builder.ts b/src/scripts/i18n/lib/ai/prompt-builder.ts index 432533dc36d..153e866c98a 100644 --- a/src/scripts/i18n/lib/ai/prompt-builder.ts +++ b/src/scripts/i18n/lib/ai/prompt-builder.ts @@ -23,6 +23,8 @@ interface PromptOptions { targetLanguage: string languageName: string glossaryTerms: Map + /** When true, HTML tags have been replaced with placeholders */ + htmlExtracted?: boolean } /** @@ -36,12 +38,13 @@ export function buildTranslationPrompt(options: PromptOptions): string { targetLanguage, languageName, glossaryTerms, + htmlExtracted, } = options const group = getLanguageGroup(targetLanguage) const siteNotes = getSiteSpecificNotes(group) const glossarySection = formatGlossary(glossaryTerms) - const formatRules = getFormatRules(fileType, group, targetLanguage) + const formatRules = getFormatRules(fileType, group, targetLanguage, htmlExtracted) const sanitizerHints = getSanitizerHints() return `Translate this ${fileType} file from English to ${languageName} (${targetLanguage}). @@ -66,13 +69,18 @@ Output ONLY the translated file content. No explanations, no markdown wrapping, function getFormatRules( fileType: "markdown" | "json", group: LanguageGroup, - targetLanguage: string + targetLanguage: string, + htmlExtracted?: boolean ): string { if (fileType === "json") { + const htmlRule = htmlExtracted + ? `- \`\` placeholders are stand-ins for HTML tags managed by our pipeline. You MUST preserve them EXACTLY as-is -- same text, same position, same numbering. Do NOT remove, translate, modify, or renumber them. They will be restored to HTML tags automatically after translation.` + : `- Preserve HTML tags within values exactly (
, , etc.).` + return `Format rules: - Output valid JSON with identical key structure. - Translate only string values. Never translate keys. -- Preserve HTML tags within values exactly (, , etc.). +${htmlRule} - Preserve interpolation variables exactly ({count}, {{name}}, etc.). - Internal href paths (/developers/docs/...) must stay in English.` } From 574ffd7d7fca0ff8f6e222af6a91f7a2203b52f9 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Sat, 28 Mar 2026 04:25:55 +0000 Subject: [PATCH 19/23] feat(i18n): add safety settings and response diagnostics Adds BLOCK_NONE safety settings for all harm categories to prevent Gemini from silently returning empty responses for educational blockchain content (mining, attacks, etc.). Inspects response candidates, finishReason, and safetyRatings before accessing response.text, logging detailed diagnostics when non-STOP finish reasons are detected. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/lib/ai/gemini-translate.ts | 64 ++++++++++++++++++++- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/src/scripts/i18n/lib/ai/gemini-translate.ts b/src/scripts/i18n/lib/ai/gemini-translate.ts index 3347173b819..c8eb8d5a73d 100644 --- a/src/scripts/i18n/lib/ai/gemini-translate.ts +++ b/src/scripts/i18n/lib/ai/gemini-translate.ts @@ -37,6 +37,19 @@ const GEMINI_MODELS = ["gemini-3.1-pro-preview", "gemini-3.1-pro"] const MAX_RETRIES = 3 const RETRY_DELAY_MS = 5000 +/** + * Disable safety filters for all categories. Translation content (educational + * blockchain docs) should never be blocked. Without this, Gemini silently + * returns empty candidates for content that triggers false positives (e.g., + * mining/attack descriptions in certain non-Latin languages). + */ +const SAFETY_SETTINGS = [ + { category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" }, + { category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" }, + { category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" }, + { category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" }, +] + const LANGUAGE_NAMES: Record = Object.fromEntries( i18nConfig.map(({ code, name }: { code: string; name: string }) => [ code, @@ -484,17 +497,62 @@ async function callGeminiRaw( const response = await client.models.generateContent({ model: modelId, contents: prompt, - config: { temperature: 0 }, + config: { temperature: 0, safetySettings: SAFETY_SETTINGS }, }) const usage = response.usageMetadata const duration = ((Date.now() - startTime) / 1000).toFixed(1) + // Inspect response for non-obvious failure modes before accessing .text + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const candidate = (response as any).candidates?.[0] + const finishReason: string | undefined = candidate?.finishReason + + // Log non-STOP finish reasons (these explain silent failures) + if (finishReason && finishReason !== "STOP") { + const safetyInfo = candidate?.safetyRatings + ?.map( + (r: { category?: string; probability?: string }) => + `${r.category}=${r.probability}` + ) + .join(", ") + console.warn( + `[${ts()}] [gemini] FINISH_REASON model=${modelId} ${ctx} ` + + `duration=${duration}s reason=${finishReason}` + + (safetyInfo ? ` safety=[${safetyInfo}]` : "") + ) + } + + // Check prompt-level blocking + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const blockReason = (response as any).promptFeedback?.blockReason + if (blockReason) { + console.warn( + `[${ts()}] [gemini] PROMPT_BLOCKED model=${modelId} ${ctx} ` + + `duration=${duration}s reason=${blockReason}` + ) + } + console.log( - `[${ts()}] [gemini] RESPONSE model=${modelId} ${ctx} duration=${duration}s tokens_in=${usage?.promptTokenCount || 0} tokens_out=${usage?.candidatesTokenCount || 0}` + `[${ts()}] [gemini] RESPONSE model=${modelId} ${ctx} ` + + `duration=${duration}s ` + + `tokens_in=${usage?.promptTokenCount || 0} ` + + `tokens_out=${usage?.candidatesTokenCount || 0}` + + (finishReason && finishReason !== "STOP" + ? ` finishReason=${finishReason}` + : "") ) + // Access .text -- may be empty/undefined if blocked + const text = response.text ?? "" + if (!text && finishReason && finishReason !== "STOP") { + throw new Error( + `Gemini returned no content (finishReason=${finishReason}). ` + + `This file/language combination may be triggering content filters.` + ) + } + return { - text: response.text ?? "", + text, tokensUsed: { input: usage?.promptTokenCount || 0, output: usage?.candidatesTokenCount || 0, From 4e829add98e787a2e5b195a692b7c3c2a2b64495 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Sat, 28 Mar 2026 04:35:13 +0000 Subject: [PATCH 20/23] fix(i18n): use SDK enum types for safety settings Use HarmCategory and HarmBlockThreshold enums from @google/genai instead of plain strings. Fixes TS2322 type error in CI where the SDK types are available. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/lib/ai/gemini-translate.ts | 22 ++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/scripts/i18n/lib/ai/gemini-translate.ts b/src/scripts/i18n/lib/ai/gemini-translate.ts index c8eb8d5a73d..ed9410d26f5 100644 --- a/src/scripts/i18n/lib/ai/gemini-translate.ts +++ b/src/scripts/i18n/lib/ai/gemini-translate.ts @@ -5,7 +5,7 @@ * Gemini handles the linguistics; we handle the guardrails. */ -import { GoogleGenAI } from "@google/genai" +import { GoogleGenAI, HarmBlockThreshold,HarmCategory } from "@google/genai" import i18nConfig from "../../../../../i18n.config.json" import { delay } from "../workflows/utils" @@ -44,10 +44,22 @@ const RETRY_DELAY_MS = 5000 * mining/attack descriptions in certain non-Latin languages). */ const SAFETY_SETTINGS = [ - { category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" }, - { category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" }, - { category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" }, - { category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" }, + { + category: HarmCategory.HARM_CATEGORY_HARASSMENT, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, + { + category: HarmCategory.HARM_CATEGORY_HATE_SPEECH, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, + { + category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, + { + category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, ] const LANGUAGE_NAMES: Record = Object.fromEntries( From e4a9d66c6cbf379d90511dcfd867a4d3102ff3d0 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Sat, 28 Mar 2026 05:39:07 +0000 Subject: [PATCH 21/23] fix(i18n): use relative paths for sanitizer commits The sanitizer was pushing absolute filesystem paths into changedFiles, causing GitHub tree API to reject them with "tree.path cannot start with a slash". Uses path.relative() to convert to repo-relative paths, matching the pattern already used for logging in the same file. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/post_import_sanitize.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 6b95805fa7e..d14908cb8d9 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -4488,7 +4488,7 @@ export async function runSanitizer( } const changedFiles = [...mdChanged, ...jsonChanged].map((f) => ({ - path: f.path, + path: path.relative(ROOT, f.path), content: f.content, })) return { From f6b9b8177626b6f5b6b702e75c1950af7c775074 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Sat, 28 Mar 2026 06:36:54 +0000 Subject: [PATCH 22/23] fix(i18n): only fail when both title and desc untranslated Technical titles like "Ethash", "JSON-RPC API", "PeerDAS" are legitimately kept in English. The previous check failed if either title or description matched English. Now only fails when BOTH are identical, catching genuinely untranslated output while allowing technical/proper-noun titles. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- .../i18n/lib/ai/gemini-output-validation.ts | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/scripts/i18n/lib/ai/gemini-output-validation.ts b/src/scripts/i18n/lib/ai/gemini-output-validation.ts index 3691722b4c2..b5dd272f7e0 100644 --- a/src/scripts/i18n/lib/ai/gemini-output-validation.ts +++ b/src/scripts/i18n/lib/ai/gemini-output-validation.ts @@ -193,18 +193,29 @@ function extractFrontmatterField( /** * Check that key frontmatter fields (title, description) were actually * translated and not left identical to the English source. + * + * Only fails if BOTH title and description are identical to English. + * Technical titles (e.g., "Ethash", "JSON-RPC API", "PeerDAS") are + * legitimately kept in English, so a matching title alone is not a + * failure -- as long as the description was translated. + * * Returns an error string if untranslated, or undefined if OK. */ function checkFrontmatterTranslated( translated: string, english: string ): string | undefined { - for (const field of ["title", "description"]) { - const enValue = extractFrontmatterField(english, field) - const trValue = extractFrontmatterField(translated, field) - if (enValue && trValue && enValue === trValue) { - return `Frontmatter "${field}" was not translated (identical to English)` - } + const enTitle = extractFrontmatterField(english, "title") + const trTitle = extractFrontmatterField(translated, "title") + const titleMatch = enTitle && trTitle && enTitle === trTitle + + const enDesc = extractFrontmatterField(english, "description") + const trDesc = extractFrontmatterField(translated, "description") + const descMatch = enDesc && trDesc && enDesc === trDesc + + if (titleMatch && descMatch) { + return `Frontmatter "title" and "description" were both not translated (identical to English)` } + return undefined } From a2ee40623d6efe1276fa8fdbccf2f6ec3e09aa53 Mon Sep 17 00:00:00 2001 From: myelinated-wackerow <263208946+myelinated-wackerow@users.noreply.github.com> Date: Sat, 28 Mar 2026 17:02:48 +0000 Subject: [PATCH 23/23] fix(i18n): use relative paths for JSX attribute commits Same slash bug as the sanitizer -- absolute filesystem paths passed to GitHub tree API. Applies path.relative() at the commit point in jsx-translation.ts. Co-Authored-By: Claude Opus 4.6 Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com> --- src/scripts/i18n/lib/workflows/jsx-translation.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/lib/workflows/jsx-translation.ts b/src/scripts/i18n/lib/workflows/jsx-translation.ts index a82ef36465e..5968f430b24 100644 --- a/src/scripts/i18n/lib/workflows/jsx-translation.ts +++ b/src/scripts/i18n/lib/workflows/jsx-translation.ts @@ -1,5 +1,7 @@ // JSX attribute translation workflow phase +import path from "path" + import { config } from "../../config" import { translateJsxAttributes } from "../../translate-jsx-attributes" import { isGeminiAvailable } from "../ai" @@ -81,7 +83,10 @@ export async function runJsxTranslation( for (const updated of jsxResult.updatedFiles) { const buf = Buffer.from(updated.updatedContent, "utf8") - filesToCommit.push({ path: updated.filePath, content: buf }) + filesToCommit.push({ + path: path.relative(process.cwd(), updated.filePath), + content: buf, + }) debugLog(`JSX-TRANSLATE: Will commit ${updated.filePath}`) // Update the committedFiles array with new content for sanitizer