From 017a8953fc743cb3c1e1472fcefd5a52993f28d9 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 21 Feb 2026 12:51:16 -0700 Subject: [PATCH 01/14] chore(deps): add franc-min as devDependency ESM-only trigram language detection library used by the post-import sanitizer to detect untranslated paragraphs in translation files. Co-Authored-By: Claude Opus 4.6 --- package.json | 1 + pnpm-lock.yaml | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/package.json b/package.json index d98152c79c8..5b956dc6818 100644 --- a/package.json +++ b/package.json @@ -143,6 +143,7 @@ "eslint-plugin-simple-import-sort": "^10.0.0", "eslint-plugin-storybook": "0.8.0", "eslint-plugin-unused-imports": "^3.2.0", + "franc-min": "^6.2.0", "husky": "^9.0.11", "image-size": "^1.0.2", "lint-staged": "^15.2.5", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f2e077b8c71..bc3ee5d9831 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -341,6 +341,9 @@ importers: eslint-plugin-unused-imports: specifier: ^3.2.0 version: 3.2.0(@typescript-eslint/eslint-plugin@7.18.0(@typescript-eslint/parser@7.18.0(eslint@8.57.1)(typescript@5.8.3))(eslint@8.57.1)(typescript@5.8.3))(eslint@8.57.1) + franc-min: + specifier: ^6.2.0 + version: 6.2.0 husky: specifier: ^9.0.11 version: 9.1.7 @@ -6745,6 +6748,9 @@ packages: react-dom: optional: true + franc-min@6.2.0: + resolution: {integrity: sha512-1uDIEUSlUZgvJa2AKYR/dmJC66v/PvGQ9mWfI9nOr/kPpMFyvswK0gPXOwpYJYiYD008PpHLkGfG58SPjQJFxw==} + fs-constants@1.0.0: resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==} @@ -7823,6 +7829,9 @@ packages: mz@2.7.0: resolution: {integrity: sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==} + n-gram@2.0.2: + resolution: {integrity: sha512-S24aGsn+HLBxUGVAUFOwGpKs7LBcG4RudKU//eWzt/mQ97/NMKQxDWHyHx63UNWk/OOdihgmzoETn1tf5nQDzQ==} + nanoid@3.3.11: resolution: {integrity: sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==} engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1} @@ -9426,6 +9435,9 @@ packages: tr46@0.0.3: resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} + trigram-utils@2.0.1: + resolution: {integrity: sha512-nfWIXHEaB+HdyslAfMxSqWKDdmqY9I32jS7GnqpdWQnLH89r6A5sdk3fDVYqGAZ0CrT8ovAFSAo6HRiWcWNIGQ==} + trim-lines@3.0.1: resolution: {integrity: sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==} @@ -18919,6 +18931,10 @@ snapshots: react: 18.3.1 react-dom: 18.3.1(react@18.3.1) + franc-min@6.2.0: + dependencies: + trigram-utils: 2.0.1 + fs-constants@1.0.0: {} fs-extra@10.1.0: @@ -20292,6 +20308,8 @@ snapshots: object-assign: 4.1.1 thenify-all: 1.6.0 + n-gram@2.0.2: {} + nanoid@3.3.11: {} nanoid@3.3.8: {} @@ -22135,6 +22153,11 @@ snapshots: tr46@0.0.3: {} + trigram-utils@2.0.1: + dependencies: + collapse-white-space: 2.1.0 + n-gram: 2.0.2 + trim-lines@3.0.1: {} trough@2.2.0: {} From 312f14e1b2e93be30c24b2d111dc8e78a23c85e9 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 21 Feb 2026 12:53:32 -0700 Subject: [PATCH 02/14] feat: enhance i18n sanitizer with 8 new checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds ticker transposition fixes (EHT→ETH, BSL→BLS, ECDSA), frontmatter tag syncing from English source, expanded brand name list with auto-fix for tags, cross-script contamination detection for 20+ locales, MDX angle bracket escaping, orphaned closing tag removal, and franc-min-powered untranslated paragraph detection. Makes runSanitizer async to support dynamic ESM import of franc-min. Co-Authored-By: Claude Opus 4.6 --- src/scripts/i18n/post_import_sanitize.ts | 478 ++++++++++++++++++++++- 1 file changed, 465 insertions(+), 13 deletions(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 000d668192f..c61da262ecf 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -1,6 +1,20 @@ import * as fs from "fs" import * as path from "path" +// franc-min is ESM-only; use dynamic import +let francDetect: ((text: string) => string) | null = null +async function loadFranc(): Promise { + if (francDetect) return + try { + const francModule = await import("franc-min") + francDetect = francModule.franc + } catch { + console.warn( + "[SANITIZE] franc-min not available; skipping language detection" + ) + } +} + /** * Post-import sanitizer for Crowdin translations. * @@ -99,6 +113,10 @@ const PROTECTED_BRAND_NAMES = [ // Programming languages "Solidity", "Vyper", + "Rust", + "JavaScript", + "TypeScript", + "Python", // Companies/Products "Alchemy", "Infura", @@ -106,26 +124,151 @@ const PROTECTED_BRAND_NAMES = [ "Consensys", "Chainlink", "OpenZeppelin", + "Gnosis", + "Flashbots", + "Etherscan", + "Hardhat", + "Foundry", + "Remix", + "Truffle", + "Ganache", + "Brownie", + "Waffle", + // Protocols/Projects + "Uniswap", + "Aave", + "Compound", + "MakerDAO", + "Lido", + "Rocket Pool", + "ENS", + // Core terms that must stay English + "Ethereum", + "Bitcoin", + "Beacon Chain", + "Solana", + "Polygon", + "Arbitrum", + "Optimism", + "Base", ] /** - * Check if protected brand names from English source are preserved in translation. - * Returns warnings for any brand names that appear in English but not in translation. + * Common ticker/acronym transpositions found in translations. + * Maps wrong form → correct form. */ -function checkProtectedBrandNames( +const TICKER_CORRECTIONS: Record = { + EHT: "ETH", + BSL: "BLS", + ECDAS: "ECDSA", + KECCAK: "Keccak", +} + +/** + * Fix ticker symbol transpositions. + * Only matches whole words (word boundaries) to avoid false positives. + */ +function fixTickerTranspositions(content: string): { + content: string + fixCount: number +} { + let result = content + let fixCount = 0 + + for (const [wrong, correct] of Object.entries(TICKER_CORRECTIONS)) { + const re = new RegExp(`\\b${escapeRegex(wrong)}\\b`, "g") + const matches = result.match(re) + if (matches && matches.length > 0) { + fixCount += matches.length + result = result.replace(re, correct) + } + } + + return { content: result, fixCount } +} + +/** + * Sync frontmatter tags array from English source. + * Tags like programming language names should never be translated. + * Replaces the entire tags array with the English original. + */ +function syncFrontmatterTags( translatedContent: string, englishContent: string -): string[] { +): { content: string; fixCount: number } { + const frontmatterRe = /^---\n([\s\S]*?)\n---/ + const transMatch = translatedContent.match(frontmatterRe) + const engMatch = englishContent.match(frontmatterRe) + + if (!transMatch || !engMatch) + return { content: translatedContent, fixCount: 0 } + + const transFm = transMatch[1] + const engFm = engMatch[1] + + // Extract tags line (handles both inline array and value) + const tagsRe = /^(tags:\s*)(.+)$/m + const engTagsMatch = engFm.match(tagsRe) + const transTagsMatch = transFm.match(tagsRe) + + if (!engTagsMatch || !transTagsMatch) + return { content: translatedContent, fixCount: 0 } + + const engTagsValue = engTagsMatch[2].trim() + const transTagsValue = transTagsMatch[2].trim() + + if (engTagsValue === transTagsValue) + return { content: translatedContent, fixCount: 0 } + + // Replace translated tags with English tags + const updatedFm = transFm.replace( + tagsRe, + `${transTagsMatch[1]}${engTagsValue}` + ) + const content = translatedContent.replace( + frontmatterRe, + `---\n${updatedFm}\n---` + ) + + return { content, fixCount: 1 } +} + +/** + * Fix protected brand names that were mistranslated. + * For each brand found in English source, if the count drops in translation, + * attempt to restore by finding the translated variants and replacing them. + * + * Strategy: For brand names where English count > translation count, + * we can't easily know what the mistranslation IS without locale knowledge. + * So we report these as warnings for the LLM review to handle. + * + * However, for frontmatter `tags` arrays, we CAN auto-fix by syncing with English. + */ +function fixProtectedBrandNames( + translatedContent: string, + englishContent: string +): { content: string; fixCount: number; warnings: string[] } { const warnings: string[] = [] + let content = translatedContent + let fixCount = 0 + // Auto-fix: Sync frontmatter tags with English source + const tagsSyncResult = syncFrontmatterTags(content, englishContent) + content = tagsSyncResult.content + fixCount += tagsSyncResult.fixCount + if (tagsSyncResult.fixCount > 0) { + warnings.push( + `Auto-synced ${tagsSyncResult.fixCount} frontmatter tags with English source` + ) + } + + // Warn: Brand names with count mismatches in body content for (const brand of PROTECTED_BRAND_NAMES) { - // Check if brand exists in English source (case-sensitive match with word boundaries) const brandRegex = new RegExp(`\\b${escapeRegex(brand)}\\b`, "g") const inEnglish = englishContent.match(brandRegex) if (inEnglish && inEnglish.length > 0) { - // Brand is in English, check if it's preserved in translation - const inTranslation = translatedContent.match(brandRegex) + const inTranslation = content.match(brandRegex) const englishCount = inEnglish.length const translationCount = inTranslation?.length ?? 0 @@ -137,7 +280,7 @@ function checkProtectedBrandNames( } } - return warnings + return { content, fixCount, warnings } } /** @@ -1051,6 +1194,274 @@ function quoteFrontmatterNonAscii(content: string): { return { content, fixCount } } +/** + * Expected Unicode script ranges per locale. + * Maps locale prefix to regex of UNEXPECTED characters. + * If these characters appear in a file for that locale, it's contamination. + */ +const CROSS_SCRIPT_DETECTORS: Record< + string, + { name: string; pattern: RegExp } +> = { + // Latin-script languages should not contain Devanagari, CJK, Arabic, etc. + tr: { + name: "Devanagari/CJK/Cyrillic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF]/g, + }, + fr: { + name: "Devanagari/CJK/Cyrillic/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g, + }, + de: { + name: "Devanagari/CJK/Cyrillic/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g, + }, + es: { + name: "Devanagari/CJK/Cyrillic/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g, + }, + it: { + name: "Devanagari/CJK/Cyrillic/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g, + }, + pt: { + name: "Devanagari/CJK/Cyrillic/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g, + }, + pl: { + name: "Devanagari/CJK/Cyrillic/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g, + }, + cs: { + name: "Devanagari/CJK/Cyrillic/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g, + }, + id: { + name: "Devanagari/CJK/Cyrillic/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g, + }, + sw: { + name: "Devanagari/CJK/Cyrillic/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g, + }, + vi: { + name: "Devanagari/CJK/Cyrillic/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g, + }, + // Cyrillic languages should not contain Devanagari, CJK, Arabic, etc. + ru: { + name: "Devanagari/CJK/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0600-\u06FF]/g, + }, + uk: { + name: "Devanagari/CJK/Arabic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0600-\u06FF]/g, + }, + // Arabic should not contain Devanagari, CJK, Cyrillic, etc. + ar: { + name: "Devanagari/CJK/Cyrillic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF]/g, + }, + ur: { + name: "Devanagari/CJK/Cyrillic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF]/g, + }, + // Devanagari languages should not contain CJK, Arabic, Cyrillic + hi: { + name: "CJK/Arabic/Cyrillic", + pattern: /[\u4E00-\u9FFF\u0600-\u06FF\u0400-\u04FF]/g, + }, + mr: { + name: "CJK/Arabic/Cyrillic", + pattern: /[\u4E00-\u9FFF\u0600-\u06FF\u0400-\u04FF]/g, + }, + // CJK languages should not contain Devanagari, Arabic, Cyrillic + ja: { + name: "Devanagari/Arabic/Cyrillic", + pattern: /[\u0900-\u097F\u0600-\u06FF\u0400-\u04FF]/g, + }, + ko: { + name: "Devanagari/Arabic/Cyrillic", + pattern: /[\u0900-\u097F\u0600-\u06FF\u0400-\u04FF]/g, + }, + "zh-tw": { + name: "Devanagari/Arabic/Cyrillic", + pattern: /[\u0900-\u097F\u0600-\u06FF\u0400-\u04FF]/g, + }, + // Tamil/Telugu should not contain Devanagari, CJK, Arabic, Cyrillic + ta: { + name: "Devanagari/CJK/Arabic/Cyrillic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0600-\u06FF\u0400-\u04FF]/g, + }, + te: { + name: "Devanagari/CJK/Arabic/Cyrillic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0600-\u06FF\u0400-\u04FF]/g, + }, + // Bengali should not contain other Indic, CJK, Arabic, Cyrillic + bn: { + name: "Devanagari/CJK/Arabic/Cyrillic", + pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0600-\u06FF\u0400-\u04FF]/g, + }, +} + +/** + * Detect cross-script contamination in translated content. + * Returns warnings for unexpected Unicode characters based on the file's locale. + */ +function detectCrossScriptContamination( + content: string, + locale: string +): string[] { + const warnings: string[] = [] + const detector = CROSS_SCRIPT_DETECTORS[locale] + if (!detector) return warnings + + // Skip code blocks — foreign characters in code are valid + const codeBlockRe = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g + const parts = content.split(codeBlockRe) + + for (let i = 0; i < parts.length; i++) { + if (i % 2 === 1) continue // Skip code blocks + + const matches = parts[i].match(detector.pattern) + if (matches && matches.length > 0) { + // Get unique characters found + const uniqueChars = Array.from(new Set(matches)).slice(0, 5).join(", ") + warnings.push( + `Cross-script contamination: found ${matches.length} ${detector.name} character(s) in ${locale} file (e.g., ${uniqueChars})` + ) + } + } + + return warnings +} + +/** + * Escape raw `<` before numbers in MDX content. + * Pattern: `<5GB` becomes `<5GB` to prevent MDX treating it as a JSX tag. + * Skips code blocks (fenced and inline) where `<` is valid. + */ +function escapeMdxAngleBrackets(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + + // Split content to preserve code blocks + const codeBlockPattern = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g + const parts = content.split(codeBlockPattern) + + for (let i = 0; i < parts.length; i++) { + if (i % 2 === 1) continue // Skip code blocks + + // Match < followed by a digit (not already escaped, not part of HTML tag) + parts[i] = parts[i].replace(/(? { + fixCount++ + return `<${digit}` + }) + } + + return { content: parts.join(""), fixCount } +} + +/** + * Detect and remove orphaned closing HTML tags. + * These appear when translation restructures sentences and leaves behind + * closing tags like without matching openers. + * Only removes tags that have NO corresponding opener in the same paragraph. + */ +function removeOrphanedClosingTags(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + const orphanTags = ["a", "span", "em", "strong", "b", "i", "u"] + + for (const tag of orphanTags) { + // Find closing tags that don't have a matching opener on the same line + const lines = content.split("\n") + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + const closeRe = new RegExp(``, "g") + const openRe = new RegExp(`<${tag}[\\s>]`, "g") + + const closeCount = (line.match(closeRe) || []).length + const openCount = (line.match(openRe) || []).length + + // If there are more closing tags than opening tags on this line, + // remove the excess closing tags (they're orphans) + if (closeCount > openCount) { + let excess = closeCount - openCount + lines[i] = line.replace(closeRe, (match) => { + if (excess > 0) { + excess-- + fixCount++ + return "" + } + return match + }) + // Clean up any resulting double spaces + lines[i] = lines[i].replace(/ +/g, " ").trim() + } + } + content = lines.join("\n") + } + + return { content, fixCount } +} + +/** + * Detect paragraphs that appear to be untranslated (still in English). + * Uses franc-min for language detection on paragraph-sized chunks. + * Only flags paragraphs with high confidence of being English in non-English files. + */ +function detectUntranslatedContent(content: string, locale: string): string[] { + if (!francDetect) return [] + // Don't check English files + if (locale === "en") return [] + + const warnings: string[] = [] + + // Split into paragraphs (skip frontmatter, code blocks) + const withoutFrontmatter = content.replace(/^---\n[\s\S]*?\n---\n?/, "") + const withoutCodeBlocks = withoutFrontmatter.replace(/```[\s\S]*?```/g, "") + + const paragraphs = withoutCodeBlocks + .split(/\n\s*\n/) + .filter((p) => p.trim().length > 100) // Only check substantial paragraphs + + let untranslatedCount = 0 + for (const para of paragraphs) { + const cleanPara = para + .replace(/\[([^\]]*)\]\([^)]*\)/g, "$1") // Remove markdown links (keep text) + .replace(/<[^>]+>/g, "") // Remove HTML/JSX tags + .replace(/`[^`]+`/g, "") // Remove inline code + .trim() + + if (cleanPara.length < 80) continue // Too short for reliable detection + + const detected = francDetect(cleanPara) + if (detected === "eng") { + untranslatedCount++ + // Only report first 3 to avoid noise + if (untranslatedCount <= 3) { + const preview = cleanPara.substring(0, 80).replace(/\n/g, " ") + warnings.push( + `Possibly untranslated paragraph (detected as English): "${preview}..."` + ) + } + } + } + + if (untranslatedCount > 3) { + warnings.push( + `...and ${untranslatedCount - 3} more potentially untranslated paragraphs` + ) + } + + return warnings +} + function processMarkdownFile( mdPath: string, providedContent?: string @@ -1067,6 +1478,7 @@ function processMarkdownFile( // Map translated path to English path: remove `/translations//` segment const parts = mdPath.split(path.sep) const idx = parts.lastIndexOf("translations") + const locale = idx !== -1 && idx + 1 < parts.length ? parts[idx + 1] : "" if (idx === -1 || idx + 2 >= parts.length) { issues.push("No translations segment found; skipping formatting sync") } else { @@ -1133,6 +1545,29 @@ function processMarkdownFile( issues.push(`Unescaped ${escapedBacktickCount} backslash-escaped backticks`) } + // Fix ticker symbol transpositions (EHT → ETH, etc.) + const tickerResult = fixTickerTranspositions(content) + content = tickerResult.content + if (tickerResult.fixCount > 0) { + issues.push(`Fixed ${tickerResult.fixCount} ticker symbol transpositions`) + } + + // Escape raw < before numbers in MDX content + const angleBracketResult = escapeMdxAngleBrackets(content) + content = angleBracketResult.content + if (angleBracketResult.fixCount > 0) { + issues.push( + `Escaped ${angleBracketResult.fixCount} raw angle brackets before numbers` + ) + } + + // Remove orphaned closing HTML tags + const orphanResult = removeOrphanedClosingTags(content) + content = orphanResult.content + if (orphanResult.fixCount > 0) { + issues.push(`Removed ${orphanResult.fixCount} orphaned closing HTML tags`) + } + // Fix block component line breaks (critical for MDX parser) const blockResult = fixBlockComponentLineBreaks(content) content = blockResult.content @@ -1207,9 +1642,13 @@ function processMarkdownFile( ) } - // Check for mistranslated brand names (report-only) - const brandWarnings = checkProtectedBrandNames(content, englishMd) - issues.push(...brandWarnings) + // Fix and check protected brand names + const brandResult = fixProtectedBrandNames(content, englishMd) + content = brandResult.content + if (brandResult.fixCount > 0) { + issues.push(`Fixed ${brandResult.fixCount} brand name issues`) + } + issues.push(...brandResult.warnings) // Fix translated hrefs using set comparison const hrefResult = fixTranslatedHrefs(content, englishMd) @@ -1220,6 +1659,18 @@ function processMarkdownFile( ) } issues.push(...hrefResult.warnings) + + // Detect cross-script contamination + if (locale) { + const scriptWarnings = detectCrossScriptContamination(content, locale) + issues.push(...scriptWarnings) + } + + // Detect untranslated content + if (locale) { + const untranslatedWarnings = detectUntranslatedContent(content, locale) + issues.push(...untranslatedWarnings) + } } const fixed = before !== content @@ -1314,11 +1765,12 @@ function languagesFromEnv(): string[] | undefined { .filter(Boolean) } -export function runSanitizer( +export async function runSanitizer( filesWithContent?: Array<{ path: string; content: string }>, langs?: string[] ) { console.log("[SANITIZE] Starting post-import sanitizer") + await loadFranc() let mdFilesToProcess: Array<{ path: string; content: string }> = [] let jsonFilesToProcess: Array<{ path: string; content: string }> = [] @@ -1430,5 +1882,5 @@ export function runSanitizer( } if (require.main === module) { - runSanitizer() + runSanitizer().catch(console.error) } From 8fe6144ebe5d79c604bc8962104ba0c2ec00756a Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 21 Feb 2026 12:53:45 -0700 Subject: [PATCH 03/14] docs: add translation review scaling strategy Compound engineering document capturing the full brainstorm, 3-phase pipeline strategy, prevention matrix, and knowledge compounding approach for scaling review of 21 translation PRs across 24 languages. Co-Authored-By: Claude Opus 4.6 --- .../scaling-translation-review-pipeline.md | 350 ++++++++++++++++++ 1 file changed, 350 insertions(+) create mode 100644 docs/solutions/translation-review/scaling-translation-review-pipeline.md diff --git a/docs/solutions/translation-review/scaling-translation-review-pipeline.md b/docs/solutions/translation-review/scaling-translation-review-pipeline.md new file mode 100644 index 00000000000..4fc16c62cee --- /dev/null +++ b/docs/solutions/translation-review/scaling-translation-review-pipeline.md @@ -0,0 +1,350 @@ +--- +title: "Scaling the Translation Review Pipeline for 24-Language Deployment" +category: translation-review +component: "post_import_sanitize.ts, review-translations.md, claude-review-translations.yml" +symptoms: + - "Manual review of single PR (1 of 13 parts) takes 1-2 hours per language" + - "MDX syntax errors requiring manual fixes across translated content" + - "Brand names and product names mistranslated in target languages" + - "href attributes being translated when they should remain unchanged" + - "Cross-script contamination (e.g., Devanagari characters in Turkish files)" + - "Untranslated chunks requiring back-and-forth with Gemini for re-translation" + - "Scale challenge: ~178 exploded PRs across 20+ languages blocking production deployment" +severity: high +date: 2026-02-21 +tags: + - translation-pipeline + - i18n + - crowdin-integration + - gemini-2.5-pro + - multilingual-deployment + - mdx-content + - sanitization + - glossary-management + - quality-assurance + - batch-processing +related_prs: + - 17182 + - 17176 + - 17247 + - 17242 + - 17227 + - 17224 + - 17219 + - 17218 + - 17210 + - 17209 + - 17199 + - 17198 + - 17186 + - 17166 + - 17164 + - 17132 + - 17127 + - 17126 + - 17125 + - 17122 + - 17105 + - 17101 +languages_affected: + - ar + - bn + - cs + - de + - fr + - hi + - id + - it + - ja + - ko + - mr + - pl + - pt-br + - ru + - sw + - ta + - te + - tr + - uk + - ur + - vi + - zh-tw +--- + +# Scaling the Translation Review Pipeline for 24-Language Deployment + +## Problem Summary + +The ethereum.org website has been translated into 24 languages using Gemini 2.5 Pro via Crowdin. The translations were imported and placed into PRs -- both "unexploded" (1 PR per language, ~21 total) and "exploded" (13 parts per language, ~178 total). Manual review of a single exploded PR takes 1-2 hours, involving back-and-forth between Claude (review) and Gemini (re-translation), plus fixing MDX syntax errors, brand name mistranslations, href translations, cross-script contamination, and more. Extrapolating to all remaining PRs yields 178-356 hours of manual work. + +This document captures the strategic brainstorm and agreed-upon approach for scaling this process. + +## Root Cause Analysis + +The review bottleneck stems from several compounding factors: + +1. **Insufficient automated pre-screening** -- Issues like brand name mistranslations, broken MDX syntax, and Unicode contamination pass through to human review unnecessarily. The sanitizer catches many patterns but misses several known categories. +2. **Exploded PR strategy** -- Breaking one language PR into 13 parts multiplied the review surface without multiplying reviewer capacity. +3. **No knowledge persistence** -- Each review session starts from scratch; patterns discovered in one language are not reused for the next. +4. **No build-level verification** -- Translation issues that cause MDX compilation failures are only discovered late in the process. +5. **No automated bridge back to Gemini** -- When untranslated chunks are found, there is no automated way to re-submit them for translation and re-import results. + +## Solution Architecture + +### Phase 1: Foundation + +#### 1a. Knowledge Base Setup + +Establish a persistent, local-first knowledge base to accumulate findings across review sessions: + +- `~/.claude/translation-review/known-patterns.md` -- seeded from Turkish compound doc findings; documents recurring issues by type +- `~/.claude/translation-review/per-language/` -- one file per locale capturing language-specific findings (common errors, glossary deviations, script quirks) +- `~/.claude/translation-review/fetch-translation-glossary.json` -- already exported; schema: `Array<{ string_term, translation_text, language_code, total_votes }>` + +Initially local; later candidates for merging into the repo for full team access. + +#### 1b. Enhance `post_import_sanitize.ts` + +The current sanitizer already handles: header ID sync, href fixes (block-level set comparison), broken markdown links, frontmatter dates/quoting, guillemets, escaped backticks, block component line breaks, inline component normalization, brand name warnings, and unclosed backtick repair. + +Additions required: + +| Addition | Description | +|---|---| +| Brand name auto-fix | Expand `PROTECTED_BRAND_NAMES` list; switch from warn-only to auto-revert | +| Cross-script contamination detector | Unicode range validation per locale (e.g., catch Devanagari characters in Turkish `.md` files) | +| MDX `<` before numbers | Escape to `<` outside code blocks to prevent MDX parse failures | +| Orphaned HTML tag cleanup | Detect and remove `` (and similar) without matching opener | +| Frontmatter `tags` array protection | Prevent translation of programming language names and technical tags | +| Ticker symbol correction dictionary | Catch and fix transpositions: `EHT`->`ETH`, `BSL`->`BLS`, etc. | +| Href translation coverage audit | Verify the existing href fix catches all variants (e.g., `/governance` -> `/gobernanza`) | +| Language detection on content segments | Flag paragraphs detected as English in a non-English file | + +#### 1c. Update `review-translations.md` Workflow Document + +Modify the review workflow to: + +- Read `~/.claude/translation-review/known-patterns.md` before deploying sub-agents +- Load language-specific glossary entries from the JSON file at review start +- Add MDX compilation check as a built-in review phase (not an afterthought) +- Encode sub-agent architecture with clear separation of concerns: + - **MDX Syntax Agent** -- validates MDX structure, component usage, escaping + - **Brand Name Agent** -- checks protected terms against glossary and known-patterns + - **Href Validation Agent** -- verifies internal link translations are consistent with site routing + - **Semantic Review Agent** -- spot-checks translation quality against glossary votes + - **Build Verification Agent** -- runs `NEXT_PUBLIC_BUILD_LOCALES=en,{lang} pnpm build` +- Document the targeted build command: `NEXT_PUBLIC_BUILD_LOCALES=en,{lang} pnpm build` + +### Phase 2: Validate on One Language + +Czech (`cs`) is the pilot language because it has only 3 exploded parts remaining, plus unexploded PR #17247, making it the lowest-cost full-pipeline test. + +Pipeline steps: + +1. Run enhanced sanitizer with all new additions enabled +2. Run sub-agent review suite with `--fix` mode +3. Execute `NEXT_PUBLIC_BUILD_LOCALES=en,cs pnpm build` +4. Document all findings to `~/.claude/translation-review/per-language/cs.md` +5. Merge, close exploded PRs +6. Accumulate patterns back into `known-patterns.md` + +Success criteria: clean build, no brand name regressions, glossary alignment confirmed. + +### Phase 3: Scale to Remaining Languages + +**Tier A -- Finish exploded PRs (3-4 parts remaining):** +- Czech (`cs`), Traditional Chinese (`zh-tw`), Ukrainian (`uk`), Telugu (`te`) +- Strategy: complete remaining exploded parts using the validated pipeline + +**Tier B -- Partially done, switch to unexploded:** +- Bengali (`bn`), German (`de`), Marathi (`mr`), Polish (`pl`), Swahili (`sw`), Tamil (`ta`), Urdu (`ur`), Turkish (`tr`) +- Strategy: use single unexploded PR per language; apply full pipeline + +**Tier C -- Full review, unexploded only:** +- Arabic (`ar`), French (`fr`), Hindi (`hi`), Indonesian (`id`), Italian (`it`), Japanese (`ja`), Korean (`ko`), Russian (`ru`), Vietnamese (`vi`), Brazilian Portuguese (`pt-br`) +- Strategy: direct unexploded pipeline with knowledge base pre-loaded + +### Key Architectural Decisions + +| Decision | Rationale | +|---|---| +| **Prefer unexploded PRs (1 per language)** | Exploded PRs multiply human review surface; 13 parts x 2 hrs = 26 hrs per language vs. ~3 hrs for unexploded | +| **Sub-agents split by concern, not file count** | Concern-based split allows each agent to specialize its detection logic; file-count split leads to uneven workloads and missed cross-file patterns | +| **Gemini translates, Claude reviews** | Keeps the pipeline conservative and avoids introducing new translation errors during review. No automated bridge for re-translation yet. | +| **Build verification uses locale isolation** | `NEXT_PUBLIC_BUILD_LOCALES=en,{lang}` avoids building all 60+ locales on every check | +| **Knowledge base starts local** | Avoids premature repo noise; once patterns stabilize across 3-4 languages, promote to repo for team visibility | +| **Czech as pilot** | Lowest risk (fewest remaining parts), sufficient complexity to stress-test the full pipeline before scaling to Tier C languages | +| **Ralph Loop plugin under consideration** | Would enable iterate-until-build-passes automation; deferred until pipeline is stable | + +### Key Code Changes + +**File: `src/scripts/i18n/post_import_sanitize.ts`** + +- Expand `PROTECTED_BRAND_NAMES` constant with comprehensive brand terms list +- Change brand name handling from `console.warn` to auto-revert with logging +- Add `detectCrossScriptContamination(content, locale)` -- Unicode range validation per locale +- Add `escapeMdxAngleBrackets(content)` -- targets `< N` patterns outside fenced code blocks +- Add `removeOrphanedClosingTags(content)` -- regex-based orphan HTML tag detector +- Add `protectFrontmatterTags(translatedFm, englishFm)` -- freeze tags array against English source +- Add `TICKER_CORRECTIONS: Record` dictionary and apply in sanitize pass +- Audit and extend `fixTranslatedHrefs()` to cover all edge cases + +**File: `.claude/commands/review-translations.md`** + +- Add knowledge base load step at top of workflow +- Add glossary injection step per language +- Restructure sub-agent section with the 5-agent breakdown +- Add build verification as mandatory final step with exact command + +## Prevention Matrix + +| Issue Category | Upstream Prevention | Automated Detection | Review-Level Detection | Long-term Fix | +|---|---|---|---|---| +| **Brand name mistranslation** | Crowdin glossary with "Do Not Translate" flag; explicit list in Gemini system prompt | Token-match against protected-terms allowlist; flag phonetic/semantic variants | LLM check: "Does this translation preserve all brand names exactly?" | Crowdin TM enforcement + MTQE threshold on brand-name segments | +| **Cross-script contamination** | Crowdin project setting: enforce target locale script; Gemini script constraint | Unicode block range check per file per locale | LLM check: "Does any portion contain characters from an incompatible script?" | Per-locale Unicode allowlist enforced at import time as a hard gate | +| **MDX syntax errors** | Crowdin HTML/MDX-aware segment protection; Gemini locked segment config | MDX AST parse post-import; regex for unmatched backtick parity, `<[0-9]`, unclosed HTML | LLM check: "Any raw `<` before numbers, unmatched backtick pairs, HTML outside code blocks?" | Mandatory `mdx-compile` step in post-import; quarantine failures | +| **Semantic inversions** | Crowdin glossary entries for antonym pairs with definitions; Gemini system prompt with mutually exclusive term list | Concordance check: if source has "proof-of-work" verify translation uses correct locale term, not antonym | LLM check: "Verify all consensus mechanism terms match source meaning. Inversion is a known failure mode." | Semantic consistency test corpus per locale | +| **Translated hrefs** | Crowdin: configure internal href paths as locked/non-translatable; Gemini system prompt: "Never translate URL paths" | Extract all `href` values, compare against source file href set; any divergence is a hard failure | LLM check: "Are all internal href values identical to the source?" | Href exact-match comparison as mandatory pre-merge CI check | +| **Translated frontmatter tags** | Crowdin: mark frontmatter `tags` as non-translatable | Parse frontmatter, compare tag arrays against source; flag any tag not in source set | LLM check: "Do frontmatter tags match the source exactly?" | Frontmatter schema validation with strict allowlists | +| **Ticker/acronym typos** | Crowdin glossary: ticker symbols as "Do Not Translate"; Gemini system prompt with explicit list | Levenshtein distance check: all uppercase tokens against canonical ticker list; flag distance <= 1 | LLM check: "Are all tickers and acronyms spelled exactly as in source?" | Canonical ticker allowlist validated in CI | +| **Domain typos** | Gemini system prompt: "The domain ethereum.org must never be altered"; Crowdin: lock URL segments | Regex: extract domain strings, assert exact match against `ethereum.org` | LLM check: "Any misspelling of ethereum.org?" | Regex validation in CI, zero tolerance | +| **Untranslated content chunks** | Crowdin MT coverage threshold; Gemini system prompt: "Every segment must be translated" | Paragraph-level language detection; flag English content in non-English files above threshold | LLM check: "Are there paragraphs that appear untranslated?" | Language detection as post-import gate; failed segments queued for re-translation | +| **Wrong technical term selection** | Crowdin glossary with preferred translations per locale for high-risk terms; Gemini prompt with locale-specific terminology reference | Concordance check: verify technical terms use glossary entries, not colloquial equivalents | LLM check is primary: "Check that technical terms use established Ethereum translations" | Per-locale Ethereum technical glossary maintained as versioned data file | + +## Knowledge Compounding Strategy + +### Session Memory (Per-Locale) + +After each language review, findings are written to `~/.claude/translation-review/per-language/[locale].md`: + +- Confirmed issues by category +- False positives to suppress in future reviews +- Glossary additions/corrections +- Systemic notes (e.g., "Crowdin TM appears contaminated from Hindi batch") + +### Cross-Locale Aggregation + +`~/.claude/translation-review/known-patterns.md` is maintained as a rolling aggregate that: + +1. Captures patterns seen across multiple locales (e.g., brand name issues in 8+ languages = systemic upstream problem) +2. Records confirmed false-positive patterns to suppress +3. Provides the context injection block for review agents + +### Inter-Agent Context Injection + +Each review agent receives prior findings as context: + +``` +Known issues confirmed in prior reviews of this locale: +- "katillik" is a mistranslation of "Solidity" -- flag all occurrences +- Cross-script contamination from Devanagari was found -- check for recurrence + +Cross-locale patterns seen in 5+ languages: +- DeFi is being translated as "MeFi" -- check this locale +- Internal hrefs are being translated -- perform href audit +``` + +This transforms each review from a cold start into an informed continuation. + +## Pipeline Hardening Recommendations (Ordered by Impact) + +1. **Mandatory MDX compile gate** -- Every file must pass MDX AST parse before entering review queue. Files that fail are quarantined immediately. Highest-leverage check: fully deterministic, zero ambiguity. + +2. **Href exact-match validation** -- Extract all `href` attributes from source and translated files, compare sets. Any deviation is a hard failure. Zero false-positive risk. + +3. **Unicode script range validation** -- Per-locale expected Unicode block range. Catches cross-script contamination with zero ambiguity. + +4. **Canonical ticker fuzzy-match** -- Levenshtein distance <= 1 check on all uppercase tokens against canonical ticker list. Catches transpositions that human reviewers and LLMs miss under volume. + +5. **Language detection on content segments** -- Paragraph-level language ID on translated files. English content in non-English files above threshold flags for re-translation queue. + +6. **Domain string exact-match** -- Regex for `ethereum` + TLD-like pattern. Trivial to implement, catches trust/SEO issues. + +7. **Frontmatter schema validation** -- Parse with gray-matter, validate fixed fields against source. Prevents programming language names from being localized in tags. + +8. **Brand name token allowlist** -- Protected-terms list with auto-revert. Requires per-locale map for terms with accepted translations vs. always-English terms. + +9. **Build verification in CI** -- `NEXT_PUBLIC_BUILD_LOCALES=en,{lang} pnpm build` as required PR check. Full build catches integration failures that segment-level checks miss. + +10. **Findings persistence and context injection** -- Write structured findings after each review. Inject prior findings as context for subsequent reviews. Without this, each review starts cold. + +## Open Problems + +### Gemini Re-Translation Gap + +When untranslated chunks are detected, there is no automated round-trip back to Gemini for completion. The current workflow requires manual intervention: extract the file, submit to Gemini with glossary context, receive output, re-import into the repo branch. A proper fix requires a re-translation queue and Gemini API integration outside the Crowdin workflow. + +### Semantic Inversion Detection + +Detecting swapped consensus mechanism terminology (PoW/PoS) requires knowing the correct translation of both terms in every target language. No universal automated approach exists. Partial solution: build term maps for critical antonym pairs during first review of each locale and persist them. + +### Wrong Technical Term Selection at Scale + +Distinguishing "client (software)" from "client (customer)" requires semantic context that regex/token checks cannot provide. LLM review is the only practical detector, but at 20+ languages, LLM review cost and latency are constraints. + +### Crowdin Translation Memory Contamination + +Cross-script contamination (Devanagari in Turkish) suggests Crowdin TM is pulling from wrong-locale segments. Root cause is unclear without Crowdin admin access. Downstream mitigations (Unicode range gate, Gemini script constraint) are in place, but the actual fix requires auditing TM isolation per locale. + +### Ralph Loop Integration + +The [Ralph Loop](https://claude.com/plugins/ralph-loop) Claude Code plugin enables iterative loops where Claude works on a task repeatedly until completion. It uses a stop hook to re-feed the prompt while preserving file modifications between iterations. This maps well to the "sanitize -> review -> fix -> verify -> repeat" cycle. However, integration with worktree isolation and the multi-model pipeline (Gemini for translation, Claude for review) needs validation before adoption at scale. + +## Related Documentation + +### Existing Compound Docs + +| Document | Location | Status | +|---|---|---| +| Turkish (tr) Review - PR #17182 | `docs/solutions/translation-review/crowdin-import-review-turkish-pr-17182.md` | On `dev` branch | +| Vietnamese (vi) Review - PR #17176 | `docs/solutions/translation-review/crowdin-import-review-vietnamese-pr-17176.md` | On `i18n/import/2026-01-27T15-06-08-vi` branch only | + +### Key Codebase Files + +| File | Role | +|---|---| +| `src/scripts/i18n/post_import_sanitize.ts` | Deterministic post-import sanitizer | +| `.claude/commands/review-translations.md` | Claude Code review command | +| `.github/workflows/claude-review-translations.yml` | CI workflow for automated review | +| `src/scripts/i18n/main.ts` | Import pipeline orchestrator | +| `src/scripts/i18n/config.ts` | Pipeline configuration (languages, paths, API keys) | +| `.claude/commands/netlify-build-check.md` | Build status check and MDX error analysis | +| `src/intl/[locale]/glossary.json` | Per-locale glossary files | +| `src/scripts/i18n/lib/supabase/glossary.ts` | Supabase glossary client | + +### Unexploded PRs (One Per Language) + +| PR | Language | State | +|---|---|---| +| #17247 | Czech (cs) | Open | +| #17242 | Traditional Chinese (zh-tw) | Open | +| #17227 | Swahili (sw) | Open | +| #17224 | Marathi (mr) | Open | +| #17219 | Telugu (te) | Open | +| #17218 | Tamil (ta) | Open | +| #17210 | Ukrainian (uk) | Open | +| #17209 | Urdu (ur) | Open | +| #17199 | Polish (pl) | Open | +| #17198 | Italian (it) | Open | +| #17186 | Bengali (bn) | Open | +| #17176 | Vietnamese (vi) | Open | +| #17166 | Korean (ko) | Open | +| #17164 | German (de) | Open | +| #17132 | Japanese (ja) | Open | +| #17127 | Russian (ru) | Open | +| #17126 | Indonesian (id) | Open | +| #17125 | French (fr) | Open | +| #17122 | Brazilian Portuguese (pt-br) | Open | +| #17105 | Arabic (ar) | Open | +| #17101 | Hindi (hi) | Open | + +### Cross-References + +| Source | References | Nature | +|---|---|---| +| Turkish compound doc | `post_import_sanitize.ts` | Recommends adding brand name dictionary and cross-script detector | +| Turkish compound doc | `review-translations.md` | Command that ran the review | +| Turkish compound doc | Vietnamese PR #17176 companion doc | Same MDX error patterns | +| `review-translations.md` | `netlify-build-check.md` | Review command recommends running build check for MDX errors | +| `.github/workflows/claude-review-translations.yml` | `review-translations.md` | Workflow executes the command via `claude-code-action@v1` | +| `src/scripts/i18n/docs/v0.2.0-roadmap.md` | `lib/supabase/glossary.ts` | Roadmap plans Supabase glossary sync; file already exists | From 4cb98fb46b29aa51b98b954c0c7dca433fa9baa6 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 21 Feb 2026 15:55:59 -0700 Subject: [PATCH 04/14] feat: improve sanitizer logging and href fixing Replaces fixCount-based issue reporting with actual content comparison so transforms only log when content genuinely changes. Adds block-scoped href replacement to prevent cross-block interference when the same href appears in multiple blocks. Detects displaced hrefs that are globally valid but in the wrong block. Co-Authored-By: Claude Opus 4.6 --- src/scripts/i18n/post_import_sanitize.ts | 298 +++++++++++------------ 1 file changed, 138 insertions(+), 160 deletions(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index c61da262ecf..f0e1da46831 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -445,7 +445,11 @@ function fixTranslatedHrefs( // Collect all English internal hrefs as the "valid" set const allEnglishHrefs = extractHrefs(englishContent) - const allFixes: Array<[string, string]> = [] // [wrong, correct] + const blockFixes: Array<{ + blockIdx: number + wrong: string + correct: string + }> = [] const allWarnings: string[] = [] // Process block by block @@ -461,15 +465,17 @@ function fixTranslatedHrefs( // Skip blocks with no internal hrefs if (engHrefs.length === 0 && transHrefs.length === 0) continue - // Find hrefs in translation that don't exist in English (invalid) + // Compare hrefs at block level + const engHrefSet = new Set(engHrefs) const transHrefSet = new Set(transHrefs) - const invalidInTrans: string[] = [] // In translation but not in any English href - const missingFromTrans: string[] = [] // In English block but not in translation + // Hrefs in translation block but NOT in corresponding English block + const displacedInTrans: string[] = [] + const missingFromTrans: string[] = [] for (const href of transHrefs) { - if (!allEnglishHrefs.has(href)) { - invalidInTrans.push(href) + if (!engHrefSet.has(href)) { + displacedInTrans.push(href) } } @@ -480,21 +486,24 @@ function fixTranslatedHrefs( } // No issues in this block - if (invalidInTrans.length === 0 && missingFromTrans.length === 0) continue + if (displacedInTrans.length === 0 && missingFromTrans.length === 0) continue // Deduplicate for set comparison - const uniqueInvalid = [...new Set(invalidInTrans)] + const uniqueDisplaced = [...new Set(displacedInTrans)] const uniqueMissing = [...new Set(missingFromTrans)] - // Only auto-fix when there's exactly 1 invalid and 1 missing in block - // Multiple mismatches within same block could be reordered - don't guess - if (uniqueInvalid.length === 1 && uniqueMissing.length === 1) { - allFixes.push([uniqueInvalid[0], uniqueMissing[0]]) - } else if (uniqueInvalid.length > 0 || uniqueMissing.length > 0) { - // Count mismatch - can't safely fix, warn instead - for (const href of uniqueInvalid) { + // Auto-fix when there's exactly 1 displaced and 1 missing in the same block + if (uniqueDisplaced.length === 1 && uniqueMissing.length === 1) { + blockFixes.push({ + blockIdx: i, + wrong: uniqueDisplaced[0], + correct: uniqueMissing[0], + }) + } else if (uniqueDisplaced.length > 0 || uniqueMissing.length > 0) { + for (const href of uniqueDisplaced) { + const globallyValid = allEnglishHrefs.has(href) allWarnings.push( - `Block ${i + 1}: Invalid href "${href}" - not a valid English path` + `Block ${i + 1}: ${globallyValid ? "Displaced" : "Invalid"} href "${href}" - not in corresponding English block` ) } for (const href of uniqueMissing) { @@ -512,25 +521,28 @@ function fixTranslatedHrefs( ) } - // Apply all fixes + // Apply fixes block-by-block to avoid cross-block interference let result = translatedContent const appliedFixes: string[] = [] - for (const [wrong, correct] of allFixes) { + for (const { blockIdx, wrong, correct } of blockFixes) { + const originalBlock = translatedBlocks[blockIdx] + let fixedBlock = originalBlock + // Replace in markdown links: [text](wrong) → [text](correct) const markdownRe = new RegExp( `(\\[[^\\]]*\\]\\()${escapeRegex(wrong)}(\\))`, "g" ) - const beforeMd = result - result = result.replace(markdownRe, `$1${correct}$2`) + fixedBlock = fixedBlock.replace(markdownRe, `$1${correct}$2`) // Replace in href attributes: href="wrong" → href="correct" const hrefRe = new RegExp(`(href=["'])${escapeRegex(wrong)}(["'])`, "g") - const beforeAttr = result - result = result.replace(hrefRe, `$1${correct}$2`) + fixedBlock = fixedBlock.replace(hrefRe, `$1${correct}$2`) - if (result !== beforeMd || result !== beforeAttr) { + if (fixedBlock !== originalBlock) { + result = result.replace(originalBlock, fixedBlock) + translatedBlocks[blockIdx] = fixedBlock // update for subsequent fixes appliedFixes.push(`${wrong} → ${correct}`) } } @@ -1498,166 +1510,116 @@ function processMarkdownFile( const before = content - // Fix duplicated headings (e.g., ## Text? Text? {#id} → ## Text? {#id}) - const duplicatedResult = fixDuplicatedHeadings(content) - content = duplicatedResult.content - if (duplicatedResult.fixCount > 0) { - issues.push(`Fixed ${duplicatedResult.fixCount} duplicated headings`) - } - - // Fix broken markdown links (] (https:// → ](https://) - const brokenLinksResult = fixBrokenMarkdownLinks(content) - content = brokenLinksResult.content - if (brokenLinksResult.fixCount > 0) { - issues.push(`Fixed ${brokenLinksResult.fixCount} broken markdown links`) - } - - // Fix frontmatter issues (don't need English source) - const dateResult = normalizeFrontmatterDates(content) - content = dateResult.content - if (dateResult.fixCount > 0) { - issues.push( - `Normalized ${dateResult.fixCount} frontmatter dates to ISO format` - ) - } - - const quoteResult = quoteFrontmatterNonAscii(content) - content = quoteResult.content - if (quoteResult.fixCount > 0) { - issues.push( - `Quoted ${quoteResult.fixCount} frontmatter values with non-ASCII chars` - ) + // Helper: only log a fix if content actually changed + function applyFix( + fn: () => { content: string; fixCount: number }, + label: (count: number) => string + ) { + const snapshot = content + const result = fn() + content = result.content + if (content !== snapshot) { + issues.push(label(result.fixCount)) + } } - const guillemetResult = fixAsciiGuillemets(content) - content = guillemetResult.content - if (guillemetResult.fixCount > 0) { - issues.push( - `Fixed ${guillemetResult.fixCount} ASCII guillemets (<< >>) to Unicode (« »)` - ) - } + applyFix( + () => fixDuplicatedHeadings(content), + (n) => `Fixed ${n} duplicated headings` + ) + applyFix( + () => fixBrokenMarkdownLinks(content), + (n) => `Fixed ${n} broken markdown links` + ) + applyFix( + () => normalizeFrontmatterDates(content), + (n) => `Normalized ${n} frontmatter dates to ISO format` + ) + applyFix( + () => quoteFrontmatterNonAscii(content), + (n) => `Quoted ${n} frontmatter values with non-ASCII chars` + ) + applyFix( + () => fixAsciiGuillemets(content), + (n) => `Fixed ${n} ASCII guillemets (<< >>) to Unicode (« »)` + ) // Fix escaped backticks (\`) to regular backticks (`) - // Crowdin sometimes escapes backticks unnecessarily - const escapedBacktickCount = (content.match(/\\`/g) || []).length - if (escapedBacktickCount > 0) { + { + const snapshot = content content = content.replace(/\\`/g, "`") - issues.push(`Unescaped ${escapedBacktickCount} backslash-escaped backticks`) - } - - // Fix ticker symbol transpositions (EHT → ETH, etc.) - const tickerResult = fixTickerTranspositions(content) - content = tickerResult.content - if (tickerResult.fixCount > 0) { - issues.push(`Fixed ${tickerResult.fixCount} ticker symbol transpositions`) - } - - // Escape raw < before numbers in MDX content - const angleBracketResult = escapeMdxAngleBrackets(content) - content = angleBracketResult.content - if (angleBracketResult.fixCount > 0) { - issues.push( - `Escaped ${angleBracketResult.fixCount} raw angle brackets before numbers` - ) - } - - // Remove orphaned closing HTML tags - const orphanResult = removeOrphanedClosingTags(content) - content = orphanResult.content - if (orphanResult.fixCount > 0) { - issues.push(`Removed ${orphanResult.fixCount} orphaned closing HTML tags`) + if (content !== snapshot) { + const count = (snapshot.match(/\\`/g) || []).length + issues.push(`Unescaped ${count} backslash-escaped backticks`) + } } - // Fix block component line breaks (critical for MDX parser) - const blockResult = fixBlockComponentLineBreaks(content) - content = blockResult.content - if (blockResult.fixCount > 0) { - issues.push(`Fixed ${blockResult.fixCount} inline block component tags`) - } + applyFix( + () => fixTickerTranspositions(content), + (n) => `Fixed ${n} ticker symbol transpositions` + ) + applyFix( + () => escapeMdxAngleBrackets(content), + (n) => `Escaped ${n} raw angle brackets before numbers` + ) + applyFix( + () => removeOrphanedClosingTags(content), + (n) => `Removed ${n} orphaned closing HTML tags` + ) + applyFix( + () => fixBlockComponentLineBreaks(content), + (n) => `Fixed ${n} inline block component tags` + ) content = normalizeBlockHtmlLines(content) // Normalize inline components and restore blank lines from English source if (englishMd) { - // Sync protected frontmatter fields (template, sidebar, etc.) - const protectedResult = syncProtectedFrontmatterFields(content, englishMd) - content = protectedResult.content - if (protectedResult.fixCount > 0) { - issues.push( - `Synced ${protectedResult.fixCount} protected frontmatter fields from English` - ) - } - - // Collapse inline HTML tags to match English single-line format - const inlineHtmlResult = collapseInlineHtmlFromEnglish(content, englishMd) - content = inlineHtmlResult.content - if (inlineHtmlResult.fixCount > 0) { - issues.push( - `Collapsed ${inlineHtmlResult.fixCount} inline HTML tags to match English` - ) - } - - // Fix JSX component closing tags merged with content (split to own line) - const mergedTagResult = fixMergedClosingTags(content, englishMd) - content = mergedTagResult.content - if (mergedTagResult.fixCount > 0) { - issues.push( - `Split ${mergedTagResult.fixCount} merged closing tags to own lines` - ) - } - - // Collapse inline component line breaks to match English format - const inlineResult = normalizeInlineComponentsFromEnglish( - content, - englishMd + applyFix( + () => syncProtectedFrontmatterFields(content, englishMd!), + (n) => `Synced ${n} protected frontmatter fields from English` + ) + applyFix( + () => collapseInlineHtmlFromEnglish(content, englishMd!), + (n) => `Collapsed ${n} inline HTML tags to match English` + ) + applyFix( + () => fixMergedClosingTags(content, englishMd!), + (n) => `Split ${n} merged closing tags to own lines` + ) + applyFix( + () => normalizeInlineComponentsFromEnglish(content, englishMd!), + (n) => `Normalized ${n} inline components to match English` + ) + applyFix( + () => repairUnclosedBackticks(content, englishMd!), + (n) => `Repaired ${n} unclosed backticks` + ) + applyFix( + () => restoreBlankLinesFromEnglish(content, englishMd!), + (n) => `Restored ${n} blank lines from English` + ) + applyFix( + () => fixCollapsedComponentLineBreaks(content, englishMd!), + (n) => `Fixed ${n} collapsed component line breaks` ) - content = inlineResult.content - if (inlineResult.fixCount > 0) { - issues.push( - `Normalized ${inlineResult.fixCount} inline components to match English` - ) - } - - // Repair unclosed backticks in inline code - const backtickResult = repairUnclosedBackticks(content, englishMd) - content = backtickResult.content - if (backtickResult.fixCount > 0) { - issues.push(`Repaired ${backtickResult.fixCount} unclosed backticks`) - } - - const blankLineResult = restoreBlankLinesFromEnglish(content, englishMd) - content = blankLineResult.content - if (blankLineResult.fixCount > 0) { - issues.push( - `Restored ${blankLineResult.fixCount} blank lines from English` - ) - } - - // Fix collapsed line breaks between consecutive components - const collapsedResult = fixCollapsedComponentLineBreaks(content, englishMd) - content = collapsedResult.content - if (collapsedResult.fixCount > 0) { - issues.push( - `Fixed ${collapsedResult.fixCount} collapsed component line breaks` - ) - } // Fix and check protected brand names const brandResult = fixProtectedBrandNames(content, englishMd) - content = brandResult.content - if (brandResult.fixCount > 0) { + if (brandResult.content !== content) { issues.push(`Fixed ${brandResult.fixCount} brand name issues`) } + content = brandResult.content issues.push(...brandResult.warnings) // Fix translated hrefs using set comparison const hrefResult = fixTranslatedHrefs(content, englishMd) - content = hrefResult.content - if (hrefResult.fixCount > 0) { + if (hrefResult.content !== content) { issues.push( `Fixed ${hrefResult.fixCount} translated hrefs: ${hrefResult.fixes.join(", ")}` ) } + content = hrefResult.content issues.push(...hrefResult.warnings) // Detect cross-script contamination @@ -1776,7 +1738,7 @@ export async function runSanitizer( let jsonFilesToProcess: Array<{ path: string; content: string }> = [] if (filesWithContent && filesWithContent.length > 0) { - // Process only the specific files provided with their in-memory content + // Process specific files; if content is empty, reads from disk and writes fixes back console.log( `[SANITIZE] Target: ${filesWithContent.length} specific file(s)` ) @@ -1815,10 +1777,15 @@ export async function runSanitizer( } let mdFixed = 0 + let mdDiskWrites = 0 const mdIssues: Array<{ file: string; issues: string[] }> = [] const mdChanged: Array<{ path: string; content: string }> = [] for (const fileInfo of mdFilesToProcess) { + // Read original from disk for accurate disk-write detection + const originalOnDisk = fs.existsSync(fileInfo.path) + ? fs.readFileSync(fileInfo.path, "utf8") + : null const { fixed, issues, content } = processMarkdownFile( fileInfo.path, fileInfo.content @@ -1827,15 +1794,23 @@ export async function runSanitizer( mdFixed++ mdChanged.push({ path: fileInfo.path, content }) } + // Track actual disk changes (content differs from what's on disk) + if (originalOnDisk !== null && content !== originalOnDisk) { + mdDiskWrites++ + } if (issues.length) mdIssues.push({ file: path.relative(ROOT, fileInfo.path), issues }) } let jsonFixed = 0 + let jsonDiskWrites = 0 const jsonIssues: Array<{ file: string; issues: string[] }> = [] const jsonChanged: Array<{ path: string; content: string }> = [] for (const fileInfo of jsonFilesToProcess) { + const originalOnDisk = fs.existsSync(fileInfo.path) + ? fs.readFileSync(fileInfo.path, "utf8") + : null const { fixed, issues, content } = processJsonFile( fileInfo.path, fileInfo.content @@ -1844,15 +1819,18 @@ export async function runSanitizer( jsonFixed++ jsonChanged.push({ path: fileInfo.path, content }) } + if (originalOnDisk !== null && content !== originalOnDisk) { + jsonDiskWrites++ + } if (issues.length) jsonIssues.push({ file: path.relative(ROOT, fileInfo.path), issues }) } console.log( - `\n[SANITIZE] Markdown files scanned: ${mdFilesToProcess.length}, fixed: ${mdFixed}` + `\n[SANITIZE] Markdown: ${mdFilesToProcess.length} scanned, ${mdDiskWrites} written to disk` ) console.log( - `[SANITIZE] JSON files scanned: ${jsonFilesToProcess.length}, fixed: ${jsonFixed}` + `[SANITIZE] JSON: ${jsonFilesToProcess.length} scanned, ${jsonDiskWrites} written to disk` ) if (mdIssues.length || jsonIssues.length) { From e02e64d56f5a26e2916819772e5d3576b2f1182b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 21 Feb 2026 16:12:56 -0700 Subject: [PATCH 05/14] fix(types): await promise --- src/scripts/i18n/lib/workflows/sanitization.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/i18n/lib/workflows/sanitization.ts b/src/scripts/i18n/lib/workflows/sanitization.ts index 17f59c90ff9..548871af2dc 100644 --- a/src/scripts/i18n/lib/workflows/sanitization.ts +++ b/src/scripts/i18n/lib/workflows/sanitization.ts @@ -25,7 +25,7 @@ export async function runPostImportSanitization( console.log(`[SANITIZE] Processing ${committedFiles.length} committed files`) - const sanitizeResult = runSanitizer(committedFiles) + const sanitizeResult = await runSanitizer(committedFiles) const changedFiles = sanitizeResult.changedFiles || [] if (changedFiles.length) { From 3797b707feb2c52b3d086102fde5c4520793327b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 21 Feb 2026 17:19:45 -0700 Subject: [PATCH 06/14] feat(i18n): add orphan detection to post-import sanitizer Flag translated files that have no English source at the expected path. When a single match is found by filename, suggests the correct location. Reports ambiguous cases with candidate count. Co-Authored-By: Claude Opus 4.6 --- src/scripts/i18n/post_import_sanitize.ts | 69 ++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index f0e1da46831..034e71f4a7b 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -1776,6 +1776,74 @@ export async function runSanitizer( jsonFilesToProcess = jsonFilePaths.map((p) => ({ path: p, content: "" })) } + // --- Orphan detection: flag translated files with no English counterpart --- + const orphanWarnings: Array<{ file: string; suggestion: string }> = [] + const translationsDir = path.join(CONTENT_ROOT, "translations") + + for (const fileInfo of [...mdFilesToProcess, ...jsonFilesToProcess]) { + const filePath = fileInfo.path + // Extract the relative path after translations// + const txIdx = filePath.indexOf(`${path.sep}translations${path.sep}`) + if (txIdx === -1) continue + + const afterTranslations = filePath.substring( + txIdx + `${path.sep}translations${path.sep}`.length + ) + // Strip the language code prefix: /rest/of/path + const slashIdx = afterTranslations.indexOf(path.sep) + if (slashIdx === -1) continue + + const langCode = afterTranslations.substring(0, slashIdx) + const relPathWithinLang = afterTranslations.substring(slashIdx + 1) + + // Derive the expected English source path + const englishPath = path.join(CONTENT_ROOT, relPathWithinLang) + + if (!fs.existsSync(englishPath)) { + const relFile = path.relative(ROOT, filePath) + // Try to find the English file by filename to suggest the correct location + const basename = path.basename(relPathWithinLang) + const parentDir = path.basename(path.dirname(relPathWithinLang)) + let suggestion = "No English counterpart found" + + // Search for matching parent/file pattern in English content + const englishContentFiles = listFiles(CONTENT_ROOT, (f) => { + if (f.includes(`${path.sep}translations${path.sep}`)) return false + return ( + f.endsWith(`${path.sep}${parentDir}${path.sep}${basename}`) && + !f.includes(`${path.sep}translations${path.sep}`) + ) + }) + + if (englishContentFiles.length === 1) { + const correctEnglishRel = path.relative( + CONTENT_ROOT, + englishContentFiles[0] + ) + const correctTranslationPath = path.join( + translationsDir, + langCode, + correctEnglishRel + ) + suggestion = `Likely belongs at: ${path.relative(ROOT, correctTranslationPath)}` + } else if (englishContentFiles.length > 1) { + suggestion = `Ambiguous: ${englishContentFiles.length} English candidates found (${englishContentFiles.map((f) => path.relative(CONTENT_ROOT, f)).join(", ")})` + } + + orphanWarnings.push({ file: relFile, suggestion }) + } + } + + if (orphanWarnings.length > 0) { + console.log( + `\n[SANITIZE] ⚠ Orphaned translations (no English source at expected path):` + ) + for (const w of orphanWarnings) { + console.log(` - ${w.file}`) + console.log(` ${w.suggestion}`) + } + } + let mdFixed = 0 let mdDiskWrites = 0 const mdIssues: Array<{ file: string; issues: string[] }> = [] @@ -1856,6 +1924,7 @@ export async function runSanitizer( markdown: { scanned: mdFilesToProcess.length, fixed: mdFixed }, json: { scanned: jsonFilesToProcess.length, fixed: jsonFixed }, issues: { markdown: mdIssues, json: jsonIssues }, + orphanWarnings, } } From 8de3206402270694c12b161789d8914208d70c60 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 21 Feb 2026 19:48:59 -0700 Subject: [PATCH 07/14] docs: compounding doc, path mapping and review workflow Documents root causes and fixes for misplaced translation files, the worktree-based review workflow, sanitizer enhancements, and automation permission requirements. Co-Authored-By: Claude Opus 4.6 --- ...n-file-path-mapping-and-review-workflow.md | 314 ++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md diff --git a/docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md b/docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md new file mode 100644 index 00000000000..88a869a7a30 --- /dev/null +++ b/docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md @@ -0,0 +1,314 @@ +--- +title: "Crowdin File Path Mapping Bugs and Translation Review Workflow" +date: 2026-02-21 +category: integration-issues +tags: + - crowdin + - translations + - i18n + - file-path-matching + - sanitizer + - worktree + - automation + - permissions +severity: high +component: crowdin-import-pipeline +symptoms: + - "Translated files placed at incorrect paths (e.g., cs/beacon-chain/ instead of cs/roadmap/beacon-chain/)" + - "Systematic translation errors: AI replaced with UI, semantic inversions" + - "Orphaned translation files with no corresponding English source" + - "Inaccurate fix count logging in sanitizer output" + - "Cross-block href replacements affecting wrong sections" + - "Build failures in worktrees due to missing .env.local" + - "Merge conflicts discovered only at push time" +root_causes: + - "findCrowdinFile() used .endsWith() for path matching, producing false matches on similarly named paths" + - "Crowdin/Gemini translation engine confusing acronyms and inverting meaning" + - "Sanitizer tracked in-memory transforms instead of actual disk changes" + - "Href replacement applied globally instead of per-block" + - "No .env.local in worktrees (USE_MOCK_DATA not set)" + - "PR branches diverged from dev without early merge" +status: solved +related_prs: + - 17547 + - 17553 + - 17556 + - 17182 +--- + +# Crowdin File Path Mapping Bugs and Translation Review Workflow + +## Problem Summary + +During Phase 2 of the translation review pipeline (Czech pilot), we discovered that 12 Czech translation files were placed at incorrect filesystem paths. Investigation revealed the root cause in the Crowdin import workflow's path matching logic. Additionally, we established a reproducible worktree-based workflow for reviewing translation PRs and cataloged all permissions needed for automation. + +## Root Cause Analysis + +### Misplaced Translation Files + +**File:** `src/scripts/i18n/lib/crowdin/files.ts` + +The `findCrowdinFile()` function used `.endsWith()` to match Crowdin file paths against expected content paths: + +```ts +// BROKEN: matches too broadly +const found = crowdinFiles.find(({ path }) => + path.endsWith(targetFile.filePath) +) +``` + +When looking for `public/content/roadmap/beacon-chain/index.md`, this matched `cs/beacon-chain/index.md` because the suffix `beacon-chain/index.md` is valid for both. The `roadmap/` parent directory was silently ignored. + +**Data flow of the bug:** + +``` +GitHub file path findCrowdinFile() processedFileIdToPath Download destination +public/content/roadmap/ --> .endsWith() matches --> Stores wrong Crowdin --> cs/beacon-chain/ + beacon-chain/index.md cs/beacon-chain/ path for this fileId index.md (WRONG) +``` + +**12 Czech files affected:** + +| Wrong Location | Correct Location | +|---|---| +| `cs/account-abstraction/` | `cs/roadmap/account-abstraction/` | +| `cs/beacon-chain/` | `cs/roadmap/beacon-chain/` | +| `cs/danksharding/` | `cs/roadmap/danksharding/` | +| `cs/future-proofing/` | `cs/roadmap/future-proofing/` | +| `cs/scaling/` | `cs/roadmap/scaling/` | +| `cs/statelessness/` | `cs/roadmap/statelessness/` | +| `cs/user-experience/` | `cs/roadmap/user-experience/` | +| `cs/withdrawals/` | `cs/staking/withdrawals/` | +| `cs/dvt/` | `cs/staking/dvt/` | +| `cs/support/` | `cs/community/support/` | +| `cs/code-of-conduct/` | `cs/community/code-of-conduct/` | +| `cs/developers/docs/wrapped-eth/` | `cs/wrapped-eth/` | + +### Translation Quality Issues + +Crowdin/Gemini translation engine produced two categories of critical error: + +1. **Acronym confusion**: "AI" systematically replaced with "UI" (5 instances in `cs/ai-agents/index.md`) +2. **Semantic inversion**: "malicious intent" translated as "good intentions" in `cs/bridges/index.md` + +### Sanitizer Logging Inaccuracy + +Individual fix functions returned `fixCount` based on in-memory transforms, not actual disk changes. Reported "22 files modified" when no bytes were written. + +### Cross-Block Href Interference + +`fixTranslatedHrefs()` applied replacements globally. When `/developers/docs/evm` appeared in both an EVM block and an Oracles block, the global replacement changed the correct href in the EVM block. + +## Solutions Implemented + +### Fix 1: Stricter Path Matching in findCrowdinFile() + +**File:** `src/scripts/i18n/lib/crowdin/files.ts` +**Branch:** `fix-i18n-workflow` + +```ts +// 1. Exact match first (after normalizing leading slashes) +const exactMatch = crowdinFiles.find( + ({ path }) => path.replace(/^\/+/, "") === normalizedTarget +) +if (exactMatch) return exactMatch + +// 2. Suffix match with "/" boundary guard +const suffixMatches = crowdinFiles.filter(({ path }) => { + const normalized = path.replace(/^\/+/, "") + if (!normalized.endsWith(normalizedTarget)) return false + const prefixLength = normalized.length - normalizedTarget.length + if (prefixLength === 0) return true + return normalized[prefixLength - 1] === "/" +}) + +// 3. Prefer longest (most specific) match +suffixMatches.sort((a, b) => b.path.length - a.path.length) +return suffixMatches[0] ?? null +``` + +### Fix 2: Orphan Detection in Sanitizer + +**File:** `src/scripts/i18n/post_import_sanitize.ts` +**Branch:** `fix-review-translations` + +For each translated file, derives the expected English source path and checks existence. If missing, searches by filename to suggest the correct location: + +```ts +const englishPath = path.join(CONTENT_ROOT, relPathWithinLang) + +if (!fs.existsSync(englishPath)) { + // Search for matching parent/file pattern in English content + const englishContentFiles = listFiles(CONTENT_ROOT, (f) => { + if (f.includes(`${path.sep}translations${path.sep}`)) return false + return f.endsWith(`${path.sep}${parentDir}${path.sep}${basename}`) + }) + + if (englishContentFiles.length === 1) { + suggestion = `Likely belongs at: ${correctTranslationPath}` + } else if (englishContentFiles.length > 1) { + suggestion = `Ambiguous: ${englishContentFiles.length} candidates` + } +} +``` + +### Fix 3: Accurate Disk-Write Tracking + +**File:** `src/scripts/i18n/post_import_sanitize.ts` + +Added `applyFix()` helper that snapshots content before/after each transform, plus `originalOnDisk` comparison: + +```ts +function applyFix( + fn: () => { content: string; fixCount: number }, + label: (count: number) => string +) { + const snapshot = content + const result = fn() + content = result.content + if (content !== snapshot) { + issues.push(label(result.fixCount)) + } +} +``` + +### Fix 4: Block-Scoped Href Replacement + +**File:** `src/scripts/i18n/post_import_sanitize.ts` + +Track `blockIdx` from detection phase, apply replacements only within the specific block: + +```ts +// Detection phase +blockFixes.push({ blockIdx: i, wrong: translatedHref, correct: expectedHref }) + +// Replacement phase - scoped to block +for (const { blockIdx, wrong, correct } of blockFixes) { + const originalBlock = translatedBlocks[blockIdx] + let fixedBlock = originalBlock.replace(markdownRe, `$1${correct}$2`) + if (fixedBlock !== originalBlock) { + result = result.replace(originalBlock, fixedBlock) + } +} +``` + +## Worktree Workflow for Translation Review + +Reproducible 8-step sequence for reviewing a translation PR: + +```bash +# 1. Create worktree from PR branch +git worktree add .worktrees/ +cd .worktrees/ + +# 2. Provide environment variables (USE_MOCK_DATA=true avoids network calls) +cp .env.example .env.local + +# 3. Merge latest dev to catch conflicts early +git fetch origin dev && git merge origin/dev +# Resolve conflicts — typically modify/delete for misplaced files + +# 4. Copy sanitizer scripts from canonical branch (until merged to dev) +# Also add franc-min to package.json devDependencies +cp /src/scripts/i18n/post_import_sanitize.ts ./src/scripts/i18n/ +cp /src/scripts/i18n/lib/workflows/sanitization.ts ./src/scripts/i18n/lib/workflows/ + +# 5. Install dependencies +pnpm install + +# 6. Run sanitizer for orphan detection +TARGET_LANGUAGES= npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/post_import_sanitize.ts + +# 7. Run review (critical issues only — no soft suggestions) +# Use /review-translations-local --pr= --language= + +# 8. Validate build +npx tsc --noEmit # TypeScript check FIRST +NEXT_PUBLIC_BUILD_LOCALES=en, pnpm build # Scoped build +``` + +### Key Notes + +- **Always run `npx tsc --noEmit` before `pnpm build`** — catches type errors cheaply +- **`.env.local` is mandatory** — without it, build attempts real API connections and fails +- **Merge dev early** — resolving conflicts before review prevents wasted work +- **Merge conflicts are expected** — misplaced files from prior imports cause modify/delete conflicts; accept the deletion +- **`franc-min` is required** — ESM-only package, needs devDependency until sanitizer changes reach dev + +## Automation Permissions Required + +All sandbox-restricted operations needed for this workflow: + +### Git Operations + +| Command | Purpose | +|---|---| +| `git worktree add/remove` | Create/destroy isolated review environments | +| `git fetch origin` | Retrieve latest upstream branches | +| `git merge origin/dev` | Integrate dev into PR branch | +| `git stash push/pop` | Temporarily shelve local edits | +| `git rm` | Remove orphaned/misplaced files | +| `git add` / `git commit` | Stage and commit fixes | +| `git push` | Push corrected branch to remote | + +### Package Management + +| Command | Purpose | +|---|---| +| `pnpm install` | Install dependencies (network + node_modules writes) | + +### Script Execution + +| Command | Purpose | +|---|---| +| `npx tsc --noEmit` | TypeScript check without emitting | +| `npx ts-node