From 017a8953fc743cb3c1e1472fcefd5a52993f28d9 Mon Sep 17 00:00:00 2001
From: Paul Wackerow <54227730+wackerow@users.noreply.github.com>
Date: Sat, 21 Feb 2026 12:51:16 -0700
Subject: [PATCH 01/14] chore(deps): add franc-min as devDependency

ESM-only trigram language detection library used by the post-import sanitizer to detect untranslated paragraphs in translation files.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 package.json   |  1 +
 pnpm-lock.yaml | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/package.json b/package.json
index d98152c79c8..5b956dc6818 100644
--- a/package.json
+++ b/package.json
@@ -143,6 +143,7 @@
     "eslint-plugin-simple-import-sort": "^10.0.0",
     "eslint-plugin-storybook": "0.8.0",
     "eslint-plugin-unused-imports": "^3.2.0",
+    "franc-min": "^6.2.0",
     "husky": "^9.0.11",
     "image-size": "^1.0.2",
     "lint-staged": "^15.2.5",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index f2e077b8c71..bc3ee5d9831 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -341,6 +341,9 @@ importers:
       eslint-plugin-unused-imports:
         specifier: ^3.2.0
         version: 3.2.0(@typescript-eslint/eslint-plugin@7.18.0(@typescript-eslint/parser@7.18.0(eslint@8.57.1)(typescript@5.8.3))(eslint@8.57.1)(typescript@5.8.3))(eslint@8.57.1)
+      franc-min:
+        specifier: ^6.2.0
+        version: 6.2.0
       husky:
         specifier: ^9.0.11
         version: 9.1.7
@@ -6745,6 +6748,9 @@ packages:
       react-dom:
         optional: true
 
+  franc-min@6.2.0:
+    resolution: {integrity: sha512-1uDIEUSlUZgvJa2AKYR/dmJC66v/PvGQ9mWfI9nOr/kPpMFyvswK0gPXOwpYJYiYD008PpHLkGfG58SPjQJFxw==}
+
   fs-constants@1.0.0:
     resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==}
 
@@ -7823,6 +7829,9 @@ packages:
   mz@2.7.0:
     resolution: {integrity: sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==}
 
+  n-gram@2.0.2:
+    resolution: {integrity: sha512-S24aGsn+HLBxUGVAUFOwGpKs7LBcG4RudKU//eWzt/mQ97/NMKQxDWHyHx63UNWk/OOdihgmzoETn1tf5nQDzQ==}
+
   nanoid@3.3.11:
     resolution: {integrity: sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==}
     engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1}
@@ -9426,6 +9435,9 @@ packages:
   tr46@0.0.3:
     resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==}
 
+  trigram-utils@2.0.1:
+    resolution: {integrity: sha512-nfWIXHEaB+HdyslAfMxSqWKDdmqY9I32jS7GnqpdWQnLH89r6A5sdk3fDVYqGAZ0CrT8ovAFSAo6HRiWcWNIGQ==}
+
   trim-lines@3.0.1:
     resolution: {integrity: sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==}
 
@@ -18919,6 +18931,10 @@ snapshots:
       react: 18.3.1
       react-dom: 18.3.1(react@18.3.1)
 
+  franc-min@6.2.0:
+    dependencies:
+      trigram-utils: 2.0.1
+
   fs-constants@1.0.0: {}
 
   fs-extra@10.1.0:
@@ -20292,6 +20308,8 @@ snapshots:
       object-assign: 4.1.1
       thenify-all: 1.6.0
 
+  n-gram@2.0.2: {}
+
   nanoid@3.3.11: {}
 
   nanoid@3.3.8: {}
@@ -22135,6 +22153,11 @@ snapshots:
 
   tr46@0.0.3: {}
 
+  trigram-utils@2.0.1:
+    dependencies:
+      collapse-white-space: 2.1.0
+      n-gram: 2.0.2
+
   trim-lines@3.0.1: {}
 
   trough@2.2.0: {}

From 312f14e1b2e93be30c24b2d111dc8e78a23c85e9 Mon Sep 17 00:00:00 2001
From: Paul Wackerow <54227730+wackerow@users.noreply.github.com>
Date: Sat, 21 Feb 2026 12:53:32 -0700
Subject: [PATCH 02/14] feat: enhance i18n sanitizer with 8 new checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds ticker transposition fixes (EHT→ETH, BSL→BLS, ECDSA), frontmatter tag syncing from English source, expanded brand name list with auto-fix for tags, cross-script contamination detection for 20+ locales, MDX angle bracket escaping, orphaned closing tag removal, and franc-min-powered untranslated paragraph detection. Makes runSanitizer async to support dynamic ESM import of franc-min.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/scripts/i18n/post_import_sanitize.ts | 478 ++++++++++++++++++++++-
 1 file changed, 465 insertions(+), 13 deletions(-)

diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts
index 000d668192f..c61da262ecf 100644
--- a/src/scripts/i18n/post_import_sanitize.ts
+++ b/src/scripts/i18n/post_import_sanitize.ts
@@ -1,6 +1,20 @@
 import * as fs from "fs"
 import * as path from "path"
 
+// franc-min is ESM-only; use dynamic import
+let francDetect: ((text: string) => string) | null = null
+async function loadFranc(): Promise<void> {
+  if (francDetect) return
+  try {
+    const francModule = await import("franc-min")
+    francDetect = francModule.franc
+  } catch {
+    console.warn(
+      "[SANITIZE] franc-min not available; skipping language detection"
+    )
+  }
+}
+
 /**
  * Post-import sanitizer for Crowdin translations.
  *
@@ -99,6 +113,10 @@ const PROTECTED_BRAND_NAMES = [
   // Programming languages
   "Solidity",
   "Vyper",
+  "Rust",
+  "JavaScript",
+  "TypeScript",
+  "Python",
   // Companies/Products
   "Alchemy",
   "Infura",
@@ -106,26 +124,151 @@ const PROTECTED_BRAND_NAMES = [
   "Consensys",
   "Chainlink",
   "OpenZeppelin",
+  "Gnosis",
+  "Flashbots",
+  "Etherscan",
+  "Hardhat",
+  "Foundry",
+  "Remix",
+  "Truffle",
+  "Ganache",
+  "Brownie",
+  "Waffle",
+  // Protocols/Projects
+  "Uniswap",
+  "Aave",
+  "Compound",
+  "MakerDAO",
+  "Lido",
+  "Rocket Pool",
+  "ENS",
+  // Core terms that must stay English
+  "Ethereum",
+  "Bitcoin",
+  "Beacon Chain",
+  "Solana",
+  "Polygon",
+  "Arbitrum",
+  "Optimism",
+  "Base",
 ]
 
 /**
- * Check if protected brand names from English source are preserved in translation.
- * Returns warnings for any brand names that appear in English but not in translation.
+ * Common ticker/acronym transpositions found in translations.
+ * Maps wrong form → correct form.
  */
-function checkProtectedBrandNames(
+const TICKER_CORRECTIONS: Record<string, string> = {
+  EHT: "ETH",
+  BSL: "BLS",
+  ECDAS: "ECDSA",
+  KECCAK: "Keccak",
+}
+
+/**
+ * Fix ticker symbol transpositions.
+ * Only matches whole words (word boundaries) to avoid false positives.
+ */
+function fixTickerTranspositions(content: string): {
+  content: string
+  fixCount: number
+} {
+  let result = content
+  let fixCount = 0
+
+  for (const [wrong, correct] of Object.entries(TICKER_CORRECTIONS)) {
+    const re = new RegExp(`\\b${escapeRegex(wrong)}\\b`, "g")
+    const matches = result.match(re)
+    if (matches && matches.length > 0) {
+      fixCount += matches.length
+      result = result.replace(re, correct)
+    }
+  }
+
+  return { content: result, fixCount }
+}
+
+/**
+ * Sync frontmatter tags array from English source.
+ * Tags like programming language names should never be translated.
+ * Replaces the entire tags array with the English original.
+ */
+function syncFrontmatterTags(
   translatedContent: string,
   englishContent: string
-): string[] {
+): { content: string; fixCount: number } {
+  const frontmatterRe = /^---\n([\s\S]*?)\n---/
+  const transMatch = translatedContent.match(frontmatterRe)
+  const engMatch = englishContent.match(frontmatterRe)
+
+  if (!transMatch || !engMatch)
+    return { content: translatedContent, fixCount: 0 }
+
+  const transFm = transMatch[1]
+  const engFm = engMatch[1]
+
+  // Extract tags line (handles both inline array and value)
+  const tagsRe = /^(tags:\s*)(.+)$/m
+  const engTagsMatch = engFm.match(tagsRe)
+  const transTagsMatch = transFm.match(tagsRe)
+
+  if (!engTagsMatch || !transTagsMatch)
+    return { content: translatedContent, fixCount: 0 }
+
+  const engTagsValue = engTagsMatch[2].trim()
+  const transTagsValue = transTagsMatch[2].trim()
+
+  if (engTagsValue === transTagsValue)
+    return { content: translatedContent, fixCount: 0 }
+
+  // Replace translated tags with English tags
+  const updatedFm = transFm.replace(
+    tagsRe,
+    `${transTagsMatch[1]}${engTagsValue}`
+  )
+  const content = translatedContent.replace(
+    frontmatterRe,
+    `---\n${updatedFm}\n---`
+  )
+
+  return { content, fixCount: 1 }
+}
+
+/**
+ * Fix protected brand names that were mistranslated.
+ * For each brand found in English source, if the count drops in translation,
+ * attempt to restore by finding the translated variants and replacing them.
+ *
+ * Strategy: For brand names where English count > translation count,
+ * we can't easily know what the mistranslation IS without locale knowledge.
+ * So we report these as warnings for the LLM review to handle.
+ *
+ * However, for frontmatter `tags` arrays, we CAN auto-fix by syncing with English.
+ */
+function fixProtectedBrandNames(
+  translatedContent: string,
+  englishContent: string
+): { content: string; fixCount: number; warnings: string[] } {
   const warnings: string[] = []
+  let content = translatedContent
+  let fixCount = 0
 
+  // Auto-fix: Sync frontmatter tags with English source
+  const tagsSyncResult = syncFrontmatterTags(content, englishContent)
+  content = tagsSyncResult.content
+  fixCount += tagsSyncResult.fixCount
+  if (tagsSyncResult.fixCount > 0) {
+    warnings.push(
+      `Auto-synced ${tagsSyncResult.fixCount} frontmatter tags with English source`
+    )
+  }
+
+  // Warn: Brand names with count mismatches in body content
   for (const brand of PROTECTED_BRAND_NAMES) {
-    // Check if brand exists in English source (case-sensitive match with word boundaries)
     const brandRegex = new RegExp(`\\b${escapeRegex(brand)}\\b`, "g")
     const inEnglish = englishContent.match(brandRegex)
 
     if (inEnglish && inEnglish.length > 0) {
-      // Brand is in English, check if it's preserved in translation
-      const inTranslation = translatedContent.match(brandRegex)
+      const inTranslation = content.match(brandRegex)
       const englishCount = inEnglish.length
       const translationCount = inTranslation?.length ?? 0
 
@@ -137,7 +280,7 @@ function checkProtectedBrandNames(
     }
   }
 
-  return warnings
+  return { content, fixCount, warnings }
 }
 
 /**
@@ -1051,6 +1194,274 @@ function quoteFrontmatterNonAscii(content: string): {
   return { content, fixCount }
 }
 
+/**
+ * Expected Unicode script ranges per locale.
+ * Maps locale prefix to regex of UNEXPECTED characters.
+ * If these characters appear in a file for that locale, it's contamination.
+ */
+const CROSS_SCRIPT_DETECTORS: Record<
+  string,
+  { name: string; pattern: RegExp }
+> = {
+  // Latin-script languages should not contain Devanagari, CJK, Arabic, etc.
+  tr: {
+    name: "Devanagari/CJK/Cyrillic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF]/g,
+  },
+  fr: {
+    name: "Devanagari/CJK/Cyrillic/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g,
+  },
+  de: {
+    name: "Devanagari/CJK/Cyrillic/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g,
+  },
+  es: {
+    name: "Devanagari/CJK/Cyrillic/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g,
+  },
+  it: {
+    name: "Devanagari/CJK/Cyrillic/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g,
+  },
+  pt: {
+    name: "Devanagari/CJK/Cyrillic/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g,
+  },
+  pl: {
+    name: "Devanagari/CJK/Cyrillic/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g,
+  },
+  cs: {
+    name: "Devanagari/CJK/Cyrillic/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g,
+  },
+  id: {
+    name: "Devanagari/CJK/Cyrillic/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g,
+  },
+  sw: {
+    name: "Devanagari/CJK/Cyrillic/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g,
+  },
+  vi: {
+    name: "Devanagari/CJK/Cyrillic/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF\u0600-\u06FF]/g,
+  },
+  // Cyrillic languages should not contain Devanagari, CJK, Arabic, etc.
+  ru: {
+    name: "Devanagari/CJK/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0600-\u06FF]/g,
+  },
+  uk: {
+    name: "Devanagari/CJK/Arabic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0600-\u06FF]/g,
+  },
+  // Arabic should not contain Devanagari, CJK, Cyrillic, etc.
+  ar: {
+    name: "Devanagari/CJK/Cyrillic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF]/g,
+  },
+  ur: {
+    name: "Devanagari/CJK/Cyrillic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0400-\u04FF]/g,
+  },
+  // Devanagari languages should not contain CJK, Arabic, Cyrillic
+  hi: {
+    name: "CJK/Arabic/Cyrillic",
+    pattern: /[\u4E00-\u9FFF\u0600-\u06FF\u0400-\u04FF]/g,
+  },
+  mr: {
+    name: "CJK/Arabic/Cyrillic",
+    pattern: /[\u4E00-\u9FFF\u0600-\u06FF\u0400-\u04FF]/g,
+  },
+  // CJK languages should not contain Devanagari, Arabic, Cyrillic
+  ja: {
+    name: "Devanagari/Arabic/Cyrillic",
+    pattern: /[\u0900-\u097F\u0600-\u06FF\u0400-\u04FF]/g,
+  },
+  ko: {
+    name: "Devanagari/Arabic/Cyrillic",
+    pattern: /[\u0900-\u097F\u0600-\u06FF\u0400-\u04FF]/g,
+  },
+  "zh-tw": {
+    name: "Devanagari/Arabic/Cyrillic",
+    pattern: /[\u0900-\u097F\u0600-\u06FF\u0400-\u04FF]/g,
+  },
+  // Tamil/Telugu should not contain Devanagari, CJK, Arabic, Cyrillic
+  ta: {
+    name: "Devanagari/CJK/Arabic/Cyrillic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0600-\u06FF\u0400-\u04FF]/g,
+  },
+  te: {
+    name: "Devanagari/CJK/Arabic/Cyrillic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0600-\u06FF\u0400-\u04FF]/g,
+  },
+  // Bengali should not contain other Indic, CJK, Arabic, Cyrillic
+  bn: {
+    name: "Devanagari/CJK/Arabic/Cyrillic",
+    pattern: /[\u0900-\u097F\u4E00-\u9FFF\u0600-\u06FF\u0400-\u04FF]/g,
+  },
+}
+
+/**
+ * Detect cross-script contamination in translated content.
+ * Returns warnings for unexpected Unicode characters based on the file's locale.
+ */
+function detectCrossScriptContamination(
+  content: string,
+  locale: string
+): string[] {
+  const warnings: string[] = []
+  const detector = CROSS_SCRIPT_DETECTORS[locale]
+  if (!detector) return warnings
+
+  // Skip code blocks — foreign characters in code are valid
+  const codeBlockRe = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g
+  const parts = content.split(codeBlockRe)
+
+  for (let i = 0; i < parts.length; i++) {
+    if (i % 2 === 1) continue // Skip code blocks
+
+    const matches = parts[i].match(detector.pattern)
+    if (matches && matches.length > 0) {
+      // Get unique characters found
+      const uniqueChars = Array.from(new Set(matches)).slice(0, 5).join(", ")
+      warnings.push(
+        `Cross-script contamination: found ${matches.length} ${detector.name} character(s) in ${locale} file (e.g., ${uniqueChars})`
+      )
+    }
+  }
+
+  return warnings
+}
+
+/**
+ * Escape raw `<` before numbers in MDX content.
+ * Pattern: `<5GB` becomes `&lt;5GB` to prevent MDX treating it as a JSX tag.
+ * Skips code blocks (fenced and inline) where `<` is valid.
+ */
+function escapeMdxAngleBrackets(content: string): {
+  content: string
+  fixCount: number
+} {
+  let fixCount = 0
+
+  // Split content to preserve code blocks
+  const codeBlockPattern = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g
+  const parts = content.split(codeBlockPattern)
+
+  for (let i = 0; i < parts.length; i++) {
+    if (i % 2 === 1) continue // Skip code blocks
+
+    // Match < followed by a digit (not already escaped, not part of HTML tag)
+    parts[i] = parts[i].replace(/(?<!&lt|&)<(\d)/g, (_, digit) => {
+      fixCount++
+      return `&lt;${digit}`
+    })
+  }
+
+  return { content: parts.join(""), fixCount }
+}
+
+/**
+ * Detect and remove orphaned closing HTML tags.
+ * These appear when translation restructures sentences and leaves behind
+ * closing tags like </a> without matching openers.
+ * Only removes tags that have NO corresponding opener in the same paragraph.
+ */
+function removeOrphanedClosingTags(content: string): {
+  content: string
+  fixCount: number
+} {
+  let fixCount = 0
+  const orphanTags = ["a", "span", "em", "strong", "b", "i", "u"]
+
+  for (const tag of orphanTags) {
+    // Find closing tags that don't have a matching opener on the same line
+    const lines = content.split("\n")
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i]
+      const closeRe = new RegExp(`</${tag}>`, "g")
+      const openRe = new RegExp(`<${tag}[\\s>]`, "g")
+
+      const closeCount = (line.match(closeRe) || []).length
+      const openCount = (line.match(openRe) || []).length
+
+      // If there are more closing tags than opening tags on this line,
+      // remove the excess closing tags (they're orphans)
+      if (closeCount > openCount) {
+        let excess = closeCount - openCount
+        lines[i] = line.replace(closeRe, (match) => {
+          if (excess > 0) {
+            excess--
+            fixCount++
+            return ""
+          }
+          return match
+        })
+        // Clean up any resulting double spaces
+        lines[i] = lines[i].replace(/  +/g, " ").trim()
+      }
+    }
+    content = lines.join("\n")
+  }
+
+  return { content, fixCount }
+}
+
+/**
+ * Detect paragraphs that appear to be untranslated (still in English).
+ * Uses franc-min for language detection on paragraph-sized chunks.
+ * Only flags paragraphs with high confidence of being English in non-English files.
+ */
+function detectUntranslatedContent(content: string, locale: string): string[] {
+  if (!francDetect) return []
+  // Don't check English files
+  if (locale === "en") return []
+
+  const warnings: string[] = []
+
+  // Split into paragraphs (skip frontmatter, code blocks)
+  const withoutFrontmatter = content.replace(/^---\n[\s\S]*?\n---\n?/, "")
+  const withoutCodeBlocks = withoutFrontmatter.replace(/```[\s\S]*?```/g, "")
+
+  const paragraphs = withoutCodeBlocks
+    .split(/\n\s*\n/)
+    .filter((p) => p.trim().length > 100) // Only check substantial paragraphs
+
+  let untranslatedCount = 0
+  for (const para of paragraphs) {
+    const cleanPara = para
+      .replace(/\[([^\]]*)\]\([^)]*\)/g, "$1") // Remove markdown links (keep text)
+      .replace(/<[^>]+>/g, "") // Remove HTML/JSX tags
+      .replace(/`[^`]+`/g, "") // Remove inline code
+      .trim()
+
+    if (cleanPara.length < 80) continue // Too short for reliable detection
+
+    const detected = francDetect(cleanPara)
+    if (detected === "eng") {
+      untranslatedCount++
+      // Only report first 3 to avoid noise
+      if (untranslatedCount <= 3) {
+        const preview = cleanPara.substring(0, 80).replace(/\n/g, " ")
+        warnings.push(
+          `Possibly untranslated paragraph (detected as English): "${preview}..."`
+        )
+      }
+    }
+  }
+
+  if (untranslatedCount > 3) {
+    warnings.push(
+      `...and ${untranslatedCount - 3} more potentially untranslated paragraphs`
+    )
+  }
+
+  return warnings
+}
+
 function processMarkdownFile(
   mdPath: string,
   providedContent?: string
@@ -1067,6 +1478,7 @@ function processMarkdownFile(
   // Map translated path to English path: remove `/translations/<lang>/` segment
   const parts = mdPath.split(path.sep)
   const idx = parts.lastIndexOf("translations")
+  const locale = idx !== -1 && idx + 1 < parts.length ? parts[idx + 1] : ""
   if (idx === -1 || idx + 2 >= parts.length) {
     issues.push("No translations segment found; skipping formatting sync")
   } else {
@@ -1133,6 +1545,29 @@ function processMarkdownFile(
     issues.push(`Unescaped ${escapedBacktickCount} backslash-escaped backticks`)
   }
 
+  // Fix ticker symbol transpositions (EHT → ETH, etc.)
+  const tickerResult = fixTickerTranspositions(content)
+  content = tickerResult.content
+  if (tickerResult.fixCount > 0) {
+    issues.push(`Fixed ${tickerResult.fixCount} ticker symbol transpositions`)
+  }
+
+  // Escape raw < before numbers in MDX content
+  const angleBracketResult = escapeMdxAngleBrackets(content)
+  content = angleBracketResult.content
+  if (angleBracketResult.fixCount > 0) {
+    issues.push(
+      `Escaped ${angleBracketResult.fixCount} raw angle brackets before numbers`
+    )
+  }
+
+  // Remove orphaned closing HTML tags
+  const orphanResult = removeOrphanedClosingTags(content)
+  content = orphanResult.content
+  if (orphanResult.fixCount > 0) {
+    issues.push(`Removed ${orphanResult.fixCount} orphaned closing HTML tags`)
+  }
+
   // Fix block component line breaks (critical for MDX parser)
   const blockResult = fixBlockComponentLineBreaks(content)
   content = blockResult.content
@@ -1207,9 +1642,13 @@ function processMarkdownFile(
       )
     }
 
-    // Check for mistranslated brand names (report-only)
-    const brandWarnings = checkProtectedBrandNames(content, englishMd)
-    issues.push(...brandWarnings)
+    // Fix and check protected brand names
+    const brandResult = fixProtectedBrandNames(content, englishMd)
+    content = brandResult.content
+    if (brandResult.fixCount > 0) {
+      issues.push(`Fixed ${brandResult.fixCount} brand name issues`)
+    }
+    issues.push(...brandResult.warnings)
 
     // Fix translated hrefs using set comparison
     const hrefResult = fixTranslatedHrefs(content, englishMd)
@@ -1220,6 +1659,18 @@ function processMarkdownFile(
       )
     }
     issues.push(...hrefResult.warnings)
+
+    // Detect cross-script contamination
+    if (locale) {
+      const scriptWarnings = detectCrossScriptContamination(content, locale)
+      issues.push(...scriptWarnings)
+    }
+
+    // Detect untranslated content
+    if (locale) {
+      const untranslatedWarnings = detectUntranslatedContent(content, locale)
+      issues.push(...untranslatedWarnings)
+    }
   }
 
   const fixed = before !== content
@@ -1314,11 +1765,12 @@ function languagesFromEnv(): string[] | undefined {
     .filter(Boolean)
 }
 
-export function runSanitizer(
+export async function runSanitizer(
   filesWithContent?: Array<{ path: string; content: string }>,
   langs?: string[]
 ) {
   console.log("[SANITIZE] Starting post-import sanitizer")
+  await loadFranc()
 
   let mdFilesToProcess: Array<{ path: string; content: string }> = []
   let jsonFilesToProcess: Array<{ path: string; content: string }> = []
@@ -1430,5 +1882,5 @@ export function runSanitizer(
 }
 
 if (require.main === module) {
-  runSanitizer()
+  runSanitizer().catch(console.error)
 }

From 8fe6144ebe5d79c604bc8962104ba0c2ec00756a Mon Sep 17 00:00:00 2001
From: Paul Wackerow <54227730+wackerow@users.noreply.github.com>
Date: Sat, 21 Feb 2026 12:53:45 -0700
Subject: [PATCH 03/14] docs: add translation review scaling strategy

Compound engineering document capturing the full brainstorm, 3-phase pipeline strategy, prevention matrix, and knowledge compounding approach for scaling review of 21 translation PRs across 24 languages.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../scaling-translation-review-pipeline.md    | 350 ++++++++++++++++++
 1 file changed, 350 insertions(+)
 create mode 100644 docs/solutions/translation-review/scaling-translation-review-pipeline.md

diff --git a/docs/solutions/translation-review/scaling-translation-review-pipeline.md b/docs/solutions/translation-review/scaling-translation-review-pipeline.md
new file mode 100644
index 00000000000..4fc16c62cee
--- /dev/null
+++ b/docs/solutions/translation-review/scaling-translation-review-pipeline.md
@@ -0,0 +1,350 @@
+---
+title: "Scaling the Translation Review Pipeline for 24-Language Deployment"
+category: translation-review
+component: "post_import_sanitize.ts, review-translations.md, claude-review-translations.yml"
+symptoms:
+  - "Manual review of single PR (1 of 13 parts) takes 1-2 hours per language"
+  - "MDX syntax errors requiring manual fixes across translated content"
+  - "Brand names and product names mistranslated in target languages"
+  - "href attributes being translated when they should remain unchanged"
+  - "Cross-script contamination (e.g., Devanagari characters in Turkish files)"
+  - "Untranslated chunks requiring back-and-forth with Gemini for re-translation"
+  - "Scale challenge: ~178 exploded PRs across 20+ languages blocking production deployment"
+severity: high
+date: 2026-02-21
+tags:
+  - translation-pipeline
+  - i18n
+  - crowdin-integration
+  - gemini-2.5-pro
+  - multilingual-deployment
+  - mdx-content
+  - sanitization
+  - glossary-management
+  - quality-assurance
+  - batch-processing
+related_prs:
+  - 17182
+  - 17176
+  - 17247
+  - 17242
+  - 17227
+  - 17224
+  - 17219
+  - 17218
+  - 17210
+  - 17209
+  - 17199
+  - 17198
+  - 17186
+  - 17166
+  - 17164
+  - 17132
+  - 17127
+  - 17126
+  - 17125
+  - 17122
+  - 17105
+  - 17101
+languages_affected:
+  - ar
+  - bn
+  - cs
+  - de
+  - fr
+  - hi
+  - id
+  - it
+  - ja
+  - ko
+  - mr
+  - pl
+  - pt-br
+  - ru
+  - sw
+  - ta
+  - te
+  - tr
+  - uk
+  - ur
+  - vi
+  - zh-tw
+---
+
+# Scaling the Translation Review Pipeline for 24-Language Deployment
+
+## Problem Summary
+
+The ethereum.org website has been translated into 24 languages using Gemini 2.5 Pro via Crowdin. The translations were imported and placed into PRs -- both "unexploded" (1 PR per language, ~21 total) and "exploded" (13 parts per language, ~178 total). Manual review of a single exploded PR takes 1-2 hours, involving back-and-forth between Claude (review) and Gemini (re-translation), plus fixing MDX syntax errors, brand name mistranslations, href translations, cross-script contamination, and more. Extrapolating to all remaining PRs yields 178-356 hours of manual work.
+
+This document captures the strategic brainstorm and agreed-upon approach for scaling this process.
+
+## Root Cause Analysis
+
+The review bottleneck stems from several compounding factors:
+
+1. **Insufficient automated pre-screening** -- Issues like brand name mistranslations, broken MDX syntax, and Unicode contamination pass through to human review unnecessarily. The sanitizer catches many patterns but misses several known categories.
+2. **Exploded PR strategy** -- Breaking one language PR into 13 parts multiplied the review surface without multiplying reviewer capacity.
+3. **No knowledge persistence** -- Each review session starts from scratch; patterns discovered in one language are not reused for the next.
+4. **No build-level verification** -- Translation issues that cause MDX compilation failures are only discovered late in the process.
+5. **No automated bridge back to Gemini** -- When untranslated chunks are found, there is no automated way to re-submit them for translation and re-import results.
+
+## Solution Architecture
+
+### Phase 1: Foundation
+
+#### 1a. Knowledge Base Setup
+
+Establish a persistent, local-first knowledge base to accumulate findings across review sessions:
+
+- `~/.claude/translation-review/known-patterns.md` -- seeded from Turkish compound doc findings; documents recurring issues by type
+- `~/.claude/translation-review/per-language/` -- one file per locale capturing language-specific findings (common errors, glossary deviations, script quirks)
+- `~/.claude/translation-review/fetch-translation-glossary.json` -- already exported; schema: `Array<{ string_term, translation_text, language_code, total_votes }>`
+
+Initially local; later candidates for merging into the repo for full team access.
+
+#### 1b. Enhance `post_import_sanitize.ts`
+
+The current sanitizer already handles: header ID sync, href fixes (block-level set comparison), broken markdown links, frontmatter dates/quoting, guillemets, escaped backticks, block component line breaks, inline component normalization, brand name warnings, and unclosed backtick repair.
+
+Additions required:
+
+| Addition | Description |
+|---|---|
+| Brand name auto-fix | Expand `PROTECTED_BRAND_NAMES` list; switch from warn-only to auto-revert |
+| Cross-script contamination detector | Unicode range validation per locale (e.g., catch Devanagari characters in Turkish `.md` files) |
+| MDX `<` before numbers | Escape to `&lt;` outside code blocks to prevent MDX parse failures |
+| Orphaned HTML tag cleanup | Detect and remove `</a>` (and similar) without matching opener |
+| Frontmatter `tags` array protection | Prevent translation of programming language names and technical tags |
+| Ticker symbol correction dictionary | Catch and fix transpositions: `EHT`->`ETH`, `BSL`->`BLS`, etc. |
+| Href translation coverage audit | Verify the existing href fix catches all variants (e.g., `/governance` -> `/gobernanza`) |
+| Language detection on content segments | Flag paragraphs detected as English in a non-English file |
+
+#### 1c. Update `review-translations.md` Workflow Document
+
+Modify the review workflow to:
+
+- Read `~/.claude/translation-review/known-patterns.md` before deploying sub-agents
+- Load language-specific glossary entries from the JSON file at review start
+- Add MDX compilation check as a built-in review phase (not an afterthought)
+- Encode sub-agent architecture with clear separation of concerns:
+  - **MDX Syntax Agent** -- validates MDX structure, component usage, escaping
+  - **Brand Name Agent** -- checks protected terms against glossary and known-patterns
+  - **Href Validation Agent** -- verifies internal link translations are consistent with site routing
+  - **Semantic Review Agent** -- spot-checks translation quality against glossary votes
+  - **Build Verification Agent** -- runs `NEXT_PUBLIC_BUILD_LOCALES=en,{lang} pnpm build`
+- Document the targeted build command: `NEXT_PUBLIC_BUILD_LOCALES=en,{lang} pnpm build`
+
+### Phase 2: Validate on One Language
+
+Czech (`cs`) is the pilot language because it has only 3 exploded parts remaining, plus unexploded PR #17247, making it the lowest-cost full-pipeline test.
+
+Pipeline steps:
+
+1. Run enhanced sanitizer with all new additions enabled
+2. Run sub-agent review suite with `--fix` mode
+3. Execute `NEXT_PUBLIC_BUILD_LOCALES=en,cs pnpm build`
+4. Document all findings to `~/.claude/translation-review/per-language/cs.md`
+5. Merge, close exploded PRs
+6. Accumulate patterns back into `known-patterns.md`
+
+Success criteria: clean build, no brand name regressions, glossary alignment confirmed.
+
+### Phase 3: Scale to Remaining Languages
+
+**Tier A -- Finish exploded PRs (3-4 parts remaining):**
+- Czech (`cs`), Traditional Chinese (`zh-tw`), Ukrainian (`uk`), Telugu (`te`)
+- Strategy: complete remaining exploded parts using the validated pipeline
+
+**Tier B -- Partially done, switch to unexploded:**
+- Bengali (`bn`), German (`de`), Marathi (`mr`), Polish (`pl`), Swahili (`sw`), Tamil (`ta`), Urdu (`ur`), Turkish (`tr`)
+- Strategy: use single unexploded PR per language; apply full pipeline
+
+**Tier C -- Full review, unexploded only:**
+- Arabic (`ar`), French (`fr`), Hindi (`hi`), Indonesian (`id`), Italian (`it`), Japanese (`ja`), Korean (`ko`), Russian (`ru`), Vietnamese (`vi`), Brazilian Portuguese (`pt-br`)
+- Strategy: direct unexploded pipeline with knowledge base pre-loaded
+
+### Key Architectural Decisions
+
+| Decision | Rationale |
+|---|---|
+| **Prefer unexploded PRs (1 per language)** | Exploded PRs multiply human review surface; 13 parts x 2 hrs = 26 hrs per language vs. ~3 hrs for unexploded |
+| **Sub-agents split by concern, not file count** | Concern-based split allows each agent to specialize its detection logic; file-count split leads to uneven workloads and missed cross-file patterns |
+| **Gemini translates, Claude reviews** | Keeps the pipeline conservative and avoids introducing new translation errors during review. No automated bridge for re-translation yet. |
+| **Build verification uses locale isolation** | `NEXT_PUBLIC_BUILD_LOCALES=en,{lang}` avoids building all 60+ locales on every check |
+| **Knowledge base starts local** | Avoids premature repo noise; once patterns stabilize across 3-4 languages, promote to repo for team visibility |
+| **Czech as pilot** | Lowest risk (fewest remaining parts), sufficient complexity to stress-test the full pipeline before scaling to Tier C languages |
+| **Ralph Loop plugin under consideration** | Would enable iterate-until-build-passes automation; deferred until pipeline is stable |
+
+### Key Code Changes
+
+**File: `src/scripts/i18n/post_import_sanitize.ts`**
+
+- Expand `PROTECTED_BRAND_NAMES` constant with comprehensive brand terms list
+- Change brand name handling from `console.warn` to auto-revert with logging
+- Add `detectCrossScriptContamination(content, locale)` -- Unicode range validation per locale
+- Add `escapeMdxAngleBrackets(content)` -- targets `< N` patterns outside fenced code blocks
+- Add `removeOrphanedClosingTags(content)` -- regex-based orphan HTML tag detector
+- Add `protectFrontmatterTags(translatedFm, englishFm)` -- freeze tags array against English source
+- Add `TICKER_CORRECTIONS: Record<string, string>` dictionary and apply in sanitize pass
+- Audit and extend `fixTranslatedHrefs()` to cover all edge cases
+
+**File: `.claude/commands/review-translations.md`**
+
+- Add knowledge base load step at top of workflow
+- Add glossary injection step per language
+- Restructure sub-agent section with the 5-agent breakdown
+- Add build verification as mandatory final step with exact command
+
+## Prevention Matrix
+
+| Issue Category | Upstream Prevention | Automated Detection | Review-Level Detection | Long-term Fix |
+|---|---|---|---|---|
+| **Brand name mistranslation** | Crowdin glossary with "Do Not Translate" flag; explicit list in Gemini system prompt | Token-match against protected-terms allowlist; flag phonetic/semantic variants | LLM check: "Does this translation preserve all brand names exactly?" | Crowdin TM enforcement + MTQE threshold on brand-name segments |
+| **Cross-script contamination** | Crowdin project setting: enforce target locale script; Gemini script constraint | Unicode block range check per file per locale | LLM check: "Does any portion contain characters from an incompatible script?" | Per-locale Unicode allowlist enforced at import time as a hard gate |
+| **MDX syntax errors** | Crowdin HTML/MDX-aware segment protection; Gemini locked segment config | MDX AST parse post-import; regex for unmatched backtick parity, `<[0-9]`, unclosed HTML | LLM check: "Any raw `<` before numbers, unmatched backtick pairs, HTML outside code blocks?" | Mandatory `mdx-compile` step in post-import; quarantine failures |
+| **Semantic inversions** | Crowdin glossary entries for antonym pairs with definitions; Gemini system prompt with mutually exclusive term list | Concordance check: if source has "proof-of-work" verify translation uses correct locale term, not antonym | LLM check: "Verify all consensus mechanism terms match source meaning. Inversion is a known failure mode." | Semantic consistency test corpus per locale |
+| **Translated hrefs** | Crowdin: configure internal href paths as locked/non-translatable; Gemini system prompt: "Never translate URL paths" | Extract all `href` values, compare against source file href set; any divergence is a hard failure | LLM check: "Are all internal href values identical to the source?" | Href exact-match comparison as mandatory pre-merge CI check |
+| **Translated frontmatter tags** | Crowdin: mark frontmatter `tags` as non-translatable | Parse frontmatter, compare tag arrays against source; flag any tag not in source set | LLM check: "Do frontmatter tags match the source exactly?" | Frontmatter schema validation with strict allowlists |
+| **Ticker/acronym typos** | Crowdin glossary: ticker symbols as "Do Not Translate"; Gemini system prompt with explicit list | Levenshtein distance check: all uppercase tokens against canonical ticker list; flag distance <= 1 | LLM check: "Are all tickers and acronyms spelled exactly as in source?" | Canonical ticker allowlist validated in CI |
+| **Domain typos** | Gemini system prompt: "The domain ethereum.org must never be altered"; Crowdin: lock URL segments | Regex: extract domain strings, assert exact match against `ethereum.org` | LLM check: "Any misspelling of ethereum.org?" | Regex validation in CI, zero tolerance |
+| **Untranslated content chunks** | Crowdin MT coverage threshold; Gemini system prompt: "Every segment must be translated" | Paragraph-level language detection; flag English content in non-English files above threshold | LLM check: "Are there paragraphs that appear untranslated?" | Language detection as post-import gate; failed segments queued for re-translation |
+| **Wrong technical term selection** | Crowdin glossary with preferred translations per locale for high-risk terms; Gemini prompt with locale-specific terminology reference | Concordance check: verify technical terms use glossary entries, not colloquial equivalents | LLM check is primary: "Check that technical terms use established Ethereum translations" | Per-locale Ethereum technical glossary maintained as versioned data file |
+
+## Knowledge Compounding Strategy
+
+### Session Memory (Per-Locale)
+
+After each language review, findings are written to `~/.claude/translation-review/per-language/[locale].md`:
+
+- Confirmed issues by category
+- False positives to suppress in future reviews
+- Glossary additions/corrections
+- Systemic notes (e.g., "Crowdin TM appears contaminated from Hindi batch")
+
+### Cross-Locale Aggregation
+
+`~/.claude/translation-review/known-patterns.md` is maintained as a rolling aggregate that:
+
+1. Captures patterns seen across multiple locales (e.g., brand name issues in 8+ languages = systemic upstream problem)
+2. Records confirmed false-positive patterns to suppress
+3. Provides the context injection block for review agents
+
+### Inter-Agent Context Injection
+
+Each review agent receives prior findings as context:
+
+```
+Known issues confirmed in prior reviews of this locale:
+- "katillik" is a mistranslation of "Solidity" -- flag all occurrences
+- Cross-script contamination from Devanagari was found -- check for recurrence
+
+Cross-locale patterns seen in 5+ languages:
+- DeFi is being translated as "MeFi" -- check this locale
+- Internal hrefs are being translated -- perform href audit
+```
+
+This transforms each review from a cold start into an informed continuation.
+
+## Pipeline Hardening Recommendations (Ordered by Impact)
+
+1. **Mandatory MDX compile gate** -- Every file must pass MDX AST parse before entering review queue. Files that fail are quarantined immediately. Highest-leverage check: fully deterministic, zero ambiguity.
+
+2. **Href exact-match validation** -- Extract all `href` attributes from source and translated files, compare sets. Any deviation is a hard failure. Zero false-positive risk.
+
+3. **Unicode script range validation** -- Per-locale expected Unicode block range. Catches cross-script contamination with zero ambiguity.
+
+4. **Canonical ticker fuzzy-match** -- Levenshtein distance <= 1 check on all uppercase tokens against canonical ticker list. Catches transpositions that human reviewers and LLMs miss under volume.
+
+5. **Language detection on content segments** -- Paragraph-level language ID on translated files. English content in non-English files above threshold flags for re-translation queue.
+
+6. **Domain string exact-match** -- Regex for `ethereum` + TLD-like pattern. Trivial to implement, catches trust/SEO issues.
+
+7. **Frontmatter schema validation** -- Parse with gray-matter, validate fixed fields against source. Prevents programming language names from being localized in tags.
+
+8. **Brand name token allowlist** -- Protected-terms list with auto-revert. Requires per-locale map for terms with accepted translations vs. always-English terms.
+
+9. **Build verification in CI** -- `NEXT_PUBLIC_BUILD_LOCALES=en,{lang} pnpm build` as required PR check. Full build catches integration failures that segment-level checks miss.
+
+10. **Findings persistence and context injection** -- Write structured findings after each review. Inject prior findings as context for subsequent reviews. Without this, each review starts cold.
+
+## Open Problems
+
+### Gemini Re-Translation Gap
+
+When untranslated chunks are detected, there is no automated round-trip back to Gemini for completion. The current workflow requires manual intervention: extract the file, submit to Gemini with glossary context, receive output, re-import into the repo branch. A proper fix requires a re-translation queue and Gemini API integration outside the Crowdin workflow.
+
+### Semantic Inversion Detection
+
+Detecting swapped consensus mechanism terminology (PoW/PoS) requires knowing the correct translation of both terms in every target language. No universal automated approach exists. Partial solution: build term maps for critical antonym pairs during first review of each locale and persist them.
+
+### Wrong Technical Term Selection at Scale
+
+Distinguishing "client (software)" from "client (customer)" requires semantic context that regex/token checks cannot provide. LLM review is the only practical detector, but at 20+ languages, LLM review cost and latency are constraints.
+
+### Crowdin Translation Memory Contamination
+
+Cross-script contamination (Devanagari in Turkish) suggests Crowdin TM is pulling from wrong-locale segments. Root cause is unclear without Crowdin admin access. Downstream mitigations (Unicode range gate, Gemini script constraint) are in place, but the actual fix requires auditing TM isolation per locale.
+
+### Ralph Loop Integration
+
+The [Ralph Loop](https://claude.com/plugins/ralph-loop) Claude Code plugin enables iterative loops where Claude works on a task repeatedly until completion. It uses a stop hook to re-feed the prompt while preserving file modifications between iterations. This maps well to the "sanitize -> review -> fix -> verify -> repeat" cycle. However, integration with worktree isolation and the multi-model pipeline (Gemini for translation, Claude for review) needs validation before adoption at scale.
+
+## Related Documentation
+
+### Existing Compound Docs
+
+| Document | Location | Status |
+|---|---|---|
+| Turkish (tr) Review - PR #17182 | `docs/solutions/translation-review/crowdin-import-review-turkish-pr-17182.md` | On `dev` branch |
+| Vietnamese (vi) Review - PR #17176 | `docs/solutions/translation-review/crowdin-import-review-vietnamese-pr-17176.md` | On `i18n/import/2026-01-27T15-06-08-vi` branch only |
+
+### Key Codebase Files
+
+| File | Role |
+|---|---|
+| `src/scripts/i18n/post_import_sanitize.ts` | Deterministic post-import sanitizer |
+| `.claude/commands/review-translations.md` | Claude Code review command |
+| `.github/workflows/claude-review-translations.yml` | CI workflow for automated review |
+| `src/scripts/i18n/main.ts` | Import pipeline orchestrator |
+| `src/scripts/i18n/config.ts` | Pipeline configuration (languages, paths, API keys) |
+| `.claude/commands/netlify-build-check.md` | Build status check and MDX error analysis |
+| `src/intl/[locale]/glossary.json` | Per-locale glossary files |
+| `src/scripts/i18n/lib/supabase/glossary.ts` | Supabase glossary client |
+
+### Unexploded PRs (One Per Language)
+
+| PR | Language | State |
+|---|---|---|
+| #17247 | Czech (cs) | Open |
+| #17242 | Traditional Chinese (zh-tw) | Open |
+| #17227 | Swahili (sw) | Open |
+| #17224 | Marathi (mr) | Open |
+| #17219 | Telugu (te) | Open |
+| #17218 | Tamil (ta) | Open |
+| #17210 | Ukrainian (uk) | Open |
+| #17209 | Urdu (ur) | Open |
+| #17199 | Polish (pl) | Open |
+| #17198 | Italian (it) | Open |
+| #17186 | Bengali (bn) | Open |
+| #17176 | Vietnamese (vi) | Open |
+| #17166 | Korean (ko) | Open |
+| #17164 | German (de) | Open |
+| #17132 | Japanese (ja) | Open |
+| #17127 | Russian (ru) | Open |
+| #17126 | Indonesian (id) | Open |
+| #17125 | French (fr) | Open |
+| #17122 | Brazilian Portuguese (pt-br) | Open |
+| #17105 | Arabic (ar) | Open |
+| #17101 | Hindi (hi) | Open |
+
+### Cross-References
+
+| Source | References | Nature |
+|---|---|---|
+| Turkish compound doc | `post_import_sanitize.ts` | Recommends adding brand name dictionary and cross-script detector |
+| Turkish compound doc | `review-translations.md` | Command that ran the review |
+| Turkish compound doc | Vietnamese PR #17176 companion doc | Same MDX error patterns |
+| `review-translations.md` | `netlify-build-check.md` | Review command recommends running build check for MDX errors |
+| `.github/workflows/claude-review-translations.yml` | `review-translations.md` | Workflow executes the command via `claude-code-action@v1` |
+| `src/scripts/i18n/docs/v0.2.0-roadmap.md` | `lib/supabase/glossary.ts` | Roadmap plans Supabase glossary sync; file already exists |

From 4cb98fb46b29aa51b98b954c0c7dca433fa9baa6 Mon Sep 17 00:00:00 2001
From: Paul Wackerow <54227730+wackerow@users.noreply.github.com>
Date: Sat, 21 Feb 2026 15:55:59 -0700
Subject: [PATCH 04/14] feat: improve sanitizer logging and href fixing

Replaces fixCount-based issue reporting with actual content comparison so transforms only log when content genuinely changes. Adds block-scoped href replacement to prevent cross-block interference when the same href appears in multiple blocks. Detects displaced hrefs that are globally valid but in the wrong block.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/scripts/i18n/post_import_sanitize.ts | 298 +++++++++++------------
 1 file changed, 138 insertions(+), 160 deletions(-)

diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts
index c61da262ecf..f0e1da46831 100644
--- a/src/scripts/i18n/post_import_sanitize.ts
+++ b/src/scripts/i18n/post_import_sanitize.ts
@@ -445,7 +445,11 @@ function fixTranslatedHrefs(
   // Collect all English internal hrefs as the "valid" set
   const allEnglishHrefs = extractHrefs(englishContent)
 
-  const allFixes: Array<[string, string]> = [] // [wrong, correct]
+  const blockFixes: Array<{
+    blockIdx: number
+    wrong: string
+    correct: string
+  }> = []
   const allWarnings: string[] = []
 
   // Process block by block
@@ -461,15 +465,17 @@ function fixTranslatedHrefs(
     // Skip blocks with no internal hrefs
     if (engHrefs.length === 0 && transHrefs.length === 0) continue
 
-    // Find hrefs in translation that don't exist in English (invalid)
+    // Compare hrefs at block level
+    const engHrefSet = new Set(engHrefs)
     const transHrefSet = new Set(transHrefs)
 
-    const invalidInTrans: string[] = [] // In translation but not in any English href
-    const missingFromTrans: string[] = [] // In English block but not in translation
+    // Hrefs in translation block but NOT in corresponding English block
+    const displacedInTrans: string[] = []
+    const missingFromTrans: string[] = []
 
     for (const href of transHrefs) {
-      if (!allEnglishHrefs.has(href)) {
-        invalidInTrans.push(href)
+      if (!engHrefSet.has(href)) {
+        displacedInTrans.push(href)
       }
     }
 
@@ -480,21 +486,24 @@ function fixTranslatedHrefs(
     }
 
     // No issues in this block
-    if (invalidInTrans.length === 0 && missingFromTrans.length === 0) continue
+    if (displacedInTrans.length === 0 && missingFromTrans.length === 0) continue
 
     // Deduplicate for set comparison
-    const uniqueInvalid = [...new Set(invalidInTrans)]
+    const uniqueDisplaced = [...new Set(displacedInTrans)]
     const uniqueMissing = [...new Set(missingFromTrans)]
 
-    // Only auto-fix when there's exactly 1 invalid and 1 missing in block
-    // Multiple mismatches within same block could be reordered - don't guess
-    if (uniqueInvalid.length === 1 && uniqueMissing.length === 1) {
-      allFixes.push([uniqueInvalid[0], uniqueMissing[0]])
-    } else if (uniqueInvalid.length > 0 || uniqueMissing.length > 0) {
-      // Count mismatch - can't safely fix, warn instead
-      for (const href of uniqueInvalid) {
+    // Auto-fix when there's exactly 1 displaced and 1 missing in the same block
+    if (uniqueDisplaced.length === 1 && uniqueMissing.length === 1) {
+      blockFixes.push({
+        blockIdx: i,
+        wrong: uniqueDisplaced[0],
+        correct: uniqueMissing[0],
+      })
+    } else if (uniqueDisplaced.length > 0 || uniqueMissing.length > 0) {
+      for (const href of uniqueDisplaced) {
+        const globallyValid = allEnglishHrefs.has(href)
         allWarnings.push(
-          `Block ${i + 1}: Invalid href "${href}" - not a valid English path`
+          `Block ${i + 1}: ${globallyValid ? "Displaced" : "Invalid"} href "${href}" - not in corresponding English block`
         )
       }
       for (const href of uniqueMissing) {
@@ -512,25 +521,28 @@ function fixTranslatedHrefs(
     )
   }
 
-  // Apply all fixes
+  // Apply fixes block-by-block to avoid cross-block interference
   let result = translatedContent
   const appliedFixes: string[] = []
 
-  for (const [wrong, correct] of allFixes) {
+  for (const { blockIdx, wrong, correct } of blockFixes) {
+    const originalBlock = translatedBlocks[blockIdx]
+    let fixedBlock = originalBlock
+
     // Replace in markdown links: [text](wrong) → [text](correct)
     const markdownRe = new RegExp(
       `(\\[[^\\]]*\\]\\()${escapeRegex(wrong)}(\\))`,
       "g"
     )
-    const beforeMd = result
-    result = result.replace(markdownRe, `$1${correct}$2`)
+    fixedBlock = fixedBlock.replace(markdownRe, `$1${correct}$2`)
 
     // Replace in href attributes: href="wrong" → href="correct"
     const hrefRe = new RegExp(`(href=["'])${escapeRegex(wrong)}(["'])`, "g")
-    const beforeAttr = result
-    result = result.replace(hrefRe, `$1${correct}$2`)
+    fixedBlock = fixedBlock.replace(hrefRe, `$1${correct}$2`)
 
-    if (result !== beforeMd || result !== beforeAttr) {
+    if (fixedBlock !== originalBlock) {
+      result = result.replace(originalBlock, fixedBlock)
+      translatedBlocks[blockIdx] = fixedBlock // update for subsequent fixes
       appliedFixes.push(`${wrong} → ${correct}`)
     }
   }
@@ -1498,166 +1510,116 @@ function processMarkdownFile(
 
   const before = content
 
-  // Fix duplicated headings (e.g., ## Text? Text? {#id} → ## Text? {#id})
-  const duplicatedResult = fixDuplicatedHeadings(content)
-  content = duplicatedResult.content
-  if (duplicatedResult.fixCount > 0) {
-    issues.push(`Fixed ${duplicatedResult.fixCount} duplicated headings`)
-  }
-
-  // Fix broken markdown links (] (https:// → ](https://)
-  const brokenLinksResult = fixBrokenMarkdownLinks(content)
-  content = brokenLinksResult.content
-  if (brokenLinksResult.fixCount > 0) {
-    issues.push(`Fixed ${brokenLinksResult.fixCount} broken markdown links`)
-  }
-
-  // Fix frontmatter issues (don't need English source)
-  const dateResult = normalizeFrontmatterDates(content)
-  content = dateResult.content
-  if (dateResult.fixCount > 0) {
-    issues.push(
-      `Normalized ${dateResult.fixCount} frontmatter dates to ISO format`
-    )
-  }
-
-  const quoteResult = quoteFrontmatterNonAscii(content)
-  content = quoteResult.content
-  if (quoteResult.fixCount > 0) {
-    issues.push(
-      `Quoted ${quoteResult.fixCount} frontmatter values with non-ASCII chars`
-    )
+  // Helper: only log a fix if content actually changed
+  function applyFix(
+    fn: () => { content: string; fixCount: number },
+    label: (count: number) => string
+  ) {
+    const snapshot = content
+    const result = fn()
+    content = result.content
+    if (content !== snapshot) {
+      issues.push(label(result.fixCount))
+    }
   }
 
-  const guillemetResult = fixAsciiGuillemets(content)
-  content = guillemetResult.content
-  if (guillemetResult.fixCount > 0) {
-    issues.push(
-      `Fixed ${guillemetResult.fixCount} ASCII guillemets (<< >>) to Unicode (« »)`
-    )
-  }
+  applyFix(
+    () => fixDuplicatedHeadings(content),
+    (n) => `Fixed ${n} duplicated headings`
+  )
+  applyFix(
+    () => fixBrokenMarkdownLinks(content),
+    (n) => `Fixed ${n} broken markdown links`
+  )
+  applyFix(
+    () => normalizeFrontmatterDates(content),
+    (n) => `Normalized ${n} frontmatter dates to ISO format`
+  )
+  applyFix(
+    () => quoteFrontmatterNonAscii(content),
+    (n) => `Quoted ${n} frontmatter values with non-ASCII chars`
+  )
+  applyFix(
+    () => fixAsciiGuillemets(content),
+    (n) => `Fixed ${n} ASCII guillemets (<< >>) to Unicode (« »)`
+  )
 
   // Fix escaped backticks (\`) to regular backticks (`)
-  // Crowdin sometimes escapes backticks unnecessarily
-  const escapedBacktickCount = (content.match(/\\`/g) || []).length
-  if (escapedBacktickCount > 0) {
+  {
+    const snapshot = content
     content = content.replace(/\\`/g, "`")
-    issues.push(`Unescaped ${escapedBacktickCount} backslash-escaped backticks`)
-  }
-
-  // Fix ticker symbol transpositions (EHT → ETH, etc.)
-  const tickerResult = fixTickerTranspositions(content)
-  content = tickerResult.content
-  if (tickerResult.fixCount > 0) {
-    issues.push(`Fixed ${tickerResult.fixCount} ticker symbol transpositions`)
-  }
-
-  // Escape raw < before numbers in MDX content
-  const angleBracketResult = escapeMdxAngleBrackets(content)
-  content = angleBracketResult.content
-  if (angleBracketResult.fixCount > 0) {
-    issues.push(
-      `Escaped ${angleBracketResult.fixCount} raw angle brackets before numbers`
-    )
-  }
-
-  // Remove orphaned closing HTML tags
-  const orphanResult = removeOrphanedClosingTags(content)
-  content = orphanResult.content
-  if (orphanResult.fixCount > 0) {
-    issues.push(`Removed ${orphanResult.fixCount} orphaned closing HTML tags`)
+    if (content !== snapshot) {
+      const count = (snapshot.match(/\\`/g) || []).length
+      issues.push(`Unescaped ${count} backslash-escaped backticks`)
+    }
   }
 
-  // Fix block component line breaks (critical for MDX parser)
-  const blockResult = fixBlockComponentLineBreaks(content)
-  content = blockResult.content
-  if (blockResult.fixCount > 0) {
-    issues.push(`Fixed ${blockResult.fixCount} inline block component tags`)
-  }
+  applyFix(
+    () => fixTickerTranspositions(content),
+    (n) => `Fixed ${n} ticker symbol transpositions`
+  )
+  applyFix(
+    () => escapeMdxAngleBrackets(content),
+    (n) => `Escaped ${n} raw angle brackets before numbers`
+  )
+  applyFix(
+    () => removeOrphanedClosingTags(content),
+    (n) => `Removed ${n} orphaned closing HTML tags`
+  )
+  applyFix(
+    () => fixBlockComponentLineBreaks(content),
+    (n) => `Fixed ${n} inline block component tags`
+  )
 
   content = normalizeBlockHtmlLines(content)
 
   // Normalize inline components and restore blank lines from English source
   if (englishMd) {
-    // Sync protected frontmatter fields (template, sidebar, etc.)
-    const protectedResult = syncProtectedFrontmatterFields(content, englishMd)
-    content = protectedResult.content
-    if (protectedResult.fixCount > 0) {
-      issues.push(
-        `Synced ${protectedResult.fixCount} protected frontmatter fields from English`
-      )
-    }
-
-    // Collapse inline HTML tags to match English single-line format
-    const inlineHtmlResult = collapseInlineHtmlFromEnglish(content, englishMd)
-    content = inlineHtmlResult.content
-    if (inlineHtmlResult.fixCount > 0) {
-      issues.push(
-        `Collapsed ${inlineHtmlResult.fixCount} inline HTML tags to match English`
-      )
-    }
-
-    // Fix JSX component closing tags merged with content (split to own line)
-    const mergedTagResult = fixMergedClosingTags(content, englishMd)
-    content = mergedTagResult.content
-    if (mergedTagResult.fixCount > 0) {
-      issues.push(
-        `Split ${mergedTagResult.fixCount} merged closing tags to own lines`
-      )
-    }
-
-    // Collapse inline component line breaks to match English format
-    const inlineResult = normalizeInlineComponentsFromEnglish(
-      content,
-      englishMd
+    applyFix(
+      () => syncProtectedFrontmatterFields(content, englishMd!),
+      (n) => `Synced ${n} protected frontmatter fields from English`
+    )
+    applyFix(
+      () => collapseInlineHtmlFromEnglish(content, englishMd!),
+      (n) => `Collapsed ${n} inline HTML tags to match English`
+    )
+    applyFix(
+      () => fixMergedClosingTags(content, englishMd!),
+      (n) => `Split ${n} merged closing tags to own lines`
+    )
+    applyFix(
+      () => normalizeInlineComponentsFromEnglish(content, englishMd!),
+      (n) => `Normalized ${n} inline components to match English`
+    )
+    applyFix(
+      () => repairUnclosedBackticks(content, englishMd!),
+      (n) => `Repaired ${n} unclosed backticks`
+    )
+    applyFix(
+      () => restoreBlankLinesFromEnglish(content, englishMd!),
+      (n) => `Restored ${n} blank lines from English`
+    )
+    applyFix(
+      () => fixCollapsedComponentLineBreaks(content, englishMd!),
+      (n) => `Fixed ${n} collapsed component line breaks`
     )
-    content = inlineResult.content
-    if (inlineResult.fixCount > 0) {
-      issues.push(
-        `Normalized ${inlineResult.fixCount} inline components to match English`
-      )
-    }
-
-    // Repair unclosed backticks in inline code
-    const backtickResult = repairUnclosedBackticks(content, englishMd)
-    content = backtickResult.content
-    if (backtickResult.fixCount > 0) {
-      issues.push(`Repaired ${backtickResult.fixCount} unclosed backticks`)
-    }
-
-    const blankLineResult = restoreBlankLinesFromEnglish(content, englishMd)
-    content = blankLineResult.content
-    if (blankLineResult.fixCount > 0) {
-      issues.push(
-        `Restored ${blankLineResult.fixCount} blank lines from English`
-      )
-    }
-
-    // Fix collapsed line breaks between consecutive components
-    const collapsedResult = fixCollapsedComponentLineBreaks(content, englishMd)
-    content = collapsedResult.content
-    if (collapsedResult.fixCount > 0) {
-      issues.push(
-        `Fixed ${collapsedResult.fixCount} collapsed component line breaks`
-      )
-    }
 
     // Fix and check protected brand names
     const brandResult = fixProtectedBrandNames(content, englishMd)
-    content = brandResult.content
-    if (brandResult.fixCount > 0) {
+    if (brandResult.content !== content) {
       issues.push(`Fixed ${brandResult.fixCount} brand name issues`)
     }
+    content = brandResult.content
     issues.push(...brandResult.warnings)
 
     // Fix translated hrefs using set comparison
     const hrefResult = fixTranslatedHrefs(content, englishMd)
-    content = hrefResult.content
-    if (hrefResult.fixCount > 0) {
+    if (hrefResult.content !== content) {
       issues.push(
         `Fixed ${hrefResult.fixCount} translated hrefs: ${hrefResult.fixes.join(", ")}`
       )
     }
+    content = hrefResult.content
     issues.push(...hrefResult.warnings)
 
     // Detect cross-script contamination
@@ -1776,7 +1738,7 @@ export async function runSanitizer(
   let jsonFilesToProcess: Array<{ path: string; content: string }> = []
 
   if (filesWithContent && filesWithContent.length > 0) {
-    // Process only the specific files provided with their in-memory content
+    // Process specific files; if content is empty, reads from disk and writes fixes back
     console.log(
       `[SANITIZE] Target: ${filesWithContent.length} specific file(s)`
     )
@@ -1815,10 +1777,15 @@ export async function runSanitizer(
   }
 
   let mdFixed = 0
+  let mdDiskWrites = 0
   const mdIssues: Array<{ file: string; issues: string[] }> = []
   const mdChanged: Array<{ path: string; content: string }> = []
 
   for (const fileInfo of mdFilesToProcess) {
+    // Read original from disk for accurate disk-write detection
+    const originalOnDisk = fs.existsSync(fileInfo.path)
+      ? fs.readFileSync(fileInfo.path, "utf8")
+      : null
     const { fixed, issues, content } = processMarkdownFile(
       fileInfo.path,
       fileInfo.content
@@ -1827,15 +1794,23 @@ export async function runSanitizer(
       mdFixed++
       mdChanged.push({ path: fileInfo.path, content })
     }
+    // Track actual disk changes (content differs from what's on disk)
+    if (originalOnDisk !== null && content !== originalOnDisk) {
+      mdDiskWrites++
+    }
     if (issues.length)
       mdIssues.push({ file: path.relative(ROOT, fileInfo.path), issues })
   }
 
   let jsonFixed = 0
+  let jsonDiskWrites = 0
   const jsonIssues: Array<{ file: string; issues: string[] }> = []
   const jsonChanged: Array<{ path: string; content: string }> = []
 
   for (const fileInfo of jsonFilesToProcess) {
+    const originalOnDisk = fs.existsSync(fileInfo.path)
+      ? fs.readFileSync(fileInfo.path, "utf8")
+      : null
     const { fixed, issues, content } = processJsonFile(
       fileInfo.path,
       fileInfo.content
@@ -1844,15 +1819,18 @@ export async function runSanitizer(
       jsonFixed++
       jsonChanged.push({ path: fileInfo.path, content })
     }
+    if (originalOnDisk !== null && content !== originalOnDisk) {
+      jsonDiskWrites++
+    }
     if (issues.length)
       jsonIssues.push({ file: path.relative(ROOT, fileInfo.path), issues })
   }
 
   console.log(
-    `\n[SANITIZE] Markdown files scanned: ${mdFilesToProcess.length}, fixed: ${mdFixed}`
+    `\n[SANITIZE] Markdown: ${mdFilesToProcess.length} scanned, ${mdDiskWrites} written to disk`
   )
   console.log(
-    `[SANITIZE] JSON files scanned: ${jsonFilesToProcess.length}, fixed: ${jsonFixed}`
+    `[SANITIZE] JSON: ${jsonFilesToProcess.length} scanned, ${jsonDiskWrites} written to disk`
   )
 
   if (mdIssues.length || jsonIssues.length) {

From e02e64d56f5a26e2916819772e5d3576b2f1182b Mon Sep 17 00:00:00 2001
From: Paul Wackerow <54227730+wackerow@users.noreply.github.com>
Date: Sat, 21 Feb 2026 16:12:56 -0700
Subject: [PATCH 05/14] fix(types): await promise

---
 src/scripts/i18n/lib/workflows/sanitization.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/i18n/lib/workflows/sanitization.ts b/src/scripts/i18n/lib/workflows/sanitization.ts
index 17f59c90ff9..548871af2dc 100644
--- a/src/scripts/i18n/lib/workflows/sanitization.ts
+++ b/src/scripts/i18n/lib/workflows/sanitization.ts
@@ -25,7 +25,7 @@ export async function runPostImportSanitization(
 
   console.log(`[SANITIZE] Processing ${committedFiles.length} committed files`)
 
-  const sanitizeResult = runSanitizer(committedFiles)
+  const sanitizeResult = await runSanitizer(committedFiles)
   const changedFiles = sanitizeResult.changedFiles || []
 
   if (changedFiles.length) {

From 3797b707feb2c52b3d086102fde5c4520793327b Mon Sep 17 00:00:00 2001
From: Paul Wackerow <54227730+wackerow@users.noreply.github.com>
Date: Sat, 21 Feb 2026 17:19:45 -0700
Subject: [PATCH 06/14] feat(i18n): add orphan detection to post-import
 sanitizer

Flag translated files that have no English source at the expected path. When a single match is found by filename, suggests the correct location. Reports ambiguous cases with candidate count.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/scripts/i18n/post_import_sanitize.ts | 69 ++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts
index f0e1da46831..034e71f4a7b 100644
--- a/src/scripts/i18n/post_import_sanitize.ts
+++ b/src/scripts/i18n/post_import_sanitize.ts
@@ -1776,6 +1776,74 @@ export async function runSanitizer(
     jsonFilesToProcess = jsonFilePaths.map((p) => ({ path: p, content: "" }))
   }
 
+  // --- Orphan detection: flag translated files with no English counterpart ---
+  const orphanWarnings: Array<{ file: string; suggestion: string }> = []
+  const translationsDir = path.join(CONTENT_ROOT, "translations")
+
+  for (const fileInfo of [...mdFilesToProcess, ...jsonFilesToProcess]) {
+    const filePath = fileInfo.path
+    // Extract the relative path after translations/<lang>/
+    const txIdx = filePath.indexOf(`${path.sep}translations${path.sep}`)
+    if (txIdx === -1) continue
+
+    const afterTranslations = filePath.substring(
+      txIdx + `${path.sep}translations${path.sep}`.length
+    )
+    // Strip the language code prefix: <lang>/rest/of/path
+    const slashIdx = afterTranslations.indexOf(path.sep)
+    if (slashIdx === -1) continue
+
+    const langCode = afterTranslations.substring(0, slashIdx)
+    const relPathWithinLang = afterTranslations.substring(slashIdx + 1)
+
+    // Derive the expected English source path
+    const englishPath = path.join(CONTENT_ROOT, relPathWithinLang)
+
+    if (!fs.existsSync(englishPath)) {
+      const relFile = path.relative(ROOT, filePath)
+      // Try to find the English file by filename to suggest the correct location
+      const basename = path.basename(relPathWithinLang)
+      const parentDir = path.basename(path.dirname(relPathWithinLang))
+      let suggestion = "No English counterpart found"
+
+      // Search for matching parent/file pattern in English content
+      const englishContentFiles = listFiles(CONTENT_ROOT, (f) => {
+        if (f.includes(`${path.sep}translations${path.sep}`)) return false
+        return (
+          f.endsWith(`${path.sep}${parentDir}${path.sep}${basename}`) &&
+          !f.includes(`${path.sep}translations${path.sep}`)
+        )
+      })
+
+      if (englishContentFiles.length === 1) {
+        const correctEnglishRel = path.relative(
+          CONTENT_ROOT,
+          englishContentFiles[0]
+        )
+        const correctTranslationPath = path.join(
+          translationsDir,
+          langCode,
+          correctEnglishRel
+        )
+        suggestion = `Likely belongs at: ${path.relative(ROOT, correctTranslationPath)}`
+      } else if (englishContentFiles.length > 1) {
+        suggestion = `Ambiguous: ${englishContentFiles.length} English candidates found (${englishContentFiles.map((f) => path.relative(CONTENT_ROOT, f)).join(", ")})`
+      }
+
+      orphanWarnings.push({ file: relFile, suggestion })
+    }
+  }
+
+  if (orphanWarnings.length > 0) {
+    console.log(
+      `\n[SANITIZE] ⚠ Orphaned translations (no English source at expected path):`
+    )
+    for (const w of orphanWarnings) {
+      console.log(`  - ${w.file}`)
+      console.log(`    ${w.suggestion}`)
+    }
+  }
+
   let mdFixed = 0
   let mdDiskWrites = 0
   const mdIssues: Array<{ file: string; issues: string[] }> = []
@@ -1856,6 +1924,7 @@ export async function runSanitizer(
     markdown: { scanned: mdFilesToProcess.length, fixed: mdFixed },
     json: { scanned: jsonFilesToProcess.length, fixed: jsonFixed },
     issues: { markdown: mdIssues, json: jsonIssues },
+    orphanWarnings,
   }
 }
 

From 8de3206402270694c12b161789d8914208d70c60 Mon Sep 17 00:00:00 2001
From: Paul Wackerow <54227730+wackerow@users.noreply.github.com>
Date: Sat, 21 Feb 2026 19:48:59 -0700
Subject: [PATCH 07/14] docs: compounding doc, path mapping and review workflow

Documents root causes and fixes for misplaced translation files, the worktree-based review workflow, sanitizer enhancements, and automation permission requirements.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ...n-file-path-mapping-and-review-workflow.md | 314 ++++++++++++++++++
 1 file changed, 314 insertions(+)
 create mode 100644 docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md

diff --git a/docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md b/docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md
new file mode 100644
index 00000000000..88a869a7a30
--- /dev/null
+++ b/docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md
@@ -0,0 +1,314 @@
+---
+title: "Crowdin File Path Mapping Bugs and Translation Review Workflow"
+date: 2026-02-21
+category: integration-issues
+tags:
+  - crowdin
+  - translations
+  - i18n
+  - file-path-matching
+  - sanitizer
+  - worktree
+  - automation
+  - permissions
+severity: high
+component: crowdin-import-pipeline
+symptoms:
+  - "Translated files placed at incorrect paths (e.g., cs/beacon-chain/ instead of cs/roadmap/beacon-chain/)"
+  - "Systematic translation errors: AI replaced with UI, semantic inversions"
+  - "Orphaned translation files with no corresponding English source"
+  - "Inaccurate fix count logging in sanitizer output"
+  - "Cross-block href replacements affecting wrong sections"
+  - "Build failures in worktrees due to missing .env.local"
+  - "Merge conflicts discovered only at push time"
+root_causes:
+  - "findCrowdinFile() used .endsWith() for path matching, producing false matches on similarly named paths"
+  - "Crowdin/Gemini translation engine confusing acronyms and inverting meaning"
+  - "Sanitizer tracked in-memory transforms instead of actual disk changes"
+  - "Href replacement applied globally instead of per-block"
+  - "No .env.local in worktrees (USE_MOCK_DATA not set)"
+  - "PR branches diverged from dev without early merge"
+status: solved
+related_prs:
+  - 17547
+  - 17553
+  - 17556
+  - 17182
+---
+
+# Crowdin File Path Mapping Bugs and Translation Review Workflow
+
+## Problem Summary
+
+During Phase 2 of the translation review pipeline (Czech pilot), we discovered that 12 Czech translation files were placed at incorrect filesystem paths. Investigation revealed the root cause in the Crowdin import workflow's path matching logic. Additionally, we established a reproducible worktree-based workflow for reviewing translation PRs and cataloged all permissions needed for automation.
+
+## Root Cause Analysis
+
+### Misplaced Translation Files
+
+**File:** `src/scripts/i18n/lib/crowdin/files.ts`
+
+The `findCrowdinFile()` function used `.endsWith()` to match Crowdin file paths against expected content paths:
+
+```ts
+// BROKEN: matches too broadly
+const found = crowdinFiles.find(({ path }) =>
+  path.endsWith(targetFile.filePath)
+)
+```
+
+When looking for `public/content/roadmap/beacon-chain/index.md`, this matched `cs/beacon-chain/index.md` because the suffix `beacon-chain/index.md` is valid for both. The `roadmap/` parent directory was silently ignored.
+
+**Data flow of the bug:**
+
+```
+GitHub file path                  findCrowdinFile()           processedFileIdToPath        Download destination
+public/content/roadmap/     -->   .endsWith() matches    -->  Stores wrong Crowdin    -->  cs/beacon-chain/
+  beacon-chain/index.md           cs/beacon-chain/             path for this fileId         index.md (WRONG)
+```
+
+**12 Czech files affected:**
+
+| Wrong Location | Correct Location |
+|---|---|
+| `cs/account-abstraction/` | `cs/roadmap/account-abstraction/` |
+| `cs/beacon-chain/` | `cs/roadmap/beacon-chain/` |
+| `cs/danksharding/` | `cs/roadmap/danksharding/` |
+| `cs/future-proofing/` | `cs/roadmap/future-proofing/` |
+| `cs/scaling/` | `cs/roadmap/scaling/` |
+| `cs/statelessness/` | `cs/roadmap/statelessness/` |
+| `cs/user-experience/` | `cs/roadmap/user-experience/` |
+| `cs/withdrawals/` | `cs/staking/withdrawals/` |
+| `cs/dvt/` | `cs/staking/dvt/` |
+| `cs/support/` | `cs/community/support/` |
+| `cs/code-of-conduct/` | `cs/community/code-of-conduct/` |
+| `cs/developers/docs/wrapped-eth/` | `cs/wrapped-eth/` |
+
+### Translation Quality Issues
+
+Crowdin/Gemini translation engine produced two categories of critical error:
+
+1. **Acronym confusion**: "AI" systematically replaced with "UI" (5 instances in `cs/ai-agents/index.md`)
+2. **Semantic inversion**: "malicious intent" translated as "good intentions" in `cs/bridges/index.md`
+
+### Sanitizer Logging Inaccuracy
+
+Individual fix functions returned `fixCount` based on in-memory transforms, not actual disk changes. Reported "22 files modified" when no bytes were written.
+
+### Cross-Block Href Interference
+
+`fixTranslatedHrefs()` applied replacements globally. When `/developers/docs/evm` appeared in both an EVM block and an Oracles block, the global replacement changed the correct href in the EVM block.
+
+## Solutions Implemented
+
+### Fix 1: Stricter Path Matching in findCrowdinFile()
+
+**File:** `src/scripts/i18n/lib/crowdin/files.ts`
+**Branch:** `fix-i18n-workflow`
+
+```ts
+// 1. Exact match first (after normalizing leading slashes)
+const exactMatch = crowdinFiles.find(
+  ({ path }) => path.replace(/^\/+/, "") === normalizedTarget
+)
+if (exactMatch) return exactMatch
+
+// 2. Suffix match with "/" boundary guard
+const suffixMatches = crowdinFiles.filter(({ path }) => {
+  const normalized = path.replace(/^\/+/, "")
+  if (!normalized.endsWith(normalizedTarget)) return false
+  const prefixLength = normalized.length - normalizedTarget.length
+  if (prefixLength === 0) return true
+  return normalized[prefixLength - 1] === "/"
+})
+
+// 3. Prefer longest (most specific) match
+suffixMatches.sort((a, b) => b.path.length - a.path.length)
+return suffixMatches[0] ?? null
+```
+
+### Fix 2: Orphan Detection in Sanitizer
+
+**File:** `src/scripts/i18n/post_import_sanitize.ts`
+**Branch:** `fix-review-translations`
+
+For each translated file, derives the expected English source path and checks existence. If missing, searches by filename to suggest the correct location:
+
+```ts
+const englishPath = path.join(CONTENT_ROOT, relPathWithinLang)
+
+if (!fs.existsSync(englishPath)) {
+  // Search for matching parent/file pattern in English content
+  const englishContentFiles = listFiles(CONTENT_ROOT, (f) => {
+    if (f.includes(`${path.sep}translations${path.sep}`)) return false
+    return f.endsWith(`${path.sep}${parentDir}${path.sep}${basename}`)
+  })
+
+  if (englishContentFiles.length === 1) {
+    suggestion = `Likely belongs at: ${correctTranslationPath}`
+  } else if (englishContentFiles.length > 1) {
+    suggestion = `Ambiguous: ${englishContentFiles.length} candidates`
+  }
+}
+```
+
+### Fix 3: Accurate Disk-Write Tracking
+
+**File:** `src/scripts/i18n/post_import_sanitize.ts`
+
+Added `applyFix()` helper that snapshots content before/after each transform, plus `originalOnDisk` comparison:
+
+```ts
+function applyFix(
+  fn: () => { content: string; fixCount: number },
+  label: (count: number) => string
+) {
+  const snapshot = content
+  const result = fn()
+  content = result.content
+  if (content !== snapshot) {
+    issues.push(label(result.fixCount))
+  }
+}
+```
+
+### Fix 4: Block-Scoped Href Replacement
+
+**File:** `src/scripts/i18n/post_import_sanitize.ts`
+
+Track `blockIdx` from detection phase, apply replacements only within the specific block:
+
+```ts
+// Detection phase
+blockFixes.push({ blockIdx: i, wrong: translatedHref, correct: expectedHref })
+
+// Replacement phase - scoped to block
+for (const { blockIdx, wrong, correct } of blockFixes) {
+  const originalBlock = translatedBlocks[blockIdx]
+  let fixedBlock = originalBlock.replace(markdownRe, `$1${correct}$2`)
+  if (fixedBlock !== originalBlock) {
+    result = result.replace(originalBlock, fixedBlock)
+  }
+}
+```
+
+## Worktree Workflow for Translation Review
+
+Reproducible 8-step sequence for reviewing a translation PR:
+
+```bash
+# 1. Create worktree from PR branch
+git worktree add .worktrees/<name> <pr-branch>
+cd .worktrees/<name>
+
+# 2. Provide environment variables (USE_MOCK_DATA=true avoids network calls)
+cp .env.example .env.local
+
+# 3. Merge latest dev to catch conflicts early
+git fetch origin dev && git merge origin/dev
+# Resolve conflicts — typically modify/delete for misplaced files
+
+# 4. Copy sanitizer scripts from canonical branch (until merged to dev)
+#    Also add franc-min to package.json devDependencies
+cp <canonical-branch>/src/scripts/i18n/post_import_sanitize.ts ./src/scripts/i18n/
+cp <canonical-branch>/src/scripts/i18n/lib/workflows/sanitization.ts ./src/scripts/i18n/lib/workflows/
+
+# 5. Install dependencies
+pnpm install
+
+# 6. Run sanitizer for orphan detection
+TARGET_LANGUAGES=<lang> npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/post_import_sanitize.ts
+
+# 7. Run review (critical issues only — no soft suggestions)
+# Use /review-translations-local --pr=<NUMBER> --language=<lang>
+
+# 8. Validate build
+npx tsc --noEmit                                    # TypeScript check FIRST
+NEXT_PUBLIC_BUILD_LOCALES=en,<lang> pnpm build      # Scoped build
+```
+
+### Key Notes
+
+- **Always run `npx tsc --noEmit` before `pnpm build`** — catches type errors cheaply
+- **`.env.local` is mandatory** — without it, build attempts real API connections and fails
+- **Merge dev early** — resolving conflicts before review prevents wasted work
+- **Merge conflicts are expected** — misplaced files from prior imports cause modify/delete conflicts; accept the deletion
+- **`franc-min` is required** — ESM-only package, needs devDependency until sanitizer changes reach dev
+
+## Automation Permissions Required
+
+All sandbox-restricted operations needed for this workflow:
+
+### Git Operations
+
+| Command | Purpose |
+|---|---|
+| `git worktree add/remove` | Create/destroy isolated review environments |
+| `git fetch origin` | Retrieve latest upstream branches |
+| `git merge origin/dev` | Integrate dev into PR branch |
+| `git stash push/pop` | Temporarily shelve local edits |
+| `git rm` | Remove orphaned/misplaced files |
+| `git add` / `git commit` | Stage and commit fixes |
+| `git push` | Push corrected branch to remote |
+
+### Package Management
+
+| Command | Purpose |
+|---|---|
+| `pnpm install` | Install dependencies (network + node_modules writes) |
+
+### Script Execution
+
+| Command | Purpose |
+|---|---|
+| `npx tsc --noEmit` | TypeScript check without emitting |
+| `npx ts-node <script>` | Run sanitizer scripts directly |
+| `NEXT_PUBLIC_BUILD_LOCALES=en,<lang> pnpm build` | Scoped production build |
+
+### GitHub CLI
+
+| Command | Purpose |
+|---|---|
+| `gh pr view` | Fetch PR metadata and branch name |
+| `gh api repos/{owner}/{repo}/pulls/{PR}/files` | Get PR file list (paginated) |
+| `gh pr comment` | Post review findings to PR |
+
+### File Operations (within worktree paths)
+
+| Operation | Purpose |
+|---|---|
+| `cp` | Copy scripts, .env.example |
+| `mkdir -p` | Create correct translation directories |
+| `rm` / `rmdir` | Remove misplaced files and empty dirs |
+
+## Prevention Strategies
+
+### Automated Guards
+
+1. **Orphan detection in sanitizer** (implemented) — flags files with no English counterpart
+2. **Stricter path matching** (implemented) — exact match with "/" boundary fallback
+3. **`npx tsc --noEmit` before build** — cheap TypeScript error screen
+4. **Early dev merge** — catches conflicts before review work begins
+5. **`.env.example` copy** — ensures mock data mode for local builds
+
+### Requires Human Judgment
+
+1. **Translation semantic accuracy** — AI can flag systematic patterns but native speakers needed for nuance
+2. **Merge conflict resolution** — deciding which version to keep requires domain context
+3. **Glossary compliance** — community-voted terms are authoritative but context matters
+
+## Review Command Improvements (Noted for Future)
+
+1. **Critical issues only** — no soft suggestions, limit LLM opinion loop
+2. **Run `npx tsc --noEmit` before building**
+3. **Default `--fix` to true** (or invert to `--no-fix`)
+
+## Cross-References
+
+- [Turkish PR #17182 Review](../translation-review/crowdin-import-review-turkish-pr-17182.md)
+- [Scaling Translation Review Pipeline](../translation-review/scaling-translation-review-pipeline.md)
+- [Known Patterns Knowledge Base](~/.claude/translation-review/known-patterns.md)
+- [Translation Glossary](~/.claude/translation-review/fetch-translation-glossary.json)
+- Review command: `.claude/commands/review-translations.md`
+- Sanitizer: `src/scripts/i18n/post_import_sanitize.ts`
+- Path matching fix: `src/scripts/i18n/lib/crowdin/files.ts`

From 3fcc8d35548ad6bfd35736aaa37f5f524cf5e9c0 Mon Sep 17 00:00:00 2001
From: Paul Wackerow <54227730+wackerow@users.noreply.github.com>
Date: Sat, 21 Feb 2026 20:12:34 -0700
Subject: [PATCH 08/14] feat(i18n): add PR-scoped sanitizer script

Add sanitize-pr.ts to run the sanitizer on only files changed in a PR diff (via gh API), replacing ad-hoc TARGET_LANGUAGES scoping. Update post_import_sanitize.ts: replace syncFrontmatterTags with brand-only tag fixing, add orphan file detection with suggested correct paths.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/scripts/i18n/post_import_sanitize.ts | 71 ++++++++++++++-------
 src/scripts/i18n/sanitize-pr.ts          | 80 ++++++++++++++++++++++++
 2 files changed, 129 insertions(+), 22 deletions(-)
 create mode 100644 src/scripts/i18n/sanitize-pr.ts

diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts
index 034e71f4a7b..63e470ef8b0 100644
--- a/src/scripts/i18n/post_import_sanitize.ts
+++ b/src/scripts/i18n/post_import_sanitize.ts
@@ -188,11 +188,12 @@ function fixTickerTranspositions(content: string): {
 }
 
 /**
- * Sync frontmatter tags array from English source.
- * Tags like programming language names should never be translated.
- * Replaces the entire tags array with the English original.
+ * Fix only brand/product/language name tags in frontmatter.
+ * Generic concept tags (e.g. "zero-knowledge" → "nulová znalost") should
+ * remain in the native language. Only tags that match a protected brand name
+ * in the English source are restored to English.
  */
-function syncFrontmatterTags(
+function fixBrandTags(
   translatedContent: string,
   englishContent: string
 ): { content: string; fixCount: number } {
@@ -206,31 +207,56 @@ function syncFrontmatterTags(
   const transFm = transMatch[1]
   const engFm = engMatch[1]
 
-  // Extract tags line (handles both inline array and value)
-  const tagsRe = /^(tags:\s*)(.+)$/m
+  // Extract tags arrays
+  const tagsRe = /^tags:\s*\[([^\]]*)\]/m
   const engTagsMatch = engFm.match(tagsRe)
   const transTagsMatch = transFm.match(tagsRe)
 
   if (!engTagsMatch || !transTagsMatch)
     return { content: translatedContent, fixCount: 0 }
 
-  const engTagsValue = engTagsMatch[2].trim()
-  const transTagsValue = transTagsMatch[2].trim()
+  // Parse tag values (handles quoted and unquoted)
+  const parseTags = (raw: string): string[] =>
+    raw
+      .split(",")
+      .map((t) => t.trim().replace(/^["']|["']$/g, ""))
+      .filter(Boolean)
 
-  if (engTagsValue === transTagsValue)
+  const engTags = parseTags(engTagsMatch[1])
+  const transTags = parseTags(transTagsMatch[1])
+
+  if (engTags.length !== transTags.length)
     return { content: translatedContent, fixCount: 0 }
 
-  // Replace translated tags with English tags
-  const updatedFm = transFm.replace(
-    tagsRe,
-    `${transTagsMatch[1]}${engTagsValue}`
-  )
+  // Build lowercase brand set for matching
+  const brandLower = new Set(PROTECTED_BRAND_NAMES.map((b) => b.toLowerCase()))
+
+  // Only replace tags where the English version is a protected brand
+  let fixCount = 0
+  const fixedTags = transTags.map((transTag, i) => {
+    const engTag = engTags[i]
+    if (brandLower.has(engTag.toLowerCase()) && transTag !== engTag) {
+      fixCount++
+      return engTag
+    }
+    return transTag
+  })
+
+  if (fixCount === 0) return { content: translatedContent, fixCount: 0 }
+
+  // Reconstruct the tags line preserving original quoting style
+  const quote = transTagsMatch[1].includes('"') ? '"' : "'"
+  const newTagsValue = fixedTags.map((t) => `${quote}${t}${quote}`).join(", ")
+  const fullTagsLine = transTagsMatch[0]
+  const newFullTagsLine = `tags: [${newTagsValue}]`
+
+  const updatedFm = transFm.replace(fullTagsLine, newFullTagsLine)
   const content = translatedContent.replace(
     frontmatterRe,
     `---\n${updatedFm}\n---`
   )
 
-  return { content, fixCount: 1 }
+  return { content, fixCount }
 }
 
 /**
@@ -242,7 +268,8 @@ function syncFrontmatterTags(
  * we can't easily know what the mistranslation IS without locale knowledge.
  * So we report these as warnings for the LLM review to handle.
  *
- * However, for frontmatter `tags` arrays, we CAN auto-fix by syncing with English.
+ * For frontmatter `tags`, only brand/product/language names are restored
+ * to English; generic concept tags remain in the native language.
  */
 function fixProtectedBrandNames(
   translatedContent: string,
@@ -252,13 +279,13 @@ function fixProtectedBrandNames(
   let content = translatedContent
   let fixCount = 0
 
-  // Auto-fix: Sync frontmatter tags with English source
-  const tagsSyncResult = syncFrontmatterTags(content, englishContent)
-  content = tagsSyncResult.content
-  fixCount += tagsSyncResult.fixCount
-  if (tagsSyncResult.fixCount > 0) {
+  // Auto-fix: Restore brand-name tags to English (leaves concept tags translated)
+  const brandTagsResult = fixBrandTags(content, englishContent)
+  content = brandTagsResult.content
+  fixCount += brandTagsResult.fixCount
+  if (brandTagsResult.fixCount > 0) {
     warnings.push(
-      `Auto-synced ${tagsSyncResult.fixCount} frontmatter tags with English source`
+      `Restored ${brandTagsResult.fixCount} brand-name tag(s) to English`
     )
   }
 
diff --git a/src/scripts/i18n/sanitize-pr.ts b/src/scripts/i18n/sanitize-pr.ts
new file mode 100644
index 00000000000..3892c5361c6
--- /dev/null
+++ b/src/scripts/i18n/sanitize-pr.ts
@@ -0,0 +1,80 @@
+/**
+ * Run the post-import sanitizer on ONLY the files changed in a specific PR.
+ *
+ * Usage:
+ *   npx ts-node -O '{"module":"commonjs"}' src/scripts/i18n/sanitize-pr.ts <PR_NUMBER>
+ *
+ * Requires: `gh` CLI authenticated and available in PATH.
+ *
+ * Fetches the file list from the GitHub API (paginated), filters to
+ * translation files (.md and .json), and passes them to runSanitizer()
+ * with empty content so the sanitizer reads from disk and writes fixes back.
+ */
+
+import { execSync } from "child_process"
+import * as path from "path"
+
+import { runSanitizer } from "./post_import_sanitize"
+
+const ROOT = process.cwd()
+
+function getPRFiles(prNumber: string): string[] {
+  const cmd = `gh api repos/ethereum/ethereum-org-website/pulls/${prNumber}/files --paginate -q '.[].filename'`
+  const output = execSync(cmd, {
+    encoding: "utf8",
+    maxBuffer: 10 * 1024 * 1024,
+  })
+  return output.trim().split("\n").filter(Boolean)
+}
+
+async function main() {
+  const prNumber = process.argv[2]
+  if (!prNumber) {
+    console.error("Usage: sanitize-pr.ts <PR_NUMBER>")
+    process.exit(1)
+  }
+
+  console.log(`[sanitize-pr] Fetching file list for PR #${prNumber}...`)
+  const allFiles = getPRFiles(prNumber)
+
+  // Filter to translation files only (md + json under translations/ or intl/)
+  const translationFiles = allFiles.filter(
+    (f) =>
+      (f.includes("/translations/") || f.includes("/intl/")) &&
+      (f.endsWith(".md") || f.endsWith(".json"))
+  )
+
+  if (translationFiles.length === 0) {
+    console.log("[sanitize-pr] No translation files found in PR diff.")
+    process.exit(0)
+  }
+
+  console.log(
+    `[sanitize-pr] Found ${translationFiles.length} translation files in PR #${prNumber}`
+  )
+
+  // Convert to absolute paths with empty content (sanitizer reads from disk)
+  const filesWithContent = translationFiles.map((relPath) => ({
+    path: path.join(ROOT, relPath),
+    content: "",
+  }))
+
+  const result = await runSanitizer(filesWithContent)
+
+  console.log(`\n[sanitize-pr] Done.`)
+  console.log(
+    `  Markdown: ${result.markdown.scanned} scanned, ${result.markdown.fixed} fixed`
+  )
+  console.log(
+    `  JSON: ${result.json.scanned} scanned, ${result.json.fixed} fixed`
+  )
+
+  if (result.orphanWarnings && result.orphanWarnings.length > 0) {
+    console.log(`  Orphan warnings: ${result.orphanWarnings.length}`)
+  }
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})

From 185fe6a0cbb6346b04f1f6ce5ed098a729c7acd4 Mon Sep 17 00:00:00 2001
From: Paul Wackerow <54227730+wackerow@users.noreply.github.com>
Date: Sat, 21 Feb 2026 20:13:28 -0700
Subject: [PATCH 09/14] docs: add diff reliability note for automation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../crowdin-file-path-mapping-and-review-workflow.md     | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md b/docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md
index 88a869a7a30..ef42edc003b 100644
--- a/docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md
+++ b/docs/solutions/integration-issues/crowdin-file-path-mapping-and-review-workflow.md
@@ -235,6 +235,15 @@ NEXT_PUBLIC_BUILD_LOCALES=en,<lang> pnpm build      # Scoped build
 - **Merge conflicts are expected** — misplaced files from prior imports cause modify/delete conflicts; accept the deletion
 - **`franc-min` is required** — ESM-only package, needs devDependency until sanitizer changes reach dev
 
+### Tool Reliability: `diff` Command
+
+During cs-part-07 review, `diff` returned empty output comparing two files that were verifiably different (confirmed by reading both files and re-running `diff` with identical arguments, which then returned correct output). Root cause unknown — not conclusively a sandbox issue since the second run succeeded with the same arguments.
+
+**For automation, do not trust empty `diff` output as proof of file equality.** Mitigations:
+- Check `diff` exit code explicitly (`0` = identical, `1` = different, `2` = error)
+- Use `diff --brief` for a quick same/different check before assuming equality
+- When comparing files for migration decisions (orphan dedup), read and verify content directly if `diff` returns empty
+
 ## Automation Permissions Required
 
 All sandbox-restricted operations needed for this workflow:

From d67a75cf9d2e9875d59287a95c279d0214b4261c Mon Sep 17 00:00:00 2001
From: myelinated-wackerow
 <263208946+myelinated-wackerow@users.noreply.github.com>
Date: Tue, 24 Feb 2026 00:03:57 +0000
Subject: [PATCH 10/14] fix(i18n): fix 5 sanitizer bugs found during ja review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. fixTranslatedHrefs: convert to warn-only — block-positional alignment
   is unreliable (Crowdin adds/removes blank lines, shifting paragraph
   indices and causing incorrect href substitutions across unrelated
   paragraphs). Href fixes left to AI review agents with semantic context.

2. fixBrandTags: use canonical casing from PROTECTED_BRAND_NAMES instead
   of copying English source values (which may be lowercase). Switch to
   targeted replacement to preserve original YAML formatting (multi-line
   arrays, spacing, quoting style).

3. fixTickerTranspositions: remove KECCAK→Keccak from corrections map
   (KECCAK is a valid all-caps form in code). Add code-fence skipping so
   ticker corrections don't modify content inside code blocks.

4. removeOrphanedClosingTags: add code-block/code-span awareness using
   the same split pattern as escapeMdxAngleBrackets, so tags inside
   backticks (e.g. `</strong>`) are not stripped.

5. removeOrphanedClosingTags: fix removal order — keep first N closers
   (paired with openers) and remove trailing excess, instead of removing
   the first N matches which strips correctly-paired tags.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com>
---
 src/scripts/i18n/post_import_sanitize.ts | 262 ++++++++++-------------
 1 file changed, 109 insertions(+), 153 deletions(-)

diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts
index 63e470ef8b0..063e272e271 100644
--- a/src/scripts/i18n/post_import_sanitize.ts
+++ b/src/scripts/i18n/post_import_sanitize.ts
@@ -161,37 +161,47 @@ const TICKER_CORRECTIONS: Record<string, string> = {
   EHT: "ETH",
   BSL: "BLS",
   ECDAS: "ECDSA",
-  KECCAK: "Keccak",
 }
 
 /**
  * Fix ticker symbol transpositions.
  * Only matches whole words (word boundaries) to avoid false positives.
+ * Skips code blocks (fenced and inline) where these forms may be valid.
  */
 function fixTickerTranspositions(content: string): {
   content: string
   fixCount: number
 } {
-  let result = content
   let fixCount = 0
 
-  for (const [wrong, correct] of Object.entries(TICKER_CORRECTIONS)) {
-    const re = new RegExp(`\\b${escapeRegex(wrong)}\\b`, "g")
-    const matches = result.match(re)
-    if (matches && matches.length > 0) {
-      fixCount += matches.length
-      result = result.replace(re, correct)
+  // Split content to preserve code blocks
+  const codeBlockPattern = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g
+  const parts = content.split(codeBlockPattern)
+
+  for (let i = 0; i < parts.length; i++) {
+    if (i % 2 === 1) continue // Skip code blocks
+
+    for (const [wrong, correct] of Object.entries(TICKER_CORRECTIONS)) {
+      const re = new RegExp(`\\b${escapeRegex(wrong)}\\b`, "g")
+      const matches = parts[i].match(re)
+      if (matches && matches.length > 0) {
+        fixCount += matches.length
+        parts[i] = parts[i].replace(re, correct)
+      }
     }
   }
 
-  return { content: result, fixCount }
+  return { content: parts.join(""), fixCount }
 }
 
 /**
  * Fix only brand/product/language name tags in frontmatter.
  * Generic concept tags (e.g. "zero-knowledge" → "nulová znalost") should
  * remain in the native language. Only tags that match a protected brand name
- * in the English source are restored to English.
+ * in the English source are restored to their canonical casing.
+ *
+ * Uses targeted replacement to preserve original formatting (multi-line YAML,
+ * spacing, quoting style) instead of reconstructing the entire tags line.
  */
 function fixBrandTags(
   translatedContent: string,
@@ -228,29 +238,38 @@ function fixBrandTags(
   if (engTags.length !== transTags.length)
     return { content: translatedContent, fixCount: 0 }
 
-  // Build lowercase brand set for matching
-  const brandLower = new Set(PROTECTED_BRAND_NAMES.map((b) => b.toLowerCase()))
+  // Build a map from lowercase brand name to canonical casing
+  const brandCanonical = new Map<string, string>()
+  for (const brand of PROTECTED_BRAND_NAMES) {
+    brandCanonical.set(brand.toLowerCase(), brand)
+  }
 
-  // Only replace tags where the English version is a protected brand
+  // Identify tags that need fixing: brand tags whose translation differs
+  // from the canonical casing
   let fixCount = 0
-  const fixedTags = transTags.map((transTag, i) => {
+  let updatedFm = transFm
+
+  for (let i = 0; i < transTags.length; i++) {
     const engTag = engTags[i]
-    if (brandLower.has(engTag.toLowerCase()) && transTag !== engTag) {
+    const transTag = transTags[i]
+    const canonical = brandCanonical.get(engTag.toLowerCase())
+
+    if (!canonical) continue // Not a brand tag — leave as-is
+    if (transTag === canonical) continue // Already correct
+
+    // Targeted replacement: find the exact quoted tag in frontmatter and replace
+    // Match the tag with its surrounding quotes to avoid false positives
+    const quotedTagRe = new RegExp(
+      `(["'])${escapeRegex(transTag)}\\1`
+    )
+    if (quotedTagRe.test(updatedFm)) {
+      updatedFm = updatedFm.replace(quotedTagRe, `$1${canonical}$1`)
       fixCount++
-      return engTag
     }
-    return transTag
-  })
+  }
 
   if (fixCount === 0) return { content: translatedContent, fixCount: 0 }
 
-  // Reconstruct the tags line preserving original quoting style
-  const quote = transTagsMatch[1].includes('"') ? '"' : "'"
-  const newTagsValue = fixedTags.map((t) => `${quote}${t}${quote}`).join(", ")
-  const fullTagsLine = transTagsMatch[0]
-  const newFullTagsLine = `tags: [${newTagsValue}]`
-
-  const updatedFm = transFm.replace(fullTagsLine, newFullTagsLine)
   const content = translatedContent.replace(
     frontmatterRe,
     `---\n${updatedFm}\n---`
@@ -451,16 +470,12 @@ function splitIntoBlocks(content: string): string[] {
 }
 
 /**
- * Fix translated hrefs by comparing against English source.
- * Uses paragraph-scoped set comparison for robust matching across languages.
+ * Detect translated/mismatched hrefs by comparing against English source.
+ * Warn-only — does NOT auto-fix, because block-positional alignment between
+ * English and translated documents is unreliable (Crowdin often adds/removes
+ * blank lines, shifting paragraph indices and causing incorrect substitutions).
  *
- * Strategy:
- * 1. Split both documents into blocks (paragraphs separated by blank lines)
- * 2. For each block pair, compare internal href sets
- * 3. Within a block: if invalid href count equals missing href count, we can match
- * 4. This handles grammatical reordering within sentences (common in non-English)
- *
- * Only auto-fixes unambiguous cases; warns for complex mismatches.
+ * Href fixes are left to the AI review agents which have full semantic context.
  */
 function fixTranslatedHrefs(
   translatedContent: string,
@@ -471,113 +486,41 @@ function fixTranslatedHrefs(
 
   // Collect all English internal hrefs as the "valid" set
   const allEnglishHrefs = extractHrefs(englishContent)
+  // Collect all translation internal hrefs
+  const allTransHrefs = extractHrefs(translatedContent)
 
-  const blockFixes: Array<{
-    blockIdx: number
-    wrong: string
-    correct: string
-  }> = []
   const allWarnings: string[] = []
 
-  // Process block by block
-  const blockCount = Math.min(englishBlocks.length, translatedBlocks.length)
-
-  for (let i = 0; i < blockCount; i++) {
-    const engBlock = englishBlocks[i]
-    const transBlock = translatedBlocks[i]
-
-    const engHrefs = extractHrefsFromBlock(engBlock).filter(isInternalHref)
-    const transHrefs = extractHrefsFromBlock(transBlock).filter(isInternalHref)
-
-    // Skip blocks with no internal hrefs
-    if (engHrefs.length === 0 && transHrefs.length === 0) continue
-
-    // Compare hrefs at block level
-    const engHrefSet = new Set(engHrefs)
-    const transHrefSet = new Set(transHrefs)
-
-    // Hrefs in translation block but NOT in corresponding English block
-    const displacedInTrans: string[] = []
-    const missingFromTrans: string[] = []
-
-    for (const href of transHrefs) {
-      if (!engHrefSet.has(href)) {
-        displacedInTrans.push(href)
-      }
-    }
-
-    for (const href of engHrefs) {
-      if (!transHrefSet.has(href)) {
-        missingFromTrans.push(href)
-      }
-    }
-
-    // No issues in this block
-    if (displacedInTrans.length === 0 && missingFromTrans.length === 0) continue
-
-    // Deduplicate for set comparison
-    const uniqueDisplaced = [...new Set(displacedInTrans)]
-    const uniqueMissing = [...new Set(missingFromTrans)]
-
-    // Auto-fix when there's exactly 1 displaced and 1 missing in the same block
-    if (uniqueDisplaced.length === 1 && uniqueMissing.length === 1) {
-      blockFixes.push({
-        blockIdx: i,
-        wrong: uniqueDisplaced[0],
-        correct: uniqueMissing[0],
-      })
-    } else if (uniqueDisplaced.length > 0 || uniqueMissing.length > 0) {
-      for (const href of uniqueDisplaced) {
-        const globallyValid = allEnglishHrefs.has(href)
-        allWarnings.push(
-          `Block ${i + 1}: ${globallyValid ? "Displaced" : "Invalid"} href "${href}" - not in corresponding English block`
-        )
-      }
-      for (const href of uniqueMissing) {
-        allWarnings.push(
-          `Block ${i + 1}: Missing href "${href}" - present in English but not translation`
-        )
-      }
-    }
-  }
-
-  // Warn about block count mismatch
+  // Warn about block count mismatch (indicates paragraph alignment drift)
   if (englishBlocks.length !== translatedBlocks.length) {
     allWarnings.push(
       `Block count mismatch: English has ${englishBlocks.length}, translation has ${translatedBlocks.length}`
     )
   }
 
-  // Apply fixes block-by-block to avoid cross-block interference
-  let result = translatedContent
-  const appliedFixes: string[] = []
-
-  for (const { blockIdx, wrong, correct } of blockFixes) {
-    const originalBlock = translatedBlocks[blockIdx]
-    let fixedBlock = originalBlock
-
-    // Replace in markdown links: [text](wrong) → [text](correct)
-    const markdownRe = new RegExp(
-      `(\\[[^\\]]*\\]\\()${escapeRegex(wrong)}(\\))`,
-      "g"
-    )
-    fixedBlock = fixedBlock.replace(markdownRe, `$1${correct}$2`)
-
-    // Replace in href attributes: href="wrong" → href="correct"
-    const hrefRe = new RegExp(`(href=["'])${escapeRegex(wrong)}(["'])`, "g")
-    fixedBlock = fixedBlock.replace(hrefRe, `$1${correct}$2`)
+  // Document-level href comparison: find hrefs in translation that don't
+  // exist anywhere in English (likely translated or corrupted paths)
+  for (const href of allTransHrefs) {
+    if (isInternalHref(href) && !allEnglishHrefs.has(href)) {
+      allWarnings.push(
+        `Invalid internal href "${href}" — not found in English source`
+      )
+    }
+  }
 
-    if (fixedBlock !== originalBlock) {
-      result = result.replace(originalBlock, fixedBlock)
-      translatedBlocks[blockIdx] = fixedBlock // update for subsequent fixes
-      appliedFixes.push(`${wrong} → ${correct}`)
+  // Find English hrefs missing from translation entirely
+  for (const href of allEnglishHrefs) {
+    if (isInternalHref(href) && !allTransHrefs.has(href)) {
+      allWarnings.push(
+        `Missing href "${href}" — present in English but not in translation`
+      )
     }
   }
 
   return {
-    content: result,
-    fixCount: appliedFixes.length,
-    fixes: appliedFixes,
+    content: translatedContent, // No modifications — warn only
+    fixCount: 0,
+    fixes: [],
     warnings: allWarnings,
   }
 }
@@ -1408,6 +1351,10 @@ function escapeMdxAngleBrackets(content: string): {
  * These appear when translation restructures sentences and leaves behind
  * closing tags like </a> without matching openers.
  * Only removes tags that have NO corresponding opener in the same paragraph.
+ *
+ * Skips code blocks (fenced and inline) where closing tags are valid content.
+ * Removes excess closers from right-to-left (last occurrence first) so that
+ * correctly-paired closers near their openers are preserved.
  */
 function removeOrphanedClosingTags(content: string): {
   content: string
@@ -1416,37 +1363,46 @@ function removeOrphanedClosingTags(content: string): {
   let fixCount = 0
   const orphanTags = ["a", "span", "em", "strong", "b", "i", "u"]
 
-  for (const tag of orphanTags) {
-    // Find closing tags that don't have a matching opener on the same line
-    const lines = content.split("\n")
-    for (let i = 0; i < lines.length; i++) {
-      const line = lines[i]
-      const closeRe = new RegExp(`</${tag}>`, "g")
-      const openRe = new RegExp(`<${tag}[\\s>]`, "g")
-
-      const closeCount = (line.match(closeRe) || []).length
-      const openCount = (line.match(openRe) || []).length
-
-      // If there are more closing tags than opening tags on this line,
-      // remove the excess closing tags (they're orphans)
-      if (closeCount > openCount) {
-        let excess = closeCount - openCount
-        lines[i] = line.replace(closeRe, (match) => {
-          if (excess > 0) {
-            excess--
+  // Split content to preserve code blocks (fenced and inline)
+  const codeBlockPattern = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g
+  const parts = content.split(codeBlockPattern)
+
+  for (let partIdx = 0; partIdx < parts.length; partIdx++) {
+    if (partIdx % 2 === 1) continue // Skip code blocks
+
+    for (const tag of orphanTags) {
+      const lines = parts[partIdx].split("\n")
+      for (let i = 0; i < lines.length; i++) {
+        const line = lines[i]
+        const closeRe = new RegExp(`</${tag}>`, "g")
+        const openRe = new RegExp(`<${tag}[\\s>]`, "g")
+
+        const closeCount = (line.match(closeRe) || []).length
+        const openCount = (line.match(openRe) || []).length
+
+        // If there are more closing tags than opening tags on this line,
+        // remove the excess closing tags (the trailing orphans)
+        if (closeCount > openCount) {
+          // Keep the first `openCount` closers (paired with openers),
+          // remove the rest (orphans at the end of the line)
+          let kept = 0
+          lines[i] = line.replace(closeRe, (match) => {
+            kept++
+            if (kept <= openCount) {
+              return match // Keep — paired with an opener
+            }
             fixCount++
-            return ""
-          }
-          return match
-        })
-        // Clean up any resulting double spaces
-        lines[i] = lines[i].replace(/  +/g, " ").trim()
+            return "" // Remove — orphaned
+          })
+          // Clean up any resulting double spaces
+          lines[i] = lines[i].replace(/  +/g, " ").trimEnd()
+        }
       }
+      parts[partIdx] = lines.join("\n")
     }
-    content = lines.join("\n")
   }
 
-  return { content, fixCount }
+  return { content: parts.join(""), fixCount }
 }
 
 /**

From 2ec37332c866473e0d33ab5a6eb859031624c61e Mon Sep 17 00:00:00 2001
From: myelinated-wackerow
 <263208946+myelinated-wackerow@users.noreply.github.com>
Date: Tue, 24 Feb 2026 00:14:08 +0000
Subject: [PATCH 11/14] docs: add post-import sanitizer bug analysis from ja
 review

Documents 5 correctness bugs found in post_import_sanitize.ts during
Japanese translation review of PR #17132. Covers root causes, fixes,
prevention strategies, and testing recommendations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com>
---
 ...rt-sanitizer-bugs-found-japanese-review.md | 205 ++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 docs/solutions/integration-issues/post-import-sanitizer-bugs-found-japanese-review.md

diff --git a/docs/solutions/integration-issues/post-import-sanitizer-bugs-found-japanese-review.md b/docs/solutions/integration-issues/post-import-sanitizer-bugs-found-japanese-review.md
new file mode 100644
index 00000000000..702dc4a7c65
--- /dev/null
+++ b/docs/solutions/integration-issues/post-import-sanitizer-bugs-found-japanese-review.md
@@ -0,0 +1,205 @@
+---
+title: Post-import sanitizer script introduced 5 correctness bugs in Japanese translation files
+date: 2026-02-24
+category: integration-issues
+severity: high
+component: src/scripts/i18n/post_import_sanitize.ts
+tags: [i18n, sanitizer, crowdin, translation-import, href-corruption, brand-names, orphaned-tags, code-fence, ticker-transposition]
+related_prs: ["#17132"]
+related_branches: ["fix-review-translations"]
+symptoms:
+  - Internal page hrefs in translated markdown replaced with unrelated paths (e.g., /staking/ became /roadmap/)
+  - Brand name tags written with incorrect lowercase casing (e.g., "solidity" instead of "Solidity")
+  - YAML tags block multi-line formatting destroyed by full-line reconstruction
+  - KECCAK (valid all-caps form) incorrectly rewritten as "Keccak" inside code fences
+  - Closing HTML tags stripped from inside inline code spans (e.g., `</strong>` incorrectly removed)
+  - First paired closing tag removed instead of trailing orphan, worsening HTML structure
+root_causes:
+  - fixTranslatedHrefs used block positional index alignment which is unreliable due to Crowdin blank line insertion/removal shifting paragraph indices
+  - fixBrandTags read tag values from the English source rather than from PROTECTED_BRAND_NAMES canonical casing map
+  - fixBrandTags reconstructed the entire YAML tags line rather than doing targeted per-tag replacement
+  - KECCAK was erroneously listed in the ticker transpositions corrections map despite being a valid all-caps identifier
+  - fixTickerTranspositions had no code-fence awareness and modified content inside fenced code blocks
+  - removeOrphanedClosingTags operated on raw text without markdown parsing, making it blind to inline code spans
+  - removeOrphanedClosingTags removal logic targeted the first excess closer rather than the last (trailing orphan)
+---
+
+# Post-Import Sanitizer: 5 Bugs Found During Japanese Translation Review
+
+## Problem
+
+During the Japanese (ja) translation review of PR #17132 (306 files, branch `i18n/import/2026-01-21T17-37-56-ja`), the post-import sanitizer script on the `fix-review-translations` branch was found to silently corrupt translated files in 5 distinct ways. The sanitizer ran as Phase 1e of the `review-translations-local` pipeline, modifying 46 files. Manual review of the staged changes revealed that many modifications were incorrect — hrefs pointed to wrong pages, brand names were lowercased, code examples were mangled, and HTML structure was worsened.
+
+**Files affected:** ~40 of the 46 sanitizer-modified files contained at least one incorrect change.
+
+## Solution
+
+All 5 bugs were fixed in commit `d67a75cf9d` on the `fix-review-translations` branch.
+
+**File:** `src/scripts/i18n/post_import_sanitize.ts`
+**Diff:** 109 insertions, 153 deletions (net reduction via simplification)
+
+---
+
+### Bug 1: `fixTranslatedHrefs` — Block-Positional Href Substitution Corrupts Unrelated Paragraphs
+
+**Root cause:** The function split both English and translated documents into "blocks" (paragraphs separated by blank lines) and compared hrefs at the same positional index. When exactly one href was "displaced" and one was "missing" in a block pair, it auto-substituted. The fatal assumption: block N in the translation corresponds to block N in the English source. Crowdin routinely adds/removes blank lines, causing paragraph indices to drift. A substitution based on misaligned pairs replaces a valid href with one from an entirely different section.
+
+**Manifestation (ja review):**
+
+| File | Original href | Replaced with |
+|------|--------------|---------------|
+| `eth/supply/index.md` | `/staking/` | `/roadmap/` |
+| `ethereum-forks/index.md` | `/roadmap/beacon-chain` | `/glossary/#difficulty-bomb` |
+| `ethereum-forks/index.md` | `/glossary/#ice-age` | `/roadmap/beacon-chain` |
+| `roadmap/merge/index.md` | `/roadmap/` | `/energy-consumption/` |
+| `what-are-apps/index.md` | `/wallets/find-wallet` | `/get-eth` |
+
+**Fix:** Converted to **warn-only**. The function now performs a document-level set comparison (all English hrefs vs all translation hrefs) and emits warnings for mismatches without modifying any content. Href repairs are left to AI review agents with full semantic context.
+
+```typescript
+// Before: auto-fix based on block-positional matching
+if (uniqueDisplaced.length === 1 && uniqueMissing.length === 1) {
+  blockFixes.push({ blockIdx: i, wrong: uniqueDisplaced[0], correct: uniqueMissing[0] })
+}
+
+// After: warn-only, document-level set comparison
+return {
+  content: translatedContent, // No modifications
+  fixCount: 0,
+  fixes: [],
+  warnings: allWarnings,
+}
+```
+
+---
+
+### Bug 2: `fixBrandTags` — English Source Values (Lowercase) Used Instead of Canonical Casing; YAML Formatting Destroyed
+
+**Root cause (sub-bug A):** When correcting brand tags, the code replaced translated tags with `engTag` — the raw value from the English source YAML. Many English sources store tags in lowercase (e.g., `"solidity"`, `"alchemy"`). The canonical casing is defined in `PROTECTED_BRAND_NAMES` (`"Solidity"`, `"Alchemy"`).
+
+**Root cause (sub-bug B):** After computing corrected tags, the code reconstructed the entire `tags: [...]` line from scratch, destroying multi-line YAML formatting, collapsing spacing, and normalizing quoting style.
+
+**Manifestation (ja review):** ~30 tutorial files had brand tags lowercased and YAML reformatted:
+```yaml
+# Before (correct)
+tags: [ "Solidity", "hardhat", "Alchemy", "スマート契約", "デプロイ" ]
+
+# After sanitizer (wrong)
+tags: ["solidity", "hardhat", "alchemy", "スマート契約", "デプロイ"]
+```
+
+**Fix:** Uses canonical casing from a `Map<lowercase, canonical>` built from `PROTECTED_BRAND_NAMES`. Performs targeted in-place replacement of individual quoted tags instead of reconstructing the full line, preserving original YAML formatting.
+
+```typescript
+// Before: copies English source value
+return engTag  // "solidity" from English YAML
+
+// After: looks up canonical casing
+const brandCanonical = new Map<string, string>()
+for (const brand of PROTECTED_BRAND_NAMES) {
+  brandCanonical.set(brand.toLowerCase(), brand)
+}
+const canonical = brandCanonical.get(engTag.toLowerCase()) // "Solidity"
+```
+
+---
+
+### Bug 3: `fixTickerTranspositions` — `KECCAK` in Corrections Map + No Code-Fence Awareness
+
+**Root cause (sub-bug A):** `TICKER_CORRECTIONS` contained `KECCAK: "Keccak"`. `KECCAK` is a valid all-caps form of the hash algorithm name used in specifications and code.
+
+**Root cause (sub-bug B):** The function applied word-boundary replacements across the entire document string with no code-block awareness. Corrections were applied inside fenced code blocks and inline code spans.
+
+**Manifestation (ja review):** In 2 files (`web3-secret-storage-definition/index.md`, `web3-secret-storage/index.md`), `KECCAK(DK[16..31] ++ <ciphertext>)` inside a JS code block was changed to `Keccak(...)`.
+
+**Fix:** Removed `KECCAK` from corrections map. Added code-fence skipping using the same `split(codeBlockPattern)` approach used by `escapeMdxAngleBrackets`.
+
+---
+
+### Bug 4: `removeOrphanedClosingTags` — No Code-Block/Code-Span Awareness
+
+**Root cause:** The function operated on raw text line-by-line, counting opening and closing HTML tags. It had no mechanism to skip content inside fenced code blocks or inline code spans.
+
+**Manifestation (ja review):** In `translators-guide/index.md`, the line `` `</strong>` - _終了タグ_ `` (an educational code example showing an HTML tag) had `</strong>` stripped from inside the backticks, producing `` `` - _終了タグ_ ``.
+
+**Fix:** Added code-block/code-span splitting as the outermost loop, processing only non-code parts for orphan detection.
+
+---
+
+### Bug 5: `removeOrphanedClosingTags` — First Excess Closer Removed Instead of Last
+
+**Root cause:** Given a line with more closers than openers (e.g., `<a href="...">text</a> prose </a>`), the `excess` counter decremented from the first `.replace()` match, removing the **first** `</a>` (correctly paired with its opener) and leaving the trailing orphan.
+
+**Manifestation (ja review):** In `restaking/index.md`, a line with `<a href="...">link text</a>...prose... </a>` had the correct closer stripped and the orphan preserved, breaking the anchor tag.
+
+**Fix:** Inverted the logic — keep the first N closers (paired with openers), remove subsequent excess (trailing orphans):
+
+```typescript
+// Before: removes FIRST excess (wrong — strips paired closers)
+let excess = closeCount - openCount
+lines[i] = line.replace(closeRe, (match) => {
+  if (excess > 0) { excess--; return "" }
+  return match
+})
+
+// After: keeps FIRST N (paired), removes trailing orphans
+let kept = 0
+lines[i] = line.replace(closeRe, (match) => {
+  kept++
+  if (kept <= openCount) return match  // Keep paired closer
+  return ""                            // Remove orphan
+})
+```
+
+---
+
+## Prevention Strategies
+
+### Design Principles Violated
+
+| Bug | Principle Violated |
+|-----|-------------------|
+| 1 | **No structural isomorphism assumption** — paragraph indices are not a join key between source and translation |
+| 2 | **Single source of truth** — canonical values come from constants, not document content |
+| 3a | **Correction scope** — don't list valid forms as misspellings |
+| 3b, 4 | **Context boundary respect** — code regions are off-limits for prose transformations |
+| 5 | **Semantic intent** — orphans are trailing excess, not leading excess |
+
+### Rules for Sanitizer Development
+
+1. **Warn before you fix:** Auto-fix only when the correct value is unambiguous and comes from a constant. When the fix requires assumptions about document structure, default to warn-only.
+
+2. **Constants are authoritative, documents are inputs:** Never derive canonical values from document text. `PROTECTED_BRAND_NAMES` owns the casing, not the English YAML file.
+
+3. **Code regions are off-limits:** Every transformation pass must split on code blocks/spans before processing. The recommended pattern: `content.split(/(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g)` — odd indices are code, skip them.
+
+4. **Structural matching requires confirmation:** Any logic pairing source and translated elements must include a confidence check. Low confidence → warn and skip.
+
+5. **Removal algorithms must define "which N":** Specify and justify selection strategy (first, last, trailing, etc.) based on semantic intent.
+
+### Testing Recommendations
+
+**Href alignment:**
+- Test with translated file having extra/fewer blank lines than English
+- Verify no href substitution occurs when block counts differ
+
+**Brand tag casing:**
+- Test with English source having lowercase brand tags
+- Verify output uses `PROTECTED_BRAND_NAMES` casing, not English source casing
+- Verify multi-line YAML tag arrays are preserved
+
+**Code-fence awareness:**
+- Test ticker corrections, orphan removal, and all text transforms with content inside fenced and inline code
+- Verify code regions are never modified
+
+**Orphan tag removal:**
+- Test `<a>text</a> prose </a>` — trailing orphan should be removed, paired closer preserved
+- Test `` `</strong>` `` — tag inside backticks should not be touched
+
+## Related Documentation
+
+- [Crowdin Import Review Agent Calibration](./crowdin-import-review-agent-calibration.md) — False positive calibration on Czech translations
+- [Crowdin File Path Mapping and Review Workflow](./crowdin-file-path-mapping-and-review-workflow.md) — Worktree workflow, automation permissions, orphan detection
+- [Scaling Translation Review Pipeline](../translation-review/scaling-translation-review-pipeline.md) — Strategic roadmap with prevention matrix
+- [Turkish PR #17182 Review](../translation-review/crowdin-import-review-turkish-pr-17182.md) — First review case study establishing baseline issue catalog

From 38ffc82e3c723f2f4bafcc056cc4b6ba2d7d9cc7 Mon Sep 17 00:00:00 2001
From: myelinated-wackerow
 <263208946+myelinated-wackerow@users.noreply.github.com>
Date: Tue, 24 Feb 2026 04:23:26 +0000
Subject: [PATCH 12/14] fix(i18n): add MDX escape handling for Crowdin
 translation artifacts

Expand escapeMdxAngleBrackets to catch bare <> and </> fragments in
prose (Crowdin drops backticks around these during translation).

Add restoreDroppedBackslashEscapes to detect \< patterns in English
source and restore missing backslash escapes in translations (Crowdin
strips these in table cells, e.g. \<= becomes <= and \<Storage becomes
<Storage, both of which break MDX compilation).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com>
---
 src/scripts/i18n/post_import_sanitize.ts | 64 +++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts
index 063e272e271..0e5f444de79 100644
--- a/src/scripts/i18n/post_import_sanitize.ts
+++ b/src/scripts/i18n/post_import_sanitize.ts
@@ -1341,11 +1341,69 @@ function escapeMdxAngleBrackets(content: string): {
       fixCount++
       return `&lt;${digit}`
     })
+
+    // Escape bare JSX fragment <> in prose (Crowdin drops backticks around `<>`)
+    parts[i] = parts[i].replace(/(?<!\\|`)<>(?!`)/g, () => {
+      fixCount++
+      return "\\<>"
+    })
+
+    // Escape bare closing JSX fragment </> in prose
+    parts[i] = parts[i].replace(/(?<!\\|`)<\/>(?!`)/g, () => {
+      fixCount++
+      return "\\</>"
+    })
   }
 
   return { content: parts.join(""), fixCount }
 }
 
+/**
+ * Restore backslash escapes before < that Crowdin dropped during translation.
+ * Compares English source to find all \< patterns, then checks if the
+ * translated file has the same context without the backslash.
+ */
+function restoreDroppedBackslashEscapes(
+  content: string,
+  englishContent: string
+): {
+  content: string
+  fixCount: number
+} {
+  let fixCount = 0
+
+  const codeBlockPattern = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g
+
+  // Collect all \<X patterns from English prose (outside code blocks)
+  const enParts = englishContent.split(codeBlockPattern)
+  const escapedFollowers = new Set<string>()
+  for (let i = 0; i < enParts.length; i++) {
+    if (i % 2 === 1) continue
+    const matches = enParts[i].matchAll(/\\<([^\s>]{1,30})/g)
+    for (const m of matches) {
+      escapedFollowers.add(m[1]) // e.g., "Storage[4]", "=2^256"
+    }
+  }
+
+  if (escapedFollowers.size === 0) return { content, fixCount }
+
+  // Check translation for bare <X where English has \<X
+  const trParts = content.split(codeBlockPattern)
+  for (let i = 0; i < trParts.length; i++) {
+    if (i % 2 === 1) continue
+    for (const follower of escapedFollowers) {
+      const bare = `<${follower}`
+      const escaped = `\\<${follower}`
+      if (trParts[i].includes(bare) && !trParts[i].includes(escaped)) {
+        trParts[i] = trParts[i].split(bare).join(escaped)
+        fixCount++
+      }
+    }
+  }
+
+  return { content: trParts.join(""), fixCount }
+}
+
 /**
  * Detect and remove orphaned closing HTML tags.
  * These appear when translation restructures sentences and leaves behind
@@ -1543,7 +1601,7 @@ function processMarkdownFile(
   )
   applyFix(
     () => escapeMdxAngleBrackets(content),
-    (n) => `Escaped ${n} raw angle brackets before numbers`
+    (n) => `Escaped ${n} raw angle brackets in prose`
   )
   applyFix(
     () => removeOrphanedClosingTags(content),
@@ -1574,6 +1632,10 @@ function processMarkdownFile(
       () => normalizeInlineComponentsFromEnglish(content, englishMd!),
       (n) => `Normalized ${n} inline components to match English`
     )
+    applyFix(
+      () => restoreDroppedBackslashEscapes(content, englishMd!),
+      (n) => `Restored ${n} dropped backslash escapes before <`
+    )
     applyFix(
       () => repairUnclosedBackticks(content, englishMd!),
       (n) => `Repaired ${n} unclosed backticks`

From 66a1eb6bf47d0a37ccb5a625086df1ad314133f3 Mon Sep 17 00:00:00 2001
From: myelinated-wackerow
 <263208946+myelinated-wackerow@users.noreply.github.com>
Date: Wed, 25 Feb 2026 02:11:42 +0000
Subject: [PATCH 13/14] fix(i18n): wire up sanitizer checks and expand brand
 protection

- Wire fixEscapedBoldAndItalic into pipeline (fixes \*\*text\*\* from Crowdin)
- Wire warnPunctuationOnlyHeadings into pipeline (detects dropped headings)
- Wire warnCodeFenceContentDrift into pipeline (detects translated code blocks)
- Add 9 Ethereum client names to PROTECTED_BRAND_NAMES
- Remove unused extractHrefsFromBlock (block-level href approach abandoned)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com>
---
 src/scripts/i18n/post_import_sanitize.ts | 172 ++++++++++++++++++++---
 1 file changed, 149 insertions(+), 23 deletions(-)

diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts
index 0e5f444de79..72bb32c09d7 100644
--- a/src/scripts/i18n/post_import_sanitize.ts
+++ b/src/scripts/i18n/post_import_sanitize.ts
@@ -142,6 +142,16 @@ const PROTECTED_BRAND_NAMES = [
   "Lido",
   "Rocket Pool",
   "ENS",
+  // Ethereum clients
+  "Besu",
+  "Geth",
+  "Nethermind",
+  "Erigon",
+  "Prysm",
+  "Lighthouse",
+  "Teku",
+  "Nimbus",
+  "Lodestar",
   // Core terms that must stay English
   "Ethereum",
   "Bitcoin",
@@ -354,6 +364,129 @@ function fixDuplicatedHeadings(content: string): {
   return { content: result, fixCount }
 }
 
+/**
+ * Fix escaped bold/italic markers from Crowdin.
+ * Crowdin often escapes markdown emphasis during translation:
+ *   \*\*text\*\* → **text** (bold)
+ *   \*text\*    → *text*   (italic)
+ *
+ * IMPORTANT: Skips table rows (lines starting with |) where \*\* may be
+ * intentional — e.g., `2\*\*256` for exponentiation in EVM opcode tables.
+ * Also skips code blocks.
+ */
+function fixEscapedBoldAndItalic(content: string): {
+  content: string
+  fixCount: number
+} {
+  let fixCount = 0
+
+  // Split content to preserve code blocks
+  const codeBlockPattern = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g
+  const parts = content.split(codeBlockPattern)
+
+  for (let i = 0; i < parts.length; i++) {
+    if (i % 2 === 1) continue // Skip code blocks
+
+    const lines = parts[i].split("\n")
+    for (let j = 0; j < lines.length; j++) {
+      // Skip table rows — \*\* may be intentional (e.g., 2\*\*256)
+      if (lines[j].trimStart().startsWith("|")) continue
+
+      // Fix escaped bold first: \*\*text\*\* → **text**
+      lines[j] = lines[j].replace(
+        /\\\*\\\*(.+?)\\\*\\\*/g,
+        (_, inner) => {
+          fixCount++
+          return `**${inner}**`
+        }
+      )
+
+      // Fix escaped italic: \*text\* → *text*
+      // Runs after bold fix, so remaining \* pairs are italic
+      lines[j] = lines[j].replace(
+        /\\\*(.+?)\\\*/g,
+        (_, inner) => {
+          fixCount++
+          return `*${inner}*`
+        }
+      )
+    }
+    parts[i] = lines.join("\n")
+  }
+
+  return { content: parts.join(""), fixCount }
+}
+
+/**
+ * Warn on headings where the text is only punctuation (no actual words).
+ * Example: `## 。 {#who-is-involved}` — heading text is just a period.
+ * This indicates Crowdin dropped the heading text during translation.
+ */
+function warnPunctuationOnlyHeadings(content: string): string[] {
+  const warnings: string[] = []
+  const headingRe = /^(#{1,6})\s+(.+?)\s*(\{#[^}]+\})?\s*$/gm
+  let match
+  while ((match = headingRe.exec(content))) {
+    const text = match[2].trim()
+    // Remove the custom ID if it got captured in the text
+    const cleanText = text.replace(/\{#[^}]+\}/, "").trim()
+    // Check if remaining text is only punctuation/whitespace
+    if (cleanText.length > 0 && /^[\p{P}\p{S}\s]+$/u.test(cleanText)) {
+      warnings.push(
+        `Heading text is only punctuation: "${match[0].trim()}" — likely missing translation`
+      )
+    }
+  }
+  return warnings
+}
+
+/**
+ * Warn when fenced code block content differs between English and translation.
+ * Code inside fences should never be translated (variable names, keywords, etc.).
+ * Catches issues like `or` → `または` inside code fences.
+ */
+function warnCodeFenceContentDrift(
+  translatedContent: string,
+  englishContent: string
+): string[] {
+  const warnings: string[] = []
+
+  const extractCodeFences = (
+    content: string
+  ): Array<{ lang: string; body: string }> => {
+    const fences: Array<{ lang: string; body: string }> = []
+    const re = /```(\w*)\n([\s\S]*?)```/g
+    let match
+    while ((match = re.exec(content))) {
+      fences.push({ lang: match[1] || "", body: match[2].trim() })
+    }
+    return fences
+  }
+
+  const engFences = extractCodeFences(englishContent)
+  const transFences = extractCodeFences(translatedContent)
+
+  if (engFences.length !== transFences.length) {
+    warnings.push(
+      `Code fence count mismatch: English has ${engFences.length}, translation has ${transFences.length}`
+    )
+    return warnings
+  }
+
+  for (let i = 0; i < engFences.length; i++) {
+    if (engFences[i].body !== transFences[i].body) {
+      const preview = transFences[i].body
+        .substring(0, 60)
+        .replace(/\n/g, "\\n")
+      warnings.push(
+        `Code fence #${i + 1} content differs from English: "${preview}..."`
+      )
+    }
+  }
+
+  return warnings
+}
+
 /**
  * Fix broken markdown links where there's a space between ] and (.
  * Pattern: ] (https://... → ](https://...
@@ -437,29 +570,6 @@ function extractHrefs(content: string): Set<string> {
   return hrefs
 }
 
-/**
- * Extract hrefs from a single text block (paragraph/section).
- * Returns array to preserve duplicates within the block.
- */
-function extractHrefsFromBlock(block: string): string[] {
-  const hrefs: string[] = []
-
-  // Markdown links: [text](href)
-  const markdownLinkRe = /\[[^\]]*\]\(([^)]+)\)/g
-  let match
-  while ((match = markdownLinkRe.exec(block))) {
-    hrefs.push(match[1])
-  }
-
-  // JSX/HTML href attributes: href="..." or href='...'
-  const hrefAttrRe = /href=["']([^"']+)["']/g
-  while ((match = hrefAttrRe.exec(block))) {
-    hrefs.push(match[1])
-  }
-
-  return hrefs
-}
-
 /**
  * Split markdown content into logical blocks (paragraphs/sections).
  * Blocks are separated by blank lines.
@@ -475,6 +585,10 @@ function splitIntoBlocks(content: string): string[] {
  * English and translated documents is unreliable (Crowdin often adds/removes
  * blank lines, shifting paragraph indices and causing incorrect substitutions).
  *
+ * Earlier approach: block-level href comparison using per-paragraph extraction
+ * (extractHrefsFromBlock) to match hrefs positionally. Abandoned because Crowdin
+ * paragraph drift caused incorrect substitutions. See docs/solutions/ for details.
+ *
  * Href fixes are left to the AI review agents which have full semantic context.
  */
 function fixTranslatedHrefs(
@@ -1611,6 +1725,10 @@ function processMarkdownFile(
     () => fixBlockComponentLineBreaks(content),
     (n) => `Fixed ${n} inline block component tags`
   )
+  applyFix(
+    () => fixEscapedBoldAndItalic(content),
+    (n) => `Unescaped ${n} bold/italic markers from Crowdin`
+  )
 
   content = normalizeBlockHtmlLines(content)
 
@@ -1667,6 +1785,14 @@ function processMarkdownFile(
     content = hrefResult.content
     issues.push(...hrefResult.warnings)
 
+    // Warn on punctuation-only headings (dropped translation text)
+    const punctuationHeadingWarnings = warnPunctuationOnlyHeadings(content)
+    issues.push(...punctuationHeadingWarnings)
+
+    // Warn on code fence content drift (translated code blocks)
+    const codeFenceWarnings = warnCodeFenceContentDrift(content, englishMd)
+    issues.push(...codeFenceWarnings)
+
     // Detect cross-script contamination
     if (locale) {
       const scriptWarnings = detectCrossScriptContamination(content, locale)

From 5c7d32477b6cdd0776f26af3c19e8984eb2bb82d Mon Sep 17 00:00:00 2001
From: myelinated-wackerow
 <263208946+myelinated-wackerow@users.noreply.github.com>
Date: Wed, 25 Feb 2026 02:18:14 +0000
Subject: [PATCH 14/14] docs: fix language count and add canonical config
 reference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update "60+ languages" to "25 languages" (actual count from i18n.config.json)
- Add reference to i18n.config.json as canonical language list
- Fix RTL language list (Arabic, Urdu — no Hebrew in active config)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Co-Authored-By: wackerow <54227730+wackerow@users.noreply.github.com>
---
 AGENTS.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index e9ecc6edf0c..fd03e8a0c94 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -15,7 +15,7 @@ This is the official Ethereum.org website - a Next.js application that serves as
 
 ### Key Dependencies
 
-- **next-intl 3.26+** - Internationalization (i18n) with 60+ languages
+- **next-intl 3.26+** - Internationalization (i18n) with 25 languages
 - **next-mdx-remote 5.0+** - MDX content processing
 - **Framer Motion 10.13+** - Animations and transitions
 - **Radix UI** - Accessible component primitives
@@ -43,7 +43,7 @@ This is the official Ethereum.org website - a Next.js application that serves as
   - **data/** - Static data and configurations
   - **hooks/** - Custom React hooks
   - **i18n/** - Internationalization config
-  - **intl/** - Translation files (60+ languages)
+  - **intl/** - Translation files (25 languages)
   - **layouts/** - Page layout components
   - **lib/** - Utility functions and types
     - **constants.ts** - App constants
@@ -116,8 +116,8 @@ pnpm events-import         # Import community events
 
 ### Internationalization
 
-- **60+ languages** supported via Crowdin
-- **RTL support** for Arabic, Hebrew, etc.
+- **25 languages** supported via Crowdin (canonical list: `i18n.config.json`)
+- **RTL support** for Arabic, Urdu
 - Translation files (JSON format) in `src/intl/[locale]/`
 - Content translations managed through Crowdin platform