From 25c345227b2d7af3128c6dac761dd01e2580ccaf Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 26 Nov 2025 16:02:45 -0300 Subject: [PATCH 01/99] feat: add start offset and post_import_sanitize --- .github/workflows/crowdin-ai-import.yml | 11 + src/scripts/i18n/main.ts | 294 +++++++++++++++++-- src/scripts/i18n/post_import_sanitize.ts | 345 +++++++++++++++++++++++ 3 files changed, 621 insertions(+), 29 deletions(-) create mode 100644 src/scripts/i18n/post_import_sanitize.ts diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index 0f09c6a552e..c84f0ce6c3c 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -3,11 +3,20 @@ name: Import Crowdin AI Translations on: workflow_dispatch: inputs: + pretranslation_id: + description: "Pre-translation ID to resume from (optional - leave empty to start new)" + required: false + type: string file_limit: description: "Number of files to process (default: 100, use 1-10 for testing)" required: false default: "100" type: string + start_offset: + description: "Starting offset for files (skip first N files; default: 0)" + required: false + default: "0" + type: string target_languages: description: "Comma-separated Crowdin language codes (default: es-EM)" required: false @@ -43,7 +52,9 @@ jobs: env: I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_API_KEY }} I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} + PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} FILE_LIMIT: ${{ github.event.inputs.file_limit }} + START_OFFSET: ${{ github.event.inputs.start_offset }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} BASE_BRANCH: ${{ github.event.inputs.base_branch }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 27cfab83e25..6721537c698 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -1,7 +1,11 @@ +/* eslint-disable import/order */ +import fs from "fs" + import dotenv from "dotenv" import i18nConfig from "../../../i18n.config.json" +import { runSanitizer } from "./post_import_sanitize" import type { BranchDetailsResponse, BranchObject, @@ -60,6 +64,12 @@ const fileLimit = process.env.FILE_LIMIT ? parseInt(process.env.FILE_LIMIT, 10) : 100 +const startOffset = process.env.START_OFFSET + ? parseInt(process.env.START_OFFSET, 10) + : 0 + +const existingPreTranslationId = process.env.PRETRANSLATION_ID || "" + // Parse GitHub repository from env (format: "owner/repo") const githubRepo = process.env.GITHUB_REPOSITORY || "ethereum/ethereum-org-website" @@ -69,7 +79,13 @@ console.log("[DEBUG] Configuration:") console.log(`[DEBUG] - Target languages: ${targetLanguages.join(", ")}`) console.log(`[DEBUG] - Base branch: ${baseBranch}`) console.log(`[DEBUG] - File limit: ${fileLimit}`) +console.log(`[DEBUG] - Start offset: ${startOffset}`) console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) +if (existingPreTranslationId) { + console.log( + `[DEBUG] - Resuming from pre-translation ID: ${existingPreTranslationId}` + ) +} const env = { projectId: 834930, @@ -149,44 +165,75 @@ const fetchWithRetry = async ( } /** - * Get all files, using perPage to limit amount fetched + * Get English files with pagination, allowing limit + offset. + * GitHub Search API caps `per_page` at 100; we fetch pages until + * we accumulate `offset + limit` items, then return the slice. */ const getAllEnglishFiles = async ( - perPage = 100 + limit = 100, + offset = 0 ): Promise => { const ghSearchEndpointBase = "https://api.github.com/search/code" const query = `repo:${env.ghOrganization}/${env.ghRepo} extension:md path:"${env.mdRoot}" -path:"${env.mdRoot}/translations" OR repo:${env.ghOrganization}/${env.ghRepo} extension:json path:"${env.jsonRoot}"` - const url = new URL(ghSearchEndpointBase) - url.searchParams.set("q", query) - url.searchParams.set("per_page", perPage.toString()) - url.searchParams.set("page", "1") - console.log(`[DEBUG] GitHub search query: ${query}`) - console.log(`[DEBUG] GitHub search URL: ${url.toString()}`) - try { - const res = await fetchWithRetry(url.toString(), { - headers: gitHubBearerHeaders, - }) + const perPage = 100 + const needed = offset + limit + const collected: GitHubQueryResponseItem[] = [] - if (!res.ok) { - console.warn(`[ERROR] GitHub API response not OK: ${res.status}`) - const body = await res.text().catch(() => "") - console.error(`[ERROR] Response body:`, body) - throw new Error(`GitHub getAllEnglishFiles (${res.status}): ${body}`) - } + let page = 1 + while (collected.length < needed) { + const url = new URL(ghSearchEndpointBase) + url.searchParams.set("q", query) + url.searchParams.set("per_page", perPage.toString()) + url.searchParams.set("page", page.toString()) - type JsonResponse = { items: GitHubQueryResponseItem[] } - const json: JsonResponse = await res.json() + console.log(`[DEBUG] Fetching search page ${page} ...`) - console.log(`[DEBUG] Found ${json.items.length} files from GitHub`) - console.log(`[DEBUG] First GitHub file:`, json.items[0]) - return json.items - } catch (error) { - console.error(`[ERROR] Failed to get English files from GitHub:`, error) - process.exit(1) + try { + const res = await fetchWithRetry(url.toString(), { + headers: gitHubBearerHeaders, + }) + + if (!res.ok) { + console.warn(`[ERROR] GitHub API response not OK: ${res.status}`) + const body = await res.text().catch(() => "") + console.error(`[ERROR] Response body:`, body) + throw new Error(`GitHub getAllEnglishFiles (${res.status}): ${body}`) + } + + type JsonResponse = { items: GitHubQueryResponseItem[] } + const json: JsonResponse = await res.json() + + if (!json.items.length) { + console.log(`[DEBUG] No more results at page ${page}.`) + break + } + + collected.push(...json.items) + console.log(`[DEBUG] Collected ${collected.length} items so far.`) + + page += 1 + if (page > 10) { + // Safety cap: avoid excessive paging; typical search caps ~1000 results + console.warn( + `[WARN] Reached pagination safety cap at page ${page - 1}.` + ) + break + } + } catch (error) { + console.error(`[ERROR] Failed to get English files from GitHub:`, error) + process.exit(1) + } } + + const sliced = collected.slice(offset, offset + limit) + console.log( + `[DEBUG] Returning ${sliced.length} files (offset=${offset}, limit=${limit})` + ) + if (sliced.length) console.log(`[DEBUG] First GitHub file:`, sliced[0]) + return sliced } const getFileMetadata = async ( @@ -1079,6 +1126,134 @@ const postPullRequest = async (head: string, base = env.baseBranch) => { return json } +async function buildAndCommitTranslations( + preTranslateJobCompletedResponse: CrowdinPreTranslateResponse +) { + if (preTranslateJobCompletedResponse.status !== "finished") { + console.error( + "[BUILD] ❌ Pre-translation did not finish successfully. Full response:", + preTranslateJobCompletedResponse + ) + throw new Error( + `Pre-translation ended with unexpected status: ${preTranslateJobCompletedResponse.status}` + ) + } + + console.log(`[BUILD] ✓ Pre-translation completed successfully!`) + console.log(`[BUILD] Progress: ${preTranslateJobCompletedResponse.progress}%`) + console.log( + `[BUILD] Full response:`, + JSON.stringify(preTranslateJobCompletedResponse, null, 2) + ) + + const { languageIds, fileIds } = preTranslateJobCompletedResponse.attributes + + // Get Crowdin project files for path mapping + const crowdinProjectFiles = await getCrowdinProjectFiles() + + // Build mapping for commit phase using existing Crowdin files + const fileIdToPathMapping: Record = {} + for (const fid of fileIds) { + const existing = crowdinProjectFiles.find((f) => f.id === fid) + if (existing) fileIdToPathMapping[fid] = existing.path + + if (!fileIdToPathMapping[fid]) { + console.warn( + `[WARN] Missing path mapping for fileId=${fid} (may impact destination path calculation)` + ) + } + } + + // Build mapping between Crowdin IDs (e.g. "es-EM") and internal codes (e.g. "es") + const languagePairs = languageIds.map((crowdinId) => ({ + crowdinId, + internalLanguageCode: crowdinToInternalCodeMapping[crowdinId], + })) + + const { branch } = await postCreateBranchFrom(env.baseBranch) + console.log(`\n[BRANCH] ✓ Created branch: ${branch}`) + + // For each language + for (const { crowdinId, internalLanguageCode } of languagePairs) { + console.log( + `\n[BUILD] ========== Building translations for language: ${crowdinId} (internal: ${internalLanguageCode}) ==========` + ) + + // Build, download and commit each file + for (const fileId of fileIds) { + console.log(`\n[BUILD] --- Processing fileId: ${fileId} ---`) + const crowdinPath = fileIdToPathMapping[fileId] + console.log(`[BUILD] Crowdin path: ${crowdinPath}`) + + // 1- Build + console.log( + `[BUILD] Requesting build for fileId=${fileId}, language=${crowdinId}` + ) + const { url: downloadUrl } = await postBuildProjectFileTranslation( + fileId, + crowdinId, + env.projectId + ) + console.log(`[BUILD] ✓ Build complete, download URL: ${downloadUrl}`) + + // 2- Download + console.log(`[BUILD] Downloading translated file...`) + const { buffer } = await getBuiltFile(downloadUrl) + console.log(`[BUILD] Downloaded ${buffer.length} bytes`) + + // 3a- Get destination path + const destinationPath = getDestinationFromPath( + crowdinPath, + internalLanguageCode + ) + console.log(`[BUILD] Destination path: ${destinationPath}`) + + // 3b- Commit + console.log(`[BUILD] Committing to branch: ${branch}`) + await putCommitFile(buffer, destinationPath, branch) + console.log(`[BUILD] ✓ Committed successfully`) + } + } + + // Run post-import sanitizer BEFORE creating PR (may produce additional commits) + console.log( + `\n[SANITIZE] ========== Running post-import sanitizer before PR ==========` + ) + const sanitizeResult = runSanitizer(env.allCrowdinCodes) + const changedFiles = sanitizeResult.changedFiles || [] + if (changedFiles.length) { + console.log(`[SANITIZE] Files changed by sanitizer: ${changedFiles.length}`) + for (const abs of changedFiles) { + const relPath = abs.startsWith(process.cwd()) + ? abs.slice(process.cwd().length + 1) + : abs + try { + const buf = fs.readFileSync(abs) + await putCommitFile(buf, relPath, branch) + console.log(`[SANITIZE] ✓ Committed sanitized file: ${relPath}`) + } catch (e) { + console.warn( + `[SANITIZE] Failed to commit sanitized file ${relPath}:`, + e + ) + } + } + } else { + console.log("[SANITIZE] No sanitation changes to commit") + } + + console.log(`\n[PR] ========== Creating Pull Request ==========`) + console.log(`[PR] Head branch: ${branch}`) + console.log(`[PR] Base branch: ${env.baseBranch}`) + + const pr = await postPullRequest(branch, env.baseBranch) + + console.log(`\n[SUCCESS] ========== Translation import complete! ==========`) + console.log(`[SUCCESS] Pull Request URL: ${pr.html_url}`) + console.log(`[SUCCESS] PR Number: #${pr.number}`) + console.log(pr) +} + async function main(options?: { allLangs: boolean }) { console.log(`[DEBUG] Starting main function with options:`, options) console.log(`[DEBUG] Environment config:`, { @@ -1089,10 +1264,44 @@ async function main(options?: { allLangs: boolean }) { allCrowdinCodes: env.allCrowdinCodes, }) - // Fetch English files with the configured file limit - const allEnglishFiles = await getAllEnglishFiles(fileLimit) + // Check if resuming from existing pre-translation + if (existingPreTranslationId) { + console.log( + `\n[RESUME] ========== Resuming from pre-translation ID: ${existingPreTranslationId} ==========` + ) + console.log(`[RESUME] Checking status of existing pre-translation...`) + + const preTranslateJobCompletedResponse = await getPreTranslationStatus( + existingPreTranslationId + ) + + if (preTranslateJobCompletedResponse.status === "in_progress") { + console.log( + `[RESUME] Pre-translation still in progress (${preTranslateJobCompletedResponse.progress}%). Waiting for completion...` + ) + const completedResponse = await awaitPreTranslationCompleted( + existingPreTranslationId + ) + return await buildAndCommitTranslations(completedResponse) + } else if (preTranslateJobCompletedResponse.status === "finished") { + console.log( + `[RESUME] Pre-translation already finished. Building translations...` + ) + return await buildAndCommitTranslations(preTranslateJobCompletedResponse) + } else { + throw new Error( + `Pre-translation ${existingPreTranslationId} has unexpected status: ${preTranslateJobCompletedResponse.status}` + ) + } + } + + // Normal flow: Start new pre-translation + console.log(`\n[START] ========== Starting new pre-translation ==========`) + + // Fetch English files with limit + start offset + const allEnglishFiles = await getAllEnglishFiles(fileLimit, startOffset) console.log( - `[DEBUG] Found ${allEnglishFiles.length} English files from GitHub` + `[DEBUG] Found ${allEnglishFiles.length} English files from GitHub (offset=${startOffset}, limit=${fileLimit})` ) // TODO: Add filter here to select specific files @@ -1331,6 +1540,33 @@ async function main(options?: { allLangs: boolean }) { } } + // Run post-import sanitizer BEFORE creating PR (may produce additional commits) + console.log( + `\n[SANITIZE] ========== Running post-import sanitizer before PR ==========` + ) + const sanitizeResult = runSanitizer(env.allCrowdinCodes) + const changedFiles = sanitizeResult.changedFiles || [] + if (changedFiles.length) { + console.log(`[SANITIZE] Files changed by sanitizer: ${changedFiles.length}`) + for (const abs of changedFiles) { + const relPath = abs.startsWith(process.cwd()) + ? abs.slice(process.cwd().length + 1) + : abs + try { + const buf = fs.readFileSync(abs) + await putCommitFile(buf, relPath, branch) + console.log(`[SANITIZE] ✓ Committed sanitized file: ${relPath}`) + } catch (e) { + console.warn( + `[SANITIZE] Failed to commit sanitized file ${relPath}:`, + e + ) + } + } + } else { + console.log("[SANITIZE] No sanitation changes to commit") + } + console.log(`\n[PR] ========== Creating Pull Request ==========`) console.log(`[PR] Head branch: ${branch}`) console.log(`[PR] Base branch: ${env.baseBranch}`) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts new file mode 100644 index 00000000000..97fbe5ace84 --- /dev/null +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -0,0 +1,345 @@ +import fs from "fs" +import path from "path" + +/** + * Post-import sanitizer for Crowdin translations. + * + * - Synchronize custom Markdown header IDs `{#...}` with English source (ASCII-only) + * - Normalize block HTML tag line breaks (opening and closing tags on their own lines) + * - Protect known brand/team names from inadvertent translation + * - Validate JSON files; report issues + * + * Usage: + * npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/post_import_sanitize.ts + * + * Env: + * TARGET_LANGUAGES (comma-separated, e.g. "es-EM") optional; defaults to scanning all `translations/*` folders + */ + +const ROOT = process.cwd() +const CONTENT_ROOT = path.join(ROOT, "public", "content") +const INTL_ROOT = path.join(ROOT, "src", "intl") + +const _protectedNames = [ + "Ethereum", + "ETH", + "Solidity", + "MetaMask", + "GitHub", + "Crowdin", + "EIP", + "NFT", + "HTML", + "PoW", + "PoS", +] + +const BLOCK_HTML_TAGS = [ + "section", + "div", + "article", + "aside", + "header", + "footer", +] + +function listFiles( + dir: string, + predicate: (file: string) => boolean +): string[] { + const out: string[] = [] + const stack: string[] = [dir] + while (stack.length) { + const d = stack.pop()! + const entries = fs.readdirSync(d, { withFileTypes: true }) + for (const e of entries) { + const full = path.join(d, e.name) + if (e.isDirectory()) stack.push(full) + else if (predicate(full)) out.push(full) + } + } + return out +} + +function toAsciiId(id: string): string { + // keep only ASCII letters, numbers, hyphens and underscores; strip accents + const normalized = id.normalize("NFD").replace(/[\u0300-\u036f]/g, "") + return normalized.replace(/[^A-Za-z0-9_-]/g, "-") +} + +// Critical regex checks adapted from legacy markdownChecker +const BROKEN_LINK_REGEX = /\[[^\]]+\]\([^)\s]+\s[^)]+\)/g +const INVALID_LINK_REGEX = + /(? { + // Map of heading text -> custom id found in English source + const map = new Map() + const headingRe = /^(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/gm + let m: RegExpExecArray | null + while ((m = headingRe.exec(md))) { + const text = m[2].trim() + const id = m[3].trim() + map.set(text, id) + } + return map +} + +function syncHeaderIdsWithEnglish( + translatedMd: string, + englishMd: string +): string { + const englishIds = extractHeadingIds(englishMd) + const headingRe = /^(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/gm + return translatedMd.replace(headingRe, (full, hashes, text) => { + const englishId = englishIds.get(text.trim()) + if (!englishId) return full // no corresponding English heading; leave as is + const asciiId = toAsciiId(englishId) + return `${hashes} ${text} {#${asciiId}}` + }) +} + +function normalizeBlockHtmlLines(md: string): string { + for (const tag of BLOCK_HTML_TAGS) { + const inlineCloseRe = new RegExp(`([^\\n])\\s*`, "g") + md = md.replace(inlineCloseRe, (_, before) => `${before}\n`) + } + return md +} + +function protectNames(text: string): string { + // Replace common incorrectly localized variants back to protected names. + // This is heuristic; extend as needed per locale QA. + const replacements: Array<[RegExp, string]> = [ + [/\bEtéreo\b/gi, "Ethereum"], + [/\bEtéreum\b/gi, "Ethereum"], + [/\bMetamask\b/gi, "MetaMask"], + [/\bGithub\b/gi, "GitHub"], + [/\bNft\b/g, "NFT"], + ] + let out = text + for (const [re, val] of replacements) out = out.replace(re, val) + // Normalize canonical capitalization of protected names + for (const name of _protectedNames) { + const re = new RegExp(`\\b${name}\\b`, "gi") + out = out.replace(re, name) + } + return out +} + +function processMarkdownFile(mdPath: string): { + fixed: boolean + issues: string[] +} { + const issues: string[] = [] + let content = fs.readFileSync(mdPath, "utf8") + + // Map translated path to English path: remove `/translations//` segment + const parts = mdPath.split(path.sep) + const idx = parts.lastIndexOf("translations") + if (idx === -1 || idx + 2 >= parts.length) { + issues.push("No translations segment found; skipping header ID sync") + } else { + const englishPath = path.join( + ...parts.slice(0, idx), + ...parts.slice(idx + 2) // drop translations/ + ) + if (fs.existsSync(englishPath)) { + const englishMd = fs.readFileSync(englishPath, "utf8") + content = syncHeaderIdsWithEnglish(content, englishMd) + } else { + issues.push(`English source missing: ${path.relative(ROOT, englishPath)}`) + } + } + + const before = content + content = normalizeBlockHtmlLines(content) + content = protectNames(content) + + const fixed = before !== content + if (fixed) fs.writeFileSync(mdPath, content, "utf8") + // Run critical checks (report-only) + let m: RegExpExecArray | null + // Broken links containing spaces inside URL + while ((m = BROKEN_LINK_REGEX.exec(content))) { + issues.push(`Broken link format at ${mdPath}:${lineAt(content, m.index)}`) + } + // Invalid links (exclude images/internal/hash/http/mailto/pdf/<...>) + while ((m = INVALID_LINK_REGEX.exec(content))) { + issues.push(`Invalid link at ${mdPath}:${lineAt(content, m.index)}`) + } + // Empty link text + while ((m = LINK_TEXT_MISSING_REGEX.exec(content))) { + issues.push(`Link text missing at ${mdPath}:${lineAt(content, m.index)}`) + } + // Incorrect image path in translated markdown + if (mdPath.includes(`${path.sep}translations${path.sep}`)) { + while ((m = INCORRECT_PATH_IN_TRANSLATED_MARKDOWN.exec(content))) { + issues.push( + `Incorrect image path at ${mdPath}:${lineAt(content, m.index)}` + ) + } + } + // Spelling mistakes (case-insensitive) + for (const mistake of COMMON_SPELLING_MISTAKES) { + const re = new RegExp(mistake, "gi") + while ((m = re.exec(content))) { + issues.push( + `Spelling mistake "${mistake}" at ${mdPath}:${lineAt(content, m.index)}` + ) + } + } + // Case-sensitive mistakes for brands + for (const mistake of CASE_SENSITIVE_SPELLING_MISTAKES) { + const re = new RegExp(mistake, "g") + while ((m = re.exec(content))) { + issues.push( + `Brand capitalization issue "${mistake}" at ${mdPath}:${lineAt(content, m.index)}` + ) + } + } + return { fixed, issues } +} + +function processJsonFile(jsonPath: string): { + fixed: boolean + issues: string[] +} { + const issues: string[] = [] + let content = fs.readFileSync(jsonPath, "utf8") + let fixed = false + // Normalize BOM and smart quotes + const cleaned = content + .replace(/^\uFEFF/, "") + .replace(/[“”]/g, '"') + .replace(/[‘’]/g, "'") + if (cleaned !== content) { + content = cleaned + fixed = true + } + try { + JSON.parse(content) + } catch (e) { + issues.push(`JSON parse error: ${(e as Error).message}`) + } + if (fixed) fs.writeFileSync(jsonPath, content, "utf8") + return { fixed, issues } +} + +function languagesFromEnv(): string[] | undefined { + const env = process.env.TARGET_LANGUAGES?.trim() + if (!env) return undefined + return env + .split(",") + .map((s) => s.trim()) + .filter(Boolean) +} + +export function runSanitizer(langs?: string[]) { + const effectiveLangs = langs || languagesFromEnv() + console.log("[SANITIZE] Starting post-import sanitizer") + console.log( + "[SANITIZE] Target languages:", + effectiveLangs ?? "ALL detected in translations/" + ) + + const mdFiles = listFiles(CONTENT_ROOT, (f) => { + if (!f.endsWith(".md")) return false + if (!f.includes(`${path.sep}translations${path.sep}`)) return false + if (effectiveLangs) + return effectiveLangs.some((l) => + f.includes(`${path.sep}translations${path.sep}${l}${path.sep}`) + ) + return true + }) + + let mdFixed = 0 + const mdIssues: Array<{ file: string; issues: string[] }> = [] + const mdChanged: string[] = [] + for (const f of mdFiles) { + const { fixed, issues } = processMarkdownFile(f) + if (fixed) { + mdFixed++ + mdChanged.push(f) + } + if (issues.length) mdIssues.push({ file: path.relative(ROOT, f), issues }) + } + + const jsonFiles = listFiles(INTL_ROOT, (f) => { + if (!f.endsWith(".json")) return false + const p = path.relative(INTL_ROOT, f).split(path.sep) + const langDir = p[0] + if (!langDir) return false + if (effectiveLangs) return effectiveLangs.some((l) => l.startsWith(langDir)) + return true + }) + + let jsonFixed = 0 + const jsonIssues: Array<{ file: string; issues: string[] }> = [] + const jsonChanged: string[] = [] + for (const f of jsonFiles) { + const { fixed, issues } = processJsonFile(f) + if (fixed) { + jsonFixed++ + jsonChanged.push(f) + } + if (issues.length) jsonIssues.push({ file: path.relative(ROOT, f), issues }) + } + + console.log( + `\n[SANITIZE] Markdown files scanned: ${mdFiles.length}, fixed: ${mdFixed}` + ) + console.log( + `[SANITIZE] JSON files scanned: ${jsonFiles.length}, fixed: ${jsonFixed}` + ) + + if (mdIssues.length || jsonIssues.length) { + console.log("\n[SANITIZE] Issues detected:") + for (const i of mdIssues) { + console.log(` - MD ${i.file}`) + for (const msg of i.issues) console.log(` • ${msg}`) + } + for (const i of jsonIssues) { + console.log(` - JSON ${i.file}`) + for (const msg of i.issues) console.log(` • ${msg}`) + } + } else { + console.log("\n[SANITIZE] No issues detected.") + } + + const changedFiles = [...mdChanged, ...jsonChanged] + return { + changedFiles, + markdown: { scanned: mdFiles.length, fixed: mdFixed }, + json: { scanned: jsonFiles.length, fixed: jsonFixed }, + issues: { markdown: mdIssues, json: jsonIssues }, + } +} + +if (require.main === module) { + runSanitizer() +} From 85a67842a50e2688c053cc1aa7624936f9d61e4b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 26 Nov 2025 16:16:12 -0300 Subject: [PATCH 02/99] feat: update to using gemini prompt --- src/scripts/i18n/main.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 6721537c698..d29902ff5bf 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -93,7 +93,7 @@ const env = { ghRepo, jsonRoot: "src/intl/en", mdRoot: "public/content", - preTranslatePromptId: 168584, + preTranslatePromptId: 326942, allCrowdinCodes: targetLanguages, baseBranch, } From 888eb379684851b9819e70d54f5b05826c65b938 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 26 Nov 2025 18:25:21 -0300 Subject: [PATCH 03/99] feat: adaptive pre-translate polling + timeouts --- .github/workflows/crowdin-ai-import.yml | 12 ++++ src/scripts/i18n/main.ts | 96 +++++++++++++++++-------- 2 files changed, 77 insertions(+), 31 deletions(-) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index c84f0ce6c3c..18e343b7850 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -27,6 +27,16 @@ on: required: false default: "dev" type: string + pretranslate_timeout_ms: + description: "Max ms to wait for pre-translate (default: 21600000 ~6h)" + required: false + default: "21600000" + type: string + pretranslate_poll_base_ms: + description: "Base poll interval ms (default: 30000)" + required: false + default: "30000" + type: string jobs: import_translations: @@ -57,4 +67,6 @@ jobs: START_OFFSET: ${{ github.event.inputs.start_offset }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} BASE_BRANCH: ${{ github.event.inputs.base_branch }} + PRETRANSLATE_TIMEOUT_MS: ${{ github.event.inputs.pretranslate_timeout_ms }} + PRETRANSLATE_POLL_BASE_MS: ${{ github.event.inputs.pretranslate_poll_base_ms }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index d29902ff5bf..881c2fbbc56 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -68,6 +68,14 @@ const startOffset = process.env.START_OFFSET ? parseInt(process.env.START_OFFSET, 10) : 0 +// Adaptive polling / timeout configuration (milliseconds) +const pretranslateTimeoutMs = process.env.PRETRANSLATE_TIMEOUT_MS + ? parseInt(process.env.PRETRANSLATE_TIMEOUT_MS, 10) + : 6 * 60 * 60 * 1000 // default 6h +const pretranslatePollBaseMs = process.env.PRETRANSLATE_POLL_BASE_MS + ? Math.max(5000, parseInt(process.env.PRETRANSLATE_POLL_BASE_MS, 10)) + : 30_000 // default 30s base (min clamped to 5s) + const existingPreTranslationId = process.env.PRETRANSLATION_ID || "" // Parse GitHub repository from env (format: "owner/repo") @@ -81,6 +89,8 @@ console.log(`[DEBUG] - Base branch: ${baseBranch}`) console.log(`[DEBUG] - File limit: ${fileLimit}`) console.log(`[DEBUG] - Start offset: ${startOffset}`) console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) +console.log(`[DEBUG] - Pretranslate timeout ms: ${pretranslateTimeoutMs}`) +console.log(`[DEBUG] - Pretranslate poll base ms: ${pretranslatePollBaseMs}`) if (existingPreTranslationId) { console.log( `[DEBUG] - Resuming from pre-translation ID: ${existingPreTranslationId}` @@ -753,41 +763,65 @@ const getPreTranslationStatus = async ( */ const awaitPreTranslationCompleted = async ( preTranslationId: string, - options?: { intervalMs?: number; timeoutMs?: number } + opts?: { timeoutMs?: number; baseIntervalMs?: number } ): Promise => { - const intervalMs = options?.intervalMs ?? 10_000 - const timeoutMs = options?.timeoutMs ?? 30 /* min */ * 60 * 1000 - - return await new Promise((resolve, reject) => { - const timeout = setTimeout(() => { - reject(new Error("Timed out waiting for pre-translation to finish")) - }, timeoutMs) + const timeoutMs = opts?.timeoutMs ?? pretranslateTimeoutMs + const baseInterval = opts?.baseIntervalMs ?? pretranslatePollBaseMs + const start = Date.now() + let attempt = 0 + + const computeInterval = (elapsedMs: number): number => { + const minutes = elapsedMs / 60000 + if (minutes < 10) return baseInterval + if (minutes < 30) return Math.max(baseInterval * 2, 60_000) + if (minutes < 60) return Math.max(baseInterval * 4, 180_000) + return Math.max(baseInterval * 10, 300_000) // cap at 5 min + } - const poll = async () => { - try { - const res = await getPreTranslationStatus(preTranslationId) - if (res.status !== "in_progress") { - clearTimeout(timeout) - if (res.status === "finished") { - resolve(res) - } else { - reject( - new Error( - `Pre-translation ended with unexpected status: ${res.status}` - ) - ) - } - } else { - setTimeout(poll, intervalMs) - } - } catch (err) { - clearTimeout(timeout) - reject(err) + // Bounded loop: terminates once elapsed exceeds timeoutMs + while (Date.now() - start <= timeoutMs) { + const elapsed = Date.now() - start + attempt++ + let res: CrowdinPreTranslateResponse + try { + res = await getPreTranslationStatus(preTranslationId) + } catch (e) { + // transient fetch errors: log + continue within timeout window + const nextWait = computeInterval(elapsed) + console.warn( + `[PRE-TRANSLATE][POLL] Error on attempt ${attempt}: ${(e as Error).message}. Retrying in ${nextWait}ms.` + ) + await delay(nextWait) + continue + } + if (res.status !== "in_progress") { + if (res.status === "finished") { + console.log( + `[PRE-TRANSLATE][POLL] Completed after ${attempt} attempts; elapsed ${Math.round( + (Date.now() - start) / 60000 + )}m.` + ) + return res } + throw new Error( + `Pre-translation ended with unexpected status: ${res.status}` + ) } - - void poll() - }) + const nextWait = computeInterval(elapsed) + const progressPct = res.progress ?? 0 + console.log( + `[PRE-TRANSLATE][POLL] attempt=${attempt} progress=${progressPct}% elapsed=${Math.round( + elapsed / 60000 + )}m nextWait=${nextWait}ms` + ) + await delay(nextWait) + } + const finalElapsed = Date.now() - start + throw new Error( + `Timed out waiting for pre-translation (elapsed ${Math.round( + finalElapsed / 60000 + )}m)` + ) } /** From 7918f6ba59ca9a1841b46109842cae5c998cb572 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 26 Nov 2025 18:45:45 -0300 Subject: [PATCH 04/99] feat(i18n): add qa_check, workflow inputs add QA polish pass via Crowdin ai_prompt override; workflow inputs for pre_translate and qa_check prompts --- .github/workflows/crowdin-ai-import.yml | 12 +++++++ src/scripts/i18n/main.ts | 42 +++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index 18e343b7850..0f4c8d77062 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -37,6 +37,16 @@ on: required: false default: "30000" type: string + pre_translate_prompt_id: + description: "AI prompt ID for pre_translate (default: 326942)" + required: false + default: "326942" + type: string + qa_prompt_id: + description: "AI prompt ID for qa_check (default: 168592)" + required: false + default: "168592" + type: string jobs: import_translations: @@ -69,4 +79,6 @@ jobs: BASE_BRANCH: ${{ github.event.inputs.base_branch }} PRETRANSLATE_TIMEOUT_MS: ${{ github.event.inputs.pretranslate_timeout_ms }} PRETRANSLATE_POLL_BASE_MS: ${{ github.event.inputs.pretranslate_poll_base_ms }} + PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} + QA_PROMPT_ID: ${{ github.event.inputs.qa_prompt_id }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 881c2fbbc56..ff6d87d6587 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -104,6 +104,7 @@ const env = { jsonRoot: "src/intl/en", mdRoot: "public/content", preTranslatePromptId: 326942, + qaPromptId: Number.parseInt(process.env.QA_PROMPT_ID || "168592"), allCrowdinCodes: targetLanguages, baseBranch, } @@ -667,7 +668,8 @@ const postFileToStorage = async (fileBuffer: Buffer, fileName: string) => { const postApplyPreTranslation = async ( fileIds: number[], - languageIds?: string[] + languageIds?: string[], + aiPromptIdOverride?: number ): Promise => { const url = new URL( `https://api.crowdin.com/api/v2/projects/${env.projectId}/pre-translations` @@ -683,7 +685,10 @@ const postApplyPreTranslation = async ( languageIds: languageIds || env.allCrowdinCodes, // ["es-EM"], // TODO: All languages fileIds, method: "ai", - aiPromptId: env.preTranslatePromptId, + aiPromptId: + typeof aiPromptIdOverride === "number" + ? aiPromptIdOverride + : env.preTranslatePromptId, }), }) @@ -1490,6 +1495,39 @@ async function main(options?: { allLangs: boolean }) { JSON.stringify(preTranslateJobCompletedResponse, null, 2) ) + // Optional QA polish pass using Crowdin AI with qa_check prompt + console.log(`\n[QA-CHECK] ========== Requesting AI QA-Polish ==========`) + console.log(`[QA-CHECK] Using AI Prompt ID:`, env.qaPromptId) + const qaApplyResponse = await postApplyPreTranslation( + preTranslateJobCompletedResponse.attributes.fileIds, + preTranslateJobCompletedResponse.attributes.languageIds, + env.qaPromptId + ) + console.log( + `[QA-CHECK] ✓ QA job created with ID: ${qaApplyResponse.identifier}` + ) + console.log(`[QA-CHECK] Initial status:`, qaApplyResponse.status) + + console.log(`\n[QA-CHECK] Waiting for QA job to complete...`) + const qaCompletedResponse = await awaitPreTranslationCompleted( + qaApplyResponse.identifier + ) + if (qaCompletedResponse.status !== "finished") { + console.error( + "[QA-CHECK] ❌ QA check did not finish successfully. Full response:", + qaCompletedResponse + ) + throw new Error( + `QA check ended with unexpected status: ${qaCompletedResponse.status}` + ) + } + console.log(`[QA-CHECK] ✓ QA job completed successfully!`) + console.log(`[QA-CHECK] Progress: ${qaCompletedResponse.progress}%`) + console.log( + `[QA-CHECK] Full response:`, + JSON.stringify(qaCompletedResponse, null, 2) + ) + const { languageIds, fileIds } = preTranslateJobCompletedResponse.attributes // Build mapping for commit phase. Prefer processed mapping (includes newly added files); fall back to existing Crowdin snapshot for any missed IDs. From ed59f5444e4008d365238f4382d9373ded51a524 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 27 Nov 2025 12:20:30 -0300 Subject: [PATCH 05/99] patch: revert QA prompt to pre_translate endpoint --- src/scripts/i18n/main.ts | 46 ++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index ff6d87d6587..5d173bf46e9 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -103,7 +103,9 @@ const env = { ghRepo, jsonRoot: "src/intl/en", mdRoot: "public/content", - preTranslatePromptId: 326942, + preTranslatePromptId: Number.parseInt( + process.env.PRE_TRANSLATE_PROMPT_ID || "326942" + ), qaPromptId: Number.parseInt(process.env.QA_PROMPT_ID || "168592"), allCrowdinCodes: targetLanguages, baseBranch, @@ -1495,38 +1497,22 @@ async function main(options?: { allLangs: boolean }) { JSON.stringify(preTranslateJobCompletedResponse, null, 2) ) - // Optional QA polish pass using Crowdin AI with qa_check prompt - console.log(`\n[QA-CHECK] ========== Requesting AI QA-Polish ==========`) - console.log(`[QA-CHECK] Using AI Prompt ID:`, env.qaPromptId) - const qaApplyResponse = await postApplyPreTranslation( - preTranslateJobCompletedResponse.attributes.fileIds, - preTranslateJobCompletedResponse.attributes.languageIds, - env.qaPromptId - ) - console.log( - `[QA-CHECK] ✓ QA job created with ID: ${qaApplyResponse.identifier}` - ) - console.log(`[QA-CHECK] Initial status:`, qaApplyResponse.status) - - console.log(`\n[QA-CHECK] Waiting for QA job to complete...`) - const qaCompletedResponse = await awaitPreTranslationCompleted( - qaApplyResponse.identifier - ) - if (qaCompletedResponse.status !== "finished") { - console.error( - "[QA-CHECK] ❌ QA check did not finish successfully. Full response:", - qaCompletedResponse + // Optional QA: Crowdin AI Prompt Completions (qa_check) — placeholder until completions wiring + console.log(`\n[QA-CHECK] ========== AI QA via Prompt Completions ==========`) + const crowdinUserId = process.env.CROWDIN_USER_ID + if (!crowdinUserId) { + console.log( + `[QA-CHECK] Skipping QA: missing env CROWDIN_USER_ID required for completions API` ) - throw new Error( - `QA check ended with unexpected status: ${qaCompletedResponse.status}` + } else { + console.log( + `[QA-CHECK] Ready to request completions with qa_prompt_id=${env.qaPromptId} for files:`, + preTranslateJobCompletedResponse.attributes.fileIds + ) + console.log( + `[QA-CHECK] TODO: Implement completions POST /users/{userId}/ai/prompts/{aiPromptId}/completions with stringIds per file/language.` ) } - console.log(`[QA-CHECK] ✓ QA job completed successfully!`) - console.log(`[QA-CHECK] Progress: ${qaCompletedResponse.progress}%`) - console.log( - `[QA-CHECK] Full response:`, - JSON.stringify(qaCompletedResponse, null, 2) - ) const { languageIds, fileIds } = preTranslateJobCompletedResponse.attributes From aaf41e20f97517816757f6f946b61028748a8827 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 27 Nov 2025 12:37:48 -0300 Subject: [PATCH 06/99] feat: add Crowdin QA completions + env cleanup Switch QA to AI Prompt Completions (qa_check) Resolve user id via GET /api/v2/user (no secret needed) Remove CROWDIN_USER_ID from workflow env Read PRE_TRANSLATE_PROMPT_ID from env Add QA summary to PR body Tidy main.ts by removing unused env references --- src/scripts/i18n/main.ts | 243 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 228 insertions(+), 15 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 5d173bf46e9..63df1dd5889 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -177,6 +177,182 @@ const fetchWithRetry = async ( throw new Error("fetchWithRetry: exhausted retries") } +// --- Crowdin AI Completions (qa_check) helpers --- +type QaCompletionRequest = { + projectId: number + sourceLanguageId: string + targetLanguageId: string + stringIds: number[] +} +type QaCompletionJob = { + id: string + status: "in_progress" | "finished" | string + progress?: number +} +type QaIssue = { + fileId: number + stringId: number + severity: "error" | "warning" | "info" + title: string + details?: string +} + +const resolveCrowdinUserId = async (): Promise => { + const url = new URL("https://api.crowdin.com/api/v2/user") + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`resolveCrowdinUserId (${res.status}): ${text}`) + } + const json = await res.json() + const id = String(json.data?.id || json.id) + if (!id) throw new Error("Failed to resolve Crowdin user id from /users/me") + return id +} + +const listStringIdsForFile = async (fileId: number): Promise => { + const url = new URL( + `https://api.crowdin.com/api/v2/projects/${env.projectId}/strings` + ) + url.searchParams.set("fileId", String(fileId)) + url.searchParams.set("limit", "500") + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`listStringIdsForFile (${res.status}): ${text}`) + } + const json = await res.json() + type StringItem = { data: { id: number } } + const items: StringItem[] = json.data || [] + const ids: number[] = items.map((d) => d.data.id) + return ids +} + +const postQaCompletions = async ( + qaPromptId: number, + payload: QaCompletionRequest +): Promise => { + const userId = await resolveCrowdinUserId() + if (!userId) throw new Error("CROWDIN_USER_ID env missing for completions") + const url = new URL( + `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${qaPromptId}/completions` + ) + const res = await fetch(url.toString(), { + method: "POST", + headers: { ...crowdinBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ resources: payload }), + }) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`postQaCompletions (${res.status}): ${text}`) + } + const json = await res.json() + return json.data as QaCompletionJob +} + +const getQaCompletion = async ( + completionId: string +): Promise => { + const userId = await resolveCrowdinUserId() + const url = new URL( + `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/completions/${completionId}` + ) + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`getQaCompletion (${res.status}): ${text}`) + } + const json = await res.json() + return json.data as QaCompletionJob +} + +const awaitQaCompletion = async ( + completionId: string, + timeoutMs = pretranslateTimeoutMs, + baseIntervalMs = pretranslatePollBaseMs +): Promise => { + const start = Date.now() + let attempt = 0 + const computeInterval = (elapsedMs: number): number => { + const minutes = elapsedMs / 60000 + if (minutes < 10) return baseIntervalMs + if (minutes < 30) return Math.max(baseIntervalMs * 2, 60_000) + if (minutes < 60) return Math.max(baseIntervalMs * 4, 180_000) + return Math.max(baseIntervalMs * 10, 300_000) + } + while (Date.now() - start <= timeoutMs) { + attempt++ + const elapsed = Date.now() - start + let job: QaCompletionJob + try { + job = await getQaCompletion(completionId) + } catch (e) { + const wait = computeInterval(elapsed) + console.warn( + `[QA-CHECK][POLL] Error on attempt ${attempt}: ${(e as Error).message}. Waiting ${wait}ms.` + ) + await delay(wait) + continue + } + if (job.status !== "in_progress") return job + const wait = computeInterval(elapsed) + console.log( + `[QA-CHECK][POLL] attempt=${attempt} progress=${job.progress ?? 0}% nextWait=${wait}ms` + ) + await delay(wait) + } + throw new Error("Timed out awaiting QA completion") +} + +const downloadQaCompletionResult = async ( + completionId: string +): Promise => { + const userId = await resolveCrowdinUserId() + const url = new URL( + `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/completions/${completionId}/download` + ) + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`downloadQaCompletionResult (${res.status}): ${text}`) + } + // Assume JSON structure containing issues; adjust as per actual response + const arrayBuffer = await res.arrayBuffer() + const text = Buffer.from(arrayBuffer).toString("utf-8") + try { + const parsed = JSON.parse(text) + const issues: QaIssue[] = parsed.issues || parsed.data || [] + return issues + } catch { + // If plain text, return empty and attach raw for summary + return [] + } +} + +const summarizeQaIssues = ( + issues: QaIssue[], + fileIdToPath: Record, + lang: string +): string => { + if (!issues.length) return `No QA issues detected for ${lang}.` + const counts = { error: 0, warning: 0, info: 0 } + for (const i of issues) { + const sev = i.severity + if (sev === "error" || sev === "warning" || sev === "info") { + counts[sev]++ + } + } + const top = issues.slice(0, 10) + const lines = [ + `QA for ${lang}: ${counts.error} errors, ${counts.warning} warnings, ${counts.info} info`, + ] + for (const i of top) { + const path = fileIdToPath[i.fileId] || `fileId=${i.fileId}` + lines.push(`- [${i.severity}] ${path} string=${i.stringId} — ${i.title}`) + } + return lines.join("\n") +} + /** * Get English files with pagination, allowing limit + offset. * GitHub Search API caps `per_page` at 100; we fetch pages until @@ -1136,7 +1312,11 @@ const putCommitFile = async ( } } -const postPullRequest = async (head: string, base = env.baseBranch) => { +const postPullRequest = async ( + head: string, + base = env.baseBranch, + bodyText?: string +) => { const url = new URL( `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/pulls` ) @@ -1145,7 +1325,7 @@ const postPullRequest = async (head: string, base = env.baseBranch) => { title: "i18n: automated Crowdin translation import", head, base, - body: "Automated Crowdin translation import", + body: bodyText || "Automated Crowdin translation import", } const res = await fetchWithRetry(url.toString(), { @@ -1305,6 +1485,8 @@ async function main(options?: { allLangs: boolean }) { allCrowdinCodes: env.allCrowdinCodes, }) + // Crowdin user id is fetched on-demand when calling completions API + // Check if resuming from existing pre-translation if (existingPreTranslationId) { console.log( @@ -1497,21 +1679,49 @@ async function main(options?: { allLangs: boolean }) { JSON.stringify(preTranslateJobCompletedResponse, null, 2) ) - // Optional QA: Crowdin AI Prompt Completions (qa_check) — placeholder until completions wiring + // QA via Crowdin AI Prompt Completions (qa_check) console.log(`\n[QA-CHECK] ========== AI QA via Prompt Completions ==========`) - const crowdinUserId = process.env.CROWDIN_USER_ID - if (!crowdinUserId) { - console.log( - `[QA-CHECK] Skipping QA: missing env CROWDIN_USER_ID required for completions API` - ) - } else { - console.log( - `[QA-CHECK] Ready to request completions with qa_prompt_id=${env.qaPromptId} for files:`, - preTranslateJobCompletedResponse.attributes.fileIds - ) + const qaSummaries: string[] = [] + const { languageIds: qaLanguageIds, fileIds: qaFileIds } = + preTranslateJobCompletedResponse.attributes + // Build stringId lists per file + const fileStringMap: Record = {} + for (const fid of qaFileIds) { + try { + fileStringMap[fid] = await listStringIdsForFile(fid) + } catch (e) { + console.warn(`[QA-CHECK] Failed listing strings for fileId=${fid}:`, e) + fileStringMap[fid] = [] + } + } + // Use project source language from repo (assume en-US or en) — map from i18n config + const sourceLanguageId = "en" + // For each language, request a completion over all strings of the selected files + for (const lang of qaLanguageIds) { + const allStringIds = Object.values(fileStringMap).flat() + if (!allStringIds.length) { + console.log(`[QA-CHECK] No strings found to QA for ${lang}`) + continue + } console.log( - `[QA-CHECK] TODO: Implement completions POST /users/{userId}/ai/prompts/{aiPromptId}/completions with stringIds per file/language.` + `[QA-CHECK] Posting completions for ${lang} with ${allStringIds.length} strings` ) + const job = await postQaCompletions(env.qaPromptId, { + projectId: env.projectId, + sourceLanguageId, + targetLanguageId: lang, + stringIds: allStringIds, + }) + const finished = await awaitQaCompletion(job.id) + if (finished.status !== "finished") { + console.warn( + `[QA-CHECK] Completion status=${finished.status} for ${lang}` + ) + continue + } + const issues = await downloadQaCompletionResult(job.id) + const summary = summarizeQaIssues(issues, processedFileIdToPath, lang) + qaSummaries.push(summary) } const { languageIds, fileIds } = preTranslateJobCompletedResponse.attributes @@ -1629,7 +1839,10 @@ async function main(options?: { allLangs: boolean }) { console.log(`[PR] Head branch: ${branch}`) console.log(`[PR] Base branch: ${env.baseBranch}`) - const pr = await postPullRequest(branch, env.baseBranch) + const prBody = qaSummaries.length + ? `Automated Crowdin translation import\n\nQA Summary:\n\n${qaSummaries.join("\n\n")}` + : "Automated Crowdin translation import" + const pr = await postPullRequest(branch, env.baseBranch, prBody) console.log(`\n[SUCCESS] ========== Translation import complete! ==========`) console.log(`[SUCCESS] Pull Request URL: ${pr.html_url}`) From be02aa7f337c32cb9ba3868f74ee11c626093ada Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 27 Nov 2025 12:50:35 -0300 Subject: [PATCH 07/99] debug: QA check endpoint --- src/scripts/i18n/main.ts | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 63df1dd5889..2c6280a8520 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -237,6 +237,8 @@ const postQaCompletions = async ( const url = new URL( `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${qaPromptId}/completions` ) + console.log(`[QA-CHECK][DEBUG] POST ${url.toString()}`) + console.log(`[QA-CHECK][DEBUG] Payload:`, JSON.stringify(payload, null, 2)) const res = await fetch(url.toString(), { method: "POST", headers: { ...crowdinBearerHeaders, "Content-Type": "application/json" }, @@ -244,6 +246,12 @@ const postQaCompletions = async ( }) if (!res.ok) { const text = await res.text().catch(() => "") + if (res.status === 403) { + throw new Error( + `QA completions endpoint not accessible (403). ` + + `This may require Crowdin Enterprise or AI credits. URL: ${url.toString()} Raw: ${text}` + ) + } throw new Error(`postQaCompletions (${res.status}): ${text}`) } const json = await res.json() @@ -1706,12 +1714,22 @@ async function main(options?: { allLangs: boolean }) { console.log( `[QA-CHECK] Posting completions for ${lang} with ${allStringIds.length} strings` ) - const job = await postQaCompletions(env.qaPromptId, { - projectId: env.projectId, - sourceLanguageId, - targetLanguageId: lang, - stringIds: allStringIds, - }) + let job: QaCompletionJob | undefined + try { + job = await postQaCompletions(env.qaPromptId, { + projectId: env.projectId, + sourceLanguageId, + targetLanguageId: lang, + stringIds: allStringIds, + }) + } catch (e) { + const msg = String((e as Error).message || e) + console.warn(`[QA-CHECK] Skipping QA for ${lang}: ${msg}`) + qaSummaries.push( + `QA for ${lang}: skipped (token lacks AI completions scope).` + ) + continue + } const finished = await awaitQaCompletion(job.id) if (finished.status !== "finished") { console.warn( From 837d6fd8b6a15f5c7a0c743c68c9e619c1f38445 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 27 Nov 2025 15:15:17 -0300 Subject: [PATCH 08/99] debug: completions check --- src/scripts/i18n/main.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 2c6280a8520..6258d6bb03e 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -237,15 +237,18 @@ const postQaCompletions = async ( const url = new URL( `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${qaPromptId}/completions` ) + const bodyPayload = { resources: payload } console.log(`[QA-CHECK][DEBUG] POST ${url.toString()}`) - console.log(`[QA-CHECK][DEBUG] Payload:`, JSON.stringify(payload, null, 2)) + console.log(`[QA-CHECK][DEBUG] Body:`, JSON.stringify(bodyPayload, null, 2)) const res = await fetch(url.toString(), { method: "POST", headers: { ...crowdinBearerHeaders, "Content-Type": "application/json" }, - body: JSON.stringify({ resources: payload }), + body: JSON.stringify(bodyPayload), }) + console.log(`[QA-CHECK][DEBUG] Response status: ${res.status}`) if (!res.ok) { const text = await res.text().catch(() => "") + console.log(`[QA-CHECK][DEBUG] Error response:`, text) if (res.status === 403) { throw new Error( `QA completions endpoint not accessible (403). ` + @@ -255,6 +258,10 @@ const postQaCompletions = async ( throw new Error(`postQaCompletions (${res.status}): ${text}`) } const json = await res.json() + console.log( + `[QA-CHECK][DEBUG] Success response:`, + JSON.stringify(json, null, 2) + ) return json.data as QaCompletionJob } From f54fc5476950e92e2fdf480fb8bfc9ec6ce4c466 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 27 Nov 2025 15:28:54 -0300 Subject: [PATCH 09/99] fix: chunk requests by file, 500 at a time --- src/scripts/i18n/main.ts | 110 ++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 30 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 6258d6bb03e..be81e5f4178 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -1711,42 +1711,92 @@ async function main(options?: { allLangs: boolean }) { } // Use project source language from repo (assume en-US or en) — map from i18n config const sourceLanguageId = "en" - // For each language, request a completion over all strings of the selected files + const MAX_STRINGS_PER_REQUEST = 500 + // For each language, run QA per file (naturally batches and ties issues to specific files) for (const lang of qaLanguageIds) { - const allStringIds = Object.values(fileStringMap).flat() - if (!allStringIds.length) { - console.log(`[QA-CHECK] No strings found to QA for ${lang}`) - continue - } console.log( - `[QA-CHECK] Posting completions for ${lang} with ${allStringIds.length} strings` + `[QA-CHECK] Running QA for ${lang} across ${qaFileIds.length} files` ) - let job: QaCompletionJob | undefined - try { - job = await postQaCompletions(env.qaPromptId, { - projectId: env.projectId, - sourceLanguageId, - targetLanguageId: lang, - stringIds: allStringIds, - }) - } catch (e) { - const msg = String((e as Error).message || e) - console.warn(`[QA-CHECK] Skipping QA for ${lang}: ${msg}`) - qaSummaries.push( - `QA for ${lang}: skipped (token lacks AI completions scope).` + const allIssues: QaIssue[] = [] + let skipped = false + + for (const fid of qaFileIds) { + const stringIds = fileStringMap[fid] || [] + if (!stringIds.length) { + console.log(`[QA-CHECK] Skipping fileId=${fid} (no strings)`) + continue + } + + console.log( + `[QA-CHECK] QA for ${lang} fileId=${fid} (${stringIds.length} strings)` ) - continue + + // Chunk large files to stay within API limits + const chunks = + stringIds.length > MAX_STRINGS_PER_REQUEST + ? Array.from( + { length: Math.ceil(stringIds.length / MAX_STRINGS_PER_REQUEST) }, + (_, i) => + stringIds.slice( + i * MAX_STRINGS_PER_REQUEST, + (i + 1) * MAX_STRINGS_PER_REQUEST + ) + ) + : [stringIds] + + for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) { + const chunk = chunks[chunkIdx] + if (chunks.length > 1) { + console.log( + `[QA-CHECK] Chunk ${chunkIdx + 1}/${chunks.length} (${chunk.length} strings)` + ) + } + + let job: QaCompletionJob | undefined + try { + job = await postQaCompletions(env.qaPromptId, { + projectId: env.projectId, + sourceLanguageId, + targetLanguageId: lang, + stringIds: chunk, + }) + } catch (e) { + const msg = String((e as Error).message || e) + console.warn( + `[QA-CHECK] Failed for fileId=${fid} chunk ${chunkIdx + 1}: ${msg}` + ) + if (msg.includes("403")) { + // If 403, skip entire language (endpoint not accessible) + qaSummaries.push( + `QA for ${lang}: skipped (endpoint not accessible - may require Enterprise or AI credits).` + ) + skipped = true + break + } + continue + } + + const finished = await awaitQaCompletion(job.id) + if (finished.status !== "finished") { + console.warn( + `[QA-CHECK] Completion for fileId=${fid} chunk ${chunkIdx + 1} status=${finished.status}` + ) + continue + } + const issues = await downloadQaCompletionResult(job.id) + allIssues.push(...issues) + } + + if (skipped) break } - const finished = await awaitQaCompletion(job.id) - if (finished.status !== "finished") { - console.warn( - `[QA-CHECK] Completion status=${finished.status} for ${lang}` - ) - continue + + if ( + !skipped && + (allIssues.length > 0 || Object.keys(fileStringMap).length > 0) + ) { + const summary = summarizeQaIssues(allIssues, processedFileIdToPath, lang) + qaSummaries.push(summary) } - const issues = await downloadQaCompletionResult(job.id) - const summary = summarizeQaIssues(issues, processedFileIdToPath, lang) - qaSummaries.push(summary) } const { languageIds, fileIds } = preTranslateJobCompletedResponse.attributes From e6b3fa046b462be7efda9bd9bb131adf74e4fb73 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:18:19 -0300 Subject: [PATCH 10/99] refactor: modularize script --- src/scripts/i18n/config.ts | 113 ++ src/scripts/i18n/lib/crowdin/build.ts | 76 + src/scripts/i18n/lib/crowdin/files.ts | 404 +++++ src/scripts/i18n/lib/crowdin/pre-translate.ts | 170 ++ .../i18n/lib/crowdin/qa-completions.ts | 225 +++ src/scripts/i18n/lib/github/branches.ts | 84 + src/scripts/i18n/lib/github/commits.ts | 173 ++ src/scripts/i18n/lib/github/files.ts | 132 ++ src/scripts/i18n/lib/github/pull-requests.ts | 47 + src/scripts/i18n/{ => lib}/types.ts | 0 src/scripts/i18n/lib/utils/fetch.ts | 66 + src/scripts/i18n/lib/utils/mapping.ts | 18 + src/scripts/i18n/main.ts | 1500 ++--------------- src/scripts/i18n/post_import_sanitize.ts | 4 +- 14 files changed, 1605 insertions(+), 1407 deletions(-) create mode 100644 src/scripts/i18n/config.ts create mode 100644 src/scripts/i18n/lib/crowdin/build.ts create mode 100644 src/scripts/i18n/lib/crowdin/files.ts create mode 100644 src/scripts/i18n/lib/crowdin/pre-translate.ts create mode 100644 src/scripts/i18n/lib/crowdin/qa-completions.ts create mode 100644 src/scripts/i18n/lib/github/branches.ts create mode 100644 src/scripts/i18n/lib/github/commits.ts create mode 100644 src/scripts/i18n/lib/github/files.ts create mode 100644 src/scripts/i18n/lib/github/pull-requests.ts rename src/scripts/i18n/{ => lib}/types.ts (100%) create mode 100644 src/scripts/i18n/lib/utils/fetch.ts create mode 100644 src/scripts/i18n/lib/utils/mapping.ts diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts new file mode 100644 index 00000000000..5af77ba4621 --- /dev/null +++ b/src/scripts/i18n/config.ts @@ -0,0 +1,113 @@ +import * as dotenv from "dotenv" + +import i18nConfig from "../../../i18n.config.json" + +dotenv.config({ path: ".env.local" }) + +// Language code mapping +export const crowdinToInternalCodeMapping: Record = + i18nConfig.reduce( + (acc, { crowdinCode, code }) => { + acc[crowdinCode] = code + return acc + }, + {} as Record + ) + +// GitHub API configuration +const gitHubApiKey = process.env.I18N_GITHUB_API_KEY || "" +if (!gitHubApiKey) { + console.error("[ERROR] Missing I18N_GITHUB_API_KEY environment variable") + console.error( + "[ERROR] Please set I18N_GITHUB_API_KEY in your .env.local file" + ) + throw new Error("No GitHub API Key found (I18N_GITHUB_API_KEY)") +} +console.log("[DEBUG] GitHub API key found ✓") + +export const gitHubBearerHeaders = { + Authorization: `Bearer ${gitHubApiKey}`, + Accept: "application/vnd.github.v3+json", +} + +// Crowdin API configuration +const crowdinApiKey = process.env.I18N_CROWDIN_API_KEY || "" +if (!crowdinApiKey) { + console.error("[ERROR] Missing I18N_CROWDIN_API_KEY environment variable") + console.error( + "[ERROR] Please set I18N_CROWDIN_API_KEY in your .env.local file" + ) + throw new Error("No Crowdin API Key found (I18N_CROWDIN_API_KEY)") +} +console.log("[DEBUG] Crowdin API key found ✓") + +export const crowdinBearerHeaders = { Authorization: `Bearer ${crowdinApiKey}` } + +// Parse environment variables with defaults +const targetLanguages = process.env.TARGET_LANGUAGES + ? process.env.TARGET_LANGUAGES.split(",").map((lang) => lang.trim()) + : ["es-EM"] + +const baseBranch = process.env.BASE_BRANCH || "dev" + +const fileLimit = process.env.FILE_LIMIT + ? parseInt(process.env.FILE_LIMIT, 10) + : 100 + +const startOffset = process.env.START_OFFSET + ? parseInt(process.env.START_OFFSET, 10) + : 0 + +// Adaptive polling / timeout configuration (milliseconds) +const pretranslateTimeoutMs = process.env.PRETRANSLATE_TIMEOUT_MS + ? parseInt(process.env.PRETRANSLATE_TIMEOUT_MS, 10) + : 6 * 60 * 60 * 1000 // default 6h + +const pretranslatePollBaseMs = process.env.PRETRANSLATE_POLL_BASE_MS + ? Math.max(5000, parseInt(process.env.PRETRANSLATE_POLL_BASE_MS, 10)) + : 30_000 // default 30s base (min clamped to 5s) + +const existingPreTranslationId = process.env.PRETRANSLATION_ID || "" + +// Parse GitHub repository from env (format: "owner/repo") +const githubRepo = + process.env.GITHUB_REPOSITORY || "ethereum/ethereum-org-website" +const [ghOrganization, ghRepo] = githubRepo.split("/") + +console.log("[DEBUG] Configuration:") +console.log(`[DEBUG] - Target languages: ${targetLanguages.join(", ")}`) +console.log(`[DEBUG] - Base branch: ${baseBranch}`) +console.log(`[DEBUG] - File limit: ${fileLimit}`) +console.log(`[DEBUG] - Start offset: ${startOffset}`) +console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) +console.log(`[DEBUG] - Pretranslate timeout ms: ${pretranslateTimeoutMs}`) +console.log(`[DEBUG] - Pretranslate poll base ms: ${pretranslatePollBaseMs}`) +if (existingPreTranslationId) { + console.log( + `[DEBUG] - Resuming from pre-translation ID: ${existingPreTranslationId}` + ) +} + +// Main configuration object +export const config = { + projectId: 834930, + ghOrganization, + ghRepo, + jsonRoot: "src/intl/en", + mdRoot: "public/content", + preTranslatePromptId: Number.parseInt( + process.env.PRE_TRANSLATE_PROMPT_ID || "326942" + ), + qaPromptId: Number.parseInt(process.env.QA_PROMPT_ID || "168592"), + allCrowdinCodes: targetLanguages, + baseBranch, + fileLimit, + startOffset, + pretranslateTimeoutMs, + pretranslatePollBaseMs, + existingPreTranslationId, +} + +// Constants +export const CROWDIN_API_BASE_URL = "https://api.crowdin.com/api/v2" +export const MAX_STRINGS_PER_REQUEST = 500 diff --git a/src/scripts/i18n/lib/crowdin/build.ts b/src/scripts/i18n/lib/crowdin/build.ts new file mode 100644 index 00000000000..cdd49d35fff --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/build.ts @@ -0,0 +1,76 @@ +// Crowdin build and download operations + +import { + config, + CROWDIN_API_BASE_URL, + crowdinBearerHeaders, +} from "../../config" +import type { BuildProjectFileTranslationResponse } from "../types" + +/** + * Build a project file translation for a specific language + * + * @param fileId - The Crowdin file ID + * @param targetLanguageId - The target language ID + * @param projectId - The Crowdin project ID (defaults to config) + * @returns Build response with download URL + */ +export const postBuildProjectFileTranslation = async ( + fileId: number, + targetLanguageId: string, + projectId = config.projectId +): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${projectId}/translations/builds/files/${fileId}` + ) + + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + Accept: "application/json", + }, + body: JSON.stringify({ targetLanguageId }), + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error( + `Crowdin postBuildProjectFileTranslation failed (${res.status}): ${body}` + ) + } + + type JsonResponse = { data: BuildProjectFileTranslationResponse } + const json: JsonResponse = await res.json() + console.log("Built file:", json.data) + return json.data +} + +/** + * Download a built file from Crowdin + * + * @param downloadUrl - The download URL from the build response + * @returns Buffer containing the file contents + */ +export const getBuiltFile = async ( + downloadUrl: string +): Promise<{ buffer: Buffer }> => { + try { + const res = await fetch(downloadUrl) + + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error(`Failed to download built file (${res.status}): ${body}`) + } + + const arrayBuffer = await res.arrayBuffer() + const buffer = Buffer.from(arrayBuffer) + + return { buffer } + } catch (error) { + console.error("getBuiltFile error:", error) + throw error + } +} diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts new file mode 100644 index 00000000000..9c93b2d0fe5 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/files.ts @@ -0,0 +1,404 @@ +// Crowdin file operations + +import { + config, + CROWDIN_API_BASE_URL, + crowdinBearerHeaders, +} from "../../config" +import type { + CrowdinAddFileResponse, + CrowdinFileData, + GitHubCrowdinFileMetadata, +} from "../types" + +/** + * Get all files in the Crowdin project + */ +export const getCrowdinProjectFiles = async (): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/files` + ) + url.searchParams.set("limit", "500") + + console.log(`[DEBUG] Fetching Crowdin project files from: ${url.toString()}`) + + try { + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + + if (!res.ok) { + console.warn(`[ERROR] Crowdin API response not OK: ${res.status}`) + const body = await res.text().catch(() => "") + console.error(`[ERROR] Response body:`, body) + throw new Error( + `Crowdin getCrowdinProjectFiles failed (${res.status}): ${body}` + ) + } + + type JsonResponse = { data: { data: CrowdinFileData }[] } + const json: JsonResponse = await res.json() + + const mappedData = json.data.map(({ data }) => data) + + console.log( + `[DEBUG] Successfully fetched ${mappedData.length} Crowdin files` + ) + console.log(`[DEBUG] First Crowdin file:`, mappedData[0]) + return mappedData + } catch (error) { + console.error(`[ERROR] Failed to fetch Crowdin project files:`, error) + process.exit(1) + } +} + +/** + * Find a Crowdin file matching a GitHub file + */ +export const findCrowdinFile = ( + targetFile: GitHubCrowdinFileMetadata, + crowdinFiles: CrowdinFileData[] +): CrowdinFileData => { + console.log( + `[DEBUG] Looking for Crowdin file matching: ${targetFile.filePath}` + ) + console.log(`[DEBUG] Target file name: ${targetFile["Crowdin-API-FileName"]}`) + + // Log first few Crowdin files for comparison + console.log(`[DEBUG] Total Crowdin files found: ${crowdinFiles.length}`) + console.log( + `[DEBUG] First 3 Crowdin file paths:`, + crowdinFiles.slice(0, 3).map((f) => f.path) + ) + + const found = crowdinFiles.find(({ path }) => + path.endsWith(targetFile.filePath) + ) + + if (!found) { + console.error( + `[ERROR] No matching Crowdin project file found for: ${targetFile.filePath}` + ) + console.error( + `[ERROR] Available Crowdin file paths:`, + crowdinFiles.map((f) => f.path) + ) + throw new Error( + `No matching Crowdin project file found for: ${targetFile.filePath}` + ) + } + + console.log( + `[DEBUG] Successfully matched with Crowdin file: ${found.path} (ID: ${found.id})` + ) + return found +} + +/** + * Unhides all hidden strings in a Crowdin file. + * Hidden strings (often marked as duplicates) cannot be translated. + * This function makes them visible so they can be processed by pre-translation. + */ +export const unhideStringsInFile = async (fileId: number): Promise => { + console.log(`[UNHIDE] Checking for hidden strings in fileId=${fileId}`) + + // Get all strings from the file + const listUrl = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/strings?fileId=${fileId}&limit=500` + + try { + const listRes = await fetch(listUrl, { headers: crowdinBearerHeaders }) + if (!listRes.ok) { + const text = await listRes.text().catch(() => "") + console.warn( + `[UNHIDE] Failed to list strings for fileId=${fileId}: ${text}` + ) + return 0 + } + + const listJson = await listRes.json() + const strings = listJson.data || [] + + let unhiddenCount = 0 + + for (const item of strings) { + const stringId = item.data.id + const isHidden = item.data.isHidden + + if (!isHidden) continue + + // Unhide the string using PATCH + const patchUrl = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/strings/${stringId}` + + try { + const patchRes = await fetch(patchUrl, { + method: "PATCH", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify([ + { + op: "replace", + path: "/isHidden", + value: false, + }, + ]), + }) + + if (patchRes.ok) { + unhiddenCount++ + } else { + const text = await patchRes.text().catch(() => "") + console.warn(`[UNHIDE] Failed to unhide string ${stringId}: ${text}`) + } + } catch (err) { + console.warn(`[UNHIDE] Error unhiding string ${stringId}:`, err) + } + } + + if (unhiddenCount > 0) { + console.log( + `[UNHIDE] ✓ Unhidden ${unhiddenCount} strings in fileId=${fileId}` + ) + } else { + console.log(`[UNHIDE] No hidden strings found in fileId=${fileId}`) + } + + return unhiddenCount + } catch (error) { + console.error(`[UNHIDE] Error processing fileId=${fileId}:`, error) + return 0 + } +} + +/** + * Lists all Crowdin directories in the project. + */ +export const getCrowdinProjectDirectories = async (): Promise< + { id: number; name: string; directoryId?: number }[] +> => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/directories` + ) + url.searchParams.set("limit", "500") + + console.log(`[DEBUG] Fetching Crowdin directories: ${url.toString()}`) + + try { + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error( + `Crowdin getCrowdinProjectDirectories failed (${res.status}): ${body}` + ) + } + type DirJson = { + data: { data: { id: number; name: string; directoryId?: number } }[] + } + const json: DirJson = await res.json() + const dirs = json.data.map(({ data }) => data) + console.log(`[DEBUG] Loaded ${dirs.length} directories`) + return dirs + } catch (error) { + console.error("[ERROR] getCrowdinProjectDirectories:", error) + throw error + } +} + +/** + * Creates a single Crowdin directory (one segment). Parent may be undefined for root. + */ +export const postCrowdinDirectory = async ( + name: string, + parentDirectoryId?: number +): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/directories` + ) + + const body: Record = { name } + if (parentDirectoryId) body.directoryId = parentDirectoryId + + console.log( + `[DEBUG] Creating directory segment "${name}" parent=${parentDirectoryId ?? "ROOT"}` + ) + + try { + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + Accept: "application/json", + }, + body: JSON.stringify(body), + }) + + if (!res.ok) { + const text = await res.text().catch(() => "") + // 409 = already exists race condition + throw new Error( + `Crowdin postCrowdinDirectory failed (${res.status}): ${text}` + ) + } + + type JsonResponse = { data: { id: number } } + const json: JsonResponse = await res.json() + console.log(`[DEBUG] Created directory id=${json.data.id} name="${name}"`) + return json.data.id + } catch (error) { + console.error("[ERROR] postCrowdinDirectory:", error) + throw error + } +} + +/** + * Ensures a nested path of directories exists. + * Example path: "public/content/community/events/organizing" + * Returns the final (deepest) directory id. + * + * - Splits path on "/" ignoring empty segments. + * - Reuses existing segments (matched by name + parent). + * - Creates missing segments sequentially. + */ +export const createCrowdinDirectory = async ( + fullPath: string +): Promise => { + if (!fullPath || typeof fullPath !== "string") { + throw new Error("createCrowdinDirectory: path must be a non-empty string") + } + console.log(`[DEBUG] Ensuring Crowdin directory path: "${fullPath}"`) + + const segments = fullPath + .split("/") + .map((s) => s.trim()) + .filter(Boolean) + if (!segments.length) throw new Error("No valid path segments") + + const invalidChars = /[\\:*?"<>|]/ // Disallowed per Crowdin docs for directory name (exclude forward slash which is path separator) + for (const segment of segments) { + if (invalidChars.test(segment)) { + throw new Error( + `createCrowdinDirectory: segment "${segment}" contains invalid characters in path "${fullPath}"` + ) + } + } + + // Load existing directories once + const existing = await getCrowdinProjectDirectories() + + // Build quick lookup: parentId|name -> id (root parentId = 0 sentinel) + const key = (parentId: number | undefined, name: string) => + `${parentId || 0}|${name}` + + const directoryIndex = new Map() + for (const dir of existing) { + directoryIndex.set(key(dir.directoryId, dir.name), dir.id) + } + + let currentParentId: number | undefined + for (const segment of segments) { + const k = key(currentParentId, segment) + let dirId = directoryIndex.get(k) + if (dirId) { + console.log( + `[DEBUG] Reusing existing directory "${segment}" id=${dirId} parent=${currentParentId ?? "ROOT"}` + ) + currentParentId = dirId + continue + } + // Create + dirId = await postCrowdinDirectory(segment, currentParentId) + directoryIndex.set(k, dirId) + currentParentId = dirId + } + + if (!currentParentId) + throw new Error("Failed to resolve final directory id (unexpected)") + + console.log( + `[DEBUG] Final directory id for path "${fullPath}" = ${currentParentId}` + ) + return currentParentId +} + +/** + * Upload a file to Crowdin storage + */ +export const postFileToStorage = async ( + fileBuffer: Buffer, + fileName: string +) => { + const url = new URL(`${CROWDIN_API_BASE_URL}/storages`) + + try { + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + // Crowdin expects raw bytes for storages endpoint; use octet-stream. + "Content-Type": "application/octet-stream", + "Crowdin-API-FileName": fileName, + }, + body: fileBuffer, + }) + + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error( + `Crowdin postFileToStorage failed (${res.status}): ${text}` + ) + } + + type JsonResponse = { + data: { + id: number + fileName: string + } + } + const json: JsonResponse = await res.json() + console.log("Uploaded storage:", json.data) + return json.data + } catch (error) { + console.error("postFileToStorage error:", error) + throw error + } +} + +/** + * Add a file to Crowdin project + */ +export const postCrowdinFile = async ( + storageId: number, + name: string, + dir: string +): Promise => { + const directoryId = await createCrowdinDirectory(dir) + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/files` + ) + + try { + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + Accept: "application/json", + }, + body: JSON.stringify({ storageId, name, directoryId }), + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error(`Crowdin postCrowdinFile failed (${res.status}): ${body}`) + } + + type JsonResponse = { data: CrowdinAddFileResponse } + const json: JsonResponse = await res.json() + console.log("Updated file:", json.data) + return json.data + } catch (error) { + console.error(error) + process.exit(1) + } +} diff --git a/src/scripts/i18n/lib/crowdin/pre-translate.ts b/src/scripts/i18n/lib/crowdin/pre-translate.ts new file mode 100644 index 00000000000..62b312b6217 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/pre-translate.ts @@ -0,0 +1,170 @@ +// Crowdin pre-translation operations + +import { + config, + CROWDIN_API_BASE_URL, + crowdinBearerHeaders, +} from "../../config" +import type { CrowdinPreTranslateResponse } from "../types" + +const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) + +/** + * Apply pre-translation to files + */ +export const postApplyPreTranslation = async ( + fileIds: number[], + languageIds?: string[], + aiPromptIdOverride?: number +): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/pre-translations` + ) + try { + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + languageIds: languageIds || config.allCrowdinCodes, + fileIds, + method: "ai", + aiPromptId: + typeof aiPromptIdOverride === "number" + ? aiPromptIdOverride + : config.preTranslatePromptId, + }), + }) + + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error( + `Crowdin postApplyPreTranslation failed (${res.status}): ${text}` + ) + } + + type JsonResponse = { + data: CrowdinPreTranslateResponse + } + const json: JsonResponse = await res.json() + + return json.data + } catch (error) { + console.error("postApplyPreTranslation error:", error) + throw error + } +} + +/** + * Get pre-translation status + */ +export const getPreTranslationStatus = async ( + preTranslationId: string +): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/pre-translations/${preTranslationId}` + ) + try { + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error( + `Crowdin getPreTranslationStatus failed (${res.status}): ${text}` + ) + } + + type JsonResponse = { + data: CrowdinPreTranslateResponse + } + const json: JsonResponse = await res.json() + + return json.data + } catch (error) { + console.error("getPreTranslationStatus error:", error) + throw error + } +} + +/** + * Polls Crowdin for the status of a pre-translation job and resolves when it finishes. + * + * This function repeatedly calls `getPreTranslationStatus` for the given + * pre-translation ID until the job is no longer in progress. It uses adaptive + * polling intervals based on elapsed time and will abort with an error if the operation + * does not complete within the configured timeout. + * + * @param preTranslationId - The identifier of the Crowdin pre-translation job to monitor. + * @param opts - Optional configuration for timeout and base polling interval + * + * @returns A promise that resolves with the final CrowdinPreTranslateResponse when the + * job status becomes "finished". + * + * @throws {Error} If the wait times out + * @throws {Error} If the pre-translation completes with an unexpected status + * @throws {Error} If an error is thrown while fetching the pre-translation status + */ +export const awaitPreTranslationCompleted = async ( + preTranslationId: string, + opts?: { timeoutMs?: number; baseIntervalMs?: number } +): Promise => { + const timeoutMs = opts?.timeoutMs ?? config.pretranslateTimeoutMs + const baseInterval = opts?.baseIntervalMs ?? config.pretranslatePollBaseMs + const start = Date.now() + let attempt = 0 + + const computeInterval = (elapsedMs: number): number => { + const minutes = elapsedMs / 60000 + if (minutes < 10) return baseInterval + if (minutes < 30) return Math.max(baseInterval * 2, 60_000) + if (minutes < 60) return Math.max(baseInterval * 4, 180_000) + return Math.max(baseInterval * 10, 300_000) // cap at 5 min + } + + // Bounded loop: terminates once elapsed exceeds timeoutMs + while (Date.now() - start <= timeoutMs) { + const elapsed = Date.now() - start + attempt++ + let res: CrowdinPreTranslateResponse + try { + res = await getPreTranslationStatus(preTranslationId) + } catch (e) { + // transient fetch errors: log + continue within timeout window + const nextWait = computeInterval(elapsed) + console.warn( + `[PRE-TRANSLATE][POLL] Error on attempt ${attempt}: ${(e as Error).message}. Retrying in ${nextWait}ms.` + ) + await delay(nextWait) + continue + } + if (res.status !== "in_progress") { + if (res.status === "finished") { + console.log( + `[PRE-TRANSLATE][POLL] Completed after ${attempt} attempts; elapsed ${Math.round( + (Date.now() - start) / 60000 + )}m.` + ) + return res + } + throw new Error( + `Pre-translation ended with unexpected status: ${res.status}` + ) + } + const nextWait = computeInterval(elapsed) + const progressPct = res.progress ?? 0 + console.log( + `[PRE-TRANSLATE][POLL] attempt=${attempt} progress=${progressPct}% elapsed=${Math.round( + elapsed / 60000 + )}m nextWait=${nextWait}ms` + ) + await delay(nextWait) + } + const finalElapsed = Date.now() - start + throw new Error( + `Timed out waiting for pre-translation (elapsed ${Math.round( + finalElapsed / 60000 + )}m)` + ) +} diff --git a/src/scripts/i18n/lib/crowdin/qa-completions.ts b/src/scripts/i18n/lib/crowdin/qa-completions.ts new file mode 100644 index 00000000000..0f1d9206f6f --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/qa-completions.ts @@ -0,0 +1,225 @@ +// Crowdin AI Completions (QA check) helpers + +import { + config, + CROWDIN_API_BASE_URL, + crowdinBearerHeaders, +} from "../../config" + +export type QaCompletionRequest = { + projectId: number + sourceLanguageId: string + targetLanguageId: string + stringIds: number[] +} + +export type QaCompletionJob = { + id: string + status: "in_progress" | "finished" | string + progress?: number +} + +export type QaIssue = { + fileId: number + stringId: number + severity: "error" | "warning" | "info" + title: string + details?: string +} + +const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) + +/** + * Resolve the Crowdin user ID from the API + */ +export const resolveCrowdinUserId = async (): Promise => { + const url = new URL(`${CROWDIN_API_BASE_URL}/user`) + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`resolveCrowdinUserId (${res.status}): ${text}`) + } + const json = await res.json() + const id = String(json.data?.id || json.id) + if (!id) throw new Error("Failed to resolve Crowdin user id from /user") + return id +} + +/** + * List all string IDs for a given file + */ +export const listStringIdsForFile = async ( + fileId: number +): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/strings` + ) + url.searchParams.set("fileId", String(fileId)) + url.searchParams.set("limit", "500") + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`listStringIdsForFile (${res.status}): ${text}`) + } + const json = await res.json() + type StringItem = { data: { id: number } } + const items: StringItem[] = json.data || [] + const ids: number[] = items.map((d) => d.data.id) + return ids +} + +/** + * Post QA completions request + */ +export const postQaCompletions = async ( + qaPromptId: number, + payload: QaCompletionRequest +): Promise => { + const userId = await resolveCrowdinUserId() + if (!userId) + throw new Error("Failed to resolve Crowdin user ID for completions") + const url = new URL( + `${CROWDIN_API_BASE_URL}/users/${userId}/ai/prompts/${qaPromptId}/completions` + ) + const bodyPayload = { resources: payload } + console.log(`[QA-CHECK][DEBUG] POST ${url.toString()}`) + console.log(`[QA-CHECK][DEBUG] Body:`, JSON.stringify(bodyPayload, null, 2)) + const res = await fetch(url.toString(), { + method: "POST", + headers: { ...crowdinBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify(bodyPayload), + }) + console.log(`[QA-CHECK][DEBUG] Response status: ${res.status}`) + if (!res.ok) { + const text = await res.text().catch(() => "") + console.log(`[QA-CHECK][DEBUG] Error response:`, text) + if (res.status === 403) { + throw new Error( + `QA completions endpoint not accessible (403). ` + + `This may require Crowdin Enterprise or AI credits. URL: ${url.toString()} Raw: ${text}` + ) + } + throw new Error(`postQaCompletions (${res.status}): ${text}`) + } + const json = await res.json() + console.log( + `[QA-CHECK][DEBUG] Success response:`, + JSON.stringify(json, null, 2) + ) + return json.data as QaCompletionJob +} + +/** + * Get QA completion status + */ +export const getQaCompletion = async ( + completionId: string +): Promise => { + const userId = await resolveCrowdinUserId() + const url = new URL( + `${CROWDIN_API_BASE_URL}/users/${userId}/ai/prompts/completions/${completionId}` + ) + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`getQaCompletion (${res.status}): ${text}`) + } + const json = await res.json() + return json.data as QaCompletionJob +} + +/** + * Poll QA completion until finished with adaptive intervals + */ +export const awaitQaCompletion = async ( + completionId: string, + timeoutMs = config.pretranslateTimeoutMs, + baseIntervalMs = config.pretranslatePollBaseMs +): Promise => { + const start = Date.now() + let attempt = 0 + const computeInterval = (elapsedMs: number): number => { + const minutes = elapsedMs / 60000 + if (minutes < 10) return baseIntervalMs + if (minutes < 30) return Math.max(baseIntervalMs * 2, 60_000) + if (minutes < 60) return Math.max(baseIntervalMs * 4, 180_000) + return Math.max(baseIntervalMs * 10, 300_000) + } + while (Date.now() - start <= timeoutMs) { + attempt++ + const elapsed = Date.now() - start + let job: QaCompletionJob + try { + job = await getQaCompletion(completionId) + } catch (e) { + const wait = computeInterval(elapsed) + console.warn( + `[QA-CHECK][POLL] Error on attempt ${attempt}: ${(e as Error).message}. Waiting ${wait}ms.` + ) + await delay(wait) + continue + } + if (job.status !== "in_progress") return job + const wait = computeInterval(elapsed) + console.log( + `[QA-CHECK][POLL] attempt=${attempt} progress=${job.progress ?? 0}% nextWait=${wait}ms` + ) + await delay(wait) + } + throw new Error("Timed out awaiting QA completion") +} + +/** + * Download QA completion results + */ +export const downloadQaCompletionResult = async ( + completionId: string +): Promise => { + const userId = await resolveCrowdinUserId() + const url = new URL( + `${CROWDIN_API_BASE_URL}/users/${userId}/ai/prompts/completions/${completionId}/download` + ) + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`downloadQaCompletionResult (${res.status}): ${text}`) + } + // Assume JSON structure containing issues; adjust as per actual response + const arrayBuffer = await res.arrayBuffer() + const text = Buffer.from(arrayBuffer).toString("utf-8") + try { + const parsed = JSON.parse(text) + const issues: QaIssue[] = parsed.issues || parsed.data || [] + return issues + } catch { + // If plain text, return empty and attach raw for summary + return [] + } +} + +/** + * Summarize QA issues for PR body + */ +export const summarizeQaIssues = ( + issues: QaIssue[], + fileIdToPath: Record, + lang: string +): string => { + if (!issues.length) return `No QA issues detected for ${lang}.` + const counts = { error: 0, warning: 0, info: 0 } + for (const i of issues) { + const sev = i.severity + if (sev === "error" || sev === "warning" || sev === "info") { + counts[sev]++ + } + } + const top = issues.slice(0, 10) + const lines = [ + `QA for ${lang}: ${counts.error} errors, ${counts.warning} warnings, ${counts.info} info`, + ] + for (const i of top) { + const path = fileIdToPath[i.fileId] || `fileId=${i.fileId}` + lines.push(`- [${i.severity}] ${path} string=${i.stringId} — ${i.title}`) + } + return lines.join("\n") +} diff --git a/src/scripts/i18n/lib/github/branches.ts b/src/scripts/i18n/lib/github/branches.ts new file mode 100644 index 00000000000..905174fd409 --- /dev/null +++ b/src/scripts/i18n/lib/github/branches.ts @@ -0,0 +1,84 @@ +// GitHub branch operations + +import { config, gitHubBearerHeaders } from "../../config" +import type { BranchDetailsResponse, BranchObject } from "../types" +import { fetchWithRetry } from "../utils/fetch" + +/** + * Retrieves the Git object for a branch from the GitHub API + * + * @param branch - The branch name to look up (e.g., "main" or "dev") + * @returns A promise that resolves to the BranchObject containing sha, type, and url + */ +export const getBranchObject = async ( + branch: string +): Promise => { + const url = new URL( + `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/git/ref/heads/${branch}` + ) + + const res = await fetchWithRetry(url.toString(), { + headers: gitHubBearerHeaders, + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error(`GitHub getBranchObject (${res.status}): ${body}`) + } + + type JsonResponse = BranchDetailsResponse + const json: JsonResponse = await res.json() + return json.object +} + +/** + * Generate a branch name based on current timestamp + */ +export const createBranchName = () => { + const ts = new Date().toISOString().replace(/\..*$/, "").replace(/[:]/g, "-") + return "i18n/import/" + ts +} + +/** + * Create a new branch from a base branch + * + * @param ref - The base branch reference (defaults to config.baseBranch) + * @returns Object containing the new branch name and SHA + */ +export const postCreateBranchFrom = async (ref = config.baseBranch) => { + const { sha } = await getBranchObject(ref) + const branch = createBranchName() + + const url = new URL( + `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/git/refs` + ) + + try { + console.log( + `[DEBUG] Creating branch from base="${ref}" sha=${sha} -> new branch="${branch}"` + ) + const res = await fetchWithRetry(url.toString(), { + method: "POST", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ ref: `refs/heads/${branch}`, sha }), + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + console.error( + `[ERROR] Failed to create branch. URL=${url.toString()} status=${res.status}` + ) + throw new Error(`GitHub createBranchFrom (${res.status}): ${body}`) + } + + return { branch, sha } + } catch (error) { + console.error(error) + process.exit(1) + } +} diff --git a/src/scripts/i18n/lib/github/commits.ts b/src/scripts/i18n/lib/github/commits.ts new file mode 100644 index 00000000000..f4aca2e8ef7 --- /dev/null +++ b/src/scripts/i18n/lib/github/commits.ts @@ -0,0 +1,173 @@ +// GitHub commit operations + +import { config, gitHubBearerHeaders } from "../../config" +import { fetchWithRetry } from "../utils/fetch" + +const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) + +/** + * Get the destination path for a translated file + * + * @param crowdinFilePath - The Crowdin file path (e.g., src/intl/en/page-foo.json) + * @param internalLanguageCode - The internal language code + * @returns The destination path in the repository + */ +export const getDestinationFromPath = ( + crowdinFilePath: string, + internalLanguageCode: string +) => { + const normalized = crowdinFilePath.replace(/^\//, "") + const isJson = normalized.toLowerCase().endsWith(".json") + const isMarkdown = normalized.toLowerCase().endsWith(".md") + + let destinationPath = normalized + + if (isJson) { + // JSON: src/intl/en/*.json -> src/intl//*.json + if (normalized.startsWith("src/intl/en/")) { + destinationPath = normalized.replace( + /^src\/intl\/en\//, + `src/intl/${internalLanguageCode}/` + ) + } else if (normalized.startsWith("src/intl/")) { + // Fallback: if for some reason "en" segment is missing, inject lang after src/intl/ + const parts = normalized.split("/") + // parts: [src, intl, ...] + parts.splice(2, 0, internalLanguageCode) + destinationPath = parts.join("/") + } + } else if (isMarkdown) { + // Markdown: public/content//index.md -> public/content/translations///index.md + if (normalized.startsWith("public/content/")) { + const rel = normalized.replace(/^public\/content\//, "") + // If already inside translations/, avoid duplicating; rewrite to current lang + const relParts = rel.split("/").filter(Boolean) + if (relParts[0] === "translations") { + // Drop existing translations// + const rest = relParts.slice(2).join("/") + destinationPath = `public/content/translations/${internalLanguageCode}/${rest}` + } else { + destinationPath = `public/content/translations/${internalLanguageCode}/${rel}` + } + } + } + + console.log( + `[DEBUG] Destination mapping: ${crowdinFilePath} -> ${destinationPath} (lang=${internalLanguageCode})` + ) + return destinationPath +} + +/** + * Get the SHA of a file at a specific path + * + * @param path - The file path in the repository + * @param branch - The branch name + * @returns Object containing the file SHA + */ +export const getPathSha = async (path: string, branch: string) => { + const url = new URL( + `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/contents/${path}?ref=${branch}` + ) + + const res = await fetchWithRetry(url.toString(), { + headers: gitHubBearerHeaders, + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error(`GitHub getPathSha (${res.status}): ${body}`) + } + + type JsonResponse = { sha: string } + const { sha }: JsonResponse = await res.json() + + return { sha } +} + +/** + * Commit a file to a GitHub branch with retry logic for conflicts + * + * @param buffer - The file contents as a Buffer + * @param destinationPath - The path in the repository + * @param branch - The branch name + * @param sha - Optional SHA for updating existing files + * @param attempt - Current retry attempt number + */ +export const putCommitFile = async ( + buffer: Buffer, + destinationPath: string, + branch: string, + sha?: string, + attempt = 0 +): Promise => { + const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/contents/${destinationPath}` + + try { + // Use the buffer contents as base64-encoded content for the commit + const contentBase64 = buffer.toString("base64") + + const body = { + message: `update(i18n): ${destinationPath}`, + content: contentBase64, + branch, + } + + if (sha) body["sha"] = sha + + const res = await fetchWithRetry(url.toString(), { + method: "PUT", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }) + + if (res.status === 422) { + const { sha: fileSha } = await getPathSha(destinationPath, branch) + console.warn( + `[RETRY] 422 Unprocessable for ${destinationPath}. Retrying with existing SHA ${fileSha}` + ) + return await putCommitFile( + buffer, + destinationPath, + branch, + fileSha, + attempt + ) + } + + if (res.status === 409) { + if (attempt >= 5) { + const bodyText = await res.text().catch(() => "") + throw new Error( + `GitHub putCommitFile conflict persists after ${attempt} retries (${res.status}): ${bodyText}` + ) + } + const backoff = 500 * Math.pow(2, attempt) // 500ms, 1s, 2s, 4s, 8s + console.warn( + `[RETRY] 409 Conflict for ${destinationPath}. Attempt ${attempt + 1}. Waiting ${backoff}ms before retry.` + ) + await delay(backoff) + const { sha: latestSha } = await getPathSha(destinationPath, branch) + return await putCommitFile( + buffer, + destinationPath, + branch, + latestSha, + attempt + 1 + ) + } + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error(`GitHub putCommitFile (${res.status}): ${body}`) + } + } catch (error) { + console.error(error) + process.exit(1) + } +} diff --git a/src/scripts/i18n/lib/github/files.ts b/src/scripts/i18n/lib/github/files.ts new file mode 100644 index 00000000000..1314df0b67c --- /dev/null +++ b/src/scripts/i18n/lib/github/files.ts @@ -0,0 +1,132 @@ +// GitHub file operations + +import { config, gitHubBearerHeaders } from "../../config" +import type { + ContentType, + GitHubCrowdinFileMetadata, + GitHubQueryResponseItem, +} from "../types" +import { fetchWithRetry } from "../utils/fetch" + +/** + * Get English files with pagination, allowing limit + offset. + * GitHub Search API caps `per_page` at 100; we fetch pages until + * we accumulate `offset + limit` items, then return the slice. + */ +export const getAllEnglishFiles = async ( + limit = 100, + offset = 0 +): Promise => { + const ghSearchEndpointBase = "https://api.github.com/search/code" + const query = `repo:${config.ghOrganization}/${config.ghRepo} extension:md path:"${config.mdRoot}" -path:"${config.mdRoot}/translations" OR repo:${config.ghOrganization}/${config.ghRepo} extension:json path:"${config.jsonRoot}"` + + console.log(`[DEBUG] GitHub search query: ${query}`) + + const perPage = 100 + const needed = offset + limit + const collected: GitHubQueryResponseItem[] = [] + + let page = 1 + while (collected.length < needed) { + const url = new URL(ghSearchEndpointBase) + url.searchParams.set("q", query) + url.searchParams.set("per_page", perPage.toString()) + url.searchParams.set("page", page.toString()) + + console.log(`[DEBUG] Fetching search page ${page} ...`) + + try { + const res = await fetchWithRetry(url.toString(), { + headers: gitHubBearerHeaders, + }) + + if (!res.ok) { + console.warn(`[ERROR] GitHub API response not OK: ${res.status}`) + const body = await res.text().catch(() => "") + console.error(`[ERROR] Response body:`, body) + throw new Error(`GitHub getAllEnglishFiles (${res.status}): ${body}`) + } + + type JsonResponse = { items: GitHubQueryResponseItem[] } + const json: JsonResponse = await res.json() + + if (!json.items.length) { + console.log(`[DEBUG] No more results at page ${page}.`) + break + } + + collected.push(...json.items) + console.log(`[DEBUG] Collected ${collected.length} items so far.`) + + page += 1 + if (page > 10) { + // Safety cap: avoid excessive paging; typical search caps ~1000 results + console.warn( + `[WARN] Reached pagination safety cap at page ${page - 1}.` + ) + break + } + } catch (error) { + console.error(`[ERROR] Failed to get English files from GitHub:`, error) + process.exit(1) + } + } + + const sliced = collected.slice(offset, offset + limit) + console.log( + `[DEBUG] Returning ${sliced.length} files (offset=${offset}, limit=${limit})` + ) + if (sliced.length) console.log(`[DEBUG] First GitHub file:`, sliced[0]) + return sliced +} + +/** + * Convert GitHub items to Crowdin file metadata + */ +export const getFileMetadata = async ( + items: GitHubQueryResponseItem[] +): Promise => { + if (!items.length) return [] + + const owner = items[0].repository.owner.login + const repo = items[0].repository.name + + const englishFileMetadata = items.map((item) => { + // https://raw.githubusercontent.com/:owner/:repo/:ref/:path + const download_url = `https://raw.githubusercontent.com/${owner}/${repo}/${config.baseBranch}/${item.path}` + const filePath = item.path + const filePathSplit = filePath.split("/") + const fileName = filePathSplit[filePathSplit.length - 1] + const contentType: ContentType = fileName?.endsWith(".json") + ? "application/json" + : "text/markdown" + + return { + "Crowdin-API-FileName": fileName, + filePath: filePath, + download_url: download_url, + "Content-Type": contentType, + } + }) + return englishFileMetadata +} + +/** + * Download a file from GitHub + */ +export const downloadGitHubFile = async ( + download_url: string +): Promise => { + try { + const res = await fetch(download_url) + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error(`Failed to download from GitHub (${res.status}): ${body}`) + } + const arrayBuffer = await res.arrayBuffer() + return Buffer.from(arrayBuffer) + } catch (error) { + console.error("downloadGitHubFile error:", error) + throw error + } +} diff --git a/src/scripts/i18n/lib/github/pull-requests.ts b/src/scripts/i18n/lib/github/pull-requests.ts new file mode 100644 index 00000000000..d62723db136 --- /dev/null +++ b/src/scripts/i18n/lib/github/pull-requests.ts @@ -0,0 +1,47 @@ +// GitHub pull request operations + +import { config, gitHubBearerHeaders } from "../../config" +import { fetchWithRetry } from "../utils/fetch" + +/** + * Create a pull request + * + * @param head - The head branch (source of changes) + * @param base - The base branch (target for merge, defaults to config.baseBranch) + * @param bodyText - Optional PR description text + * @returns The created pull request object + */ +export const postPullRequest = async ( + head: string, + base = config.baseBranch, + bodyText?: string +) => { + const url = new URL( + `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/pulls` + ) + + const body = { + title: "i18n: automated Crowdin translation import", + head, + base, + body: bodyText || "Automated Crowdin translation import", + } + + const res = await fetchWithRetry(url.toString(), { + method: "POST", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error(`Crowdin postPullRequest failed (${res.status}): ${body}`) + } + + const json = await res.json() + return json +} diff --git a/src/scripts/i18n/types.ts b/src/scripts/i18n/lib/types.ts similarity index 100% rename from src/scripts/i18n/types.ts rename to src/scripts/i18n/lib/types.ts diff --git a/src/scripts/i18n/lib/utils/fetch.ts b/src/scripts/i18n/lib/utils/fetch.ts new file mode 100644 index 00000000000..7191a531527 --- /dev/null +++ b/src/scripts/i18n/lib/utils/fetch.ts @@ -0,0 +1,66 @@ +// Utilities: resilient fetch with retry logic + +const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) + +export type RetryOptions = { + retries?: number + timeoutMs?: number + backoffMs?: number + retryOnStatuses?: number[] +} + +export const fetchWithRetry = async ( + url: string, + init?: RequestInit, + options?: RetryOptions +) => { + const retries = options?.retries ?? 3 + const timeoutMs = options?.timeoutMs ?? 30000 + const backoffMs = options?.backoffMs ?? 1000 + const retryOnStatuses = options?.retryOnStatuses ?? [ + 408, 429, 500, 502, 503, 504, + ] + + for (let attempt = 0; attempt <= retries; attempt++) { + const controller = new AbortController() + const id = setTimeout(() => controller.abort(), timeoutMs) + try { + const res = await fetch(url, { + ...(init || {}), + signal: controller.signal, + }) + clearTimeout(id) + if ( + !res.ok && + retryOnStatuses.includes(res.status) && + attempt < retries + ) { + const wait = backoffMs * Math.pow(2, attempt) + console.warn( + `[RETRY] ${url} -> ${res.status}. Attempt ${attempt + 1}/${retries}. Waiting ${wait}ms.` + ) + await delay(wait) + continue + } + return res + } catch (err: unknown) { + clearTimeout(id) + const errObj = err as { name?: string; code?: string } + const isAbort = errObj?.name === "AbortError" + const isConnectTimeout = errObj?.code === "UND_ERR_CONNECT_TIMEOUT" + if ((isAbort || isConnectTimeout) && attempt < retries) { + const wait = backoffMs * Math.pow(2, attempt) + console.warn( + `[RETRY] ${url} -> ${isAbort ? "AbortError" : errObj?.code}. Attempt ${ + attempt + 1 + }/${retries}. Waiting ${wait}ms.` + ) + await delay(wait) + continue + } + throw err + } + } + // Unreachable, but TS wants a return + throw new Error("fetchWithRetry: exhausted retries") +} diff --git a/src/scripts/i18n/lib/utils/mapping.ts b/src/scripts/i18n/lib/utils/mapping.ts new file mode 100644 index 00000000000..e4f543e942d --- /dev/null +++ b/src/scripts/i18n/lib/utils/mapping.ts @@ -0,0 +1,18 @@ +import { crowdinToInternalCodeMapping } from "../../config" + +/** + * Convert Crowdin language code to internal language code + */ +export function mapCrowdinCodeToInternal(crowdinCode: string): string { + return crowdinToInternalCodeMapping[crowdinCode] || crowdinCode +} + +/** + * Convert internal language code to Crowdin language code + */ +export function mapInternalCodeToCrowdin(internalCode: string): string { + const entry = Object.entries(crowdinToInternalCodeMapping).find( + ([, internal]) => internal === internalCode + ) + return entry ? entry[0] : internalCode +} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index be81e5f4178..fc5f96aa7bc 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -1,1367 +1,56 @@ -/* eslint-disable import/order */ -import fs from "fs" - -import dotenv from "dotenv" - -import i18nConfig from "../../../i18n.config.json" - -import { runSanitizer } from "./post_import_sanitize" +import * as fs from "fs" + +import { + getBuiltFile, + postBuildProjectFileTranslation, +} from "./lib/crowdin/build" +// Crowdin operations +import { + findCrowdinFile, + getCrowdinProjectFiles, + postCrowdinFile, + postFileToStorage, + unhideStringsInFile, +} from "./lib/crowdin/files" +import { + awaitPreTranslationCompleted, + getPreTranslationStatus, + postApplyPreTranslation, +} from "./lib/crowdin/pre-translate" +import { + awaitQaCompletion, + downloadQaCompletionResult, + listStringIdsForFile, + postQaCompletions, + type QaCompletionJob, + type QaIssue, + summarizeQaIssues, +} from "./lib/crowdin/qa-completions" +import { postCreateBranchFrom } from "./lib/github/branches" +import { getDestinationFromPath, putCommitFile } from "./lib/github/commits" +// GitHub operations +import { + downloadGitHubFile, + getAllEnglishFiles, + getFileMetadata, +} from "./lib/github/files" +import { postPullRequest } from "./lib/github/pull-requests" import type { - BranchDetailsResponse, - BranchObject, - BuildProjectFileTranslationResponse, - ContentType, CrowdinAddFileResponse, CrowdinFileData, CrowdinPreTranslateResponse, - GitHubCrowdinFileMetadata, - GitHubQueryResponseItem, -} from "./types" - -dotenv.config({ path: ".env.local" }) - -const crowdinToInternalCodeMapping: Record = i18nConfig.reduce( - (acc, { crowdinCode, code }) => { - acc[crowdinCode] = code - return acc - }, - {} as Record -) - -const gitHubApiKey = process.env.I18N_GITHUB_API_KEY || "" -if (!gitHubApiKey) { - console.error("[ERROR] Missing I18N_GITHUB_API_KEY environment variable") - console.error( - "[ERROR] Please set I18N_GITHUB_API_KEY in your .env.local file" - ) - throw new Error("No GitHub API Key found (I18N_GITHUB_API_KEY)") -} -console.log("[DEBUG] GitHub API key found ✓") -const gitHubBearerHeaders = { - Authorization: `Bearer ${gitHubApiKey}`, - Accept: "application/vnd.github.v3+json", -} - -const crowdinApiKey = process.env.I18N_CROWDIN_API_KEY || "" -if (!crowdinApiKey) { - console.error("[ERROR] Missing I18N_CROWDIN_API_KEY environment variable") - console.error( - "[ERROR] Please set I18N_CROWDIN_API_KEY in your .env.local file" - ) - throw new Error("No Crowdin API Key found (I18N_CROWDIN_API_KEY)") -} -console.log("[DEBUG] Crowdin API key found ✓") -const crowdinBearerHeaders = { Authorization: `Bearer ${crowdinApiKey}` } - -// Parse environment variables with defaults -const targetLanguages = process.env.TARGET_LANGUAGES - ? process.env.TARGET_LANGUAGES.split(",").map((lang) => lang.trim()) - : ["es-EM"] - -const baseBranch = process.env.BASE_BRANCH || "dev" - -const fileLimit = process.env.FILE_LIMIT - ? parseInt(process.env.FILE_LIMIT, 10) - : 100 - -const startOffset = process.env.START_OFFSET - ? parseInt(process.env.START_OFFSET, 10) - : 0 - -// Adaptive polling / timeout configuration (milliseconds) -const pretranslateTimeoutMs = process.env.PRETRANSLATE_TIMEOUT_MS - ? parseInt(process.env.PRETRANSLATE_TIMEOUT_MS, 10) - : 6 * 60 * 60 * 1000 // default 6h -const pretranslatePollBaseMs = process.env.PRETRANSLATE_POLL_BASE_MS - ? Math.max(5000, parseInt(process.env.PRETRANSLATE_POLL_BASE_MS, 10)) - : 30_000 // default 30s base (min clamped to 5s) - -const existingPreTranslationId = process.env.PRETRANSLATION_ID || "" - -// Parse GitHub repository from env (format: "owner/repo") -const githubRepo = - process.env.GITHUB_REPOSITORY || "ethereum/ethereum-org-website" -const [ghOrganization, ghRepo] = githubRepo.split("/") - -console.log("[DEBUG] Configuration:") -console.log(`[DEBUG] - Target languages: ${targetLanguages.join(", ")}`) -console.log(`[DEBUG] - Base branch: ${baseBranch}`) -console.log(`[DEBUG] - File limit: ${fileLimit}`) -console.log(`[DEBUG] - Start offset: ${startOffset}`) -console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) -console.log(`[DEBUG] - Pretranslate timeout ms: ${pretranslateTimeoutMs}`) -console.log(`[DEBUG] - Pretranslate poll base ms: ${pretranslatePollBaseMs}`) -if (existingPreTranslationId) { - console.log( - `[DEBUG] - Resuming from pre-translation ID: ${existingPreTranslationId}` - ) -} - -const env = { - projectId: 834930, - ghOrganization, - ghRepo, - jsonRoot: "src/intl/en", - mdRoot: "public/content", - preTranslatePromptId: Number.parseInt( - process.env.PRE_TRANSLATE_PROMPT_ID || "326942" - ), - qaPromptId: Number.parseInt(process.env.QA_PROMPT_ID || "168592"), - allCrowdinCodes: targetLanguages, - baseBranch, -} +} from "./lib/types" +// Utilities +import { mapCrowdinCodeToInternal } from "./lib/utils/mapping" +import { config } from "./config" +import { MAX_STRINGS_PER_REQUEST } from "./config" +import { runSanitizer } from "./post_import_sanitize" -// --- Utilities: resilient fetch for GitHub calls --- const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) -type RetryOptions = { - retries?: number - timeoutMs?: number - backoffMs?: number - retryOnStatuses?: number[] -} - -const fetchWithRetry = async ( - url: string, - init?: RequestInit, - options?: RetryOptions -) => { - const retries = options?.retries ?? 3 - const timeoutMs = options?.timeoutMs ?? 30000 - const backoffMs = options?.backoffMs ?? 1000 - const retryOnStatuses = options?.retryOnStatuses ?? [ - 408, 429, 500, 502, 503, 504, - ] - - for (let attempt = 0; attempt <= retries; attempt++) { - const controller = new AbortController() - const id = setTimeout(() => controller.abort(), timeoutMs) - try { - const res = await fetch(url, { - ...(init || {}), - signal: controller.signal, - }) - clearTimeout(id) - if ( - !res.ok && - retryOnStatuses.includes(res.status) && - attempt < retries - ) { - const wait = backoffMs * Math.pow(2, attempt) - console.warn( - `[RETRY] ${url} -> ${res.status}. Attempt ${attempt + 1}/${retries}. Waiting ${wait}ms.` - ) - await delay(wait) - continue - } - return res - } catch (err: unknown) { - clearTimeout(id) - const errObj = err as { name?: string; code?: string } - const isAbort = errObj?.name === "AbortError" - const isConnectTimeout = errObj?.code === "UND_ERR_CONNECT_TIMEOUT" - if ((isAbort || isConnectTimeout) && attempt < retries) { - const wait = backoffMs * Math.pow(2, attempt) - console.warn( - `[RETRY] ${url} -> ${isAbort ? "AbortError" : errObj?.code}. Attempt ${ - attempt + 1 - }/${retries}. Waiting ${wait}ms.` - ) - await delay(wait) - continue - } - throw err - } - } - // Unreachable, but TS wants a return - throw new Error("fetchWithRetry: exhausted retries") -} - -// --- Crowdin AI Completions (qa_check) helpers --- -type QaCompletionRequest = { - projectId: number - sourceLanguageId: string - targetLanguageId: string - stringIds: number[] -} -type QaCompletionJob = { - id: string - status: "in_progress" | "finished" | string - progress?: number -} -type QaIssue = { - fileId: number - stringId: number - severity: "error" | "warning" | "info" - title: string - details?: string -} - -const resolveCrowdinUserId = async (): Promise => { - const url = new URL("https://api.crowdin.com/api/v2/user") - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error(`resolveCrowdinUserId (${res.status}): ${text}`) - } - const json = await res.json() - const id = String(json.data?.id || json.id) - if (!id) throw new Error("Failed to resolve Crowdin user id from /users/me") - return id -} - -const listStringIdsForFile = async (fileId: number): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/strings` - ) - url.searchParams.set("fileId", String(fileId)) - url.searchParams.set("limit", "500") - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error(`listStringIdsForFile (${res.status}): ${text}`) - } - const json = await res.json() - type StringItem = { data: { id: number } } - const items: StringItem[] = json.data || [] - const ids: number[] = items.map((d) => d.data.id) - return ids -} - -const postQaCompletions = async ( - qaPromptId: number, - payload: QaCompletionRequest -): Promise => { - const userId = await resolveCrowdinUserId() - if (!userId) throw new Error("CROWDIN_USER_ID env missing for completions") - const url = new URL( - `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${qaPromptId}/completions` - ) - const bodyPayload = { resources: payload } - console.log(`[QA-CHECK][DEBUG] POST ${url.toString()}`) - console.log(`[QA-CHECK][DEBUG] Body:`, JSON.stringify(bodyPayload, null, 2)) - const res = await fetch(url.toString(), { - method: "POST", - headers: { ...crowdinBearerHeaders, "Content-Type": "application/json" }, - body: JSON.stringify(bodyPayload), - }) - console.log(`[QA-CHECK][DEBUG] Response status: ${res.status}`) - if (!res.ok) { - const text = await res.text().catch(() => "") - console.log(`[QA-CHECK][DEBUG] Error response:`, text) - if (res.status === 403) { - throw new Error( - `QA completions endpoint not accessible (403). ` + - `This may require Crowdin Enterprise or AI credits. URL: ${url.toString()} Raw: ${text}` - ) - } - throw new Error(`postQaCompletions (${res.status}): ${text}`) - } - const json = await res.json() - console.log( - `[QA-CHECK][DEBUG] Success response:`, - JSON.stringify(json, null, 2) - ) - return json.data as QaCompletionJob -} - -const getQaCompletion = async ( - completionId: string -): Promise => { - const userId = await resolveCrowdinUserId() - const url = new URL( - `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/completions/${completionId}` - ) - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error(`getQaCompletion (${res.status}): ${text}`) - } - const json = await res.json() - return json.data as QaCompletionJob -} - -const awaitQaCompletion = async ( - completionId: string, - timeoutMs = pretranslateTimeoutMs, - baseIntervalMs = pretranslatePollBaseMs -): Promise => { - const start = Date.now() - let attempt = 0 - const computeInterval = (elapsedMs: number): number => { - const minutes = elapsedMs / 60000 - if (minutes < 10) return baseIntervalMs - if (minutes < 30) return Math.max(baseIntervalMs * 2, 60_000) - if (minutes < 60) return Math.max(baseIntervalMs * 4, 180_000) - return Math.max(baseIntervalMs * 10, 300_000) - } - while (Date.now() - start <= timeoutMs) { - attempt++ - const elapsed = Date.now() - start - let job: QaCompletionJob - try { - job = await getQaCompletion(completionId) - } catch (e) { - const wait = computeInterval(elapsed) - console.warn( - `[QA-CHECK][POLL] Error on attempt ${attempt}: ${(e as Error).message}. Waiting ${wait}ms.` - ) - await delay(wait) - continue - } - if (job.status !== "in_progress") return job - const wait = computeInterval(elapsed) - console.log( - `[QA-CHECK][POLL] attempt=${attempt} progress=${job.progress ?? 0}% nextWait=${wait}ms` - ) - await delay(wait) - } - throw new Error("Timed out awaiting QA completion") -} - -const downloadQaCompletionResult = async ( - completionId: string -): Promise => { - const userId = await resolveCrowdinUserId() - const url = new URL( - `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/completions/${completionId}/download` - ) - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error(`downloadQaCompletionResult (${res.status}): ${text}`) - } - // Assume JSON structure containing issues; adjust as per actual response - const arrayBuffer = await res.arrayBuffer() - const text = Buffer.from(arrayBuffer).toString("utf-8") - try { - const parsed = JSON.parse(text) - const issues: QaIssue[] = parsed.issues || parsed.data || [] - return issues - } catch { - // If plain text, return empty and attach raw for summary - return [] - } -} - -const summarizeQaIssues = ( - issues: QaIssue[], - fileIdToPath: Record, - lang: string -): string => { - if (!issues.length) return `No QA issues detected for ${lang}.` - const counts = { error: 0, warning: 0, info: 0 } - for (const i of issues) { - const sev = i.severity - if (sev === "error" || sev === "warning" || sev === "info") { - counts[sev]++ - } - } - const top = issues.slice(0, 10) - const lines = [ - `QA for ${lang}: ${counts.error} errors, ${counts.warning} warnings, ${counts.info} info`, - ] - for (const i of top) { - const path = fileIdToPath[i.fileId] || `fileId=${i.fileId}` - lines.push(`- [${i.severity}] ${path} string=${i.stringId} — ${i.title}`) - } - return lines.join("\n") -} - -/** - * Get English files with pagination, allowing limit + offset. - * GitHub Search API caps `per_page` at 100; we fetch pages until - * we accumulate `offset + limit` items, then return the slice. - */ -const getAllEnglishFiles = async ( - limit = 100, - offset = 0 -): Promise => { - const ghSearchEndpointBase = "https://api.github.com/search/code" - const query = `repo:${env.ghOrganization}/${env.ghRepo} extension:md path:"${env.mdRoot}" -path:"${env.mdRoot}/translations" OR repo:${env.ghOrganization}/${env.ghRepo} extension:json path:"${env.jsonRoot}"` - - console.log(`[DEBUG] GitHub search query: ${query}`) - - const perPage = 100 - const needed = offset + limit - const collected: GitHubQueryResponseItem[] = [] - - let page = 1 - while (collected.length < needed) { - const url = new URL(ghSearchEndpointBase) - url.searchParams.set("q", query) - url.searchParams.set("per_page", perPage.toString()) - url.searchParams.set("page", page.toString()) - - console.log(`[DEBUG] Fetching search page ${page} ...`) - - try { - const res = await fetchWithRetry(url.toString(), { - headers: gitHubBearerHeaders, - }) - - if (!res.ok) { - console.warn(`[ERROR] GitHub API response not OK: ${res.status}`) - const body = await res.text().catch(() => "") - console.error(`[ERROR] Response body:`, body) - throw new Error(`GitHub getAllEnglishFiles (${res.status}): ${body}`) - } - - type JsonResponse = { items: GitHubQueryResponseItem[] } - const json: JsonResponse = await res.json() - - if (!json.items.length) { - console.log(`[DEBUG] No more results at page ${page}.`) - break - } - - collected.push(...json.items) - console.log(`[DEBUG] Collected ${collected.length} items so far.`) - - page += 1 - if (page > 10) { - // Safety cap: avoid excessive paging; typical search caps ~1000 results - console.warn( - `[WARN] Reached pagination safety cap at page ${page - 1}.` - ) - break - } - } catch (error) { - console.error(`[ERROR] Failed to get English files from GitHub:`, error) - process.exit(1) - } - } - - const sliced = collected.slice(offset, offset + limit) - console.log( - `[DEBUG] Returning ${sliced.length} files (offset=${offset}, limit=${limit})` - ) - if (sliced.length) console.log(`[DEBUG] First GitHub file:`, sliced[0]) - return sliced -} - -const getFileMetadata = async ( - items: GitHubQueryResponseItem[] -): Promise => { - if (!items.length) return [] - - const owner = items[0].repository.owner.login - const repo = items[0].repository.name - - const englishFileMetadata = items.map((item) => { - // https://raw.githubusercontent.com/:owner/:repo/:ref/:path - const download_url = `https://raw.githubusercontent.com/${owner}/${repo}/${env.baseBranch}/${item.path}` - const filePath = item.path - const filePathSplit = filePath.split("/") - const fileName = filePathSplit[filePathSplit.length - 1] - const contentType: ContentType = fileName?.endsWith(".json") - ? "application/json" - : "text/markdown" - - return { - "Crowdin-API-FileName": fileName, - filePath: filePath, - download_url: download_url, - "Content-Type": contentType, - } - }) - return englishFileMetadata -} - -const getCrowdinProjectFiles = async (): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/files` - ) - url.searchParams.set("limit", "500") - - console.log(`[DEBUG] Fetching Crowdin project files from: ${url.toString()}`) - - try { - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - - if (!res.ok) { - console.warn(`[ERROR] Crowdin API response not OK: ${res.status}`) - const body = await res.text().catch(() => "") - console.error(`[ERROR] Response body:`, body) - throw new Error( - `Crowdin getCrowdinProjectFiles failed (${res.status}): ${body}` - ) - } - - type JsonResponse = { data: { data: CrowdinFileData }[] } - const json: JsonResponse = await res.json() - - const mappedData = json.data.map(({ data }) => data) - - console.log( - `[DEBUG] Successfully fetched ${mappedData.length} Crowdin files` - ) - console.log(`[DEBUG] First Crowdin file:`, mappedData[0]) - return mappedData - } catch (error) { - console.error(`[ERROR] Failed to fetch Crowdin project files:`, error) - process.exit(1) - } -} - -const findCrowdinFile = ( - targetFile: GitHubCrowdinFileMetadata, - crowdinFiles: CrowdinFileData[] -): CrowdinFileData => { - console.log( - `[DEBUG] Looking for Crowdin file matching: ${targetFile.filePath}` - ) - console.log(`[DEBUG] Target file name: ${targetFile["Crowdin-API-FileName"]}`) - - // Log first few Crowdin files for comparison - console.log(`[DEBUG] Total Crowdin files found: ${crowdinFiles.length}`) - console.log( - `[DEBUG] First 3 Crowdin file paths:`, - crowdinFiles.slice(0, 3).map((f) => f.path) - ) - - const found = crowdinFiles.find(({ path }) => - path.endsWith(targetFile.filePath) - ) - - if (!found) { - console.error( - `[ERROR] No matching Crowdin project file found for: ${targetFile.filePath}` - ) - console.error( - `[ERROR] Available Crowdin file paths:`, - crowdinFiles.map((f) => f.path) - ) - throw new Error( - `No matching Crowdin project file found for: ${targetFile.filePath}` - ) - } - - console.log( - `[DEBUG] Successfully matched with Crowdin file: ${found.path} (ID: ${found.id})` - ) - return found -} - -/** - * Unhides all hidden strings in a Crowdin file. - * Hidden strings (often marked as duplicates) cannot be translated. - * This function makes them visible so they can be processed by pre-translation. - */ -const unhideStringsInFile = async (fileId: number): Promise => { - console.log(`[UNHIDE] Checking for hidden strings in fileId=${fileId}`) - - // Get all strings from the file - const listUrl = `https://api.crowdin.com/api/v2/projects/${env.projectId}/strings?fileId=${fileId}&limit=500` - - try { - const listRes = await fetch(listUrl, { headers: crowdinBearerHeaders }) - if (!listRes.ok) { - const text = await listRes.text().catch(() => "") - console.warn( - `[UNHIDE] Failed to list strings for fileId=${fileId}: ${text}` - ) - return 0 - } - - const listJson = await listRes.json() - const strings = listJson.data || [] - - let unhiddenCount = 0 - - for (const item of strings) { - const stringId = item.data.id - const isHidden = item.data.isHidden - - if (!isHidden) continue - - // Unhide the string using PATCH - const patchUrl = `https://api.crowdin.com/api/v2/projects/${env.projectId}/strings/${stringId}` - - try { - const patchRes = await fetch(patchUrl, { - method: "PATCH", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify([ - { - op: "replace", - path: "/isHidden", - value: false, - }, - ]), - }) - - if (patchRes.ok) { - unhiddenCount++ - } else { - const text = await patchRes.text().catch(() => "") - console.warn(`[UNHIDE] Failed to unhide string ${stringId}: ${text}`) - } - } catch (err) { - console.warn(`[UNHIDE] Error unhiding string ${stringId}:`, err) - } - } - - if (unhiddenCount > 0) { - console.log( - `[UNHIDE] ✓ Unhidden ${unhiddenCount} strings in fileId=${fileId}` - ) - } else { - console.log(`[UNHIDE] No hidden strings found in fileId=${fileId}`) - } - - return unhiddenCount - } catch (error) { - console.error(`[UNHIDE] Error processing fileId=${fileId}:`, error) - return 0 - } -} - -/** - * Lists all Crowdin directories in the project. - */ -const getCrowdinProjectDirectories = async (): Promise< - { id: number; name: string; directoryId?: number }[] -> => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/directories` - ) - url.searchParams.set("limit", "500") - - console.log(`[DEBUG] Fetching Crowdin directories: ${url.toString()}`) - - try { - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error( - `Crowdin getCrowdinProjectDirectories failed (${res.status}): ${body}` - ) - } - type DirJson = { - data: { data: { id: number; name: string; directoryId?: number } }[] - } - const json: DirJson = await res.json() - const dirs = json.data.map(({ data }) => data) - console.log(`[DEBUG] Loaded ${dirs.length} directories`) - return dirs - } catch (error) { - console.error("[ERROR] getCrowdinProjectDirectories:", error) - throw error - } -} - -/** - * Creates a single Crowdin directory (one segment). Parent may be undefined for root. - */ -const postCrowdinDirectory = async ( - name: string, - parentDirectoryId?: number -): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/directories` - ) - - const body: Record = { name } - if (parentDirectoryId) body.directoryId = parentDirectoryId - - console.log( - `[DEBUG] Creating directory segment "${name}" parent=${parentDirectoryId ?? "ROOT"}` - ) - - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - Accept: "application/json", - }, - body: JSON.stringify(body), - }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - // 409 = already exists race condition - throw new Error( - `Crowdin postCrowdinDirectory failed (${res.status}): ${text}` - ) - } - - type JsonResponse = { data: { id: number } } - const json: JsonResponse = await res.json() - console.log(`[DEBUG] Created directory id=${json.data.id} name="${name}"`) - return json.data.id - } catch (error) { - console.error("[ERROR] postCrowdinDirectory:", error) - throw error - } -} - /** - * Ensures a nested path of directories exists. - * Example path: "public/content/community/events/organizing" - * Returns the final (deepest) directory id. - * - * - Splits path on "/" ignoring empty segments. - * - Reuses existing segments (matched by name + parent). - * - Creates missing segments sequentially. + * Build and commit translations after pre-translation completes */ -const createCrowdinDirectory = async (fullPath: string): Promise => { - if (!fullPath || typeof fullPath !== "string") { - throw new Error("createCrowdinDirectory: path must be a non-empty string") - } - console.log(`[DEBUG] Ensuring Crowdin directory path: "${fullPath}"`) - - const segments = fullPath - .split("/") - .map((s) => s.trim()) - .filter(Boolean) - if (!segments.length) throw new Error("No valid path segments") - - const invalidChars = /[\\:*?"<>|]/ // Disallowed per Crowdin docs for directory name (exclude forward slash which is path separator) - for (const segment of segments) { - if (invalidChars.test(segment)) { - throw new Error( - `createCrowdinDirectory: segment "${segment}" contains invalid characters in path "${fullPath}"` - ) - } - } - - // Load existing directories once - const existing = await getCrowdinProjectDirectories() - - // Build quick lookup: parentId|name -> id (root parentId = 0 sentinel) - const key = (parentId: number | undefined, name: string) => - `${parentId || 0}|${name}` - - const directoryIndex = new Map() - for (const dir of existing) { - directoryIndex.set(key(dir.directoryId, dir.name), dir.id) - } - - let currentParentId: number | undefined - for (const segment of segments) { - const k = key(currentParentId, segment) - let dirId = directoryIndex.get(k) - if (dirId) { - console.log( - `[DEBUG] Reusing existing directory "${segment}" id=${dirId} parent=${currentParentId ?? "ROOT"}` - ) - currentParentId = dirId - continue - } - // Create - dirId = await postCrowdinDirectory(segment, currentParentId) - directoryIndex.set(k, dirId) - currentParentId = dirId - } - - if (!currentParentId) - throw new Error("Failed to resolve final directory id (unexpected)") - - console.log( - `[DEBUG] Final directory id for path "${fullPath}" = ${currentParentId}` - ) - return currentParentId -} - -const postCrowdinFile = async ( - storageId: number, - name: string, - dir: string -): Promise => { - const directoryId = await createCrowdinDirectory(dir) - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/files` - ) - - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - Accept: "application/json", - }, - body: JSON.stringify({ storageId, name, directoryId }), - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`Crowdin postCrowdinFile failed (${res.status}): ${body}`) - } - - type JsonResponse = { data: CrowdinAddFileResponse } - const json: JsonResponse = await res.json() - console.log("Updated file:", json.data) - return json.data - } catch (error) { - console.error(error) - process.exit(1) - } -} - -const downloadGitHubFile = async (download_url: string): Promise => { - try { - // const res = await fetch(download_url, { headers: gitHubBearerHeaders }) - const res = await fetch(download_url) - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error(`Failed to download from GitHub (${res.status}): ${body}`) - } - const arrayBuffer = await res.arrayBuffer() - return Buffer.from(arrayBuffer) - } catch (error) { - console.error("downloadGitHubFile error:", error) - throw error - } -} - -const postFileToStorage = async (fileBuffer: Buffer, fileName: string) => { - const url = new URL("https://api.crowdin.com/api/v2/storages") - - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - // Crowdin expects raw bytes for storages endpoint; use octet-stream. - "Content-Type": "application/octet-stream", - "Crowdin-API-FileName": fileName, - }, - body: fileBuffer, - }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error( - `Crowdin postFileToStorage failed (${res.status}): ${text}` - ) - } - - type JsonResponse = { - data: { - id: number - fileName: string - } - } - const json: JsonResponse = await res.json() - console.log("Uploaded storage:", json.data) - return json.data - } catch (error) { - console.error("postFileToStorage error:", error) - throw error - } -} - -const postApplyPreTranslation = async ( - fileIds: number[], - languageIds?: string[], - aiPromptIdOverride?: number -): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/pre-translations` - ) - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - languageIds: languageIds || env.allCrowdinCodes, // ["es-EM"], // TODO: All languages - fileIds, - method: "ai", - aiPromptId: - typeof aiPromptIdOverride === "number" - ? aiPromptIdOverride - : env.preTranslatePromptId, - }), - }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error( - `Crowdin postApplyPreTranslation failed (${res.status}): ${text}` - ) - } - - type JsonResponse = { - data: CrowdinPreTranslateResponse - } - const json: JsonResponse = await res.json() - - return json.data - } catch (error) { - console.error("postApplyPreTranslation error:", error) - throw error - } -} - -const getPreTranslationStatus = async ( - preTranslationId: string -): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/pre-translations/${preTranslationId}` - ) - try { - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error( - `Crowdin getPreTranslationStatus failed (${res.status}): ${text}` - ) - } - - type JsonResponse = { - data: CrowdinPreTranslateResponse - } - const json: JsonResponse = await res.json() - - return json.data - } catch (error) { - console.error("postApplyPreTranslation error:", error) - throw error - } -} - -/** - * Polls Crowdin for the status of a pre-translation job and resolves when it finishes. - * - * This function repeatedly calls `getPreTranslationStatus` for the given - * pre-translation ID until the job is no longer in progress. It polls at a - * fixed interval (10 seconds) and will abort with an error if the operation - * does not complete within the configured timeout (30 minutes). - * - * @param preTranslationId - The identifier of the Crowdin pre-translation job to monitor. - * - * @returns A promise that resolves with the final CrowdinPreTranslateResponse when the - * job status becomes "finished". - * - * @throws {Error} If the wait times out (after 30 minutes). - * @throws {Error} If the pre-translation completes with an unexpected status - * (i.e., any status other than "finished"). - * @throws {Error} If an error is thrown while fetching the pre-translation status - * (errors from `getPreTranslationStatus` are propagated). - * - * @remarks - * - Polling interval: 10,000 ms (10 seconds). - * - Timeout: 30 minutes. - * - * @example - * // Wait for a pre-translation to complete - * const result = await awaitPreTranslationCompleted("abc123") - */ -const awaitPreTranslationCompleted = async ( - preTranslationId: string, - opts?: { timeoutMs?: number; baseIntervalMs?: number } -): Promise => { - const timeoutMs = opts?.timeoutMs ?? pretranslateTimeoutMs - const baseInterval = opts?.baseIntervalMs ?? pretranslatePollBaseMs - const start = Date.now() - let attempt = 0 - - const computeInterval = (elapsedMs: number): number => { - const minutes = elapsedMs / 60000 - if (minutes < 10) return baseInterval - if (minutes < 30) return Math.max(baseInterval * 2, 60_000) - if (minutes < 60) return Math.max(baseInterval * 4, 180_000) - return Math.max(baseInterval * 10, 300_000) // cap at 5 min - } - - // Bounded loop: terminates once elapsed exceeds timeoutMs - while (Date.now() - start <= timeoutMs) { - const elapsed = Date.now() - start - attempt++ - let res: CrowdinPreTranslateResponse - try { - res = await getPreTranslationStatus(preTranslationId) - } catch (e) { - // transient fetch errors: log + continue within timeout window - const nextWait = computeInterval(elapsed) - console.warn( - `[PRE-TRANSLATE][POLL] Error on attempt ${attempt}: ${(e as Error).message}. Retrying in ${nextWait}ms.` - ) - await delay(nextWait) - continue - } - if (res.status !== "in_progress") { - if (res.status === "finished") { - console.log( - `[PRE-TRANSLATE][POLL] Completed after ${attempt} attempts; elapsed ${Math.round( - (Date.now() - start) / 60000 - )}m.` - ) - return res - } - throw new Error( - `Pre-translation ended with unexpected status: ${res.status}` - ) - } - const nextWait = computeInterval(elapsed) - const progressPct = res.progress ?? 0 - console.log( - `[PRE-TRANSLATE][POLL] attempt=${attempt} progress=${progressPct}% elapsed=${Math.round( - elapsed / 60000 - )}m nextWait=${nextWait}ms` - ) - await delay(nextWait) - } - const finalElapsed = Date.now() - start - throw new Error( - `Timed out waiting for pre-translation (elapsed ${Math.round( - finalElapsed / 60000 - )}m)` - ) -} - -/** - * Method: POST - * https://support.crowdin.com/developer/api/v2/#tag/Translations/operation/api.projects.translations.builds.directories.post - * @param fileId - * @param targetLanguageId - * @param projectId - * @returns { url: string; expireIn: string; etag: string; } - */ -const postBuildProjectFileTranslation = async ( - fileId: number, - targetLanguageId: string, - projectId = env.projectId -): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${projectId}/translations/builds/files/${fileId}` - ) - - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - Accept: "application/json", - }, - body: JSON.stringify({ targetLanguageId }), - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error( - `Crowdin postBuildProjectFileTranslation failed (${res.status}): ${body}` - ) - } - - type JsonResponse = { data: BuildProjectFileTranslationResponse } - const json: JsonResponse = await res.json() - console.log("Built file:", json.data) - return json.data -} - -/** - * method: GET - * @param downloadUrl - * @returns { buffer: Buffer } - */ -const getBuiltFile = async ( - downloadUrl: string - // ): Promise<{ buffer: Buffer; fileName: string; contentType: string }> => { -): Promise<{ buffer: Buffer }> => { - try { - const res = await fetch(downloadUrl) - - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error(`Failed to download built file (${res.status}): ${body}`) - } - - const arrayBuffer = await res.arrayBuffer() - const buffer = Buffer.from(arrayBuffer) - - return { buffer } - } catch (error) { - console.error("getBuiltFile error:", error) - throw error - } -} - -/** - * Retrieves the Git object for a branch from the GitHub API and returns its underlying BranchObject. - * - * Fetches the ref for the given branch name from: - * https://api.github.com/repos/{env.ghOrganization}/{env.ghRepo}/git/ref/heads/{branch} - * using the preconfigured `gitHubBearerHeaders`. - * - * @param branch - The branch name to look up (for example "main" or "dev"). - * @returns A promise that resolves to the BranchObject extracted from the GitHub API response. - * - * @throws {Error} If the HTTP response is not OK (non-2xx). The thrown error includes the HTTP status - * and the response body text (when available). - * @throws {SyntaxError} If the response body cannot be parsed as JSON. - * - * @remarks - * - This function expects `env.ghOrganization`, `env.ghRepo`, and `gitHubBearerHeaders` to be available - * in the enclosing scope and correctly configured. - * - The function returns the `.object` property of the BranchDetailsResponse returned by GitHub. - * - Network errors (e.g. connectivity issues) will propagate as rejected promises from `fetch`. - * - * @example - * ```ts - * // resolves to the branch's object (sha, type, url) - * const obj = await getBranchObject("dev"); - * ``` - */ -const getBranchObject = async (branch: string): Promise => { - // https://api.github.com/repos/{{ $('env').item.json.ghOrganization }}/{{ $('env').item.json.ghRepo }}/git/ref/heads/dev - const url = new URL( - `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/git/ref/heads/${branch}` - ) - - const res = await fetchWithRetry(url.toString(), { - headers: gitHubBearerHeaders, - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`GitHub getBranchObject (${res.status}): ${body}`) - } - - type JsonResponse = BranchDetailsResponse - const json: JsonResponse = await res.json() - // console.log("getBranchDetails results", json) - return json.object -} - -const createBranchName = () => { - const ts = new Date().toISOString().replace(/\..*$/, "").replace(/[:]/g, "-") // e.g., 2025-11-10T04-20-13 - return "i18n/import/" + ts -} - -const getDestinationFromPath = ( - crowdinFilePath: string, // e.g. src/intl/en/page-foo.json OR public/content/.../index.md - internalLanguageCode: string -) => { - const normalized = crowdinFilePath.replace(/^\//, "") - const isJson = normalized.toLowerCase().endsWith(".json") - const isMarkdown = normalized.toLowerCase().endsWith(".md") - - let destinationPath = normalized - - if (isJson) { - // JSON: src/intl/en/*.json -> src/intl//*.json - if (normalized.startsWith("src/intl/en/")) { - destinationPath = normalized.replace( - /^src\/intl\/en\//, - `src/intl/${internalLanguageCode}/` - ) - } else if (normalized.startsWith("src/intl/")) { - // Fallback: if for some reason "en" segment is missing, inject lang after src/intl/ - const parts = normalized.split("/") - // parts: [src, intl, ...] - parts.splice(2, 0, internalLanguageCode) - destinationPath = parts.join("/") - } - } else if (isMarkdown) { - // Markdown: public/content//index.md -> public/content/translations///index.md - if (normalized.startsWith("public/content/")) { - const rel = normalized.replace(/^public\/content\//, "") - // If already inside translations/, avoid duplicating; rewrite to current lang - const relParts = rel.split("/").filter(Boolean) - if (relParts[0] === "translations") { - // Drop existing translations// - const rest = relParts.slice(2).join("/") - destinationPath = `public/content/translations/${internalLanguageCode}/${rest}` - } else { - destinationPath = `public/content/translations/${internalLanguageCode}/${rel}` - } - } - } - - console.log( - `[DEBUG] Destination mapping: ${crowdinFilePath} -> ${destinationPath} (lang=${internalLanguageCode})` - ) - return destinationPath -} - -/** - * method: PUT - */ -const postCreateBranchFrom = async (ref = env.baseBranch) => { - const { sha } = await getBranchObject(ref) - - const branch = createBranchName() - - const url = new URL( - `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/git/refs` - ) - - try { - console.log( - `[DEBUG] Creating branch from base="${ref}" sha=${sha} -> new branch="${branch}"` - ) - const res = await fetchWithRetry(url.toString(), { - method: "POST", - headers: { - ...gitHubBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ ref: `refs/heads/${branch}`, sha }), - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - console.error( - `[ERROR] Failed to create branch. URL=${url.toString()} status=${res.status}` - ) - throw new Error(`GitHub createBranchFrom (${res.status}): ${body}`) - } - - return { branch, sha } - } catch (error) { - console.error(error) - process.exit(1) - } -} - -const getPathSha = async (path: string, branch: string) => { - const url = new URL( - `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/contents/${path}?ref=${branch}` - ) - - const res = await fetchWithRetry(url.toString(), { - headers: gitHubBearerHeaders, - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`GitHub getPathSha (${res.status}): ${body}`) - } - - type JsonResponse = { sha: string } - const { sha }: JsonResponse = await res.json() - - return { sha } -} -const putCommitFile = async ( - buffer: Buffer, - destinationPath: string, - branch: string, - sha?: string, - attempt = 0 -): Promise => { - const url = `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/contents/${destinationPath}` - - try { - // Use the buffer contents as base64-encoded content for the commit - const contentBase64 = buffer.toString("base64") - - const body = { - message: `update(i18n): ${destinationPath}`, - content: contentBase64, - branch, - } - - if (sha) body["sha"] = sha - - const res = await fetchWithRetry(url.toString(), { - method: "PUT", - headers: { - ...gitHubBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify(body), - }) - - if (res.status === 422) { - const { sha: fileSha } = await getPathSha(destinationPath, branch) - console.warn( - `[RETRY] 422 Unprocessable for ${destinationPath}. Retrying with existing SHA ${fileSha}` - ) - return await putCommitFile( - buffer, - destinationPath, - branch, - fileSha, - attempt - ) - } - - if (res.status === 409) { - if (attempt >= 5) { - const bodyText = await res.text().catch(() => "") - throw new Error( - `GitHub putCommitFile conflict persists after ${attempt} retries (${res.status}): ${bodyText}` - ) - } - const backoff = 500 * Math.pow(2, attempt) // 500ms, 1s, 2s, 4s, 8s - console.warn( - `[RETRY] 409 Conflict for ${destinationPath}. Attempt ${attempt + 1}. Waiting ${backoff}ms before retry.` - ) - await delay(backoff) - const { sha: latestSha } = await getPathSha(destinationPath, branch) - return await putCommitFile( - buffer, - destinationPath, - branch, - latestSha, - attempt + 1 - ) - } - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`GitHub putCommitFile (${res.status}): ${body}`) - } - } catch (error) { - console.error(error) - process.exit(1) - } -} - -const postPullRequest = async ( - head: string, - base = env.baseBranch, - bodyText?: string -) => { - const url = new URL( - `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/pulls` - ) - - const body = { - title: "i18n: automated Crowdin translation import", - head, - base, - body: bodyText || "Automated Crowdin translation import", - } - - const res = await fetchWithRetry(url.toString(), { - method: "POST", - headers: { - ...gitHubBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify(body), - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`Crowdin postPullRequest failed (${res.status}): ${body}`) - } - - const json = await res.json() - return json -} - async function buildAndCommitTranslations( preTranslateJobCompletedResponse: CrowdinPreTranslateResponse ) { @@ -1400,13 +89,13 @@ async function buildAndCommitTranslations( } } - // Build mapping between Crowdin IDs (e.g. "es-EM") and internal codes (e.g. "es") + // Build mapping between Crowdin IDs and internal codes const languagePairs = languageIds.map((crowdinId) => ({ crowdinId, - internalLanguageCode: crowdinToInternalCodeMapping[crowdinId], + internalLanguageCode: mapCrowdinCodeToInternal(crowdinId), })) - const { branch } = await postCreateBranchFrom(env.baseBranch) + const { branch } = await postCreateBranchFrom(config.baseBranch) console.log(`\n[BRANCH] ✓ Created branch: ${branch}`) // For each language @@ -1428,7 +117,7 @@ async function buildAndCommitTranslations( const { url: downloadUrl } = await postBuildProjectFileTranslation( fileId, crowdinId, - env.projectId + config.projectId ) console.log(`[BUILD] ✓ Build complete, download URL: ${downloadUrl}`) @@ -1455,7 +144,7 @@ async function buildAndCommitTranslations( console.log( `\n[SANITIZE] ========== Running post-import sanitizer before PR ==========` ) - const sanitizeResult = runSanitizer(env.allCrowdinCodes) + const sanitizeResult = runSanitizer(config.allCrowdinCodes) const changedFiles = sanitizeResult.changedFiles || [] if (changedFiles.length) { console.log(`[SANITIZE] Files changed by sanitizer: ${changedFiles.length}`) @@ -1480,9 +169,9 @@ async function buildAndCommitTranslations( console.log(`\n[PR] ========== Creating Pull Request ==========`) console.log(`[PR] Head branch: ${branch}`) - console.log(`[PR] Base branch: ${env.baseBranch}`) + console.log(`[PR] Base branch: ${config.baseBranch}`) - const pr = await postPullRequest(branch, env.baseBranch) + const pr = await postPullRequest(branch, config.baseBranch) console.log(`\n[SUCCESS] ========== Translation import complete! ==========`) console.log(`[SUCCESS] Pull Request URL: ${pr.html_url}`) @@ -1490,27 +179,28 @@ async function buildAndCommitTranslations( console.log(pr) } +/** + * Main orchestration function + */ async function main(options?: { allLangs: boolean }) { console.log(`[DEBUG] Starting main function with options:`, options) console.log(`[DEBUG] Environment config:`, { - projectId: env.projectId, - baseBranch: env.baseBranch, - jsonRoot: env.jsonRoot, - mdRoot: env.mdRoot, - allCrowdinCodes: env.allCrowdinCodes, + projectId: config.projectId, + baseBranch: config.baseBranch, + jsonRoot: config.jsonRoot, + mdRoot: config.mdRoot, + allCrowdinCodes: config.allCrowdinCodes, }) - // Crowdin user id is fetched on-demand when calling completions API - // Check if resuming from existing pre-translation - if (existingPreTranslationId) { + if (config.existingPreTranslationId) { console.log( - `\n[RESUME] ========== Resuming from pre-translation ID: ${existingPreTranslationId} ==========` + `\n[RESUME] ========== Resuming from pre-translation ID: ${config.existingPreTranslationId} ==========` ) console.log(`[RESUME] Checking status of existing pre-translation...`) const preTranslateJobCompletedResponse = await getPreTranslationStatus( - existingPreTranslationId + config.existingPreTranslationId ) if (preTranslateJobCompletedResponse.status === "in_progress") { @@ -1518,7 +208,7 @@ async function main(options?: { allLangs: boolean }) { `[RESUME] Pre-translation still in progress (${preTranslateJobCompletedResponse.progress}%). Waiting for completion...` ) const completedResponse = await awaitPreTranslationCompleted( - existingPreTranslationId + config.existingPreTranslationId ) return await buildAndCommitTranslations(completedResponse) } else if (preTranslateJobCompletedResponse.status === "finished") { @@ -1528,7 +218,7 @@ async function main(options?: { allLangs: boolean }) { return await buildAndCommitTranslations(preTranslateJobCompletedResponse) } else { throw new Error( - `Pre-translation ${existingPreTranslationId} has unexpected status: ${preTranslateJobCompletedResponse.status}` + `Pre-translation ${config.existingPreTranslationId} has unexpected status: ${preTranslateJobCompletedResponse.status}` ) } } @@ -1537,29 +227,28 @@ async function main(options?: { allLangs: boolean }) { console.log(`\n[START] ========== Starting new pre-translation ==========`) // Fetch English files with limit + start offset - const allEnglishFiles = await getAllEnglishFiles(fileLimit, startOffset) + const allEnglishFiles = await getAllEnglishFiles( + config.fileLimit, + config.startOffset + ) console.log( - `[DEBUG] Found ${allEnglishFiles.length} English files from GitHub (offset=${startOffset}, limit=${fileLimit})` + `[DEBUG] Found ${allEnglishFiles.length} English files from GitHub (offset=${config.startOffset}, limit=${config.fileLimit})` ) - // TODO: Add filter here to select specific files const fileMetadata = await getFileMetadata(allEnglishFiles) console.log(`[DEBUG] Generated metadata for ${fileMetadata.length} files`) console.log(`[DEBUG] First file metadata:`, fileMetadata[0]) - const crowdinProjectFiles = await getCrowdinProjectFiles() // *** + const crowdinProjectFiles = await getCrowdinProjectFiles() console.log( `[DEBUG] Found ${crowdinProjectFiles.length} files in Crowdin project` ) - /** - * Iterate through each file and upload - */ + // Iterate through each file and upload const fileIdsSet = new Set() - // Maintain authoritative mapping of processed Crowdin fileId -> path (including newly added files this run) const processedFileIdToPath: Record = {} - // Keep original English buffers to detect untranslated outputs const englishBuffers: Record = {} + for (const file of fileMetadata) { console.log(`[DEBUG] Processing file: ${file.filePath}`) await (async () => { @@ -1638,7 +327,6 @@ async function main(options?: { allLangs: boolean }) { } fileIdsSet.add(effectiveFileId) - // Record path for destination mapping later (Crowdin returns leading slash paths) if (effectivePath) processedFileIdToPath[effectiveFileId] = effectivePath })() } @@ -1647,7 +335,7 @@ async function main(options?: { allLangs: boolean }) { console.log( `\n[UNHIDE] ========== Unhiding strings in ${fileIdsSet.size} files ==========` ) - for (const fileId of fileIdsSet) { + for (const fileId of Array.from(fileIdsSet)) { await unhideStringsInFile(fileId) } @@ -1655,12 +343,12 @@ async function main(options?: { allLangs: boolean }) { `\n[PRE-TRANSLATE] ========== Requesting AI Pre-Translation ==========` ) console.log(`[PRE-TRANSLATE] FileIds to translate:`, Array.from(fileIdsSet)) - console.log(`[PRE-TRANSLATE] Target languages:`, env.allCrowdinCodes) - console.log(`[PRE-TRANSLATE] AI Prompt ID:`, env.preTranslatePromptId) + console.log(`[PRE-TRANSLATE] Target languages:`, config.allCrowdinCodes) + console.log(`[PRE-TRANSLATE] AI Prompt ID:`, config.preTranslatePromptId) const applyPreTranslationResponse = await postApplyPreTranslation( Array.from(fileIdsSet), - options?.allLangs ? env.allCrowdinCodes : env.allCrowdinCodes + options?.allLangs ? config.allCrowdinCodes : config.allCrowdinCodes ) console.log( `[PRE-TRANSLATE] ✓ Pre-translation job created with ID: ${applyPreTranslationResponse.identifier}` @@ -1694,11 +382,12 @@ async function main(options?: { allLangs: boolean }) { JSON.stringify(preTranslateJobCompletedResponse, null, 2) ) - // QA via Crowdin AI Prompt Completions (qa_check) + // QA via Crowdin AI Prompt Completions console.log(`\n[QA-CHECK] ========== AI QA via Prompt Completions ==========`) const qaSummaries: string[] = [] const { languageIds: qaLanguageIds, fileIds: qaFileIds } = preTranslateJobCompletedResponse.attributes + // Build stringId lists per file const fileStringMap: Record = {} for (const fid of qaFileIds) { @@ -1709,9 +398,9 @@ async function main(options?: { allLangs: boolean }) { fileStringMap[fid] = [] } } - // Use project source language from repo (assume en-US or en) — map from i18n config + const sourceLanguageId = "en" - const MAX_STRINGS_PER_REQUEST = 500 + // For each language, run QA per file (naturally batches and ties issues to specific files) for (const lang of qaLanguageIds) { console.log( @@ -1754,8 +443,8 @@ async function main(options?: { allLangs: boolean }) { let job: QaCompletionJob | undefined try { - job = await postQaCompletions(env.qaPromptId, { - projectId: env.projectId, + job = await postQaCompletions(config.qaPromptId, { + projectId: config.projectId, sourceLanguageId, targetLanguageId: lang, stringIds: chunk, @@ -1801,7 +490,7 @@ async function main(options?: { allLangs: boolean }) { const { languageIds, fileIds } = preTranslateJobCompletedResponse.attributes - // Build mapping for commit phase. Prefer processed mapping (includes newly added files); fall back to existing Crowdin snapshot for any missed IDs. + // Build mapping for commit phase const fileIdToPathMapping: Record = {} for (const fid of fileIds) { if (processedFileIdToPath[fid]) { @@ -1816,13 +505,14 @@ async function main(options?: { allLangs: boolean }) { ) } } - // Build mapping between Crowdin IDs (e.g. "es-EM") and internal codes (e.g. "es") + + // Build mapping between Crowdin IDs and internal codes const languagePairs = languageIds.map((crowdinId) => ({ crowdinId, - internalLanguageCode: crowdinToInternalCodeMapping[crowdinId], + internalLanguageCode: mapCrowdinCodeToInternal(crowdinId), })) - const { branch } = await postCreateBranchFrom(env.baseBranch) + const { branch } = await postCreateBranchFrom(config.baseBranch) console.log(`\n[BRANCH] ✓ Created branch: ${branch}`) // For each language @@ -1831,7 +521,7 @@ async function main(options?: { allLangs: boolean }) { `\n[BUILD] ========== Building translations for language: ${crowdinId} (internal: ${internalLanguageCode}) ==========` ) - // Build, download and commit each file updated + // Build, download and commit each file for (const fileId of fileIds) { console.log(`\n[BUILD] --- Processing fileId: ${fileId} ---`) const crowdinPath = fileIdToPathMapping[fileId] @@ -1843,8 +533,8 @@ async function main(options?: { allLangs: boolean }) { ) const { url: downloadUrl } = await postBuildProjectFileTranslation( fileId, - crowdinId, // Crowdin expects the Crowdin language ID here (e.g., "es-EM") - env.projectId + crowdinId, + config.projectId ) console.log(`[BUILD] ✓ Build complete, download URL: ${downloadUrl}`) @@ -1872,7 +562,7 @@ async function main(options?: { allLangs: boolean }) { // 3a- Get destination path const destinationPath = getDestinationFromPath( crowdinPath, - internalLanguageCode // Use internal code (e.g., "es") for repo path replacement + internalLanguageCode ) console.log(`[BUILD] Destination path: ${destinationPath}`) @@ -1883,11 +573,11 @@ async function main(options?: { allLangs: boolean }) { } } - // Run post-import sanitizer BEFORE creating PR (may produce additional commits) + // Run post-import sanitizer BEFORE creating PR console.log( `\n[SANITIZE] ========== Running post-import sanitizer before PR ==========` ) - const sanitizeResult = runSanitizer(env.allCrowdinCodes) + const sanitizeResult = runSanitizer(config.allCrowdinCodes) const changedFiles = sanitizeResult.changedFiles || [] if (changedFiles.length) { console.log(`[SANITIZE] Files changed by sanitizer: ${changedFiles.length}`) @@ -1912,12 +602,12 @@ async function main(options?: { allLangs: boolean }) { console.log(`\n[PR] ========== Creating Pull Request ==========`) console.log(`[PR] Head branch: ${branch}`) - console.log(`[PR] Base branch: ${env.baseBranch}`) + console.log(`[PR] Base branch: ${config.baseBranch}`) const prBody = qaSummaries.length ? `Automated Crowdin translation import\n\nQA Summary:\n\n${qaSummaries.join("\n\n")}` : "Automated Crowdin translation import" - const pr = await postPullRequest(branch, env.baseBranch, prBody) + const pr = await postPullRequest(branch, config.baseBranch, prBody) console.log(`\n[SUCCESS] ========== Translation import complete! ==========`) console.log(`[SUCCESS] Pull Request URL: ${pr.html_url}`) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 97fbe5ace84..7cb0e777145 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -1,5 +1,5 @@ -import fs from "fs" -import path from "path" +import * as fs from "fs" +import * as path from "path" /** * Post-import sanitizer for Crowdin translations. From 623560109154db51da9581c9c6a604c597d798bd Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 29 Nov 2025 11:20:35 -0300 Subject: [PATCH 11/99] feat: initialize trust tiers for LLM-language quality --- .github/workflows/crowdin-ai-import.yml | 2 + src/scripts/i18n/config/language-trust.json | 44 ++ src/scripts/i18n/gen_trust_matrix.ts | 57 ++ src/scripts/i18n/lib/crowdin/prompt-model.ts | 29 + src/scripts/i18n/lib/github/branches.ts | 11 +- .../i18n/lib/github/pr-review-comments.ts | 81 +++ .../i18n/lib/openai/trust-matrix-generator.ts | 149 +++++ src/scripts/i18n/lib/qa-routing.ts | 101 ++++ src/scripts/i18n/lib/types.ts | 6 + src/scripts/i18n/main.ts | 515 ++++++++---------- 10 files changed, 705 insertions(+), 290 deletions(-) create mode 100644 src/scripts/i18n/config/language-trust.json create mode 100644 src/scripts/i18n/gen_trust_matrix.ts create mode 100644 src/scripts/i18n/lib/crowdin/prompt-model.ts create mode 100644 src/scripts/i18n/lib/github/pr-review-comments.ts create mode 100644 src/scripts/i18n/lib/openai/trust-matrix-generator.ts create mode 100644 src/scripts/i18n/lib/qa-routing.ts diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index 0f4c8d77062..95665719346 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -72,6 +72,8 @@ jobs: env: I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_API_KEY }} I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + I18N_CROWDIN_USER_ID: ${{ secrets.I18N_CROWDIN_USER_ID }} PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} FILE_LIMIT: ${{ github.event.inputs.file_limit }} START_OFFSET: ${{ github.event.inputs.start_offset }} diff --git a/src/scripts/i18n/config/language-trust.json b/src/scripts/i18n/config/language-trust.json new file mode 100644 index 00000000000..8fd94e78241 --- /dev/null +++ b/src/scripts/i18n/config/language-trust.json @@ -0,0 +1,44 @@ +{ + "default": { + "lastUpdated": "2025-11-28T00:00:00Z", + "Aplus": ["es", "fr", "de", "zh"], + "A": ["ar", "it", "ja", "pt-br", "ru", "zh-tw"], + "Aminus": ["ko", "pt", "nl", "se", "uk"], + "Bplus": ["tr", "pl", "cs", "hi", "vi", "el", "he", "id", "da", "nb", "ca"], + "B": [ + "bn", + "th", + "fa", + "ro", + "hu", + "fi", + "fil", + "tl", + "ur", + "ms", + "bg", + "hr", + "sr", + "bs", + "sk", + "sl", + "gl" + ], + "Bminus": ["ta", "te", "kn", "gu", "mr", "ml", "lt"], + "Cplus": [ + "hy-am", + "ka", + "kk", + "uz", + "az", + "be", + "ga", + "sw", + "am", + "ne-np", + "km" + ], + "C": ["ha", "ig", "yo", "sn", "tw", "tk"], + "Dplus": ["pcm"] + } +} diff --git a/src/scripts/i18n/gen_trust_matrix.ts b/src/scripts/i18n/gen_trust_matrix.ts new file mode 100644 index 00000000000..bc4cd594bcf --- /dev/null +++ b/src/scripts/i18n/gen_trust_matrix.ts @@ -0,0 +1,57 @@ +import fs from "fs" +import path from "path" + +import i18nConfig from "../../../i18n.config.json" + +import { getPromptModelKey } from "./lib/crowdin/prompt-model" +import type { I18nConfigItem } from "./lib/types" + +// Helper to get all internal language codes +function getInternalLanguageCodes(): string[] { + return i18nConfig.map((lang: I18nConfigItem) => lang.code) +} + +// Helper to call Copilot/GPT for trust matrix generation +async function generateTrustMatrix( + modelKey: string, + internalCodes: string[] +): Promise { + // Compose prompt for Copilot/GPT + const prompt = `You are an expert in language quality assessment for AI translation models. Given the model ${modelKey} and the following internal language codes: ${internalCodes.join(", ")}, group these codes into buckets by expected translation quality (Aplus, A, Aminus, Bplus, B, Bminus, Cplus, C, Dplus). Output a JSON object with these groups as keys and arrays of codes as values. Only use the provided codes.` + + // Call Copilot (GPT-4.1) via API (pseudo-code, replace with actual API call) + // const response = await copilotApi.generate({ prompt }) + // return JSON.parse(response) + + // For now, just log the prompt and return an empty object + console.log("Prompt for Copilot/GPT:", prompt) + return {} +} + +async function main() { + const userId = process.env.I18N_CROWDIN_USER_ID + const promptId = process.env.I18N_CROWDIN_PROMPT_ID + if (!userId || !promptId) { + throw new Error( + "Set I18N_CROWDIN_USER_ID and I18N_CROWDIN_PROMPT_ID in your .env.local" + ) + } + const modelKey = await getPromptModelKey(Number(userId), Number(promptId)) + const internalCodes = getInternalLanguageCodes() + const matrix = await generateTrustMatrix(modelKey, internalCodes) + + // Write to language-trust.json + const outPath = path.join( + process.cwd(), + "src/scripts/i18n/config/language-trust.json" + ) + fs.writeFileSync(outPath, JSON.stringify({ [modelKey]: matrix }, null, 2)) + console.log( + `Trust matrix for model ${modelKey} written to language-trust.json` + ) +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/src/scripts/i18n/lib/crowdin/prompt-model.ts b/src/scripts/i18n/lib/crowdin/prompt-model.ts new file mode 100644 index 00000000000..d3a43c751a0 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/prompt-model.ts @@ -0,0 +1,29 @@ +import { crowdinBearerHeaders } from "../../config" + +type PromptResource = { + id: number + name: string + action: string + aiProviderId?: number | null + model?: string | null + version?: string | null +} + +export async function getPromptModelKey( + userId: number, + promptId: number +): Promise { + const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` + const resp = await fetch(url, { headers: crowdinBearerHeaders }) + if (!resp.ok) { + throw new Error( + `Failed to fetch prompt metadata: ${resp.status} ${await resp.text()}` + ) + } + const json: { data?: PromptResource } = await resp.json() + const data: PromptResource = json.data ?? ({} as PromptResource) + const provider = data.aiProviderId ?? "provider" + const model = data.model ?? "model" + const version = data.version ?? "version" + return `${provider}:${model}:${version}` +} diff --git a/src/scripts/i18n/lib/github/branches.ts b/src/scripts/i18n/lib/github/branches.ts index 905174fd409..2e05a97c077 100644 --- a/src/scripts/i18n/lib/github/branches.ts +++ b/src/scripts/i18n/lib/github/branches.ts @@ -35,9 +35,9 @@ export const getBranchObject = async ( /** * Generate a branch name based on current timestamp */ -export const createBranchName = () => { +export const createBranchName = (suffix?: string) => { const ts = new Date().toISOString().replace(/\..*$/, "").replace(/[:]/g, "-") - return "i18n/import/" + ts + return "i18n/import/" + ts + (suffix ? `-${suffix}` : "") } /** @@ -46,9 +46,12 @@ export const createBranchName = () => { * @param ref - The base branch reference (defaults to config.baseBranch) * @returns Object containing the new branch name and SHA */ -export const postCreateBranchFrom = async (ref = config.baseBranch) => { +export const postCreateBranchFrom = async ( + ref = config.baseBranch, + suffix?: string +) => { const { sha } = await getBranchObject(ref) - const branch = createBranchName() + const branch = createBranchName(suffix) const url = new URL( `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/git/refs` diff --git a/src/scripts/i18n/lib/github/pr-review-comments.ts b/src/scripts/i18n/lib/github/pr-review-comments.ts new file mode 100644 index 00000000000..91715074be3 --- /dev/null +++ b/src/scripts/i18n/lib/github/pr-review-comments.ts @@ -0,0 +1,81 @@ +// GitHub PR review comment helper with scoped @mentions +import { config, gitHubBearerHeaders } from "../../config" +import type { QaLevel } from "../qa-routing" +import { fetchWithRetry } from "../utils/fetch" + +/** + * Post a follow-up comment on a PR with AI reviewer mentions and clear scope + * @param prNumber The PR number + * @param qaPlan The QA plan mapping languages to review levels + */ +export async function postPrReviewComment( + prNumber: number, + qaPlan: Record +): Promise { + const copilotLangs: string[] = [] + const claudeLangs: string[] = [] + + for (const [lang, level] of Object.entries(qaPlan)) { + if (level === "copilot") { + copilotLangs.push(lang) + } else if (level === "copilot+claude") { + copilotLangs.push(lang) + claudeLangs.push(lang) + } + } + + if (copilotLangs.length === 0 && claudeLangs.length === 0) { + console.log("[PR-COMMENT] No AI review needed, skipping comment") + return + } + + let comment = "## AI Translation Review Request\n\n" + comment += + "This PR contains automated translations that need quality review.\n\n" + + if (copilotLangs.length > 0) { + comment += "### @copilot\n\n" + comment += + "@copilot Please review the translations for the following languages and check for:\n" + comment += "- Accuracy and natural phrasing\n" + comment += "- Consistent use of technical terminology\n" + comment += "- Proper handling of Markdown/code syntax\n" + comment += "- Appropriate tone and formality\n\n" + comment += `**Languages:** ${copilotLangs.join(", ")}\n\n` + } + + if (claudeLangs.length > 0) { + comment += "### @claude\n\n" + comment += + "@claude Please provide a thorough review of translations for the following languages, focusing on:\n" + comment += "- Semantic accuracy and cultural appropriateness\n" + comment += "- Technical term consistency\n" + comment += "- Grammar and idiomatic expressions\n" + comment += "- Any potential ambiguities or mistranslations\n\n" + comment += `**Languages:** ${claudeLangs.join(", ")}\n\n` + } + + comment += + "---\n*This review request was automatically generated based on language quality trust scores.*" + + const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/issues/${prNumber}/comments` + + const response = await fetchWithRetry(url, { + method: "POST", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ body: comment }), + }) + + if (!response.ok) { + const errorText = await response.text() + console.warn( + `[PR-COMMENT] Failed to post review comment (${response.status}): ${errorText}` + ) + return + } + + console.log(`[PR-COMMENT] Posted AI review comment on PR #${prNumber}`) +} diff --git a/src/scripts/i18n/lib/openai/trust-matrix-generator.ts b/src/scripts/i18n/lib/openai/trust-matrix-generator.ts new file mode 100644 index 00000000000..e692c45bfd9 --- /dev/null +++ b/src/scripts/i18n/lib/openai/trust-matrix-generator.ts @@ -0,0 +1,149 @@ +// OpenAI integration for generating language trust matrices +import fs from "fs" +import path from "path" + +import i18nConfig from "../../../../../i18n.config.json" + +type TrustBucket = { + lastUpdated?: string + Aplus?: string[] + A?: string[] + Aminus?: string[] + Bplus?: string[] + B?: string[] + Bminus?: string[] + Cplus?: string[] + C?: string[] + Dplus?: string[] +} + +/** + * Generate a trust matrix using OpenAI GPT-4 + * @param modelKey The Crowdin AI model identifier (provider:model:version) + * @returns The generated trust bucket with quality grades for each language + */ +export async function generateTrustMatrixWithOpenAI( + modelKey: string +): Promise { + const apiKey = process.env.OPENAI_API_KEY + if (!apiKey) { + throw new Error( + "OPENAI_API_KEY not found. Cannot generate trust matrix without API access." + ) + } + + const languageList = i18nConfig + .map((lang) => `${lang.code} (${lang.name})`) + .join(", ") + + const prompt = `You are an expert in evaluating AI translation model quality across different languages. + +Given the Crowdin AI translation model identifier: "${modelKey}" + +Please assess the expected translation quality for each of the following languages: ${languageList} + +Group the language codes into these quality buckets: +- Aplus: Exceptional quality, native-level fluency expected +- A: High quality, minimal post-editing needed +- Aminus: Good quality, occasional review needed +- Bplus: Above-average quality, regular review recommended +- B: Average quality, consistent review needed +- Bminus: Below-average quality, careful review required +- Cplus: Fair quality, significant review needed +- C: Poor quality, extensive review required +- Dplus: Very poor quality, requires thorough human translation review + +Respond ONLY with a valid JSON object in this exact format: +{ + "Aplus": ["code1", "code2"], + "A": ["code3"], + "Aminus": ["code4", "code5"], + "Bplus": ["code6"], + "B": ["code7", "code8"], + "Bminus": ["code9"], + "Cplus": ["code10"], + "C": ["code11"], + "Dplus": ["code12"] +} + +Use ONLY the internal codes provided (e.g., "es", "fr", "zh", "pt-br"). Do not include any explanatory text, only the JSON object.` + + const response = await fetch("https://api.openai.com/v1/chat/completions", { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + }, + body: JSON.stringify({ + model: "gpt-4-turbo-preview", + messages: [ + { + role: "system", + content: + "You are a language quality assessment expert. Respond only with valid JSON.", + }, + { role: "user", content: prompt }, + ], + temperature: 0.3, + max_tokens: 2000, + }), + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`OpenAI API error (${response.status}): ${errorText}`) + } + + const data = await response.json() + const content = data.choices?.[0]?.message?.content?.trim() + if (!content) { + throw new Error("OpenAI returned empty response") + } + + // Parse the JSON response + let trustBucket: TrustBucket + try { + trustBucket = JSON.parse(content) + } catch (err) { + console.error("[OPENAI] Failed to parse response:", content) + throw new Error(`OpenAI response was not valid JSON: ${err}`) + } + + // Add timestamp + trustBucket.lastUpdated = new Date().toISOString() + + console.log(`[OPENAI] Generated trust matrix for model: ${modelKey}`) + return trustBucket +} + +/** + * Update the language-trust.json file with a new model's trust matrix + * @param modelKey The model identifier + * @param trustBucket The trust bucket to add + */ +export function saveTrustMatrixToFile( + modelKey: string, + trustBucket: TrustBucket +): void { + const filePath = path.join( + process.cwd(), + "src/scripts/i18n/config/language-trust.json" + ) + + let matrix: Record = {} + try { + const raw = fs.readFileSync(filePath, "utf8") + matrix = JSON.parse(raw) + } catch { + console.warn( + "[TRUST-MATRIX] Could not read existing matrix, creating new file" + ) + } + + matrix[modelKey] = trustBucket + + fs.writeFileSync(filePath, JSON.stringify(matrix, null, 2) + "\n") + console.log( + `[TRUST-MATRIX] Saved trust matrix for model "${modelKey}" to language-trust.json` + ) +} diff --git a/src/scripts/i18n/lib/qa-routing.ts b/src/scripts/i18n/lib/qa-routing.ts new file mode 100644 index 00000000000..7d7eb68de70 --- /dev/null +++ b/src/scripts/i18n/lib/qa-routing.ts @@ -0,0 +1,101 @@ +// NOTE: language-trust.json now uses ONLY internal codes (see i18n.config.json 'code' field) +import fs from "fs" +import path from "path" + +type TrustBucket = { + lastUpdated?: string + Aplus?: string[] + A?: string[] + Aminus?: string[] + Bplus?: string[] + B?: string[] + Bminus?: string[] + Cplus?: string[] + C?: string[] + Dplus?: string[] +} + +type TrustMatrix = Record + +export type QaLevel = "skip" | "copilot" | "copilot+claude" + +export function loadTrustMatrix(): TrustMatrix { + const p = path.join( + process.cwd(), + "src/scripts/i18n/config/language-trust.json" + ) + try { + const raw = fs.readFileSync(p, "utf8") + return JSON.parse(raw) + } catch { + return { default: {} } + } +} + +/** + * Find the most recent model key in the trust matrix by lastUpdated timestamp + */ +export function getMostRecentModelKey(matrix: TrustMatrix): string | null { + let mostRecent: string | null = null + let latestTime = 0 + for (const [key, bucket] of Object.entries(matrix)) { + if (bucket.lastUpdated) { + const timestamp = new Date(bucket.lastUpdated).getTime() + if (timestamp > latestTime) { + latestTime = timestamp + mostRecent = key + } + } + } + return mostRecent +} + +export function planQaForLanguages( + languageIds: string[], + modelKey?: string +): Record { + const matrix = loadTrustMatrix() + // Try to use the specified model, fallback to most recent, then default + let bucket: TrustBucket = {} + if (modelKey && matrix[modelKey]) { + bucket = matrix[modelKey] + console.log(`[QA-ROUTING] Using trust matrix for model: ${modelKey}`) + } else { + const fallbackKey = getMostRecentModelKey(matrix) || "default" + bucket = matrix[fallbackKey] || {} + if (modelKey) { + console.log( + `[QA-ROUTING] Model "${modelKey}" not found, using fallback: ${fallbackKey}` + ) + } else { + console.log(`[QA-ROUTING] Using fallback trust matrix: ${fallbackKey}`) + } + } + + const groupIndex = new Map([ + ["Aplus", "skip"], + ["A", "skip"], + ["Aminus", "skip"], + ["Bplus", "copilot"], + ["B", "copilot"], + ["Bminus", "copilot"], + ["Cplus", "copilot+claude"], + ["C", "copilot+claude"], + ["Dplus", "copilot+claude"], + ]) + + const index = new Map() + for (const [group, list] of Object.entries(bucket)) { + if (group === "lastUpdated") continue // skip metadata + const level = groupIndex.get(group as string) + if (!level) continue + for (const code of list || []) index.set(code, level) + } + + const plan: Record = {} + // All languageIds should be internal codes (not Crowdin codes) + for (const lang of languageIds) { + plan[lang] = index.get(lang) || "copilot" // conservative default + } + return plan +} diff --git a/src/scripts/i18n/lib/types.ts b/src/scripts/i18n/lib/types.ts index 8c81a295668..995f70dec36 100644 --- a/src/scripts/i18n/lib/types.ts +++ b/src/scripts/i18n/lib/types.ts @@ -234,3 +234,9 @@ export type CrowdinAddFileResponse = { createdAt: string | null updatedAt: string | null } + +export type I18nConfigItem = { + code: string + crowdinCode: string + name: string +} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index fc5f96aa7bc..367c5d04d88 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -17,6 +17,7 @@ import { getPreTranslationStatus, postApplyPreTranslation, } from "./lib/crowdin/pre-translate" +import { getPromptModelKey } from "./lib/crowdin/prompt-model" import { awaitQaCompletion, downloadQaCompletionResult, @@ -34,7 +35,13 @@ import { getAllEnglishFiles, getFileMetadata, } from "./lib/github/files" +import { postPrReviewComment } from "./lib/github/pr-review-comments" import { postPullRequest } from "./lib/github/pull-requests" +import { + generateTrustMatrixWithOpenAI, + saveTrustMatrixToFile, +} from "./lib/openai/trust-matrix-generator" +import { loadTrustMatrix, planQaForLanguages } from "./lib/qa-routing" import type { CrowdinAddFileResponse, CrowdinFileData, @@ -42,142 +49,11 @@ import type { } from "./lib/types" // Utilities import { mapCrowdinCodeToInternal } from "./lib/utils/mapping" -import { config } from "./config" -import { MAX_STRINGS_PER_REQUEST } from "./config" +import { config, MAX_STRINGS_PER_REQUEST } from "./config" import { runSanitizer } from "./post_import_sanitize" - -const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) - -/** - * Build and commit translations after pre-translation completes - */ -async function buildAndCommitTranslations( - preTranslateJobCompletedResponse: CrowdinPreTranslateResponse -) { - if (preTranslateJobCompletedResponse.status !== "finished") { - console.error( - "[BUILD] ❌ Pre-translation did not finish successfully. Full response:", - preTranslateJobCompletedResponse - ) - throw new Error( - `Pre-translation ended with unexpected status: ${preTranslateJobCompletedResponse.status}` - ) - } - - console.log(`[BUILD] ✓ Pre-translation completed successfully!`) - console.log(`[BUILD] Progress: ${preTranslateJobCompletedResponse.progress}%`) - console.log( - `[BUILD] Full response:`, - JSON.stringify(preTranslateJobCompletedResponse, null, 2) - ) - - const { languageIds, fileIds } = preTranslateJobCompletedResponse.attributes - - // Get Crowdin project files for path mapping - const crowdinProjectFiles = await getCrowdinProjectFiles() - - // Build mapping for commit phase using existing Crowdin files - const fileIdToPathMapping: Record = {} - for (const fid of fileIds) { - const existing = crowdinProjectFiles.find((f) => f.id === fid) - if (existing) fileIdToPathMapping[fid] = existing.path - - if (!fileIdToPathMapping[fid]) { - console.warn( - `[WARN] Missing path mapping for fileId=${fid} (may impact destination path calculation)` - ) - } - } - - // Build mapping between Crowdin IDs and internal codes - const languagePairs = languageIds.map((crowdinId) => ({ - crowdinId, - internalLanguageCode: mapCrowdinCodeToInternal(crowdinId), - })) - - const { branch } = await postCreateBranchFrom(config.baseBranch) - console.log(`\n[BRANCH] ✓ Created branch: ${branch}`) - - // For each language - for (const { crowdinId, internalLanguageCode } of languagePairs) { - console.log( - `\n[BUILD] ========== Building translations for language: ${crowdinId} (internal: ${internalLanguageCode}) ==========` - ) - - // Build, download and commit each file - for (const fileId of fileIds) { - console.log(`\n[BUILD] --- Processing fileId: ${fileId} ---`) - const crowdinPath = fileIdToPathMapping[fileId] - console.log(`[BUILD] Crowdin path: ${crowdinPath}`) - - // 1- Build - console.log( - `[BUILD] Requesting build for fileId=${fileId}, language=${crowdinId}` - ) - const { url: downloadUrl } = await postBuildProjectFileTranslation( - fileId, - crowdinId, - config.projectId - ) - console.log(`[BUILD] ✓ Build complete, download URL: ${downloadUrl}`) - - // 2- Download - console.log(`[BUILD] Downloading translated file...`) - const { buffer } = await getBuiltFile(downloadUrl) - console.log(`[BUILD] Downloaded ${buffer.length} bytes`) - - // 3a- Get destination path - const destinationPath = getDestinationFromPath( - crowdinPath, - internalLanguageCode - ) - console.log(`[BUILD] Destination path: ${destinationPath}`) - - // 3b- Commit - console.log(`[BUILD] Committing to branch: ${branch}`) - await putCommitFile(buffer, destinationPath, branch) - console.log(`[BUILD] ✓ Committed successfully`) - } - } - - // Run post-import sanitizer BEFORE creating PR (may produce additional commits) - console.log( - `\n[SANITIZE] ========== Running post-import sanitizer before PR ==========` - ) - const sanitizeResult = runSanitizer(config.allCrowdinCodes) - const changedFiles = sanitizeResult.changedFiles || [] - if (changedFiles.length) { - console.log(`[SANITIZE] Files changed by sanitizer: ${changedFiles.length}`) - for (const abs of changedFiles) { - const relPath = abs.startsWith(process.cwd()) - ? abs.slice(process.cwd().length + 1) - : abs - try { - const buf = fs.readFileSync(abs) - await putCommitFile(buf, relPath, branch) - console.log(`[SANITIZE] ✓ Committed sanitized file: ${relPath}`) - } catch (e) { - console.warn( - `[SANITIZE] Failed to commit sanitized file ${relPath}:`, - e - ) - } - } - } else { - console.log("[SANITIZE] No sanitation changes to commit") - } - - console.log(`\n[PR] ========== Creating Pull Request ==========`) - console.log(`[PR] Head branch: ${branch}`) - console.log(`[PR] Base branch: ${config.baseBranch}`) - - const pr = await postPullRequest(branch, config.baseBranch) - - console.log(`\n[SUCCESS] ========== Translation import complete! ==========`) - console.log(`[SUCCESS] Pull Request URL: ${pr.html_url}`) - console.log(`[SUCCESS] PR Number: #${pr.number}`) - console.log(pr) -} +// Small helper for async waits +const delay = (ms: number) => + new Promise((resolve) => setTimeout(resolve, ms)) /** * Main orchestration function @@ -192,196 +68,196 @@ async function main(options?: { allLangs: boolean }) { allCrowdinCodes: config.allCrowdinCodes, }) - // Check if resuming from existing pre-translation + // Shared state used in both resume and new flows + const crowdinProjectFiles = await getCrowdinProjectFiles() + const fileIdsSet = new Set() + const processedFileIdToPath: Record = {} + const englishBuffers: Record = {} + + // If resuming, determine completed pre-translation response; otherwise start new + let preTranslateJobCompletedResponse: CrowdinPreTranslateResponse if (config.existingPreTranslationId) { console.log( `\n[RESUME] ========== Resuming from pre-translation ID: ${config.existingPreTranslationId} ==========` ) console.log(`[RESUME] Checking status of existing pre-translation...`) - - const preTranslateJobCompletedResponse = await getPreTranslationStatus( + const statusResp = await getPreTranslationStatus( config.existingPreTranslationId ) - - if (preTranslateJobCompletedResponse.status === "in_progress") { + if (statusResp.status === "in_progress") { console.log( - `[RESUME] Pre-translation still in progress (${preTranslateJobCompletedResponse.progress}%). Waiting for completion...` + `[RESUME] Pre-translation still in progress (${statusResp.progress}%). Waiting for completion...` ) - const completedResponse = await awaitPreTranslationCompleted( + preTranslateJobCompletedResponse = await awaitPreTranslationCompleted( config.existingPreTranslationId ) - return await buildAndCommitTranslations(completedResponse) - } else if (preTranslateJobCompletedResponse.status === "finished") { + } else if (statusResp.status === "finished") { console.log( `[RESUME] Pre-translation already finished. Building translations...` ) - return await buildAndCommitTranslations(preTranslateJobCompletedResponse) + preTranslateJobCompletedResponse = statusResp } else { throw new Error( - `Pre-translation ${config.existingPreTranslationId} has unexpected status: ${preTranslateJobCompletedResponse.status}` + `Pre-translation ${config.existingPreTranslationId} has unexpected status: ${statusResp.status}` ) } - } - - // Normal flow: Start new pre-translation - console.log(`\n[START] ========== Starting new pre-translation ==========`) + } else { + // Normal flow: Start new pre-translation + console.log(`\n[START] ========== Starting new pre-translation ==========`) - // Fetch English files with limit + start offset - const allEnglishFiles = await getAllEnglishFiles( - config.fileLimit, - config.startOffset - ) - console.log( - `[DEBUG] Found ${allEnglishFiles.length} English files from GitHub (offset=${config.startOffset}, limit=${config.fileLimit})` - ) + // Fetch English files with limit + start offset + const allEnglishFiles = await getAllEnglishFiles( + config.fileLimit, + config.startOffset + ) + console.log( + `[DEBUG] Found ${allEnglishFiles.length} English files from GitHub (offset=${config.startOffset}, limit=${config.fileLimit})` + ) - const fileMetadata = await getFileMetadata(allEnglishFiles) - console.log(`[DEBUG] Generated metadata for ${fileMetadata.length} files`) - console.log(`[DEBUG] First file metadata:`, fileMetadata[0]) + const fileMetadata = await getFileMetadata(allEnglishFiles) + console.log(`[DEBUG] Generated metadata for ${fileMetadata.length} files`) + console.log(`[DEBUG] First file metadata:`, fileMetadata[0]) - const crowdinProjectFiles = await getCrowdinProjectFiles() - console.log( - `[DEBUG] Found ${crowdinProjectFiles.length} files in Crowdin project` - ) + console.log( + `[DEBUG] Found ${crowdinProjectFiles.length} files in Crowdin project` + ) - // Iterate through each file and upload - const fileIdsSet = new Set() - const processedFileIdToPath: Record = {} - const englishBuffers: Record = {} + // Iterate through each file and upload + for (const file of fileMetadata) { + console.log(`[DEBUG] Processing file: ${file.filePath}`) + await (async () => { + let foundFile: CrowdinFileData | undefined + try { + foundFile = findCrowdinFile(file, crowdinProjectFiles) + } catch { + console.log("File not found in Crowdin, attempting to add new file") + } - for (const file of fileMetadata) { - console.log(`[DEBUG] Processing file: ${file.filePath}`) - await (async () => { - let foundFile: CrowdinFileData | undefined - try { - foundFile = findCrowdinFile(file, crowdinProjectFiles) - } catch { - console.log("File not found in Crowdin, attempting to add new file") - } + let crowdinFileResponse: CrowdinAddFileResponse | undefined + let effectiveFileId: number + let effectivePath: string - let crowdinFileResponse: CrowdinAddFileResponse | undefined - let effectiveFileId: number - let effectivePath: string + if (foundFile) { + // File exists - DO NOT update to preserve parsed string structure + console.log( + `[SKIP-UPDATE] File already exists in Crowdin with ID: ${foundFile.id}, using existing structure` + ) + console.log( + `[SKIP-UPDATE] Skipping upload/update to preserve existing parsed strings` + ) + effectiveFileId = foundFile.id + effectivePath = foundFile.path - if (foundFile) { - // File exists - DO NOT update to preserve parsed string structure - console.log( - `[SKIP-UPDATE] File already exists in Crowdin with ID: ${foundFile.id}, using existing structure` - ) - console.log( - `[SKIP-UPDATE] Skipping upload/update to preserve existing parsed strings` - ) - effectiveFileId = foundFile.id - effectivePath = foundFile.path + // Still download English for buffer comparison later + console.log( + `[DOWNLOAD] Downloading English source for buffer comparison: ${file.download_url}` + ) + const fileBuffer = await downloadGitHubFile(file.download_url) + englishBuffers[effectiveFileId] = fileBuffer + } else { + // File doesn't exist - create it + console.log(`[UPLOAD] File NOT found in Crowdin, creating new file`) + console.log( + `[UPLOAD] Downloading English source from: ${file.download_url}` + ) + const fileBuffer = await downloadGitHubFile(file.download_url) + console.log(`[UPLOAD] Downloaded ${fileBuffer.length} bytes`) - // Still download English for buffer comparison later - console.log( - `[DOWNLOAD] Downloading English source for buffer comparison: ${file.download_url}` - ) - const fileBuffer = await downloadGitHubFile(file.download_url) - englishBuffers[effectiveFileId] = fileBuffer - } else { - // File doesn't exist - create it - console.log(`[UPLOAD] File NOT found in Crowdin, creating new file`) - console.log( - `[UPLOAD] Downloading English source from: ${file.download_url}` - ) - const fileBuffer = await downloadGitHubFile(file.download_url) - console.log(`[UPLOAD] Downloaded ${fileBuffer.length} bytes`) + const storageInfo = await postFileToStorage( + fileBuffer, + file["Crowdin-API-FileName"] + ) + console.log( + `[UPLOAD] Uploaded to Crowdin storage with ID: ${storageInfo.id}` + ) - const storageInfo = await postFileToStorage( - fileBuffer, - file["Crowdin-API-FileName"] - ) - console.log( - `[UPLOAD] Uploaded to Crowdin storage with ID: ${storageInfo.id}` - ) + // Derive full parent directory path (exclude filename) + const parts = file.filePath.split("/").filter(Boolean) + parts.pop() // remove filename + const parentDirPath = parts.join("/") || "/" + console.log( + `[UPLOAD] Creating new Crowdin file in directory path: ${parentDirPath}` + ) + crowdinFileResponse = await postCrowdinFile( + storageInfo.id, + file["Crowdin-API-FileName"], + parentDirPath + ) + console.log( + `[UPLOAD] ✓ Created new Crowdin file with ID: ${crowdinFileResponse.id}` + ) - // Derive full parent directory path (exclude filename) - const parts = file.filePath.split("/").filter(Boolean) - parts.pop() // remove filename - const parentDirPath = parts.join("/") || "/" - console.log( - `[UPLOAD] Creating new Crowdin file in directory path: ${parentDirPath}` - ) - crowdinFileResponse = await postCrowdinFile( - storageInfo.id, - file["Crowdin-API-FileName"], - parentDirPath - ) - console.log( - `[UPLOAD] ✓ Created new Crowdin file with ID: ${crowdinFileResponse.id}` - ) + effectiveFileId = crowdinFileResponse.id + effectivePath = crowdinFileResponse.path + englishBuffers[effectiveFileId] = fileBuffer - effectiveFileId = crowdinFileResponse.id - effectivePath = crowdinFileResponse.path - englishBuffers[effectiveFileId] = fileBuffer + // Wait for new file parsing + const delayMs = 10000 + console.log( + `[UPLOAD] ⏱️ Waiting ${delayMs / 1000}s for Crowdin to parse new file...` + ) + await delay(delayMs) + console.log(`[UPLOAD] ✓ Parsing delay complete`) + } - // Wait for new file parsing - const delayMs = 10000 - console.log( - `[UPLOAD] ⏱️ Waiting ${delayMs / 1000}s for Crowdin to parse new file...` - ) - await delay(delayMs) - console.log(`[UPLOAD] ✓ Parsing delay complete`) - } + fileIdsSet.add(effectiveFileId) + if (effectivePath) + processedFileIdToPath[effectiveFileId] = effectivePath + })() + } - fileIdsSet.add(effectiveFileId) - if (effectivePath) processedFileIdToPath[effectiveFileId] = effectivePath - })() - } + // Unhide any hidden/duplicate strings before pre-translation + console.log( + `\n[UNHIDE] ========== Unhiding strings in ${fileIdsSet.size} files ==========` + ) + for (const fileId of Array.from(fileIdsSet)) { + await unhideStringsInFile(fileId) + } - // Unhide any hidden/duplicate strings before pre-translation - console.log( - `\n[UNHIDE] ========== Unhiding strings in ${fileIdsSet.size} files ==========` - ) - for (const fileId of Array.from(fileIdsSet)) { - await unhideStringsInFile(fileId) - } + console.log( + `\n[PRE-TRANSLATE] ========== Requesting AI Pre-Translation ==========` + ) + console.log(`[PRE-TRANSLATE] FileIds to translate:`, Array.from(fileIdsSet)) + console.log(`[PRE-TRANSLATE] Target languages:`, config.allCrowdinCodes) + console.log(`[PRE-TRANSLATE] AI Prompt ID:`, config.preTranslatePromptId) - console.log( - `\n[PRE-TRANSLATE] ========== Requesting AI Pre-Translation ==========` - ) - console.log(`[PRE-TRANSLATE] FileIds to translate:`, Array.from(fileIdsSet)) - console.log(`[PRE-TRANSLATE] Target languages:`, config.allCrowdinCodes) - console.log(`[PRE-TRANSLATE] AI Prompt ID:`, config.preTranslatePromptId) + const applyPreTranslationResponse = await postApplyPreTranslation( + Array.from(fileIdsSet), + options?.allLangs ? config.allCrowdinCodes : config.allCrowdinCodes + ) + console.log( + `[PRE-TRANSLATE] ✓ Pre-translation job created with ID: ${applyPreTranslationResponse.identifier}` + ) + console.log( + `[PRE-TRANSLATE] Initial status:`, + applyPreTranslationResponse.status + ) - const applyPreTranslationResponse = await postApplyPreTranslation( - Array.from(fileIdsSet), - options?.allLangs ? config.allCrowdinCodes : config.allCrowdinCodes - ) - console.log( - `[PRE-TRANSLATE] ✓ Pre-translation job created with ID: ${applyPreTranslationResponse.identifier}` - ) - console.log( - `[PRE-TRANSLATE] Initial status:`, - applyPreTranslationResponse.status - ) + console.log(`\n[PRE-TRANSLATE] Waiting for job to complete...`) + preTranslateJobCompletedResponse = await awaitPreTranslationCompleted( + applyPreTranslationResponse.identifier + ) - console.log(`\n[PRE-TRANSLATE] Waiting for job to complete...`) - const preTranslateJobCompletedResponse = await awaitPreTranslationCompleted( - applyPreTranslationResponse.identifier - ) + if (preTranslateJobCompletedResponse.status !== "finished") { + console.error( + "[PRE-TRANSLATE] ❌ Pre-translation did not finish successfully. Full response:", + preTranslateJobCompletedResponse + ) + throw new Error( + `Pre-translation ended with unexpected status: ${preTranslateJobCompletedResponse.status}` + ) + } - if (preTranslateJobCompletedResponse.status !== "finished") { - console.error( - "[PRE-TRANSLATE] ❌ Pre-translation did not finish successfully. Full response:", - preTranslateJobCompletedResponse + console.log(`[PRE-TRANSLATE] ✓ Job completed successfully!`) + console.log( + `[PRE-TRANSLATE] Progress: ${preTranslateJobCompletedResponse.progress}%` ) - throw new Error( - `Pre-translation ended with unexpected status: ${preTranslateJobCompletedResponse.status}` + console.log( + `[PRE-TRANSLATE] Full response:`, + JSON.stringify(preTranslateJobCompletedResponse, null, 2) ) } - console.log(`[PRE-TRANSLATE] ✓ Job completed successfully!`) - console.log( - `[PRE-TRANSLATE] Progress: ${preTranslateJobCompletedResponse.progress}%` - ) - console.log( - `[PRE-TRANSLATE] Full response:`, - JSON.stringify(preTranslateJobCompletedResponse, null, 2) - ) - // QA via Crowdin AI Prompt Completions console.log(`\n[QA-CHECK] ========== AI QA via Prompt Completions ==========`) const qaSummaries: string[] = [] @@ -604,11 +480,78 @@ async function main(options?: { allLangs: boolean }) { console.log(`[PR] Head branch: ${branch}`) console.log(`[PR] Base branch: ${config.baseBranch}`) + // Step 1: Detect the current model for pre-translation prompt + console.log( + `\n[MODEL-DETECTION] Fetching model for promptId: ${config.preTranslatePromptId}` + ) + let modelKey: string | undefined + try { + // Fetch userId from Crowdin API (we'll need to add this to config or fetch dynamically) + const userId = process.env.I18N_CROWDIN_USER_ID + if (userId) { + modelKey = await getPromptModelKey( + Number(userId), + config.preTranslatePromptId + ) + console.log(`[MODEL-DETECTION] Current model: ${modelKey}`) + } else { + console.log( + `[MODEL-DETECTION] I18N_CROWDIN_USER_ID not set, skipping model detection` + ) + } + } catch (err) { + console.warn(`[MODEL-DETECTION] Failed to detect model:`, err) + } + + // Step 2: Check if trust matrix exists for this model + const matrix = loadTrustMatrix() + const needsNewMatrix = modelKey && !matrix[modelKey] + + if (needsNewMatrix) { + console.log( + `\n[TRUST-MATRIX] Model "${modelKey}" not found in trust matrix` + ) + const openAiKey = process.env.OPENAI_API_KEY + if (openAiKey) { + console.log( + `[TRUST-MATRIX] OpenAI key available, generating new trust matrix...` + ) + try { + const newBucket = await generateTrustMatrixWithOpenAI(modelKey!) + saveTrustMatrixToFile(modelKey!, newBucket) + console.log( + `[TRUST-MATRIX] ✓ Generated and saved trust matrix for ${modelKey}` + ) + } catch (err) { + console.warn(`[TRUST-MATRIX] Failed to generate matrix:`, err) + console.log(`[TRUST-MATRIX] Will use most recent fallback`) + } + } else { + console.log( + `[TRUST-MATRIX] No OpenAI key available, using most recent model fallback` + ) + } + } + + // Step 3: QA routing based on trust matrix (with model-aware lookup) + const internalCodes = languagePairs.map((p) => p.internalLanguageCode) + const qaPlan = planQaForLanguages(internalCodes, modelKey) const prBody = qaSummaries.length ? `Automated Crowdin translation import\n\nQA Summary:\n\n${qaSummaries.join("\n\n")}` - : "Automated Crowdin translation import" + : `Automated Crowdin translation import` const pr = await postPullRequest(branch, config.baseBranch, prBody) + console.log(`\n[SUCCESS] Pull Request created: ${pr.html_url}`) + console.log(`[SUCCESS] PR Number: #${pr.number}`) + + // Step 4: Post follow-up comment with scoped AI review mentions + console.log(`\n[PR-COMMENT] Posting AI review comment...`) + try { + await postPrReviewComment(pr.number, qaPlan) + } catch (err) { + console.warn(`[PR-COMMENT] Failed to post review comment:`, err) + } + console.log(`\n[SUCCESS] ========== Translation import complete! ==========`) console.log(`[SUCCESS] Pull Request URL: ${pr.html_url}`) console.log(`[SUCCESS] PR Number: #${pr.number}`) From c859ca54673a017213f6a9006486eec7a67cfd07 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 29 Nov 2025 11:22:26 -0300 Subject: [PATCH 12/99] feat: chunk trust-tiers into separate PRs --- src/scripts/i18n/main.ts | 288 ++++++++++++++++++++++++--------------- 1 file changed, 179 insertions(+), 109 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 367c5d04d88..f89c5751c25 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -388,105 +388,12 @@ async function main(options?: { allLangs: boolean }) { internalLanguageCode: mapCrowdinCodeToInternal(crowdinId), })) - const { branch } = await postCreateBranchFrom(config.baseBranch) - console.log(`\n[BRANCH] ✓ Created branch: ${branch}`) - - // For each language - for (const { crowdinId, internalLanguageCode } of languagePairs) { - console.log( - `\n[BUILD] ========== Building translations for language: ${crowdinId} (internal: ${internalLanguageCode}) ==========` - ) - - // Build, download and commit each file - for (const fileId of fileIds) { - console.log(`\n[BUILD] --- Processing fileId: ${fileId} ---`) - const crowdinPath = fileIdToPathMapping[fileId] - console.log(`[BUILD] Crowdin path: ${crowdinPath}`) - - // 1- Build - console.log( - `[BUILD] Requesting build for fileId=${fileId}, language=${crowdinId}` - ) - const { url: downloadUrl } = await postBuildProjectFileTranslation( - fileId, - crowdinId, - config.projectId - ) - console.log(`[BUILD] ✓ Build complete, download URL: ${downloadUrl}`) - - // 2- Download - console.log(`[BUILD] Downloading translated file...`) - const { buffer } = await getBuiltFile(downloadUrl) - console.log(`[BUILD] Downloaded ${buffer.length} bytes`) - - // Check if translation differs from English - const originalEnglish = englishBuffers[fileId] - if (originalEnglish) { - console.log( - `[BUILD] Original English size: ${originalEnglish.length} bytes` - ) - if (originalEnglish.compare(buffer) === 0) { - console.warn( - `[BUILD] ⚠️ Skipping commit - content identical to English (no translation occurred)` - ) - continue - } else { - console.log(`[BUILD] ✓ Translation differs from English, will commit`) - } - } - - // 3a- Get destination path - const destinationPath = getDestinationFromPath( - crowdinPath, - internalLanguageCode - ) - console.log(`[BUILD] Destination path: ${destinationPath}`) - - // 3b- Commit - console.log(`[BUILD] Committing to branch: ${branch}`) - await putCommitFile(buffer, destinationPath, branch) - console.log(`[BUILD] ✓ Committed successfully`) - } - } - - // Run post-import sanitizer BEFORE creating PR - console.log( - `\n[SANITIZE] ========== Running post-import sanitizer before PR ==========` - ) - const sanitizeResult = runSanitizer(config.allCrowdinCodes) - const changedFiles = sanitizeResult.changedFiles || [] - if (changedFiles.length) { - console.log(`[SANITIZE] Files changed by sanitizer: ${changedFiles.length}`) - for (const abs of changedFiles) { - const relPath = abs.startsWith(process.cwd()) - ? abs.slice(process.cwd().length + 1) - : abs - try { - const buf = fs.readFileSync(abs) - await putCommitFile(buf, relPath, branch) - console.log(`[SANITIZE] ✓ Committed sanitized file: ${relPath}`) - } catch (e) { - console.warn( - `[SANITIZE] Failed to commit sanitized file ${relPath}:`, - e - ) - } - } - } else { - console.log("[SANITIZE] No sanitation changes to commit") - } - - console.log(`\n[PR] ========== Creating Pull Request ==========`) - console.log(`[PR] Head branch: ${branch}`) - console.log(`[PR] Base branch: ${config.baseBranch}`) - // Step 1: Detect the current model for pre-translation prompt console.log( `\n[MODEL-DETECTION] Fetching model for promptId: ${config.preTranslatePromptId}` ) let modelKey: string | undefined try { - // Fetch userId from Crowdin API (we'll need to add this to config or fetch dynamically) const userId = process.env.I18N_CROWDIN_USER_ID if (userId) { modelKey = await getPromptModelKey( @@ -536,26 +443,189 @@ async function main(options?: { allLangs: boolean }) { // Step 3: QA routing based on trust matrix (with model-aware lookup) const internalCodes = languagePairs.map((p) => p.internalLanguageCode) const qaPlan = planQaForLanguages(internalCodes, modelKey) - const prBody = qaSummaries.length - ? `Automated Crowdin translation import\n\nQA Summary:\n\n${qaSummaries.join("\n\n")}` - : `Automated Crowdin translation import` - const pr = await postPullRequest(branch, config.baseBranch, prBody) - console.log(`\n[SUCCESS] Pull Request created: ${pr.html_url}`) - console.log(`[SUCCESS] PR Number: #${pr.number}`) + // Step 4: Group languages by trust tier + const highTrustLangs = languagePairs.filter( + (p) => qaPlan[p.internalLanguageCode] === "skip" + ) + const mediumTrustLangs = languagePairs.filter( + (p) => qaPlan[p.internalLanguageCode] === "copilot" + ) + const lowTrustLangs = languagePairs.filter( + (p) => qaPlan[p.internalLanguageCode] === "copilot+claude" + ) - // Step 4: Post follow-up comment with scoped AI review mentions - console.log(`\n[PR-COMMENT] Posting AI review comment...`) - try { - await postPrReviewComment(pr.number, qaPlan) - } catch (err) { - console.warn(`[PR-COMMENT] Failed to post review comment:`, err) + console.log( + `\n[TIER-GROUPING] High trust (no review): ${highTrustLangs.length} languages` + ) + console.log( + `[TIER-GROUPING] Medium trust (@copilot): ${mediumTrustLangs.length} languages` + ) + console.log( + `[TIER-GROUPING] Low trust (@copilot + @claude): ${lowTrustLangs.length} languages` + ) + + // Helper function to process one tier + const processTierPr = async ( + tierLabel: "high-trust" | "medium-trust" | "low-trust", + tierName: string, + langs: typeof languagePairs + ) => { + if (langs.length === 0) { + console.log(`\n[TIER-${tierLabel.toUpperCase()}] No languages, skipping`) + return + } + + console.log( + `\n[TIER-${tierLabel.toUpperCase()}] ========== Processing ${langs.length} languages ==========` + ) + + const { branch } = await postCreateBranchFrom(config.baseBranch, tierLabel) + console.log(`[BRANCH] ✓ Created branch: ${branch}`) + + // For each language in this tier + for (const { crowdinId, internalLanguageCode } of langs) { + console.log( + `\n[BUILD] ========== Building translations for language: ${crowdinId} (internal: ${internalLanguageCode}) ==========` + ) + + // Build, download and commit each file + for (const fileId of fileIds) { + console.log(`\n[BUILD] --- Processing fileId: ${fileId} ---`) + const crowdinPath = fileIdToPathMapping[fileId] + console.log(`[BUILD] Crowdin path: ${crowdinPath}`) + + // 1- Build + console.log( + `[BUILD] Requesting build for fileId=${fileId}, language=${crowdinId}` + ) + const { url: downloadUrl } = await postBuildProjectFileTranslation( + fileId, + crowdinId, + config.projectId + ) + console.log(`[BUILD] ✓ Build complete, download URL: ${downloadUrl}`) + + // 2- Download + console.log(`[BUILD] Downloading translated file...`) + const { buffer } = await getBuiltFile(downloadUrl) + console.log(`[BUILD] Downloaded ${buffer.length} bytes`) + + // Check if translation differs from English + const originalEnglish = englishBuffers[fileId] + if (originalEnglish) { + console.log( + `[BUILD] Original English size: ${originalEnglish.length} bytes` + ) + if (originalEnglish.compare(buffer) === 0) { + console.warn( + `[BUILD] ⚠️ Skipping commit - content identical to English (no translation occurred)` + ) + continue + } else { + console.log( + `[BUILD] ✓ Translation differs from English, will commit` + ) + } + } + + // 3a- Get destination path + const destinationPath = getDestinationFromPath( + crowdinPath, + internalLanguageCode + ) + console.log(`[BUILD] Destination path: ${destinationPath}`) + + // 3b- Commit + console.log(`[BUILD] Committing to branch: ${branch}`) + await putCommitFile(buffer, destinationPath, branch) + console.log(`[BUILD] ✓ Committed successfully`) + } + } + + // Run post-import sanitizer for this tier's languages only + console.log( + `\n[SANITIZE] ========== Running sanitizer for ${tierLabel} languages ==========` + ) + const tierCrowdinCodes = langs.map((p) => p.crowdinId) + const sanitizeResult = runSanitizer(tierCrowdinCodes) + const changedFiles = sanitizeResult.changedFiles || [] + if (changedFiles.length) { + console.log( + `[SANITIZE] Files changed by sanitizer: ${changedFiles.length}` + ) + for (const abs of changedFiles) { + const relPath = abs.startsWith(process.cwd()) + ? abs.slice(process.cwd().length + 1) + : abs + try { + const buf = fs.readFileSync(abs) + await putCommitFile(buf, relPath, branch) + console.log(`[SANITIZE] ✓ Committed sanitized file: ${relPath}`) + } catch (e) { + console.warn( + `[SANITIZE] Failed to commit sanitized file ${relPath}:`, + e + ) + } + } + } else { + console.log("[SANITIZE] No sanitation changes to commit") + } + + // Create PR with tier-appropriate title and body + console.log( + `\n[PR] ========== Creating ${tierName} Pull Request ==========` + ) + console.log(`[PR] Head branch: ${branch}`) + console.log(`[PR] Base branch: ${config.baseBranch}`) + + const langCodes = langs.map((p) => p.internalLanguageCode).join(", ") + let prTitle = `[${tierName}] Automated Crowdin translations (${langCodes})` + if (tierLabel !== "high-trust") { + const reviewers = + tierLabel === "medium-trust" ? "@copilot" : "@copilot @claude" + prTitle += ` - ${reviewers} review requested` + } + + // Filter QA summaries to this tier's languages if available + const tierQaSummaries = qaSummaries.filter((s) => + langs.some((p) => s.includes(p.crowdinId)) + ) + const prBody = tierQaSummaries.length + ? `${prTitle}\n\nQA Summary:\n\n${tierQaSummaries.join("\n\n")}` + : prTitle + + const pr = await postPullRequest(branch, config.baseBranch, prBody) + + console.log(`\n[SUCCESS] Pull Request created: ${pr.html_url}`) + console.log(`[SUCCESS] PR Number: #${pr.number}`) + + // Post follow-up comment with scoped AI review mentions + console.log(`\n[PR-COMMENT] Posting AI review comment...`) + const tierQaPlan: Record = {} + for (const { internalLanguageCode } of langs) { + tierQaPlan[internalLanguageCode] = qaPlan[internalLanguageCode] + } + try { + await postPrReviewComment(pr.number, tierQaPlan) + } catch (err) { + console.warn(`[PR-COMMENT] Failed to post review comment:`, err) + } + + console.log( + `\n[SUCCESS] ========== ${tierName} PR complete: ${pr.html_url} ==========` + ) } - console.log(`\n[SUCCESS] ========== Translation import complete! ==========`) - console.log(`[SUCCESS] Pull Request URL: ${pr.html_url}`) - console.log(`[SUCCESS] PR Number: #${pr.number}`) - console.log(pr) + // Process each tier + await processTierPr("high-trust", "High Trust", highTrustLangs) + await processTierPr("medium-trust", "Medium Trust", mediumTrustLangs) + await processTierPr("low-trust", "Low Trust", lowTrustLangs) + + console.log( + `\n[SUCCESS] ========== All translation imports complete! ==========` + ) } main().catch((err) => { From 754c3640fa8f2176c47aa7e56860bc21ada72902 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 29 Nov 2025 12:54:07 -0300 Subject: [PATCH 13/99] refactor: accept internal lang codes --- .github/workflows/crowdin-ai-import.yml | 4 ++-- src/scripts/i18n/config.ts | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index 95665719346..a889301d1a1 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -18,9 +18,9 @@ on: default: "0" type: string target_languages: - description: "Comma-separated Crowdin language codes (default: es-EM)" + description: "Comma-separated internal language codes (default: es)" required: false - default: "es-EM" + default: "es" type: string base_branch: description: "Base branch to create PR against (default: dev)" diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index 5af77ba4621..984049d1f86 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -2,6 +2,8 @@ import * as dotenv from "dotenv" import i18nConfig from "../../../i18n.config.json" +import { mapInternalCodeToCrowdin } from "./lib/utils/mapping" + dotenv.config({ path: ".env.local" }) // Language code mapping @@ -44,9 +46,14 @@ console.log("[DEBUG] Crowdin API key found ✓") export const crowdinBearerHeaders = { Authorization: `Bearer ${crowdinApiKey}` } // Parse environment variables with defaults -const targetLanguages = process.env.TARGET_LANGUAGES +// Accept internal codes (e.g., "es") and convert to Crowdin codes (e.g., "es-EM") +const targetLanguagesInput = process.env.TARGET_LANGUAGES ? process.env.TARGET_LANGUAGES.split(",").map((lang) => lang.trim()) - : ["es-EM"] + : ["es"] + +const targetLanguages = targetLanguagesInput.map((code) => + mapInternalCodeToCrowdin(code) +) const baseBranch = process.env.BASE_BRANCH || "dev" @@ -75,7 +82,12 @@ const githubRepo = const [ghOrganization, ghRepo] = githubRepo.split("/") console.log("[DEBUG] Configuration:") -console.log(`[DEBUG] - Target languages: ${targetLanguages.join(", ")}`) +console.log( + `[DEBUG] - Target languages (internal): ${targetLanguagesInput.join(", ")}` +) +console.log( + `[DEBUG] - Target languages (Crowdin): ${targetLanguages.join(", ")}` +) console.log(`[DEBUG] - Base branch: ${baseBranch}`) console.log(`[DEBUG] - File limit: ${fileLimit}`) console.log(`[DEBUG] - Start offset: ${startOffset}`) From 5aba5dcca7d6ee03ea0264cb92d16c6d76bbcd2e Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 29 Nov 2025 15:20:51 -0300 Subject: [PATCH 14/99] fix: json punctuation sanitization --- src/scripts/i18n/post_import_sanitize.ts | 70 +++++++++++++++++++++++- 1 file changed, 67 insertions(+), 3 deletions(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 7cb0e777145..54a9c7cfeee 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -235,17 +235,81 @@ function processJsonFile(jsonPath: string): { // Normalize BOM and smart quotes const cleaned = content .replace(/^\uFEFF/, "") - .replace(/[“”]/g, '"') - .replace(/[‘’]/g, "'") + .replace(/[""]/g, '"') + .replace(/['']/g, "'") if (cleaned !== content) { content = cleaned fixed = true } + + // Try parsing; if it fails, attempt to fix unescaped quotes + let parseError: Error | null = null try { JSON.parse(content) } catch (e) { - issues.push(`JSON parse error: ${(e as Error).message}`) + parseError = e as Error + issues.push(`Initial JSON parse error: ${parseError.message}`) + + // Attempt to fix unescaped quotes in JSON string values + // Strategy: scan for patterns like "text "word" text" and escape the internal quotes + try { + let fixedContent = content + + // Find all string values that might have unescaped internal quotes + // Pattern: ": "...content..." - we look for quotes after a colon + let modified = false + const lines = fixedContent.split("\n") + const fixedLines = lines.map((line) => { + // Match JSON key-value pairs with string values + // Look for pattern: "key": "value potentially with "quotes"" + const match = line.match(/^(\s*"[^"]+"\s*:\s*")(.*)("\s*,?\s*)$/) + if (!match) return line + + const prefix = match[1] // ' "key": "' + const value = match[2] // 'text with "quotes" inside' + const suffix = match[3] // '",\n' or '"\n' + + // Check if value contains unescaped quotes + if (!value.includes('"')) return line + + // Escape unescaped quotes in the value + let fixedValue = "" + for (let i = 0; i < value.length; i++) { + const char = value[i] + if (char === '"') { + // Count preceding backslashes + let backslashCount = 0 + for (let j = i - 1; j >= 0 && value[j] === "\\"; j--) { + backslashCount++ + } + // If not escaped (even number of backslashes), escape it + if (backslashCount % 2 === 0) { + fixedValue += '\\"' + modified = true + } else { + fixedValue += char + } + } else { + fixedValue += char + } + } + + return prefix + fixedValue + suffix + }) + + if (modified) { + fixedContent = fixedLines.join("\n") + content = fixedContent + fixed = true + // Re-validate after fix + JSON.parse(content) + issues.push("Auto-fixed unescaped quotes in JSON string values") + } + } catch (fixError) { + issues.push(`Failed to auto-fix JSON: ${(fixError as Error).message}`) + } } + if (fixed) fs.writeFileSync(jsonPath, content, "utf8") return { fixed, issues } } From 60717647ff836541aee3da9acef6aabed6f5f50b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sun, 30 Nov 2025 10:51:53 -0300 Subject: [PATCH 15/99] feat: implement supabase glossary/TM sync Adds backup for existing Crowdin glossary/TM; syncs updates from EthGlossary supabase db with Crowdin --- .github/workflows/crowdin-ai-import.yml | 14 + .../i18n/GLOSSARY_SYNC_IMPLEMENTATION.md | 239 ++++++++++++ src/scripts/i18n/glossary-sync.md | 149 ++++++++ src/scripts/i18n/lib/crowdin/glossary.ts | 354 ++++++++++++++++++ src/scripts/i18n/lib/glossary/backup.ts | 245 ++++++++++++ src/scripts/i18n/lib/glossary/supabase.ts | 211 +++++++++++ src/scripts/i18n/main.ts | 20 + src/scripts/i18n/sync-glossary.ts | 278 ++++++++++++++ 8 files changed, 1510 insertions(+) create mode 100644 src/scripts/i18n/GLOSSARY_SYNC_IMPLEMENTATION.md create mode 100644 src/scripts/i18n/glossary-sync.md create mode 100644 src/scripts/i18n/lib/crowdin/glossary.ts create mode 100644 src/scripts/i18n/lib/glossary/backup.ts create mode 100644 src/scripts/i18n/lib/glossary/supabase.ts create mode 100644 src/scripts/i18n/sync-glossary.ts diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index a889301d1a1..3006b3f710b 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -47,6 +47,16 @@ on: required: false default: "168592" type: string + glossary_min_votes: + description: "Minimum votes for glossary terms (default: 2)" + required: false + default: "2" + type: string + skip_glossary_backup_pr: + description: "Skip creating a PR for glossary backups (default: false)" + required: false + default: "false" + type: string jobs: import_translations: @@ -74,6 +84,8 @@ jobs: I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} I18N_CROWDIN_USER_ID: ${{ secrets.I18N_CROWDIN_USER_ID }} + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} FILE_LIMIT: ${{ github.event.inputs.file_limit }} START_OFFSET: ${{ github.event.inputs.start_offset }} @@ -83,4 +95,6 @@ jobs: PRETRANSLATE_POLL_BASE_MS: ${{ github.event.inputs.pretranslate_poll_base_ms }} PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} QA_PROMPT_ID: ${{ github.event.inputs.qa_prompt_id }} + GLOSSARY_MIN_VOTES: ${{ github.event.inputs.glossary_min_votes }} + SKIP_GLOSSARY_BACKUP_PR: ${{ github.event.inputs.skip_glossary_backup_pr }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/src/scripts/i18n/GLOSSARY_SYNC_IMPLEMENTATION.md b/src/scripts/i18n/GLOSSARY_SYNC_IMPLEMENTATION.md new file mode 100644 index 00000000000..5b982588383 --- /dev/null +++ b/src/scripts/i18n/GLOSSARY_SYNC_IMPLEMENTATION.md @@ -0,0 +1,239 @@ +# Glossary Sync Implementation Summary + +## Overview + +Implemented automated synchronization of community-approved translations from Supabase to Crowdin glossaries, with automatic backup and PR creation for glossary version control. + +## Architecture + +### Data Flow + +``` +Supabase (top_translations table) + ↓ Fetch glossary entries (min votes filter) + ↓ Format as TBX (Term Base eXchange) + ↓ +Crowdin Glossaries + ↓ Before import: export existing + ↓ Calculate hash, detect changes + ↓ +.crowdin-backups/ (timestamped backups) + ↓ Commit to separate branch + ↓ +GitHub PR (backup for reversion) +``` + +### Key Features + +1. **Content-based change detection** - Only backs up when glossary content changes (SHA-256 hash comparison) +2. **Timestamped backups** - Each file named with `{timestamp}_{hash}_{name}.{ext}` for easy sorting and identification +3. **Separate backup PRs** - Glossary backups don't clutter translation PRs +4. **Language mapping** - Automatic conversion between internal codes (`es`) and Crowdin codes (`es-EM`) +5. **Vote filtering** - Only imports terms with minimum community consensus +6. **Fail-safe execution** - Glossary sync failures don't block translation workflow + +## Files Created + +### Core Modules + +- **src/scripts/i18n/sync-glossary.ts** (275 lines) + + - Main orchestrator + - Coordinates backup → fetch → import → PR flow + - CLI executable: `npx ts-node src/scripts/i18n/sync-glossary.ts` + +- **src/scripts/i18n/lib/glossary/supabase.ts** (206 lines) + + - REST API client (no dependencies, uses native `fetch`) + - Functions: `fetchGlossaryForLanguage`, `fetchGlossaryForAllLanguages` + - Formatters: `formatGlossaryAsCSV`, `formatGlossaryAsTBX` + +- **src/scripts/i18n/lib/crowdin/glossary.ts** (309 lines) + + - Crowdin API wrappers for glossary and TM operations + - Functions: `listGlossaries`, `exportGlossary`, `importGlossary`, `listTranslationMemories`, `exportTranslationMemory` + - Handles storage upload, import polling, and glossary creation + +- **src/scripts/i18n/lib/glossary/backup.ts** (223 lines) + - Hash calculation and change detection + - Timestamped backup file creation + - Backup history management (cleanup old backups) + - Git integration helpers + +### Configuration + +- **src/scripts/i18n/main.ts** (modified) + + - Added glossary sync step at workflow start + - Runs before pre-translation + - Skipped if resuming existing job + +- **.github/workflows/crowdin-ai-import.yml** (modified) + + - Added `SUPABASE_URL` and `SUPABASE_SERVICE_ROLE_KEY` secrets + - Added workflow inputs: `glossary_min_votes`, `skip_glossary_backup_pr` + - Environment variables passed to script + +- **.gitignore** (modified) + - Added `.crowdin-backups/` to ignore backups locally (they go to separate PR only) + +### Documentation + +- **docs/glossary-sync.md** (153 lines) + - Complete usage guide + - Configuration reference + - Troubleshooting + - Backup restoration instructions + +## Environment Variables + +### Required + +- `SUPABASE_URL` - Supabase project URL +- `SUPABASE_SERVICE_ROLE_KEY` - Service role API key +- `I18N_CROWDIN_API_KEY` - Crowdin API key (existing) +- `I18N_GITHUB_API_KEY` - GitHub token (existing) +- `TARGET_LANGUAGES` - Comma-separated internal codes + +### Optional + +- `GLOSSARY_MIN_VOTES` (default: `2`) - Min upvotes for inclusion +- `SKIP_GLOSSARY_BACKUP_PR` (default: `false`) - Skip PR creation + +## Workflow Integration + +### Normal Run + +1. User triggers `crowdin-ai-import.yml` workflow +2. **Glossary Sync Phase**: + - Export existing Crowdin glossaries + - Compare hashes, backup if changed + - Fetch from Supabase `top_translations` table + - Import to Crowdin per language + - Create backup PR (if changes detected) +3. Pre-translate phase (uses updated glossaries) +4. Build, commit, create translation PR + +### Resume Run + +- Glossary sync skipped (already completed in initial run) +- Proceeds directly to build/commit from existing pre-translation ID + +## Supabase Schema + +The script expects this table/view structure: + +```sql +CREATE OR REPLACE VIEW top_translations AS +SELECT + string_term, + translation_text, + total_votes, + language_code +FROM glossary_entries +WHERE status = 'approved' +ORDER BY total_votes DESC; +``` + +## Example Workflow Run + +``` +[GLOSSARY-SYNC] ========== Starting Glossary Sync ========== +[GLOSSARY-SYNC] Supabase URL: https://cppthnnwfvkfwgoqmhjl.supabase.co +[GLOSSARY-SYNC] Min votes: 2 + +[GLOSSARY-SYNC] Step 1: Backing up existing Crowdin glossaries +[GLOSSARY-SYNC] Found 2 existing glossaries +[GLOSSARY-SYNC] Exporting glossary: Ethereum.org Community (es-EM) +[BACKUP] Content changed for glossary:Ethereum.org Community (es-EM) +[BACKUP] Saved glossary backup: 1733011200_abc12345_ethereum_org_community_es.tbx + +[GLOSSARY-SYNC] Step 3: Fetching glossary from Supabase +[GLOSSARY] Fetching from Supabase for language: es +[GLOSSARY] Fetched 47 glossary entries for es +[GLOSSARY-SYNC] Importing glossary: Ethereum.org Community (es-EM) +[CROWDIN-GLOSSARY] Using existing glossary ID: 123456 +[CROWDIN-GLOSSARY] Import started: abc-def-123 +[CROWDIN-GLOSSARY] Import status: finished +[GLOSSARY-SYNC] ✓ Successfully updated glossary for es-EM + +[GLOSSARY-SYNC] Step 5: Creating backup PR +[GLOSSARY-SYNC] Creating branch: i18n-glossary-backup-2024-12-01 +[GLOSSARY-SYNC] Committing 3 backup files +[GLOSSARY-SYNC] Creating pull request +[GLOSSARY-SYNC] ✓ Created PR: https://github.com/ethereum/ethereum-org-website/pull/12345 + +[GLOSSARY-SYNC] ========== Sync Complete ========== +[GLOSSARY-SYNC] Updated glossaries: 2 +[GLOSSARY-SYNC] Languages: es-EM, pt-BR +[GLOSSARY-SYNC] Backup PR: https://github.com/ethereum/ethereum-org-website/pull/12345 + +[MAIN] Proceeding with pre-translation... +``` + +## Testing Checklist + +### Unit Testing + +- [ ] Supabase API connection (test with real endpoint) +- [ ] Hash calculation and change detection +- [ ] TBX formatting (validate XML) +- [ ] Language code mapping (internal ↔ Crowdin) + +### Integration Testing + +- [ ] Export existing glossary from Crowdin +- [ ] Fetch glossary from Supabase (test with 1-2 languages) +- [ ] Import to Crowdin (use test glossary) +- [ ] Backup file creation and Git operations +- [ ] PR creation with backup files + +### End-to-End Testing + +- [ ] Run full workflow with `GLOSSARY_MIN_VOTES=10` (limited entries) +- [ ] Verify backup PR created +- [ ] Verify glossaries updated in Crowdin +- [ ] Run again without changes (should skip backup) +- [ ] Test resume mode (glossary sync should be skipped) + +## Security Considerations + +1. **Service Role Key** - Never commit or log; use GitHub Secrets only +2. **Backup Files** - Stored in Git (ensure no sensitive data in glossary terms) +3. **API Rate Limits** - Sequential per-language imports (no parallel to avoid 429s) +4. **Error Handling** - Glossary sync failures don't break translation workflow + +## Performance Impact + +- **Additional time**: ~1-3 minutes per workflow run +- **Skipped when**: No glossary changes detected (most runs) +- **Network calls**: + - 1 Supabase query per language + - 2 Crowdin API calls per glossary (export + import) + - 1 GitHub API call per backup file + +## Future Enhancements + +1. **Incremental sync** - Only import changed terms (requires term-level tracking) +2. **TM population** - Push high-confidence translations to TM, not just glossary +3. **Bidirectional sync** - Pull Crowdin translator feedback back to Supabase +4. **Scheduled sync** - Daily cron job independent of translation workflow +5. **Term categories** - Support Crowdin glossary term tags/categories +6. **Conflict resolution** - Handle term collisions between languages + +## Dependencies + +**Zero new npm packages!** All implemented using: + +- Native `fetch` API (Node 18+) +- Built-in `crypto`, `fs`, `path` modules +- Existing project dependencies + +## Rollback Plan + +If issues arise: + +1. Set `SKIP_GLOSSARY_BACKUP_PR=true` to disable sync +2. Manually revert glossaries in Crowdin UI +3. Or restore from backup PR files +4. Comment out glossary sync call in `main.ts` if needed diff --git a/src/scripts/i18n/glossary-sync.md b/src/scripts/i18n/glossary-sync.md new file mode 100644 index 00000000000..edfaad7b0f2 --- /dev/null +++ b/src/scripts/i18n/glossary-sync.md @@ -0,0 +1,149 @@ +# Glossary Synchronization + +Automatically syncs community-approved translations from Supabase to Crowdin glossaries before each translation workflow run. + +## How It Works + +1. **Backup Existing**: Exports current Crowdin glossaries and Translation Memories, calculates content hashes, and saves timestamped backups if content changed +2. **Fetch from Supabase**: Queries the `top_translations` table for terms with minimum vote threshold per language +3. **Import to Crowdin**: Formats as TBX and imports into Crowdin glossaries (creates new glossary per language if doesn't exist) +4. **Create Backup PR**: Commits backup files to a separate branch (`i18n-glossary-backup-YYYY-MM-DD`) and creates a PR + +## Backup Structure + +``` +.crowdin-backups/ +├── glossary/ +│ ├── 1733011200_abc12345_ethereum_org_community_es.tbx +│ └── 1733011200_def67890_ethereum_org_community_pt.tbx +├── tm/ +│ ├── 1733011200_xyz98765_main_translation_memory.tmx +│ └── ... +└── hashes.json # Tracks content hashes to detect changes +``` + +Each backup filename includes: + +- Unix timestamp (for sorting/chronology) +- Short hash (first 8 chars of SHA-256, for quick verification) +- Sanitized resource name + +## Configuration + +### Required Secrets (GitHub Actions) + +Add these to your repository secrets: + +- `SUPABASE_URL`: Your Supabase project URL (default: `https://cppthnnwfvkfwgoqmhjl.supabase.co`) +- `SUPABASE_SERVICE_ROLE_KEY`: Supabase service role key for API access + +### Environment Variables + +- `GLOSSARY_MIN_VOTES` (default: `2`): Minimum upvotes required for a term to be included +- `SKIP_GLOSSARY_BACKUP_PR` (default: `false`): Set to `true` to save backups locally without creating a PR + +### Supabase Schema + +The script expects a `top_translations` table (or view) with these columns: + +```sql +CREATE TABLE top_translations ( + string_term TEXT NOT NULL, -- The English term + translation_text TEXT NOT NULL, -- The translated term + total_votes INTEGER NOT NULL, -- Number of upvotes + language_code TEXT NOT NULL -- Internal language code (e.g., 'es', 'pt', 'fr') +); +``` + +**Note**: Use internal language codes (`es`, `pt`, `zh`, etc.) in Supabase. The script automatically maps them to Crowdin codes (`es-EM`, `pt-BR`, `zh-CN`, etc.). + +## Workflow Integration + +The glossary sync runs automatically at the start of each `crowdin-ai-import.yml` workflow before pre-translation: + +```yaml +- name: Run Crowdin AI translation import + run: npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/main.ts + env: + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} + GLOSSARY_MIN_VOTES: ${{ github.event.inputs.glossary_min_votes }} + SKIP_GLOSSARY_BACKUP_PR: ${{ github.event.inputs.skip_glossary_backup_pr }} + # ... other env vars +``` + +## Manual Sync + +You can also run the glossary sync manually: + +```bash +# Set required environment variables +export SUPABASE_URL="https://your-project.supabase.co" +export SUPABASE_SERVICE_ROLE_KEY="your-service-role-key" +export I18N_CROWDIN_API_KEY="your-crowdin-api-key" +export I18N_GITHUB_API_KEY="your-github-token" +export TARGET_LANGUAGES="es,pt,fr" # Internal codes +export GLOSSARY_MIN_VOTES="2" +export SKIP_GLOSSARY_BACKUP_PR="false" + +# Run the sync +npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/sync-glossary.ts +``` + +## Backup Restoration + +If you need to revert to a previous glossary version: + +1. Find the backup file in `.crowdin-backups/glossary/` (sorted by timestamp, most recent first) +2. Download the TBX file +3. Manually import it to Crowdin via UI or API: + ```bash + # Using the Crowdin API + curl -X POST "https://api.crowdin.com/api/v2/glossaries/{glossaryId}/imports" \ + -H "Authorization: Bearer $CROWDIN_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"storageId": "...", "scheme": {...}}' + ``` + +Or merge the backup PR to preserve it in Git history, then cherry-pick specific versions as needed. + +## How Crowdin Uses Glossaries + +- **For AI/MT**: Crowdin AI and Machine Translation engines prefer glossary terms when translating +- **For Human Translators**: Terms are highlighted in the editor with suggested translations +- **For QA**: Crowdin can flag inconsistent terminology (if QA checks are enabled) +- **For TM Matching**: Glossary terms boost Translation Memory match confidence + +## Troubleshooting + +### "No glossary entries found for language X" + +- Check that `top_translations` has rows for that language code +- Verify `GLOSSARY_MIN_VOTES` isn't too high +- Confirm you're using internal codes (`es`) not Crowdin codes (`es-EM`) + +### "Failed to import glossary" + +- Check Crowdin API rate limits +- Verify TBX format is valid (UTF-8, well-formed XML) +- Ensure glossary name doesn't conflict with existing non-project glossaries + +### "Backup PR creation failed" + +- Verify `I18N_GITHUB_API_KEY` has `repo` scope +- Check branch doesn't already exist (delete old backup branches if needed) +- Ensure `.crowdin-backups/` is in `.gitignore` so backups go to PR only + +## Performance Notes + +- Glossary export/import for 10 languages: ~30-60 seconds +- TM export (large): ~2-5 minutes +- Backup PR creation: ~10-30 seconds (depends on file count) +- Total overhead: ~1-3 minutes per workflow run (skipped if no changes) + +## Files + +- `src/scripts/i18n/sync-glossary.ts` - Main orchestrator +- `src/scripts/i18n/lib/glossary/supabase.ts` - Supabase REST API client +- `src/scripts/i18n/lib/crowdin/glossary.ts` - Crowdin glossary/TM API wrappers +- `src/scripts/i18n/lib/glossary/backup.ts` - Hash calculation, file I/O, Git operations diff --git a/src/scripts/i18n/lib/crowdin/glossary.ts b/src/scripts/i18n/lib/crowdin/glossary.ts new file mode 100644 index 00000000000..26012dc2b41 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/glossary.ts @@ -0,0 +1,354 @@ +/** + * Crowdin Glossary and Translation Memory API operations + */ + +import { + config, + CROWDIN_API_BASE_URL, + crowdinBearerHeaders, +} from "../../config" + +export interface CrowdinGlossary { + id: number + name: string + languageId: string + terms: number + createdAt: string +} + +export interface CrowdinTMSegment { + id: number + text: string + translation: string + createdAt: string +} + +/** + * List all glossaries in the project + */ +export async function listGlossaries(): Promise { + const url = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/glossaries` + console.log(`[CROWDIN-GLOSSARY] Fetching glossaries from: ${url}`) + + try { + const response = await fetch(url, { headers: crowdinBearerHeaders }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error( + `Crowdin glossaries list failed (${response.status}): ${errorText}` + ) + } + + const json: { data: { data: CrowdinGlossary }[] } = await response.json() + const glossaries = json.data.map(({ data }) => data) + + console.log(`[CROWDIN-GLOSSARY] Found ${glossaries.length} glossaries`) + return glossaries + } catch (error) { + console.error(`[CROWDIN-GLOSSARY] Failed to list glossaries:`, error) + throw error + } +} + +/** + * Export a glossary to TBX format + */ +export async function exportGlossary(glossaryId: number): Promise { + const url = `${CROWDIN_API_BASE_URL}/glossaries/${glossaryId}/exports` + console.log(`[CROWDIN-GLOSSARY] Exporting glossary ${glossaryId}`) + + try { + // Start export + const exportResponse = await fetch(url, { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ format: "tbx" }), + }) + + if (!exportResponse.ok) { + const errorText = await exportResponse.text() + throw new Error( + `Crowdin glossary export failed (${exportResponse.status}): ${errorText}` + ) + } + + const exportJson: { data: { url: string; identifier: string } } = + await exportResponse.json() + const downloadUrl = exportJson.data.url + + console.log( + `[CROWDIN-GLOSSARY] Export ready, downloading from: ${downloadUrl}` + ) + + // Download the exported file + const downloadResponse = await fetch(downloadUrl) + if (!downloadResponse.ok) { + throw new Error( + `Failed to download glossary export (${downloadResponse.status})` + ) + } + + const content = await downloadResponse.text() + console.log( + `[CROWDIN-GLOSSARY] Downloaded glossary (${content.length} bytes)` + ) + + return content + } catch (error) { + console.error(`[CROWDIN-GLOSSARY] Failed to export glossary:`, error) + throw error + } +} + +/** + * Import a glossary from TBX content (creates or updates glossary) + */ +export async function importGlossary( + name: string, + languageId: string, + tbxContent: string +): Promise<{ glossaryId: number; imported: number }> { + console.log(`[CROWDIN-GLOSSARY] Importing glossary: ${name} (${languageId})`) + + try { + // Check if glossary exists + const existingGlossaries = await listGlossaries() + const existing = existingGlossaries.find((g) => g.name === name) + + let glossaryId: number + + if (existing) { + console.log( + `[CROWDIN-GLOSSARY] Using existing glossary ID: ${existing.id}` + ) + glossaryId = existing.id + } else { + // Create new glossary + console.log(`[CROWDIN-GLOSSARY] Creating new glossary: ${name}`) + const createUrl = `${CROWDIN_API_BASE_URL}/glossaries` + const createResponse = await fetch(createUrl, { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ name, languageId }), + }) + + if (!createResponse.ok) { + const errorText = await createResponse.text() + throw new Error( + `Failed to create glossary (${createResponse.status}): ${errorText}` + ) + } + + const createJson: { data: { id: number } } = await createResponse.json() + glossaryId = createJson.data.id + console.log(`[CROWDIN-GLOSSARY] Created glossary with ID: ${glossaryId}`) + } + + // Upload TBX file to storage first + const storageId = await uploadToStorage(tbxContent, "glossary.tbx") + + // Import the glossary + const importUrl = `${CROWDIN_API_BASE_URL}/glossaries/${glossaryId}/imports` + const importResponse = await fetch(importUrl, { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + storageId, + scheme: { + sourceLanguageId: "en", + targetLanguageId: languageId, + }, + }), + }) + + if (!importResponse.ok) { + const errorText = await importResponse.text() + throw new Error( + `Failed to import glossary (${importResponse.status}): ${errorText}` + ) + } + + const importJson: { data: { identifier: string } } = + await importResponse.json() + console.log( + `[CROWDIN-GLOSSARY] Import started: ${importJson.data.identifier}` + ) + + // Wait for import to complete (simple polling) + await waitForImport(glossaryId, importJson.data.identifier) + + console.log( + `[CROWDIN-GLOSSARY] Successfully imported glossary ${glossaryId}` + ) + return { glossaryId, imported: 0 } // Crowdin doesn't return count immediately + } catch (error) { + console.error(`[CROWDIN-GLOSSARY] Failed to import glossary:`, error) + throw error + } +} + +/** + * Upload content to Crowdin storage + */ +async function uploadToStorage( + content: string, + filename: string +): Promise { + const url = `${CROWDIN_API_BASE_URL}/storages` + console.log(`[CROWDIN-GLOSSARY] Uploading to storage: ${filename}`) + + const formData = new FormData() + const blob = new Blob([content], { type: "application/xml" }) + formData.append("file", blob, filename) + + try { + const response = await fetch(url, { + method: "POST", + headers: { + Authorization: crowdinBearerHeaders.Authorization, + }, + body: formData, + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error( + `Storage upload failed (${response.status}): ${errorText}` + ) + } + + const json: { data: { id: number } } = await response.json() + console.log(`[CROWDIN-GLOSSARY] Uploaded to storage ID: ${json.data.id}`) + return json.data.id + } catch (error) { + console.error(`[CROWDIN-GLOSSARY] Storage upload failed:`, error) + throw error + } +} + +/** + * Wait for glossary import to complete + */ +async function waitForImport( + glossaryId: number, + identifier: string +): Promise { + const maxAttempts = 30 + const delayMs = 2000 + + for (let i = 0; i < maxAttempts; i++) { + await new Promise((resolve) => setTimeout(resolve, delayMs)) + + const url = `${CROWDIN_API_BASE_URL}/glossaries/${glossaryId}/imports/${identifier}` + const response = await fetch(url, { headers: crowdinBearerHeaders }) + + if (!response.ok) continue + + const json: { data: { status: string } } = await response.json() + console.log(`[CROWDIN-GLOSSARY] Import status: ${json.data.status}`) + + if (json.data.status === "finished") { + return + } + + if (json.data.status === "failed") { + throw new Error("Glossary import failed") + } + } + + throw new Error("Glossary import timeout") +} + +/** + * List Translation Memory (TM) resources + */ +export async function listTranslationMemories(): Promise< + Array<{ id: number; name: string; languageId: string }> +> { + const url = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/tms` + console.log(`[CROWDIN-TM] Fetching TMs from: ${url}`) + + try { + const response = await fetch(url, { headers: crowdinBearerHeaders }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error( + `Crowdin TM list failed (${response.status}): ${errorText}` + ) + } + + const json: { + data: { data: { id: number; name: string; languageId: string } }[] + } = await response.json() + const tms = json.data.map(({ data }) => data) + + console.log(`[CROWDIN-TM] Found ${tms.length} TMs`) + return tms + } catch (error) { + console.error(`[CROWDIN-TM] Failed to list TMs:`, error) + throw error + } +} + +/** + * Export Translation Memory to TMX format + */ +export async function exportTranslationMemory(tmId: number): Promise { + const url = `${CROWDIN_API_BASE_URL}/tms/${tmId}/exports` + console.log(`[CROWDIN-TM] Exporting TM ${tmId}`) + + try { + // Start export + const exportResponse = await fetch(url, { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + sourceLanguageId: "en", + targetLanguageId: "all", + format: "tmx", + }), + }) + + if (!exportResponse.ok) { + const errorText = await exportResponse.text() + throw new Error( + `Crowdin TM export failed (${exportResponse.status}): ${errorText}` + ) + } + + const exportJson: { data: { url: string } } = await exportResponse.json() + const downloadUrl = exportJson.data.url + + console.log(`[CROWDIN-TM] Export ready, downloading from: ${downloadUrl}`) + + // Download the exported file + const downloadResponse = await fetch(downloadUrl) + if (!downloadResponse.ok) { + throw new Error( + `Failed to download TM export (${downloadResponse.status})` + ) + } + + const content = await downloadResponse.text() + console.log(`[CROWDIN-TM] Downloaded TM (${content.length} bytes)`) + + return content + } catch (error) { + console.error(`[CROWDIN-TM] Failed to export TM:`, error) + throw error + } +} diff --git a/src/scripts/i18n/lib/glossary/backup.ts b/src/scripts/i18n/lib/glossary/backup.ts new file mode 100644 index 00000000000..c57dd24a1ec --- /dev/null +++ b/src/scripts/i18n/lib/glossary/backup.ts @@ -0,0 +1,245 @@ +/** + * Glossary and TM backup utilities + * Handles hashing, Git operations, and timestamped backups + */ + +import * as crypto from "crypto" +import * as fs from "fs" +import * as path from "path" + +const ROOT = process.cwd() +const BACKUP_ROOT = path.join(ROOT, "src/scripts/i18n/backups") +const GLOSSARY_BACKUP_DIR = path.join(BACKUP_ROOT, "glossary") +const TM_BACKUP_DIR = path.join(BACKUP_ROOT, "tm") +const HASH_FILE = path.join(BACKUP_ROOT, "hashes.json") + +export interface BackupHashes { + glossary?: Record // glossaryName -> hash + tm?: Record // tmName -> hash + lastUpdated?: string +} + +/** + * Calculate SHA-256 hash of content + */ +export function calculateHash(content: string): string { + return crypto.createHash("sha256").update(content, "utf8").digest("hex") +} + +/** + * Get short hash (first 8 characters) + */ +export function getShortHash(content: string): string { + return calculateHash(content).substring(0, 8) +} + +/** + * Ensure backup directories exist + */ +export function ensureBackupDirs(): void { + if (!fs.existsSync(BACKUP_ROOT)) { + fs.mkdirSync(BACKUP_ROOT, { recursive: true }) + } + if (!fs.existsSync(GLOSSARY_BACKUP_DIR)) { + fs.mkdirSync(GLOSSARY_BACKUP_DIR, { recursive: true }) + } + if (!fs.existsSync(TM_BACKUP_DIR)) { + fs.mkdirSync(TM_BACKUP_DIR, { recursive: true }) + } +} + +/** + * Load existing backup hashes + */ +export function loadBackupHashes(): BackupHashes { + if (!fs.existsSync(HASH_FILE)) { + return { glossary: {}, tm: {} } + } + + try { + const content = fs.readFileSync(HASH_FILE, "utf8") + return JSON.parse(content) + } catch (error) { + console.warn(`[BACKUP] Failed to load hashes, using empty:`, error) + return { glossary: {}, tm: {} } + } +} + +/** + * Save backup hashes + */ +export function saveBackupHashes(hashes: BackupHashes): void { + ensureBackupDirs() + hashes.lastUpdated = new Date().toISOString() + fs.writeFileSync(HASH_FILE, JSON.stringify(hashes, null, 2), "utf8") +} + +/** + * Check if content has changed (compare with stored hash) + */ +export function hasContentChanged( + name: string, + content: string, + type: "glossary" | "tm" +): boolean { + const hashes = loadBackupHashes() + const storedHash = (type === "glossary" ? hashes.glossary : hashes.tm)?.[name] + + if (!storedHash) { + console.log(`[BACKUP] No previous hash found for ${type}:${name}`) + return true + } + + const currentHash = calculateHash(content) + const changed = currentHash !== storedHash + + if (changed) { + console.log(`[BACKUP] Content changed for ${type}:${name}`) + console.log(`[BACKUP] - Old hash: ${storedHash}`) + console.log(`[BACKUP] - New hash: ${currentHash}`) + } else { + console.log(`[BACKUP] No changes detected for ${type}:${name}`) + } + + return changed +} + +/** + * Save backup file with timestamp and hash + */ +export function saveBackup( + name: string, + content: string, + type: "glossary" | "tm", + extension: string = "tbx" +): string { + ensureBackupDirs() + + const timestamp = Date.now() + const shortHash = getShortHash(content) + const sanitizedName = name.replace(/[^a-z0-9-_]/gi, "_").toLowerCase() + const filename = `${timestamp}_${shortHash}_${sanitizedName}.${extension}` + + const dir = type === "glossary" ? GLOSSARY_BACKUP_DIR : TM_BACKUP_DIR + const filepath = path.join(dir, filename) + + fs.writeFileSync(filepath, content, "utf8") + console.log(`[BACKUP] Saved ${type} backup: ${filename}`) + + // Update hash record + const hashes = loadBackupHashes() + if (type === "glossary") { + if (!hashes.glossary) hashes.glossary = {} + hashes.glossary[name] = calculateHash(content) + } else { + if (!hashes.tm) hashes.tm = {} + hashes.tm[name] = calculateHash(content) + } + saveBackupHashes(hashes) + + return filepath +} + +/** + * Get list of backup files for a resource + */ +export function listBackups( + name: string, + type: "glossary" | "tm" +): Array<{ path: string; timestamp: number; hash: string }> { + const dir = type === "glossary" ? GLOSSARY_BACKUP_DIR : TM_BACKUP_DIR + + if (!fs.existsSync(dir)) { + return [] + } + + const sanitizedName = name.replace(/[^a-z0-9-_]/gi, "_").toLowerCase() + const files = fs.readdirSync(dir) + + return files + .filter((file) => file.includes(sanitizedName)) + .map((file) => { + const [timestampStr, hash] = file.split("_") + return { + path: path.join(dir, file), + timestamp: parseInt(timestampStr, 10), + hash, + } + }) + .sort((a, b) => b.timestamp - a.timestamp) // Most recent first +} + +/** + * Get most recent backup for a resource + */ +export function getMostRecentBackup( + name: string, + type: "glossary" | "tm" +): string | null { + const backups = listBackups(name, type) + if (backups.length === 0) return null + + const mostRecent = backups[0] + return fs.readFileSync(mostRecent.path, "utf8") +} + +/** + * Clean up old backups, keeping only the most recent N + */ +export function cleanupOldBackups( + name: string, + type: "glossary" | "tm", + keepCount: number = 10 +): void { + const backups = listBackups(name, type) + + if (backups.length <= keepCount) { + console.log( + `[BACKUP] Only ${backups.length} backups for ${type}:${name}, no cleanup needed` + ) + return + } + + const toDelete = backups.slice(keepCount) + console.log( + `[BACKUP] Cleaning up ${toDelete.length} old backups for ${type}:${name}` + ) + + for (const backup of toDelete) { + try { + fs.unlinkSync(backup.path) + console.log(`[BACKUP] Deleted: ${path.basename(backup.path)}`) + } catch (error) { + console.warn(`[BACKUP] Failed to delete ${backup.path}:`, error) + } + } +} + +/** + * Get relative paths for all backup files (for Git commit) + */ +export function getAllBackupPaths(): string[] { + const paths: string[] = [] + + if (fs.existsSync(GLOSSARY_BACKUP_DIR)) { + const glossaryFiles = fs.readdirSync(GLOSSARY_BACKUP_DIR) + paths.push( + ...glossaryFiles.map((f) => + path.relative(ROOT, path.join(GLOSSARY_BACKUP_DIR, f)) + ) + ) + } + + if (fs.existsSync(TM_BACKUP_DIR)) { + const tmFiles = fs.readdirSync(TM_BACKUP_DIR) + paths.push( + ...tmFiles.map((f) => path.relative(ROOT, path.join(TM_BACKUP_DIR, f))) + ) + } + + if (fs.existsSync(HASH_FILE)) { + paths.push(path.relative(ROOT, HASH_FILE)) + } + + return paths +} diff --git a/src/scripts/i18n/lib/glossary/supabase.ts b/src/scripts/i18n/lib/glossary/supabase.ts new file mode 100644 index 00000000000..440b2ac0c2b --- /dev/null +++ b/src/scripts/i18n/lib/glossary/supabase.ts @@ -0,0 +1,211 @@ +/** + * Supabase glossary fetcher using REST API + * Fetches community-approved translations from the top_translations table + */ + +export interface GlossaryEntry { + term: string + translation: string + votes: number + languageCode: string +} + +export interface SupabaseRow { + string_term: string + translation_text: string + total_votes: number + language_code: string +} + +/** + * Fetch top-voted glossary terms for a specific language from Supabase + */ +export async function fetchGlossaryForLanguage( + supabaseUrl: string, + serviceRoleKey: string, + languageCode: string, + minVotes: number = 1 +): Promise { + const url = new URL(`${supabaseUrl}/rest/v1/top_translations`) + url.searchParams.set("language_code", `eq.${languageCode}`) + url.searchParams.set("total_votes", `gte.${minVotes}`) + url.searchParams.set("order", "total_votes.desc,string_term.asc") + url.searchParams.set( + "select", + "string_term,translation_text,total_votes,language_code" + ) + + console.log(`[GLOSSARY] Fetching from Supabase for language: ${languageCode}`) + console.log(`[GLOSSARY] URL: ${url.toString()}`) + + try { + const response = await fetch(url.toString(), { + headers: { + apikey: serviceRoleKey, + Authorization: `Bearer ${serviceRoleKey}`, + "Content-Type": "application/json", + }, + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Supabase API error (${response.status}): ${errorText}`) + } + + const rows: SupabaseRow[] = await response.json() + console.log( + `[GLOSSARY] Fetched ${rows.length} glossary entries for ${languageCode}` + ) + + return rows.map((row) => ({ + term: row.string_term, + translation: row.translation_text, + votes: row.total_votes, + languageCode: row.language_code, + })) + } catch (error) { + console.error( + `[GLOSSARY] Failed to fetch glossary for ${languageCode}:`, + error + ) + throw error + } +} + +/** + * Fetch glossary entries for all specified languages + */ +export async function fetchGlossaryForAllLanguages( + supabaseUrl: string, + serviceRoleKey: string, + languageCodes: string[], + minVotes: number = 1 +): Promise> { + console.log( + `[GLOSSARY] Fetching glossary for ${languageCodes.length} languages` + ) + + const results: Record = {} + + for (const langCode of languageCodes) { + try { + const entries = await fetchGlossaryForLanguage( + supabaseUrl, + serviceRoleKey, + langCode, + minVotes + ) + results[langCode] = entries + } catch (error) { + console.warn(`[GLOSSARY] Skipping ${langCode} due to error:`, error) + results[langCode] = [] + } + } + + const totalEntries = Object.values(results).reduce( + (sum, entries) => sum + entries.length, + 0 + ) + console.log( + `[GLOSSARY] Fetched ${totalEntries} total entries across all languages` + ) + + return results +} + +/** + * Format glossary entries as CSV for Crowdin import + * Format: term,translation,description,note + */ +export function formatGlossaryAsCSV(entries: GlossaryEntry[]): string { + const header = "term,translation,description,note\n" + const rows = entries.map((entry) => { + const term = escapeCSV(entry.term) + const translation = escapeCSV(entry.translation) + const description = escapeCSV(`Community-voted (${entry.votes} votes)`) + const note = escapeCSV("") + return `${term},${translation},${description},${note}` + }) + + return header + rows.join("\n") +} + +/** + * Escape CSV values (quote if contains comma, quote, or newline) + */ +function escapeCSV(value: string): string { + if (value.includes(",") || value.includes('"') || value.includes("\n")) { + return `"${value.replace(/"/g, '""')}"` + } + return value +} + +/** + * Format glossary entries as TBX (Term Base eXchange) for Crowdin import + */ +export function formatGlossaryAsTBX( + entries: GlossaryEntry[], + sourceLanguage: string, + targetLanguage: string +): string { + const now = new Date().toISOString() + + const termEntries = entries + .map((entry) => { + const escapedTerm = escapeXML(entry.term) + const escapedTranslation = escapeXML(entry.translation) + const escapedNote = escapeXML(`Community-voted: ${entry.votes} votes`) + + return ` + + ${escapedNote} + + + + ${escapedTerm} + + + + + ${escapedTranslation} + + + ` + }) + .join("\n") + + return ` + + + + + + Ethereum.org Community Glossary + + +

Generated from Supabase community glossary on ${now}

+
+
+ +

http://www.lisa.org/fileadmin/standards/tbx/TBXXCSV02.xcs

+
+
+ + +${termEntries} + + +
` +} + +/** + * Escape XML special characters + */ +function escapeXML(value: string): string { + return value + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'") +} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index f89c5751c25..b63767f5342 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -68,6 +68,26 @@ async function main(options?: { allLangs: boolean }) { allCrowdinCodes: config.allCrowdinCodes, }) + // Step 0: Sync glossary from Supabase to Crowdin + if (!config.existingPreTranslationId) { + console.log("\n[GLOSSARY] ========== Syncing Glossary ==========") + try { + const { syncGlossary } = await import("./sync-glossary") + const glossaryResult = await syncGlossary() + console.log( + `[GLOSSARY] ✓ Updated ${glossaryResult.updatedGlossaries.length} languages` + ) + if (glossaryResult.backupPrUrl) { + console.log(`[GLOSSARY] ✓ Backup PR: ${glossaryResult.backupPrUrl}`) + } + } catch (error) { + console.error("[GLOSSARY] Failed to sync glossary:", error) + console.error("[GLOSSARY] Continuing with workflow anyway...") + } + } else { + console.log("\n[GLOSSARY] Skipping glossary sync (resuming existing job)") + } + // Shared state used in both resume and new flows const crowdinProjectFiles = await getCrowdinProjectFiles() const fileIdsSet = new Set() diff --git a/src/scripts/i18n/sync-glossary.ts b/src/scripts/i18n/sync-glossary.ts new file mode 100644 index 00000000000..e7140cf395e --- /dev/null +++ b/src/scripts/i18n/sync-glossary.ts @@ -0,0 +1,278 @@ +/** + * Glossary synchronization orchestrator + * + * Workflow: + * 1. Export existing Crowdin glossaries/TMs + * 2. Check if content has changed (compare hashes) + * 3. If changed, save timestamped backup to .crowdin-backups/ + * 4. Fetch latest glossary from Supabase + * 5. Import updated glossary to Crowdin + * 6. Create Git branch and PR with backup files + */ + +import { + exportGlossary, + exportTranslationMemory, + importGlossary, + listGlossaries, + listTranslationMemories, +} from "./lib/crowdin/glossary" +import { postCreateBranchFrom } from "./lib/github/branches" +import { putCommitFile } from "./lib/github/commits" +import { postPullRequest } from "./lib/github/pull-requests" +import { + getAllBackupPaths, + hasContentChanged, + saveBackup, +} from "./lib/glossary/backup" +import { + fetchGlossaryForLanguage, + formatGlossaryAsTBX, +} from "./lib/glossary/supabase" +import { mapCrowdinCodeToInternal } from "./lib/utils/mapping" +import { config } from "./config" + +const SUPABASE_URL = process.env.SUPABASE_URL || "" +const SUPABASE_SERVICE_ROLE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY || "" + +if (!SUPABASE_SERVICE_ROLE_KEY) { + console.error("[GLOSSARY-SYNC] Missing SUPABASE_SERVICE_ROLE_KEY") + throw new Error("SUPABASE_SERVICE_ROLE_KEY environment variable is required") +} + +const MIN_VOTES = parseInt(process.env.GLOSSARY_MIN_VOTES || "2", 10) +const SKIP_BACKUP_PR = process.env.SKIP_GLOSSARY_BACKUP_PR === "true" + +/** + * Main sync function + */ +export async function syncGlossary(): Promise<{ + backupBranch?: string + backupPrUrl?: string + updatedGlossaries: string[] +}> { + console.log("\n[GLOSSARY-SYNC] ========== Starting Glossary Sync ==========") + console.log(`[GLOSSARY-SYNC] Supabase URL: ${SUPABASE_URL}`) + console.log(`[GLOSSARY-SYNC] Min votes: ${MIN_VOTES}`) + console.log(`[GLOSSARY-SYNC] Skip backup PR: ${SKIP_BACKUP_PR}`) + + const backupPaths: string[] = [] + const updatedGlossaries: string[] = [] + let backupNeeded = false + + // Step 1: Export and backup existing Crowdin glossaries + console.log( + "\n[GLOSSARY-SYNC] Step 1: Backing up existing Crowdin glossaries" + ) + try { + const glossaries = await listGlossaries() + console.log( + `[GLOSSARY-SYNC] Found ${glossaries.length} existing glossaries` + ) + + for (const glossary of glossaries) { + console.log( + `[GLOSSARY-SYNC] Exporting glossary: ${glossary.name} (ID: ${glossary.id})` + ) + const content = await exportGlossary(glossary.id) + + if (hasContentChanged(glossary.name, content, "glossary")) { + const backupPath = saveBackup(glossary.name, content, "glossary", "tbx") + backupPaths.push(backupPath) + backupNeeded = true + } + } + } catch (error) { + console.warn( + "[GLOSSARY-SYNC] Failed to backup glossaries (continuing anyway):", + error + ) + } + + // Step 2: Export and backup Translation Memories (optional) + console.log("\n[GLOSSARY-SYNC] Step 2: Backing up Translation Memories") + try { + const tms = await listTranslationMemories() + console.log(`[GLOSSARY-SYNC] Found ${tms.length} TMs`) + + for (const tm of tms) { + console.log(`[GLOSSARY-SYNC] Exporting TM: ${tm.name} (ID: ${tm.id})`) + const content = await exportTranslationMemory(tm.id) + + if (hasContentChanged(tm.name, content, "tm")) { + const backupPath = saveBackup(tm.name, content, "tm", "tmx") + backupPaths.push(backupPath) + backupNeeded = true + } + } + } catch (error) { + console.warn( + "[GLOSSARY-SYNC] Failed to backup TMs (continuing anyway):", + error + ) + } + + // Step 3: Fetch latest glossary from Supabase for each language + console.log("\n[GLOSSARY-SYNC] Step 3: Fetching glossary from Supabase") + const languageCodes = config.allCrowdinCodes + console.log(`[GLOSSARY-SYNC] Target languages: ${languageCodes.join(", ")}`) + + for (const crowdinCode of languageCodes) { + try { + // Map Crowdin code to internal code for Supabase query + const internalCode = mapCrowdinCodeToInternal(crowdinCode) + console.log( + `\n[GLOSSARY-SYNC] Processing language: ${crowdinCode} (internal: ${internalCode})` + ) + + const entries = await fetchGlossaryForLanguage( + SUPABASE_URL, + SUPABASE_SERVICE_ROLE_KEY, + internalCode, + MIN_VOTES + ) + + if (entries.length === 0) { + console.log( + `[GLOSSARY-SYNC] No glossary entries found for ${crowdinCode}` + ) + continue + } + + console.log( + `[GLOSSARY-SYNC] Found ${entries.length} glossary entries for ${crowdinCode}` + ) + + // Step 4: Import to Crowdin + const tbxContent = formatGlossaryAsTBX(entries, "en", crowdinCode) + const glossaryName = `Ethereum.org Community (${crowdinCode})` + + console.log(`[GLOSSARY-SYNC] Importing glossary: ${glossaryName}`) + await importGlossary(glossaryName, crowdinCode, tbxContent) + + updatedGlossaries.push(crowdinCode) + console.log( + `[GLOSSARY-SYNC] ✓ Successfully updated glossary for ${crowdinCode}` + ) + } catch (error) { + console.error( + `[GLOSSARY-SYNC] Failed to update glossary for ${crowdinCode}:`, + error + ) + // Continue with other languages + } + } + + // Step 5: Create backup PR if needed + let backupBranch: string | undefined + let backupPrUrl: string | undefined + + if (backupNeeded && !SKIP_BACKUP_PR) { + console.log("\n[GLOSSARY-SYNC] Step 5: Creating backup PR") + try { + const result = await createBackupPR() + backupBranch = result.branch + backupPrUrl = result.prUrl + } catch (error) { + console.error("[GLOSSARY-SYNC] Failed to create backup PR:", error) + console.error( + "[GLOSSARY-SYNC] Backups are saved locally but not committed" + ) + } + } else if (backupNeeded) { + console.log( + "\n[GLOSSARY-SYNC] Backups saved locally (PR creation skipped via SKIP_GLOSSARY_BACKUP_PR)" + ) + } else { + console.log("\n[GLOSSARY-SYNC] No backups needed (no changes detected)") + } + + console.log("\n[GLOSSARY-SYNC] ========== Sync Complete ==========") + console.log(`[GLOSSARY-SYNC] Updated glossaries: ${updatedGlossaries.length}`) + console.log(`[GLOSSARY-SYNC] Languages: ${updatedGlossaries.join(", ")}`) + if (backupBranch) { + console.log(`[GLOSSARY-SYNC] Backup branch: ${backupBranch}`) + } + if (backupPrUrl) { + console.log(`[GLOSSARY-SYNC] Backup PR: ${backupPrUrl}`) + } + + return { + backupBranch, + backupPrUrl, + updatedGlossaries, + } +} + +/** + * Create a Git branch and PR with backup files + */ +async function createBackupPR(): Promise<{ branch: string; prUrl: string }> { + const timestamp = new Date().toISOString().split("T")[0] + const branchName = `i18n-glossary-backup-${timestamp}` + + console.log(`[GLOSSARY-SYNC] Creating branch: ${branchName}`) + await postCreateBranchFrom(config.baseBranch, branchName) + + // Get all backup files (including newly created ones) + const allBackupPaths = getAllBackupPaths() + console.log( + `[GLOSSARY-SYNC] Committing ${allBackupPaths.length} backup files` + ) + + // Commit each backup file + const fs = await import("fs") + const path = await import("path") + + for (const relativePath of allBackupPaths) { + console.log(`[GLOSSARY-SYNC] Committing: ${relativePath}`) + const absolutePath = path.join(process.cwd(), relativePath) + const buffer = fs.readFileSync(absolutePath) + await putCommitFile(buffer, relativePath, branchName) + } + + // Create PR + console.log("[GLOSSARY-SYNC] Creating pull request") + const prTitle = `🗂️ Crowdin Glossary/TM Backup - ${timestamp}` + const prBody = `# Crowdin Glossary and Translation Memory Backup + +This automated PR backs up Crowdin glossary and translation memory exports before syncing with the Supabase community glossary. + +## Backup Details +- **Date**: ${new Date().toISOString()} +- **Files**: ${allBackupPaths.length} total backups +- **Glossary backups**: ${allBackupPaths.filter((p) => p.includes("glossary")).length} +- **TM backups**: ${allBackupPaths.filter((p) => p.includes("tm")).length} + +## Purpose +These backups enable easy reversion if the Supabase glossary sync introduces issues. Each backup is timestamped and content-hashed for traceability. + +## Next Steps +- Review the backup files +- Merge to preserve the backup history +- Monitor the main translation workflow for any glossary-related issues + +**Auto-generated by the i18n glossary sync workflow** +` + + const prUrl = await postPullRequest(branchName, prTitle, prBody) + console.log(`[GLOSSARY-SYNC] ✓ Created PR: ${prUrl}`) + + return { branch: branchName, prUrl } +} + +// CLI execution +if (require.main === module) { + syncGlossary() + .then((result) => { + console.log("\n[GLOSSARY-SYNC] Success!") + if (result.backupPrUrl) { + console.log(`[GLOSSARY-SYNC] Backup PR: ${result.backupPrUrl}`) + } + process.exit(0) + }) + .catch((error) => { + console.error("\n[GLOSSARY-SYNC] Fatal error:", error) + process.exit(1) + }) +} From 09a370ca0100de0becb97f370d8c8dcd1fba343d Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 1 Dec 2025 10:09:15 -0300 Subject: [PATCH 16/99] feat: commit initial pre-translate prompt --- .../i18n/lib/crowdin/pre-translate-prompt.txt | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt diff --git a/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt b/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt new file mode 100644 index 00000000000..dc9050b65b3 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt @@ -0,0 +1,64 @@ +You are a professional translator with native-level fluency in both English and all the target languages %targetLanguages% in the project, and expertise in Ethereum, blockchain, cryptocurrency, and decentralized technologies. +You have deep familiarity with open-source communities and technical documentation, enabling you to handle domain-specific terminology accurately. Your task is to produce high-quality translations of ethereum.org content from English into the target language, following the guidelines below. +The source content is content from the ethereum.org website, segmented in the source files as %strings%. +Translate content from English into the target language %targetLanguage% specified for each individual pre-translation project run. +The target language is automatically defined in the project configuration and the pre-translation process — never guess or switch languages. +Always output translations in the target language only. + +CRITICAL DO-NOT-BREAK RULES (must follow exactly): +- Custom header IDs: If a Markdown heading includes a custom anchor like `{#custom-id}`, the ID MUST remain identical to the English source, ASCII-only (no accents or special characters). Do NOT alter, translate, add, or remove braces. Keep the exact ID string. +- HTML/MDX tag line placement: If an opening HTML tag appears on its own line, the matching closing tag MUST also be on its own line. Preserve line breaks around paired block-level tags. +- JSX/MDX attributes: Translate human-readable text found inside attribute values (e.g., `title="..."`, `aria-label="..."`, `alt="..."`) while preserving placeholders, variables, and code. Do NOT translate attribute names or change quoting/escaping. +- Protected names: Do NOT translate obvious proper names, brands, or team names (e.g., "Ethereum", "ETH", "Solidity", "MetaMask", "GitHub", "Crowdin"). Leave these as in the source unless a community-approved localized form exists. + +Maintain Clarity and Professionalism: Ensure the translated text is clear, accurate, and professional in tone, just like the source. Match the tone and register of the English content – if the source is explanatory and formal, the translation should mirror that style. Remember that Ethereum’s content serves both experts and complete beginners, so the translation should be accessible to technical and non-technical readers alike. +Consistency with Source Tone: Use a tone that is neither too casual nor overly stiff, unless the source text itself has a specific tone. For example, if the English text uses a friendly and encouraging tone, reflect that in the translation while maintaining professionalism. +Formal Address: In languages that have formal and informal address forms, use the formal form to address the reader. This ensures the content remains respectful and appropriate for all users, and often helps maintain gender-neutrality. Only use an informal tone if the English source explicitly does so. +Idioms and Cultural Nuances: If the source uses idiomatic expressions or culturally specific references, preserve their intent. Replace an idiom only with an equivalent well-understood expression; otherwise keep a direct translation that preserves meaning. + +Certain elements of the source text must be handled with special care during translation: + +Technical Terms: Do not translate highly specific blockchain terms such as "smart contract", "gas", "dapp", or other Ethereum jargon unless there is a widely accepted equivalent. When in doubt, leave the term in English. +Code, Commands, and Output: Retain code snippets, configuration commands, outputs, function names, and anything in backticks or code blocks exactly. Do not translate placeholders (e.g. {value}, %s, <0>...), variables, or braces. Translate English comments inside code (e.g., lines or blocks starting with //, #, or /* ... */) while leaving all code tokens unchanged. +URLs, File Paths, and Domain Names: Never translate or alter these. Preserve exactly, including case and slashes. +Markdown, HTML, and JSX/MDX Syntax: Preserve all formatting symbols, tags, and structure. Do not add/remove markers. Keep tag order identical. Translate only human-readable text outside tags. +Punctuation in Code/Text: Do not alter punctuation that is part of code/syntax (e.g., {}, <>, (), []). + +Match Source Capitalization: Preserve capitalization of terms, acronyms, proper nouns (e.g., "Ethereum", "Solidity", "NFT"). Maintain ALL CAPS where used. +Follow Target Language Conventions: Apply normal punctuation/grammar rules of the target language except where code syntax would break. +Sentence Structure: Reorder or split/join sentences only to achieve natural grammar; avoid ambiguity changes. +End Punctuation: Mirror source intent; headings without periods usually remain without periods. + +Use Consistent Terminology: Reuse prior translations for repeated terms unless context demands a change. +Ethereum Glossary and Termbase: If provided, follow those preferred translations strictly. +External Translation Memory (TM): Use exact matches from TM if context fits. +No Glossary or TM? Pick a clear translation and keep it consistent thereafter. + +(If translation memory/termbase resources are available to Crowdin AI, they should be applied to maintain consistency.) + +Preserve Tags and Placeholders: Keep tags/placeholders exactly ordered. Do not duplicate, omit, or reorder them. +Do Not Break Variables: Leave placeholders such as {userName} unchanged; adapt surrounding punctuation only if required. +Avoid Tag Duplication or Omission: Every opening tag must have its closing counterpart. Never remove tags. +Maintain Markdown Structure: Lists, tables, headings remain structurally identical. Custom IDs stay identical to English. +Line Breaks and Whitespace: Avoid introducing/removing line breaks. Keep opening/closing block HTML tags on their own lines when the source does. + +Inclusive Language: Use gender-neutral constructions where possible. +Localize Examples and Units Where Appropriate: Localize date formats, basic punctuation as customary without altering meaning. Do not convert currencies. +Cultural References: Prefer clarity over forced local analogies. Keep original if unsure. +Avoid Slang and Colloquialisms: Maintain professional, accessible tone. + +Untranslatable Strings: Keep product names, trademarks, protocol names, abbreviations ("ETH", "NFT", "HTML", "PoW", "PoS", "EIP-1559") unless widely accepted localized form exists. +Placeholders and Dummy Text: Do not translate placeholder tokens or dummy values (e.g., "Lorem ipsum", "user@example.com"). +Flagging Issues: If a string is ambiguous and unsafe to translate confidently, produce a literal translation or leave it for review (do not guess). +No Guessing for Missing Context: Choose neutral wording when context is unclear; retain English term where ambiguity could mislead. + +Preserve Intended Meaning: Prioritize accurate meaning over literal wording. +Literal vs. Free Translation: Avoid overly literal output if unnatural; adjust for clarity. +Clarity Over Literalness: Prefer clear, idiomatic phrasing that conveys the source meaning. +Avoid Adding Extra Information: Do not introduce new content or explanations. + +High-Quality Output: Output should need minimal post-editing: correct spelling, grammar, style. +Avoiding Errors: Do not omit content or invert meaning; retain numeric values precisely. +Consistency in Style: Maintain uniform formal, explanatory tone throughout. +Minimal Creativity: Rephrase only when necessary for clarity/grammar. +Post-Editing Ready: Deliver translation suitable for quick human approval. \ No newline at end of file From 1351ba62ff7761de6264a22e14df6b81205fed3b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 1 Dec 2025 10:10:07 -0300 Subject: [PATCH 17/99] update(i18n): pre-translate prompt --- .../i18n/lib/crowdin/pre-translate-prompt.txt | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt b/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt index dc9050b65b3..fe02bc2ada6 100644 --- a/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt +++ b/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt @@ -6,10 +6,12 @@ The target language is automatically defined in the project configuration and th Always output translations in the target language only. CRITICAL DO-NOT-BREAK RULES (must follow exactly): +- JSON escaping: When translating JSON files, ALL double quotes (") inside string values MUST be escaped as \" to maintain valid JSON. Similarly, escape backslashes (\) as \\, newlines as \n, tabs as \t. The output MUST be parseable JSON—invalid JSON will break the build. Example: translate "Learn about "Ethereum"" as "Aprenda sobre \"Ethereum\"", NOT "Aprenda sobre "Ethereum"". - Custom header IDs: If a Markdown heading includes a custom anchor like `{#custom-id}`, the ID MUST remain identical to the English source, ASCII-only (no accents or special characters). Do NOT alter, translate, add, or remove braces. Keep the exact ID string. - HTML/MDX tag line placement: If an opening HTML tag appears on its own line, the matching closing tag MUST also be on its own line. Preserve line breaks around paired block-level tags. - JSX/MDX attributes: Translate human-readable text found inside attribute values (e.g., `title="..."`, `aria-label="..."`, `alt="..."`) while preserving placeholders, variables, and code. Do NOT translate attribute names or change quoting/escaping. -- Protected names: Do NOT translate obvious proper names, brands, or team names (e.g., "Ethereum", "ETH", "Solidity", "MetaMask", "GitHub", "Crowdin"). Leave these as in the source unless a community-approved localized form exists. +- Protected names: Do NOT translate obvious proper names, brands, or team names (e.g., "Ethereum", "ETH", "Solidity", "MetaMask", "GitHub", "Crowdin", "ethereum.org"). Leave these as in the source unless a community-approved localized form exists. + - URL/path destinations MUST be preserved character-for-character: keep exact case, hyphens, slashes, fragments (`#...`), and query parameters (`?...`). Do NOT change, normalize, or localize any part of a link destination. This rule also applies to any links contained within JSON string values used in React/MDX pages. Maintain Clarity and Professionalism: Ensure the translated text is clear, accurate, and professional in tone, just like the source. Match the tone and register of the English content – if the source is explanatory and formal, the translation should mirror that style. Remember that Ethereum’s content serves both experts and complete beginners, so the translation should be accessible to technical and non-technical readers alike. Consistency with Source Tone: Use a tone that is neither too casual nor overly stiff, unless the source text itself has a specific tone. For example, if the English text uses a friendly and encouraging tone, reflect that in the translation while maintaining professionalism. @@ -42,6 +44,11 @@ Avoid Tag Duplication or Omission: Every opening tag must have its closing count Maintain Markdown Structure: Lists, tables, headings remain structurally identical. Custom IDs stay identical to English. Line Breaks and Whitespace: Avoid introducing/removing line breaks. Keep opening/closing block HTML tags on their own lines when the source does. +Consistency Reminders (non-strict, but preferred): +- Headings: Keep section and subsection heading choices consistent with the English source across the document. If the source uses a particular heading term (e.g., "Overview", "Examples", "Resources"), choose a single clear localized equivalent and reuse it throughout the page. +- Example Arrays and Lists: When the source contains example items (lists of technologies, wallets, tools, etc.), translate common nouns/adjectives to the target language. Retain English only for proper names and brands. Do not revert entire lists to English unless items are proper nouns. +- Stable Canonical Terms: Prefer previously used localized headings/labels for recurring sections when known (e.g., consistent translation for "Learn", "Developers"). If unsure, pick the most natural single term and stick to it within the page. + Inclusive Language: Use gender-neutral constructions where possible. Localize Examples and Units Where Appropriate: Localize date formats, basic punctuation as customary without altering meaning. Do not convert currencies. Cultural References: Prefer clarity over forced local analogies. Keep original if unsure. @@ -49,9 +56,9 @@ Avoid Slang and Colloquialisms: Maintain professional, accessible tone. Untranslatable Strings: Keep product names, trademarks, protocol names, abbreviations ("ETH", "NFT", "HTML", "PoW", "PoS", "EIP-1559") unless widely accepted localized form exists. Placeholders and Dummy Text: Do not translate placeholder tokens or dummy values (e.g., "Lorem ipsum", "user@example.com"). -Flagging Issues: If a string is ambiguous and unsafe to translate confidently, produce a literal translation or leave it for review (do not guess). +Flagging Issues: If a string is ambiguous and unsafe to translate confidently, produce a literal translation or leave it for review (do not guess). When consistency conflicts arise (e.g., competing heading variants), prefer the most widely used term in the target language or the termbase entry if available. No Guessing for Missing Context: Choose neutral wording when context is unclear; retain English term where ambiguity could mislead. - +´ Preserve Intended Meaning: Prioritize accurate meaning over literal wording. Literal vs. Free Translation: Avoid overly literal output if unnatural; adjust for clarity. Clarity Over Literalness: Prefer clear, idiomatic phrasing that conveys the source meaning. From 68d4e39d002d0c9e428e250955a17076220be57b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 1 Dec 2025 11:45:15 -0300 Subject: [PATCH 18/99] feat: use in-repo prompt as canonical --- src/scripts/i18n/lib/crowdin/prompt.ts | 28 +++++++++++++++ .../i18n/lib/github/pr-review-comments.ts | 14 ++++++++ src/scripts/i18n/main.ts | 36 ++++++++++++------- 3 files changed, 66 insertions(+), 12 deletions(-) create mode 100644 src/scripts/i18n/lib/crowdin/prompt.ts diff --git a/src/scripts/i18n/lib/crowdin/prompt.ts b/src/scripts/i18n/lib/crowdin/prompt.ts new file mode 100644 index 00000000000..e3dacfb75f1 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/prompt.ts @@ -0,0 +1,28 @@ +import * as fs from "fs" + +import { crowdinBearerHeaders } from "../../config" + +/** + * Update a Crowdin AI prompt's content from a local file. + * Uses Crowdin API v2: PATCH /users/{userId}/ai/prompts/{promptId} + */ +export async function updatePromptFromFile( + userId: number, + promptId: number, + filePath: string +): Promise { + const content = await fs.promises.readFile(filePath, "utf8") + const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` + const resp = await fetch(url, { + method: "PATCH", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ content }), + }) + if (!resp.ok) { + const text = await resp.text().catch(() => "") + throw new Error(`Failed to update prompt (${resp.status}): ${text}`) + } +} diff --git a/src/scripts/i18n/lib/github/pr-review-comments.ts b/src/scripts/i18n/lib/github/pr-review-comments.ts index 91715074be3..30ff5229c7e 100644 --- a/src/scripts/i18n/lib/github/pr-review-comments.ts +++ b/src/scripts/i18n/lib/github/pr-review-comments.ts @@ -33,6 +33,20 @@ export async function postPrReviewComment( comment += "This PR contains automated translations that need quality review.\n\n" + // Compact snapshot of canonical prompt rules and glossary/TM awareness + comment += "### Prompt Rules Snapshot\n\n" + comment += "Key non-negotiables for review:\n" + comment += "- Protected names include `ethereum.org`; do not change casing.\n" + comment += "- Header IDs `{#...}` must remain identical to English.\n" + comment += + "- URL/path destinations must be preserved character-for-character (case, hyphens, slashes, fragments, query params). This also applies to links inside JSON strings.\n" + comment += + '- JSON escaping: inside JSON values, escape quotes ("), backslashes (\\), newlines (\\n), tabs (\\t).\n\n' + comment += + "Canonical prompt source: `src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt` (synced to Crowdin before pre-translation).\n\n" + comment += + "Glossary/TM note: Community glossary/TM is synced from Supabase into Crowdin at the start of the run to guide terminology consistency.\n\n" + if (copilotLangs.length > 0) { comment += "### @copilot\n\n" comment += diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index b63767f5342..48375d72c5f 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -17,6 +17,7 @@ import { getPreTranslationStatus, postApplyPreTranslation, } from "./lib/crowdin/pre-translate" +import { updatePromptFromFile } from "./lib/crowdin/prompt" import { getPromptModelKey } from "./lib/crowdin/prompt-model" import { awaitQaCompletion, @@ -25,6 +26,7 @@ import { postQaCompletions, type QaCompletionJob, type QaIssue, + resolveCrowdinUserId, summarizeQaIssues, } from "./lib/crowdin/qa-completions" import { postCreateBranchFrom } from "./lib/github/branches" @@ -125,6 +127,22 @@ async function main(options?: { allLangs: boolean }) { // Normal flow: Start new pre-translation console.log(`\n[START] ========== Starting new pre-translation ==========`) + // Ensure Crowdin AI prompt content is synced from repo canonical file + try { + const userId = await resolveCrowdinUserId() + const promptPath = `${process.cwd()}/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt` + await updatePromptFromFile( + Number(userId), + config.preTranslatePromptId, + promptPath + ) + console.log( + "[PROMPT] ✓ Updated Crowdin pre-translate prompt from repo file" + ) + } catch (e) { + console.warn("[PROMPT] Failed to update prompt; continuing:", e) + } + // Fetch English files with limit + start offset const allEnglishFiles = await getAllEnglishFiles( config.fileLimit, @@ -414,18 +432,12 @@ async function main(options?: { allLangs: boolean }) { ) let modelKey: string | undefined try { - const userId = process.env.I18N_CROWDIN_USER_ID - if (userId) { - modelKey = await getPromptModelKey( - Number(userId), - config.preTranslatePromptId - ) - console.log(`[MODEL-DETECTION] Current model: ${modelKey}`) - } else { - console.log( - `[MODEL-DETECTION] I18N_CROWDIN_USER_ID not set, skipping model detection` - ) - } + const userId = await resolveCrowdinUserId() + modelKey = await getPromptModelKey( + Number(userId), + config.preTranslatePromptId + ) + console.log(`[MODEL-DETECTION] Current model: ${modelKey}`) } catch (err) { console.warn(`[MODEL-DETECTION] Failed to detect model:`, err) } From de3117e567b9225b53ec1c6f40df199bda824c11 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:02:07 -0300 Subject: [PATCH 19/99] refactor: streamline i18n automation pipeline to MVP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major refactor to simplify the translation automation workflow: REMOVED: - Supabase glossary integration and all related files - Multi-tier QA system with trust matrix and language scoring - OpenAI trust matrix generation - Post-translation Crowdin re-sync logic - Multiple PR workflow (high/medium/low trust tiers) - 13 files deleted total NEW FEATURES: - Unified target_path input (auto-detects file vs directory) - Smart translation modes: single file, directory, or full translation - Intelligent timeout handling (waits for file/dir, exits for full) - Pre-translation artifact with job metadata for resuming - Verbose logging flag for cleaner output - Single PR workflow for all languages UPDATES: - GitHub workflow: simplified inputs, removed unused env vars - config.ts: replaced fileLimit/startOffset with targetPath - main.ts: complete rewrite (667 → 422 lines) - lib/github/files.ts: smart path detection with excluded-paths.json - lib/crowdin/files.ts: verbose logging support - .gitignore: added artifacts/ directory The pipeline now focuses on Spanish MVP with ability to expand to 25+ languages defined in canonical-llm-language-list.json. Post-import sanitization ensures build stability without additional QA overhead. --- .github/workflows/crowdin-ai-import.yml | 36 +- .gitignore | 3 + .../i18n/GLOSSARY_SYNC_IMPLEMENTATION.md | 239 ------ src/scripts/i18n/config.ts | 50 +- .../config/canonical-llm-language-list.json | 27 + src/scripts/i18n/config/excluded-paths.json | 1 + src/scripts/i18n/config/language-trust.json | 44 - src/scripts/i18n/gen_trust_matrix.ts | 57 -- src/scripts/i18n/glossary-sync.md | 149 ---- src/scripts/i18n/lib/crowdin/files.ts | 45 +- src/scripts/i18n/lib/crowdin/glossary.ts | 354 -------- .../i18n/lib/crowdin/qa-completions.ts | 225 ------ src/scripts/i18n/lib/github/files.ts | 197 ++++- src/scripts/i18n/lib/glossary/backup.ts | 245 ------ src/scripts/i18n/lib/glossary/supabase.ts | 211 ----- .../i18n/lib/openai/trust-matrix-generator.ts | 149 ---- src/scripts/i18n/lib/qa-routing.ts | 101 --- src/scripts/i18n/main.ts | 757 ++++++------------ src/scripts/i18n/sync-glossary.ts | 278 ------- 19 files changed, 511 insertions(+), 2657 deletions(-) delete mode 100644 src/scripts/i18n/GLOSSARY_SYNC_IMPLEMENTATION.md create mode 100644 src/scripts/i18n/config/canonical-llm-language-list.json create mode 100644 src/scripts/i18n/config/excluded-paths.json delete mode 100644 src/scripts/i18n/config/language-trust.json delete mode 100644 src/scripts/i18n/gen_trust_matrix.ts delete mode 100644 src/scripts/i18n/glossary-sync.md delete mode 100644 src/scripts/i18n/lib/crowdin/glossary.ts delete mode 100644 src/scripts/i18n/lib/crowdin/qa-completions.ts delete mode 100644 src/scripts/i18n/lib/glossary/backup.ts delete mode 100644 src/scripts/i18n/lib/glossary/supabase.ts delete mode 100644 src/scripts/i18n/lib/openai/trust-matrix-generator.ts delete mode 100644 src/scripts/i18n/lib/qa-routing.ts delete mode 100644 src/scripts/i18n/sync-glossary.ts diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index d5a35b3509a..e347d3b3d3f 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -7,15 +7,9 @@ on: description: "Pre-translation ID to resume from (optional - leave empty to start new)" required: false type: string - file_limit: - description: "Number of files to process (default: 100, use 1-10 for testing)" + target_path: + description: "File or directory path to translate (optional - e.g., public/content/developers/index.md or public/content/developers)" required: false - default: "100" - type: string - start_offset: - description: "Starting offset for files (skip first N files; default: 0)" - required: false - default: "0" type: string target_languages: description: "Comma-separated internal language codes (default: es)" @@ -42,21 +36,11 @@ on: required: false default: "326942" type: string - qa_prompt_id: - description: "AI prompt ID for qa_check (default: 168592)" - required: false - default: "168592" - type: string - glossary_min_votes: - description: "Minimum votes for glossary terms (default: 2)" - required: false - default: "2" - type: string - skip_glossary_backup_pr: - description: "Skip creating a PR for glossary backups (default: false)" + verbose: + description: "Enable verbose logging (default: false)" required: false default: "false" - type: string + type: boolean jobs: import_translations: @@ -82,19 +66,13 @@ jobs: env: I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_API_KEY }} I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} I18N_CROWDIN_USER_ID: ${{ secrets.I18N_CROWDIN_USER_ID }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} - SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} - FILE_LIMIT: ${{ github.event.inputs.file_limit }} - START_OFFSET: ${{ github.event.inputs.start_offset }} + TARGET_PATH: ${{ github.event.inputs.target_path }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} BASE_BRANCH: ${{ github.event.inputs.base_branch }} PRETRANSLATE_TIMEOUT_MS: ${{ github.event.inputs.pretranslate_timeout_ms }} PRETRANSLATE_POLL_BASE_MS: ${{ github.event.inputs.pretranslate_poll_base_ms }} PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} - QA_PROMPT_ID: ${{ github.event.inputs.qa_prompt_id }} - GLOSSARY_MIN_VOTES: ${{ github.event.inputs.glossary_min_votes }} - SKIP_GLOSSARY_BACKUP_PR: ${{ github.event.inputs.skip_glossary_backup_pr }} + VERBOSE: ${{ github.event.inputs.verbose }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/.gitignore b/.gitignore index ed6b81bc873..cd33723238a 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,6 @@ src/data/crowdin/bucketsAwaitingReviewReport.csv build-storybook.log build-archive.log storybook-static + +# I18n translation artifacts +artifacts/ diff --git a/src/scripts/i18n/GLOSSARY_SYNC_IMPLEMENTATION.md b/src/scripts/i18n/GLOSSARY_SYNC_IMPLEMENTATION.md deleted file mode 100644 index 5b982588383..00000000000 --- a/src/scripts/i18n/GLOSSARY_SYNC_IMPLEMENTATION.md +++ /dev/null @@ -1,239 +0,0 @@ -# Glossary Sync Implementation Summary - -## Overview - -Implemented automated synchronization of community-approved translations from Supabase to Crowdin glossaries, with automatic backup and PR creation for glossary version control. - -## Architecture - -### Data Flow - -``` -Supabase (top_translations table) - ↓ Fetch glossary entries (min votes filter) - ↓ Format as TBX (Term Base eXchange) - ↓ -Crowdin Glossaries - ↓ Before import: export existing - ↓ Calculate hash, detect changes - ↓ -.crowdin-backups/ (timestamped backups) - ↓ Commit to separate branch - ↓ -GitHub PR (backup for reversion) -``` - -### Key Features - -1. **Content-based change detection** - Only backs up when glossary content changes (SHA-256 hash comparison) -2. **Timestamped backups** - Each file named with `{timestamp}_{hash}_{name}.{ext}` for easy sorting and identification -3. **Separate backup PRs** - Glossary backups don't clutter translation PRs -4. **Language mapping** - Automatic conversion between internal codes (`es`) and Crowdin codes (`es-EM`) -5. **Vote filtering** - Only imports terms with minimum community consensus -6. **Fail-safe execution** - Glossary sync failures don't block translation workflow - -## Files Created - -### Core Modules - -- **src/scripts/i18n/sync-glossary.ts** (275 lines) - - - Main orchestrator - - Coordinates backup → fetch → import → PR flow - - CLI executable: `npx ts-node src/scripts/i18n/sync-glossary.ts` - -- **src/scripts/i18n/lib/glossary/supabase.ts** (206 lines) - - - REST API client (no dependencies, uses native `fetch`) - - Functions: `fetchGlossaryForLanguage`, `fetchGlossaryForAllLanguages` - - Formatters: `formatGlossaryAsCSV`, `formatGlossaryAsTBX` - -- **src/scripts/i18n/lib/crowdin/glossary.ts** (309 lines) - - - Crowdin API wrappers for glossary and TM operations - - Functions: `listGlossaries`, `exportGlossary`, `importGlossary`, `listTranslationMemories`, `exportTranslationMemory` - - Handles storage upload, import polling, and glossary creation - -- **src/scripts/i18n/lib/glossary/backup.ts** (223 lines) - - Hash calculation and change detection - - Timestamped backup file creation - - Backup history management (cleanup old backups) - - Git integration helpers - -### Configuration - -- **src/scripts/i18n/main.ts** (modified) - - - Added glossary sync step at workflow start - - Runs before pre-translation - - Skipped if resuming existing job - -- **.github/workflows/crowdin-ai-import.yml** (modified) - - - Added `SUPABASE_URL` and `SUPABASE_SERVICE_ROLE_KEY` secrets - - Added workflow inputs: `glossary_min_votes`, `skip_glossary_backup_pr` - - Environment variables passed to script - -- **.gitignore** (modified) - - Added `.crowdin-backups/` to ignore backups locally (they go to separate PR only) - -### Documentation - -- **docs/glossary-sync.md** (153 lines) - - Complete usage guide - - Configuration reference - - Troubleshooting - - Backup restoration instructions - -## Environment Variables - -### Required - -- `SUPABASE_URL` - Supabase project URL -- `SUPABASE_SERVICE_ROLE_KEY` - Service role API key -- `I18N_CROWDIN_API_KEY` - Crowdin API key (existing) -- `I18N_GITHUB_API_KEY` - GitHub token (existing) -- `TARGET_LANGUAGES` - Comma-separated internal codes - -### Optional - -- `GLOSSARY_MIN_VOTES` (default: `2`) - Min upvotes for inclusion -- `SKIP_GLOSSARY_BACKUP_PR` (default: `false`) - Skip PR creation - -## Workflow Integration - -### Normal Run - -1. User triggers `crowdin-ai-import.yml` workflow -2. **Glossary Sync Phase**: - - Export existing Crowdin glossaries - - Compare hashes, backup if changed - - Fetch from Supabase `top_translations` table - - Import to Crowdin per language - - Create backup PR (if changes detected) -3. Pre-translate phase (uses updated glossaries) -4. Build, commit, create translation PR - -### Resume Run - -- Glossary sync skipped (already completed in initial run) -- Proceeds directly to build/commit from existing pre-translation ID - -## Supabase Schema - -The script expects this table/view structure: - -```sql -CREATE OR REPLACE VIEW top_translations AS -SELECT - string_term, - translation_text, - total_votes, - language_code -FROM glossary_entries -WHERE status = 'approved' -ORDER BY total_votes DESC; -``` - -## Example Workflow Run - -``` -[GLOSSARY-SYNC] ========== Starting Glossary Sync ========== -[GLOSSARY-SYNC] Supabase URL: https://cppthnnwfvkfwgoqmhjl.supabase.co -[GLOSSARY-SYNC] Min votes: 2 - -[GLOSSARY-SYNC] Step 1: Backing up existing Crowdin glossaries -[GLOSSARY-SYNC] Found 2 existing glossaries -[GLOSSARY-SYNC] Exporting glossary: Ethereum.org Community (es-EM) -[BACKUP] Content changed for glossary:Ethereum.org Community (es-EM) -[BACKUP] Saved glossary backup: 1733011200_abc12345_ethereum_org_community_es.tbx - -[GLOSSARY-SYNC] Step 3: Fetching glossary from Supabase -[GLOSSARY] Fetching from Supabase for language: es -[GLOSSARY] Fetched 47 glossary entries for es -[GLOSSARY-SYNC] Importing glossary: Ethereum.org Community (es-EM) -[CROWDIN-GLOSSARY] Using existing glossary ID: 123456 -[CROWDIN-GLOSSARY] Import started: abc-def-123 -[CROWDIN-GLOSSARY] Import status: finished -[GLOSSARY-SYNC] ✓ Successfully updated glossary for es-EM - -[GLOSSARY-SYNC] Step 5: Creating backup PR -[GLOSSARY-SYNC] Creating branch: i18n-glossary-backup-2024-12-01 -[GLOSSARY-SYNC] Committing 3 backup files -[GLOSSARY-SYNC] Creating pull request -[GLOSSARY-SYNC] ✓ Created PR: https://github.com/ethereum/ethereum-org-website/pull/12345 - -[GLOSSARY-SYNC] ========== Sync Complete ========== -[GLOSSARY-SYNC] Updated glossaries: 2 -[GLOSSARY-SYNC] Languages: es-EM, pt-BR -[GLOSSARY-SYNC] Backup PR: https://github.com/ethereum/ethereum-org-website/pull/12345 - -[MAIN] Proceeding with pre-translation... -``` - -## Testing Checklist - -### Unit Testing - -- [ ] Supabase API connection (test with real endpoint) -- [ ] Hash calculation and change detection -- [ ] TBX formatting (validate XML) -- [ ] Language code mapping (internal ↔ Crowdin) - -### Integration Testing - -- [ ] Export existing glossary from Crowdin -- [ ] Fetch glossary from Supabase (test with 1-2 languages) -- [ ] Import to Crowdin (use test glossary) -- [ ] Backup file creation and Git operations -- [ ] PR creation with backup files - -### End-to-End Testing - -- [ ] Run full workflow with `GLOSSARY_MIN_VOTES=10` (limited entries) -- [ ] Verify backup PR created -- [ ] Verify glossaries updated in Crowdin -- [ ] Run again without changes (should skip backup) -- [ ] Test resume mode (glossary sync should be skipped) - -## Security Considerations - -1. **Service Role Key** - Never commit or log; use GitHub Secrets only -2. **Backup Files** - Stored in Git (ensure no sensitive data in glossary terms) -3. **API Rate Limits** - Sequential per-language imports (no parallel to avoid 429s) -4. **Error Handling** - Glossary sync failures don't break translation workflow - -## Performance Impact - -- **Additional time**: ~1-3 minutes per workflow run -- **Skipped when**: No glossary changes detected (most runs) -- **Network calls**: - - 1 Supabase query per language - - 2 Crowdin API calls per glossary (export + import) - - 1 GitHub API call per backup file - -## Future Enhancements - -1. **Incremental sync** - Only import changed terms (requires term-level tracking) -2. **TM population** - Push high-confidence translations to TM, not just glossary -3. **Bidirectional sync** - Pull Crowdin translator feedback back to Supabase -4. **Scheduled sync** - Daily cron job independent of translation workflow -5. **Term categories** - Support Crowdin glossary term tags/categories -6. **Conflict resolution** - Handle term collisions between languages - -## Dependencies - -**Zero new npm packages!** All implemented using: - -- Native `fetch` API (Node 18+) -- Built-in `crypto`, `fs`, `path` modules -- Existing project dependencies - -## Rollback Plan - -If issues arise: - -1. Set `SKIP_GLOSSARY_BACKUP_PR=true` to disable sync -2. Manually revert glossaries in Crowdin UI -3. Or restore from backup PR files -4. Comment out glossary sync call in `main.ts` if needed diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index 984049d1f86..62725015406 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -25,7 +25,6 @@ if (!gitHubApiKey) { ) throw new Error("No GitHub API Key found (I18N_GITHUB_API_KEY)") } -console.log("[DEBUG] GitHub API key found ✓") export const gitHubBearerHeaders = { Authorization: `Bearer ${gitHubApiKey}`, @@ -41,7 +40,6 @@ if (!crowdinApiKey) { ) throw new Error("No Crowdin API Key found (I18N_CROWDIN_API_KEY)") } -console.log("[DEBUG] Crowdin API key found ✓") export const crowdinBearerHeaders = { Authorization: `Bearer ${crowdinApiKey}` } @@ -57,13 +55,7 @@ const targetLanguages = targetLanguagesInput.map((code) => const baseBranch = process.env.BASE_BRANCH || "dev" -const fileLimit = process.env.FILE_LIMIT - ? parseInt(process.env.FILE_LIMIT, 10) - : 100 - -const startOffset = process.env.START_OFFSET - ? parseInt(process.env.START_OFFSET, 10) - : 0 +const targetPath = process.env.TARGET_PATH || "" // Adaptive polling / timeout configuration (milliseconds) const pretranslateTimeoutMs = process.env.PRETRANSLATE_TIMEOUT_MS @@ -76,28 +68,33 @@ const pretranslatePollBaseMs = process.env.PRETRANSLATE_POLL_BASE_MS const existingPreTranslationId = process.env.PRETRANSLATION_ID || "" +const verbose = process.env.VERBOSE === "true" + // Parse GitHub repository from env (format: "owner/repo") const githubRepo = process.env.GITHUB_REPOSITORY || "ethereum/ethereum-org-website" const [ghOrganization, ghRepo] = githubRepo.split("/") -console.log("[DEBUG] Configuration:") -console.log( - `[DEBUG] - Target languages (internal): ${targetLanguagesInput.join(", ")}` -) -console.log( - `[DEBUG] - Target languages (Crowdin): ${targetLanguages.join(", ")}` -) -console.log(`[DEBUG] - Base branch: ${baseBranch}`) -console.log(`[DEBUG] - File limit: ${fileLimit}`) -console.log(`[DEBUG] - Start offset: ${startOffset}`) -console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) -console.log(`[DEBUG] - Pretranslate timeout ms: ${pretranslateTimeoutMs}`) -console.log(`[DEBUG] - Pretranslate poll base ms: ${pretranslatePollBaseMs}`) -if (existingPreTranslationId) { +if (verbose) { + console.log("[DEBUG] Configuration:") + console.log( + `[DEBUG] - Target languages (internal): ${targetLanguagesInput.join(", ")}` + ) + console.log( + `[DEBUG] - Target languages (Crowdin): ${targetLanguages.join(", ")}` + ) + console.log(`[DEBUG] - Base branch: ${baseBranch}`) console.log( - `[DEBUG] - Resuming from pre-translation ID: ${existingPreTranslationId}` + `[DEBUG] - Target path: ${targetPath || "none (full translation)"}` ) + console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) + console.log(`[DEBUG] - Pretranslate timeout ms: ${pretranslateTimeoutMs}`) + console.log(`[DEBUG] - Pretranslate poll base ms: ${pretranslatePollBaseMs}`) + if (existingPreTranslationId) { + console.log( + `[DEBUG] - Resuming from pre-translation ID: ${existingPreTranslationId}` + ) + } } // Main configuration object @@ -110,14 +107,13 @@ export const config = { preTranslatePromptId: Number.parseInt( process.env.PRE_TRANSLATE_PROMPT_ID || "326942" ), - qaPromptId: Number.parseInt(process.env.QA_PROMPT_ID || "168592"), allCrowdinCodes: targetLanguages, baseBranch, - fileLimit, - startOffset, + targetPath, pretranslateTimeoutMs, pretranslatePollBaseMs, existingPreTranslationId, + verbose, } // Constants diff --git a/src/scripts/i18n/config/canonical-llm-language-list.json b/src/scripts/i18n/config/canonical-llm-language-list.json new file mode 100644 index 00000000000..fca8cb14d29 --- /dev/null +++ b/src/scripts/i18n/config/canonical-llm-language-list.json @@ -0,0 +1,27 @@ +[ + { "code": "zh", "language": "Chinese (Simplified)", "coverageRank": 1 }, + { "code": "es", "language": "Spanish", "coverageRank": 2 }, + { "code": "hi", "language": "Hindi", "coverageRank": 3 }, + { "code": "ar", "language": "Arabic", "coverageRank": 4 }, + { "code": "pt-br", "language": "Portuguese (Brazil)", "coverageRank": 5 }, + { "code": "bn", "language": "Bengali", "coverageRank": 6 }, + { "code": "ru", "language": "Russian", "coverageRank": 7 }, + { "code": "id", "language": "Indonesian", "coverageRank": 8 }, + { "code": "fr", "language": "French", "coverageRank": 9 }, + { "code": "ja", "language": "Japanese", "coverageRank": 10 }, + { "code": "de", "language": "German", "coverageRank": 11 }, + { "code": "ur", "language": "Urdu", "coverageRank": 12 }, + { "code": "zh-tw", "language": "Chinese (Traditional)", "coverageRank": 13 }, + { "code": "tr", "language": "Turkish", "coverageRank": 14 }, + { "code": "vi", "language": "Vietnamese", "coverageRank": 15 }, + { "code": "ko", "language": "Korean", "coverageRank": 16 }, + { "code": "te", "language": "Telugu", "coverageRank": 17 }, + { "code": "mr", "language": "Marathi", "coverageRank": 18 }, + { "code": "ta", "language": "Tamil", "coverageRank": 19 }, + { "code": "it", "language": "Italian", "coverageRank": 20 }, + { "code": "pt", "language": "Portuguese (Euro)", "coverageRank": 21 }, + { "code": "pl", "language": "Polish", "coverageRank": 22 }, + { "code": "uk", "language": "Ukrainian", "coverageRank": 23 }, + { "code": "sw", "language": "Swahili", "coverageRank": 24 }, + { "code": "cs", "language": "Czech", "coverageRank": 25 } +] \ No newline at end of file diff --git a/src/scripts/i18n/config/excluded-paths.json b/src/scripts/i18n/config/excluded-paths.json new file mode 100644 index 00000000000..e179216a205 --- /dev/null +++ b/src/scripts/i18n/config/excluded-paths.json @@ -0,0 +1 @@ +["/cookie-policy/", "/privacy-policy/", "/terms-of-use/", "/style-guide/"] diff --git a/src/scripts/i18n/config/language-trust.json b/src/scripts/i18n/config/language-trust.json deleted file mode 100644 index 8fd94e78241..00000000000 --- a/src/scripts/i18n/config/language-trust.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "default": { - "lastUpdated": "2025-11-28T00:00:00Z", - "Aplus": ["es", "fr", "de", "zh"], - "A": ["ar", "it", "ja", "pt-br", "ru", "zh-tw"], - "Aminus": ["ko", "pt", "nl", "se", "uk"], - "Bplus": ["tr", "pl", "cs", "hi", "vi", "el", "he", "id", "da", "nb", "ca"], - "B": [ - "bn", - "th", - "fa", - "ro", - "hu", - "fi", - "fil", - "tl", - "ur", - "ms", - "bg", - "hr", - "sr", - "bs", - "sk", - "sl", - "gl" - ], - "Bminus": ["ta", "te", "kn", "gu", "mr", "ml", "lt"], - "Cplus": [ - "hy-am", - "ka", - "kk", - "uz", - "az", - "be", - "ga", - "sw", - "am", - "ne-np", - "km" - ], - "C": ["ha", "ig", "yo", "sn", "tw", "tk"], - "Dplus": ["pcm"] - } -} diff --git a/src/scripts/i18n/gen_trust_matrix.ts b/src/scripts/i18n/gen_trust_matrix.ts deleted file mode 100644 index bc4cd594bcf..00000000000 --- a/src/scripts/i18n/gen_trust_matrix.ts +++ /dev/null @@ -1,57 +0,0 @@ -import fs from "fs" -import path from "path" - -import i18nConfig from "../../../i18n.config.json" - -import { getPromptModelKey } from "./lib/crowdin/prompt-model" -import type { I18nConfigItem } from "./lib/types" - -// Helper to get all internal language codes -function getInternalLanguageCodes(): string[] { - return i18nConfig.map((lang: I18nConfigItem) => lang.code) -} - -// Helper to call Copilot/GPT for trust matrix generation -async function generateTrustMatrix( - modelKey: string, - internalCodes: string[] -): Promise { - // Compose prompt for Copilot/GPT - const prompt = `You are an expert in language quality assessment for AI translation models. Given the model ${modelKey} and the following internal language codes: ${internalCodes.join(", ")}, group these codes into buckets by expected translation quality (Aplus, A, Aminus, Bplus, B, Bminus, Cplus, C, Dplus). Output a JSON object with these groups as keys and arrays of codes as values. Only use the provided codes.` - - // Call Copilot (GPT-4.1) via API (pseudo-code, replace with actual API call) - // const response = await copilotApi.generate({ prompt }) - // return JSON.parse(response) - - // For now, just log the prompt and return an empty object - console.log("Prompt for Copilot/GPT:", prompt) - return {} -} - -async function main() { - const userId = process.env.I18N_CROWDIN_USER_ID - const promptId = process.env.I18N_CROWDIN_PROMPT_ID - if (!userId || !promptId) { - throw new Error( - "Set I18N_CROWDIN_USER_ID and I18N_CROWDIN_PROMPT_ID in your .env.local" - ) - } - const modelKey = await getPromptModelKey(Number(userId), Number(promptId)) - const internalCodes = getInternalLanguageCodes() - const matrix = await generateTrustMatrix(modelKey, internalCodes) - - // Write to language-trust.json - const outPath = path.join( - process.cwd(), - "src/scripts/i18n/config/language-trust.json" - ) - fs.writeFileSync(outPath, JSON.stringify({ [modelKey]: matrix }, null, 2)) - console.log( - `Trust matrix for model ${modelKey} written to language-trust.json` - ) -} - -main().catch((err) => { - console.error(err) - process.exit(1) -}) diff --git a/src/scripts/i18n/glossary-sync.md b/src/scripts/i18n/glossary-sync.md deleted file mode 100644 index edfaad7b0f2..00000000000 --- a/src/scripts/i18n/glossary-sync.md +++ /dev/null @@ -1,149 +0,0 @@ -# Glossary Synchronization - -Automatically syncs community-approved translations from Supabase to Crowdin glossaries before each translation workflow run. - -## How It Works - -1. **Backup Existing**: Exports current Crowdin glossaries and Translation Memories, calculates content hashes, and saves timestamped backups if content changed -2. **Fetch from Supabase**: Queries the `top_translations` table for terms with minimum vote threshold per language -3. **Import to Crowdin**: Formats as TBX and imports into Crowdin glossaries (creates new glossary per language if doesn't exist) -4. **Create Backup PR**: Commits backup files to a separate branch (`i18n-glossary-backup-YYYY-MM-DD`) and creates a PR - -## Backup Structure - -``` -.crowdin-backups/ -├── glossary/ -│ ├── 1733011200_abc12345_ethereum_org_community_es.tbx -│ └── 1733011200_def67890_ethereum_org_community_pt.tbx -├── tm/ -│ ├── 1733011200_xyz98765_main_translation_memory.tmx -│ └── ... -└── hashes.json # Tracks content hashes to detect changes -``` - -Each backup filename includes: - -- Unix timestamp (for sorting/chronology) -- Short hash (first 8 chars of SHA-256, for quick verification) -- Sanitized resource name - -## Configuration - -### Required Secrets (GitHub Actions) - -Add these to your repository secrets: - -- `SUPABASE_URL`: Your Supabase project URL (default: `https://cppthnnwfvkfwgoqmhjl.supabase.co`) -- `SUPABASE_SERVICE_ROLE_KEY`: Supabase service role key for API access - -### Environment Variables - -- `GLOSSARY_MIN_VOTES` (default: `2`): Minimum upvotes required for a term to be included -- `SKIP_GLOSSARY_BACKUP_PR` (default: `false`): Set to `true` to save backups locally without creating a PR - -### Supabase Schema - -The script expects a `top_translations` table (or view) with these columns: - -```sql -CREATE TABLE top_translations ( - string_term TEXT NOT NULL, -- The English term - translation_text TEXT NOT NULL, -- The translated term - total_votes INTEGER NOT NULL, -- Number of upvotes - language_code TEXT NOT NULL -- Internal language code (e.g., 'es', 'pt', 'fr') -); -``` - -**Note**: Use internal language codes (`es`, `pt`, `zh`, etc.) in Supabase. The script automatically maps them to Crowdin codes (`es-EM`, `pt-BR`, `zh-CN`, etc.). - -## Workflow Integration - -The glossary sync runs automatically at the start of each `crowdin-ai-import.yml` workflow before pre-translation: - -```yaml -- name: Run Crowdin AI translation import - run: npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/main.ts - env: - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} - SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} - GLOSSARY_MIN_VOTES: ${{ github.event.inputs.glossary_min_votes }} - SKIP_GLOSSARY_BACKUP_PR: ${{ github.event.inputs.skip_glossary_backup_pr }} - # ... other env vars -``` - -## Manual Sync - -You can also run the glossary sync manually: - -```bash -# Set required environment variables -export SUPABASE_URL="https://your-project.supabase.co" -export SUPABASE_SERVICE_ROLE_KEY="your-service-role-key" -export I18N_CROWDIN_API_KEY="your-crowdin-api-key" -export I18N_GITHUB_API_KEY="your-github-token" -export TARGET_LANGUAGES="es,pt,fr" # Internal codes -export GLOSSARY_MIN_VOTES="2" -export SKIP_GLOSSARY_BACKUP_PR="false" - -# Run the sync -npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/sync-glossary.ts -``` - -## Backup Restoration - -If you need to revert to a previous glossary version: - -1. Find the backup file in `.crowdin-backups/glossary/` (sorted by timestamp, most recent first) -2. Download the TBX file -3. Manually import it to Crowdin via UI or API: - ```bash - # Using the Crowdin API - curl -X POST "https://api.crowdin.com/api/v2/glossaries/{glossaryId}/imports" \ - -H "Authorization: Bearer $CROWDIN_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{"storageId": "...", "scheme": {...}}' - ``` - -Or merge the backup PR to preserve it in Git history, then cherry-pick specific versions as needed. - -## How Crowdin Uses Glossaries - -- **For AI/MT**: Crowdin AI and Machine Translation engines prefer glossary terms when translating -- **For Human Translators**: Terms are highlighted in the editor with suggested translations -- **For QA**: Crowdin can flag inconsistent terminology (if QA checks are enabled) -- **For TM Matching**: Glossary terms boost Translation Memory match confidence - -## Troubleshooting - -### "No glossary entries found for language X" - -- Check that `top_translations` has rows for that language code -- Verify `GLOSSARY_MIN_VOTES` isn't too high -- Confirm you're using internal codes (`es`) not Crowdin codes (`es-EM`) - -### "Failed to import glossary" - -- Check Crowdin API rate limits -- Verify TBX format is valid (UTF-8, well-formed XML) -- Ensure glossary name doesn't conflict with existing non-project glossaries - -### "Backup PR creation failed" - -- Verify `I18N_GITHUB_API_KEY` has `repo` scope -- Check branch doesn't already exist (delete old backup branches if needed) -- Ensure `.crowdin-backups/` is in `.gitignore` so backups go to PR only - -## Performance Notes - -- Glossary export/import for 10 languages: ~30-60 seconds -- TM export (large): ~2-5 minutes -- Backup PR creation: ~10-30 seconds (depends on file count) -- Total overhead: ~1-3 minutes per workflow run (skipped if no changes) - -## Files - -- `src/scripts/i18n/sync-glossary.ts` - Main orchestrator -- `src/scripts/i18n/lib/glossary/supabase.ts` - Supabase REST API client -- `src/scripts/i18n/lib/crowdin/glossary.ts` - Crowdin glossary/TM API wrappers -- `src/scripts/i18n/lib/glossary/backup.ts` - Hash calculation, file I/O, Git operations diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts index 9c93b2d0fe5..50330733e95 100644 --- a/src/scripts/i18n/lib/crowdin/files.ts +++ b/src/scripts/i18n/lib/crowdin/files.ts @@ -20,15 +20,17 @@ export const getCrowdinProjectFiles = async (): Promise => { ) url.searchParams.set("limit", "500") - console.log(`[DEBUG] Fetching Crowdin project files from: ${url.toString()}`) + if (config.verbose) { + console.log( + `[DEBUG] Fetching Crowdin project files from: ${url.toString()}` + ) + } try { const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) if (!res.ok) { - console.warn(`[ERROR] Crowdin API response not OK: ${res.status}`) const body = await res.text().catch(() => "") - console.error(`[ERROR] Response body:`, body) throw new Error( `Crowdin getCrowdinProjectFiles failed (${res.status}): ${body}` ) @@ -39,10 +41,11 @@ export const getCrowdinProjectFiles = async (): Promise => { const mappedData = json.data.map(({ data }) => data) - console.log( - `[DEBUG] Successfully fetched ${mappedData.length} Crowdin files` - ) - console.log(`[DEBUG] First Crowdin file:`, mappedData[0]) + if (config.verbose) { + console.log( + `[DEBUG] Successfully fetched ${mappedData.length} Crowdin files` + ) + } return mappedData } catch (error) { console.error(`[ERROR] Failed to fetch Crowdin project files:`, error) @@ -57,17 +60,11 @@ export const findCrowdinFile = ( targetFile: GitHubCrowdinFileMetadata, crowdinFiles: CrowdinFileData[] ): CrowdinFileData => { - console.log( - `[DEBUG] Looking for Crowdin file matching: ${targetFile.filePath}` - ) - console.log(`[DEBUG] Target file name: ${targetFile["Crowdin-API-FileName"]}`) - - // Log first few Crowdin files for comparison - console.log(`[DEBUG] Total Crowdin files found: ${crowdinFiles.length}`) - console.log( - `[DEBUG] First 3 Crowdin file paths:`, - crowdinFiles.slice(0, 3).map((f) => f.path) - ) + if (config.verbose) { + console.log( + `[DEBUG] Looking for Crowdin file matching: ${targetFile.filePath}` + ) + } const found = crowdinFiles.find(({ path }) => path.endsWith(targetFile.filePath) @@ -86,9 +83,11 @@ export const findCrowdinFile = ( ) } - console.log( - `[DEBUG] Successfully matched with Crowdin file: ${found.path} (ID: ${found.id})` - ) + if (config.verbose) { + console.log( + `[DEBUG] Successfully matched with Crowdin file: ${found.path} (ID: ${found.id})` + ) + } return found } @@ -98,7 +97,9 @@ export const findCrowdinFile = ( * This function makes them visible so they can be processed by pre-translation. */ export const unhideStringsInFile = async (fileId: number): Promise => { - console.log(`[UNHIDE] Checking for hidden strings in fileId=${fileId}`) + if (config.verbose) { + console.log(`[DEBUG] Checking for hidden strings in fileId=${fileId}`) + } // Get all strings from the file const listUrl = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/strings?fileId=${fileId}&limit=500` diff --git a/src/scripts/i18n/lib/crowdin/glossary.ts b/src/scripts/i18n/lib/crowdin/glossary.ts deleted file mode 100644 index 26012dc2b41..00000000000 --- a/src/scripts/i18n/lib/crowdin/glossary.ts +++ /dev/null @@ -1,354 +0,0 @@ -/** - * Crowdin Glossary and Translation Memory API operations - */ - -import { - config, - CROWDIN_API_BASE_URL, - crowdinBearerHeaders, -} from "../../config" - -export interface CrowdinGlossary { - id: number - name: string - languageId: string - terms: number - createdAt: string -} - -export interface CrowdinTMSegment { - id: number - text: string - translation: string - createdAt: string -} - -/** - * List all glossaries in the project - */ -export async function listGlossaries(): Promise { - const url = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/glossaries` - console.log(`[CROWDIN-GLOSSARY] Fetching glossaries from: ${url}`) - - try { - const response = await fetch(url, { headers: crowdinBearerHeaders }) - - if (!response.ok) { - const errorText = await response.text() - throw new Error( - `Crowdin glossaries list failed (${response.status}): ${errorText}` - ) - } - - const json: { data: { data: CrowdinGlossary }[] } = await response.json() - const glossaries = json.data.map(({ data }) => data) - - console.log(`[CROWDIN-GLOSSARY] Found ${glossaries.length} glossaries`) - return glossaries - } catch (error) { - console.error(`[CROWDIN-GLOSSARY] Failed to list glossaries:`, error) - throw error - } -} - -/** - * Export a glossary to TBX format - */ -export async function exportGlossary(glossaryId: number): Promise { - const url = `${CROWDIN_API_BASE_URL}/glossaries/${glossaryId}/exports` - console.log(`[CROWDIN-GLOSSARY] Exporting glossary ${glossaryId}`) - - try { - // Start export - const exportResponse = await fetch(url, { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ format: "tbx" }), - }) - - if (!exportResponse.ok) { - const errorText = await exportResponse.text() - throw new Error( - `Crowdin glossary export failed (${exportResponse.status}): ${errorText}` - ) - } - - const exportJson: { data: { url: string; identifier: string } } = - await exportResponse.json() - const downloadUrl = exportJson.data.url - - console.log( - `[CROWDIN-GLOSSARY] Export ready, downloading from: ${downloadUrl}` - ) - - // Download the exported file - const downloadResponse = await fetch(downloadUrl) - if (!downloadResponse.ok) { - throw new Error( - `Failed to download glossary export (${downloadResponse.status})` - ) - } - - const content = await downloadResponse.text() - console.log( - `[CROWDIN-GLOSSARY] Downloaded glossary (${content.length} bytes)` - ) - - return content - } catch (error) { - console.error(`[CROWDIN-GLOSSARY] Failed to export glossary:`, error) - throw error - } -} - -/** - * Import a glossary from TBX content (creates or updates glossary) - */ -export async function importGlossary( - name: string, - languageId: string, - tbxContent: string -): Promise<{ glossaryId: number; imported: number }> { - console.log(`[CROWDIN-GLOSSARY] Importing glossary: ${name} (${languageId})`) - - try { - // Check if glossary exists - const existingGlossaries = await listGlossaries() - const existing = existingGlossaries.find((g) => g.name === name) - - let glossaryId: number - - if (existing) { - console.log( - `[CROWDIN-GLOSSARY] Using existing glossary ID: ${existing.id}` - ) - glossaryId = existing.id - } else { - // Create new glossary - console.log(`[CROWDIN-GLOSSARY] Creating new glossary: ${name}`) - const createUrl = `${CROWDIN_API_BASE_URL}/glossaries` - const createResponse = await fetch(createUrl, { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ name, languageId }), - }) - - if (!createResponse.ok) { - const errorText = await createResponse.text() - throw new Error( - `Failed to create glossary (${createResponse.status}): ${errorText}` - ) - } - - const createJson: { data: { id: number } } = await createResponse.json() - glossaryId = createJson.data.id - console.log(`[CROWDIN-GLOSSARY] Created glossary with ID: ${glossaryId}`) - } - - // Upload TBX file to storage first - const storageId = await uploadToStorage(tbxContent, "glossary.tbx") - - // Import the glossary - const importUrl = `${CROWDIN_API_BASE_URL}/glossaries/${glossaryId}/imports` - const importResponse = await fetch(importUrl, { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - storageId, - scheme: { - sourceLanguageId: "en", - targetLanguageId: languageId, - }, - }), - }) - - if (!importResponse.ok) { - const errorText = await importResponse.text() - throw new Error( - `Failed to import glossary (${importResponse.status}): ${errorText}` - ) - } - - const importJson: { data: { identifier: string } } = - await importResponse.json() - console.log( - `[CROWDIN-GLOSSARY] Import started: ${importJson.data.identifier}` - ) - - // Wait for import to complete (simple polling) - await waitForImport(glossaryId, importJson.data.identifier) - - console.log( - `[CROWDIN-GLOSSARY] Successfully imported glossary ${glossaryId}` - ) - return { glossaryId, imported: 0 } // Crowdin doesn't return count immediately - } catch (error) { - console.error(`[CROWDIN-GLOSSARY] Failed to import glossary:`, error) - throw error - } -} - -/** - * Upload content to Crowdin storage - */ -async function uploadToStorage( - content: string, - filename: string -): Promise { - const url = `${CROWDIN_API_BASE_URL}/storages` - console.log(`[CROWDIN-GLOSSARY] Uploading to storage: ${filename}`) - - const formData = new FormData() - const blob = new Blob([content], { type: "application/xml" }) - formData.append("file", blob, filename) - - try { - const response = await fetch(url, { - method: "POST", - headers: { - Authorization: crowdinBearerHeaders.Authorization, - }, - body: formData, - }) - - if (!response.ok) { - const errorText = await response.text() - throw new Error( - `Storage upload failed (${response.status}): ${errorText}` - ) - } - - const json: { data: { id: number } } = await response.json() - console.log(`[CROWDIN-GLOSSARY] Uploaded to storage ID: ${json.data.id}`) - return json.data.id - } catch (error) { - console.error(`[CROWDIN-GLOSSARY] Storage upload failed:`, error) - throw error - } -} - -/** - * Wait for glossary import to complete - */ -async function waitForImport( - glossaryId: number, - identifier: string -): Promise { - const maxAttempts = 30 - const delayMs = 2000 - - for (let i = 0; i < maxAttempts; i++) { - await new Promise((resolve) => setTimeout(resolve, delayMs)) - - const url = `${CROWDIN_API_BASE_URL}/glossaries/${glossaryId}/imports/${identifier}` - const response = await fetch(url, { headers: crowdinBearerHeaders }) - - if (!response.ok) continue - - const json: { data: { status: string } } = await response.json() - console.log(`[CROWDIN-GLOSSARY] Import status: ${json.data.status}`) - - if (json.data.status === "finished") { - return - } - - if (json.data.status === "failed") { - throw new Error("Glossary import failed") - } - } - - throw new Error("Glossary import timeout") -} - -/** - * List Translation Memory (TM) resources - */ -export async function listTranslationMemories(): Promise< - Array<{ id: number; name: string; languageId: string }> -> { - const url = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/tms` - console.log(`[CROWDIN-TM] Fetching TMs from: ${url}`) - - try { - const response = await fetch(url, { headers: crowdinBearerHeaders }) - - if (!response.ok) { - const errorText = await response.text() - throw new Error( - `Crowdin TM list failed (${response.status}): ${errorText}` - ) - } - - const json: { - data: { data: { id: number; name: string; languageId: string } }[] - } = await response.json() - const tms = json.data.map(({ data }) => data) - - console.log(`[CROWDIN-TM] Found ${tms.length} TMs`) - return tms - } catch (error) { - console.error(`[CROWDIN-TM] Failed to list TMs:`, error) - throw error - } -} - -/** - * Export Translation Memory to TMX format - */ -export async function exportTranslationMemory(tmId: number): Promise { - const url = `${CROWDIN_API_BASE_URL}/tms/${tmId}/exports` - console.log(`[CROWDIN-TM] Exporting TM ${tmId}`) - - try { - // Start export - const exportResponse = await fetch(url, { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - sourceLanguageId: "en", - targetLanguageId: "all", - format: "tmx", - }), - }) - - if (!exportResponse.ok) { - const errorText = await exportResponse.text() - throw new Error( - `Crowdin TM export failed (${exportResponse.status}): ${errorText}` - ) - } - - const exportJson: { data: { url: string } } = await exportResponse.json() - const downloadUrl = exportJson.data.url - - console.log(`[CROWDIN-TM] Export ready, downloading from: ${downloadUrl}`) - - // Download the exported file - const downloadResponse = await fetch(downloadUrl) - if (!downloadResponse.ok) { - throw new Error( - `Failed to download TM export (${downloadResponse.status})` - ) - } - - const content = await downloadResponse.text() - console.log(`[CROWDIN-TM] Downloaded TM (${content.length} bytes)`) - - return content - } catch (error) { - console.error(`[CROWDIN-TM] Failed to export TM:`, error) - throw error - } -} diff --git a/src/scripts/i18n/lib/crowdin/qa-completions.ts b/src/scripts/i18n/lib/crowdin/qa-completions.ts deleted file mode 100644 index 0f1d9206f6f..00000000000 --- a/src/scripts/i18n/lib/crowdin/qa-completions.ts +++ /dev/null @@ -1,225 +0,0 @@ -// Crowdin AI Completions (QA check) helpers - -import { - config, - CROWDIN_API_BASE_URL, - crowdinBearerHeaders, -} from "../../config" - -export type QaCompletionRequest = { - projectId: number - sourceLanguageId: string - targetLanguageId: string - stringIds: number[] -} - -export type QaCompletionJob = { - id: string - status: "in_progress" | "finished" | string - progress?: number -} - -export type QaIssue = { - fileId: number - stringId: number - severity: "error" | "warning" | "info" - title: string - details?: string -} - -const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) - -/** - * Resolve the Crowdin user ID from the API - */ -export const resolveCrowdinUserId = async (): Promise => { - const url = new URL(`${CROWDIN_API_BASE_URL}/user`) - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error(`resolveCrowdinUserId (${res.status}): ${text}`) - } - const json = await res.json() - const id = String(json.data?.id || json.id) - if (!id) throw new Error("Failed to resolve Crowdin user id from /user") - return id -} - -/** - * List all string IDs for a given file - */ -export const listStringIdsForFile = async ( - fileId: number -): Promise => { - const url = new URL( - `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/strings` - ) - url.searchParams.set("fileId", String(fileId)) - url.searchParams.set("limit", "500") - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error(`listStringIdsForFile (${res.status}): ${text}`) - } - const json = await res.json() - type StringItem = { data: { id: number } } - const items: StringItem[] = json.data || [] - const ids: number[] = items.map((d) => d.data.id) - return ids -} - -/** - * Post QA completions request - */ -export const postQaCompletions = async ( - qaPromptId: number, - payload: QaCompletionRequest -): Promise => { - const userId = await resolveCrowdinUserId() - if (!userId) - throw new Error("Failed to resolve Crowdin user ID for completions") - const url = new URL( - `${CROWDIN_API_BASE_URL}/users/${userId}/ai/prompts/${qaPromptId}/completions` - ) - const bodyPayload = { resources: payload } - console.log(`[QA-CHECK][DEBUG] POST ${url.toString()}`) - console.log(`[QA-CHECK][DEBUG] Body:`, JSON.stringify(bodyPayload, null, 2)) - const res = await fetch(url.toString(), { - method: "POST", - headers: { ...crowdinBearerHeaders, "Content-Type": "application/json" }, - body: JSON.stringify(bodyPayload), - }) - console.log(`[QA-CHECK][DEBUG] Response status: ${res.status}`) - if (!res.ok) { - const text = await res.text().catch(() => "") - console.log(`[QA-CHECK][DEBUG] Error response:`, text) - if (res.status === 403) { - throw new Error( - `QA completions endpoint not accessible (403). ` + - `This may require Crowdin Enterprise or AI credits. URL: ${url.toString()} Raw: ${text}` - ) - } - throw new Error(`postQaCompletions (${res.status}): ${text}`) - } - const json = await res.json() - console.log( - `[QA-CHECK][DEBUG] Success response:`, - JSON.stringify(json, null, 2) - ) - return json.data as QaCompletionJob -} - -/** - * Get QA completion status - */ -export const getQaCompletion = async ( - completionId: string -): Promise => { - const userId = await resolveCrowdinUserId() - const url = new URL( - `${CROWDIN_API_BASE_URL}/users/${userId}/ai/prompts/completions/${completionId}` - ) - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error(`getQaCompletion (${res.status}): ${text}`) - } - const json = await res.json() - return json.data as QaCompletionJob -} - -/** - * Poll QA completion until finished with adaptive intervals - */ -export const awaitQaCompletion = async ( - completionId: string, - timeoutMs = config.pretranslateTimeoutMs, - baseIntervalMs = config.pretranslatePollBaseMs -): Promise => { - const start = Date.now() - let attempt = 0 - const computeInterval = (elapsedMs: number): number => { - const minutes = elapsedMs / 60000 - if (minutes < 10) return baseIntervalMs - if (minutes < 30) return Math.max(baseIntervalMs * 2, 60_000) - if (minutes < 60) return Math.max(baseIntervalMs * 4, 180_000) - return Math.max(baseIntervalMs * 10, 300_000) - } - while (Date.now() - start <= timeoutMs) { - attempt++ - const elapsed = Date.now() - start - let job: QaCompletionJob - try { - job = await getQaCompletion(completionId) - } catch (e) { - const wait = computeInterval(elapsed) - console.warn( - `[QA-CHECK][POLL] Error on attempt ${attempt}: ${(e as Error).message}. Waiting ${wait}ms.` - ) - await delay(wait) - continue - } - if (job.status !== "in_progress") return job - const wait = computeInterval(elapsed) - console.log( - `[QA-CHECK][POLL] attempt=${attempt} progress=${job.progress ?? 0}% nextWait=${wait}ms` - ) - await delay(wait) - } - throw new Error("Timed out awaiting QA completion") -} - -/** - * Download QA completion results - */ -export const downloadQaCompletionResult = async ( - completionId: string -): Promise => { - const userId = await resolveCrowdinUserId() - const url = new URL( - `${CROWDIN_API_BASE_URL}/users/${userId}/ai/prompts/completions/${completionId}/download` - ) - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error(`downloadQaCompletionResult (${res.status}): ${text}`) - } - // Assume JSON structure containing issues; adjust as per actual response - const arrayBuffer = await res.arrayBuffer() - const text = Buffer.from(arrayBuffer).toString("utf-8") - try { - const parsed = JSON.parse(text) - const issues: QaIssue[] = parsed.issues || parsed.data || [] - return issues - } catch { - // If plain text, return empty and attach raw for summary - return [] - } -} - -/** - * Summarize QA issues for PR body - */ -export const summarizeQaIssues = ( - issues: QaIssue[], - fileIdToPath: Record, - lang: string -): string => { - if (!issues.length) return `No QA issues detected for ${lang}.` - const counts = { error: 0, warning: 0, info: 0 } - for (const i of issues) { - const sev = i.severity - if (sev === "error" || sev === "warning" || sev === "info") { - counts[sev]++ - } - } - const top = issues.slice(0, 10) - const lines = [ - `QA for ${lang}: ${counts.error} errors, ${counts.warning} warnings, ${counts.info} info`, - ] - for (const i of top) { - const path = fileIdToPath[i.fileId] || `fileId=${i.fileId}` - lines.push(`- [${i.severity}] ${path} string=${i.stringId} — ${i.title}`) - } - return lines.join("\n") -} diff --git a/src/scripts/i18n/lib/github/files.ts b/src/scripts/i18n/lib/github/files.ts index 1314df0b67c..fab7e4dfa8f 100644 --- a/src/scripts/i18n/lib/github/files.ts +++ b/src/scripts/i18n/lib/github/files.ts @@ -1,5 +1,8 @@ // GitHub file operations +import * as fs from "fs" +import * as path from "path" + import { config, gitHubBearerHeaders } from "../../config" import type { ContentType, @@ -9,31 +12,103 @@ import type { import { fetchWithRetry } from "../utils/fetch" /** - * Get English files with pagination, allowing limit + offset. - * GitHub Search API caps `per_page` at 100; we fetch pages until - * we accumulate `offset + limit` items, then return the slice. + * Load excluded paths from config + */ +function loadExcludedPaths(): string[] { + try { + const excludedPathsFile = path.join( + process.cwd(), + "src/scripts/i18n/config/excluded-paths.json" + ) + const raw = fs.readFileSync(excludedPathsFile, "utf8") + return JSON.parse(raw) as string[] + } catch { + return [] + } +} + +/** + * Check if a path should be excluded + */ +function isPathExcluded(filePath: string, excludedPaths: string[]): boolean { + return excludedPaths.some((excluded) => filePath.includes(excluded)) +} + +/** + * Check if a path is a file (has .md or .json extension) or directory + */ +function isFilePath(targetPath: string): boolean { + return targetPath.endsWith(".md") || targetPath.endsWith(".json") +} + +/** + * Get English files with optional file/directory filtering and excluded paths. + * If targetPath is a file (ends with .md or .json), returns only that file. + * If targetPath is a directory, returns all files recursively within that directory. + * Otherwise, returns all English content files. */ -export const getAllEnglishFiles = async ( - limit = 100, - offset = 0 -): Promise => { +export const getAllEnglishFiles = async (): Promise< + GitHubQueryResponseItem[] +> => { + const { targetPath, verbose } = config + const excludedPaths = loadExcludedPaths() + + if (verbose) { + console.log( + `[DEBUG] Excluded paths loaded: ${excludedPaths.length} entries` + ) + } + + // Determine if targetPath is a file or directory + if (targetPath) { + if (isPathExcluded(targetPath, excludedPaths)) { + console.log(`[INFO] Path ${targetPath} is in excluded paths, skipping`) + return [] + } + + if (isFilePath(targetPath)) { + // Single file mode + console.log(`[INFO] Fetching single file: ${targetPath}`) + return await fetchSingleFile(targetPath) + } else { + // Directory mode + console.log(`[INFO] Fetching files from directory: ${targetPath}`) + } + } + + // Directory mode or full translation const ghSearchEndpointBase = "https://api.github.com/search/code" - const query = `repo:${config.ghOrganization}/${config.ghRepo} extension:md path:"${config.mdRoot}" -path:"${config.mdRoot}/translations" OR repo:${config.ghOrganization}/${config.ghRepo} extension:json path:"${config.jsonRoot}"` + let query: string + + if (targetPath && !isFilePath(targetPath)) { + // Search within specific directory + query = `repo:${config.ghOrganization}/${config.ghRepo} extension:md path:"${targetPath}" -path:"${config.mdRoot}/translations" OR repo:${config.ghOrganization}/${config.ghRepo} extension:json path:"${targetPath}"` + } else { + // Search all content files + query = `repo:${config.ghOrganization}/${config.ghRepo} extension:md path:"${config.mdRoot}" -path:"${config.mdRoot}/translations" OR repo:${config.ghOrganization}/${config.ghRepo} extension:json path:"${config.jsonRoot}"` + if (!targetPath) { + console.log(`[INFO] Fetching all English content files`) + } + } - console.log(`[DEBUG] GitHub search query: ${query}`) + if (verbose) { + console.log(`[DEBUG] GitHub search query: ${query}`) + } const perPage = 100 - const needed = offset + limit const collected: GitHubQueryResponseItem[] = [] let page = 1 - while (collected.length < needed) { + let hasMorePages = true + while (hasMorePages) { const url = new URL(ghSearchEndpointBase) url.searchParams.set("q", query) url.searchParams.set("per_page", perPage.toString()) url.searchParams.set("page", page.toString()) - console.log(`[DEBUG] Fetching search page ${page} ...`) + if (verbose) { + console.log(`[DEBUG] Fetching search page ${page}...`) + } try { const res = await fetchWithRetry(url.toString(), { @@ -41,9 +116,7 @@ export const getAllEnglishFiles = async ( }) if (!res.ok) { - console.warn(`[ERROR] GitHub API response not OK: ${res.status}`) const body = await res.text().catch(() => "") - console.error(`[ERROR] Response body:`, body) throw new Error(`GitHub getAllEnglishFiles (${res.status}): ${body}`) } @@ -51,19 +124,23 @@ export const getAllEnglishFiles = async ( const json: JsonResponse = await res.json() if (!json.items.length) { - console.log(`[DEBUG] No more results at page ${page}.`) + if (verbose) { + console.log(`[DEBUG] No more results at page ${page}`) + } + hasMorePages = false break } collected.push(...json.items) - console.log(`[DEBUG] Collected ${collected.length} items so far.`) + + if (verbose) { + console.log(`[DEBUG] Collected ${collected.length} items so far`) + } page += 1 if (page > 10) { - // Safety cap: avoid excessive paging; typical search caps ~1000 results - console.warn( - `[WARN] Reached pagination safety cap at page ${page - 1}.` - ) + console.warn(`[WARN] Reached pagination safety cap at page ${page - 1}`) + hasMorePages = false break } } catch (error) { @@ -72,12 +149,82 @@ export const getAllEnglishFiles = async ( } } - const sliced = collected.slice(offset, offset + limit) - console.log( - `[DEBUG] Returning ${sliced.length} files (offset=${offset}, limit=${limit})` + // Filter out excluded paths + const filtered = collected.filter( + (item) => !isPathExcluded(item.path, excludedPaths) ) - if (sliced.length) console.log(`[DEBUG] First GitHub file:`, sliced[0]) - return sliced + + const excludedCount = collected.length - filtered.length + if (excludedCount > 0) { + console.log(`[INFO] Filtered out ${excludedCount} excluded files`) + } + + console.log(`[INFO] Total files to translate: ${filtered.length}`) + + return filtered +} + +/** + * Fetch a single file by path from GitHub + */ +async function fetchSingleFile( + filePath: string +): Promise { + const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/contents/${filePath}?ref=${config.baseBranch}` + + try { + const res = await fetchWithRetry(url, { + headers: gitHubBearerHeaders, + }) + + if (!res.ok) { + throw new Error(`Failed to fetch file ${filePath}: ${res.status}`) + } + + const data = await res.json() + + // Convert to GitHubQueryResponseItem format + return [ + { + name: data.name, + path: data.path, + sha: data.sha, + url: data.url, + git_url: data.git_url, + html_url: data.html_url, + repository: { + id: 0, + name: config.ghRepo, + full_name: `${config.ghOrganization}/${config.ghRepo}`, + owner: { + login: config.ghOrganization, + id: 0, + node_id: "", + avatar_url: "", + gravatar_id: "", + url: "", + html_url: "", + followers_url: "", + following_url: "", + gists_url: "", + starred_url: "", + subscriptions_url: "", + organizations_url: "", + repos_url: "", + events_url: "", + received_events_url: "", + type: "Organization", + user_view_type: "", + site_admin: false, + }, + } as GitHubQueryResponseItem["repository"], + score: 1, + }, + ] + } catch (error) { + console.error(`[ERROR] Failed to fetch single file ${filePath}:`, error) + throw error + } } /** diff --git a/src/scripts/i18n/lib/glossary/backup.ts b/src/scripts/i18n/lib/glossary/backup.ts deleted file mode 100644 index c57dd24a1ec..00000000000 --- a/src/scripts/i18n/lib/glossary/backup.ts +++ /dev/null @@ -1,245 +0,0 @@ -/** - * Glossary and TM backup utilities - * Handles hashing, Git operations, and timestamped backups - */ - -import * as crypto from "crypto" -import * as fs from "fs" -import * as path from "path" - -const ROOT = process.cwd() -const BACKUP_ROOT = path.join(ROOT, "src/scripts/i18n/backups") -const GLOSSARY_BACKUP_DIR = path.join(BACKUP_ROOT, "glossary") -const TM_BACKUP_DIR = path.join(BACKUP_ROOT, "tm") -const HASH_FILE = path.join(BACKUP_ROOT, "hashes.json") - -export interface BackupHashes { - glossary?: Record // glossaryName -> hash - tm?: Record // tmName -> hash - lastUpdated?: string -} - -/** - * Calculate SHA-256 hash of content - */ -export function calculateHash(content: string): string { - return crypto.createHash("sha256").update(content, "utf8").digest("hex") -} - -/** - * Get short hash (first 8 characters) - */ -export function getShortHash(content: string): string { - return calculateHash(content).substring(0, 8) -} - -/** - * Ensure backup directories exist - */ -export function ensureBackupDirs(): void { - if (!fs.existsSync(BACKUP_ROOT)) { - fs.mkdirSync(BACKUP_ROOT, { recursive: true }) - } - if (!fs.existsSync(GLOSSARY_BACKUP_DIR)) { - fs.mkdirSync(GLOSSARY_BACKUP_DIR, { recursive: true }) - } - if (!fs.existsSync(TM_BACKUP_DIR)) { - fs.mkdirSync(TM_BACKUP_DIR, { recursive: true }) - } -} - -/** - * Load existing backup hashes - */ -export function loadBackupHashes(): BackupHashes { - if (!fs.existsSync(HASH_FILE)) { - return { glossary: {}, tm: {} } - } - - try { - const content = fs.readFileSync(HASH_FILE, "utf8") - return JSON.parse(content) - } catch (error) { - console.warn(`[BACKUP] Failed to load hashes, using empty:`, error) - return { glossary: {}, tm: {} } - } -} - -/** - * Save backup hashes - */ -export function saveBackupHashes(hashes: BackupHashes): void { - ensureBackupDirs() - hashes.lastUpdated = new Date().toISOString() - fs.writeFileSync(HASH_FILE, JSON.stringify(hashes, null, 2), "utf8") -} - -/** - * Check if content has changed (compare with stored hash) - */ -export function hasContentChanged( - name: string, - content: string, - type: "glossary" | "tm" -): boolean { - const hashes = loadBackupHashes() - const storedHash = (type === "glossary" ? hashes.glossary : hashes.tm)?.[name] - - if (!storedHash) { - console.log(`[BACKUP] No previous hash found for ${type}:${name}`) - return true - } - - const currentHash = calculateHash(content) - const changed = currentHash !== storedHash - - if (changed) { - console.log(`[BACKUP] Content changed for ${type}:${name}`) - console.log(`[BACKUP] - Old hash: ${storedHash}`) - console.log(`[BACKUP] - New hash: ${currentHash}`) - } else { - console.log(`[BACKUP] No changes detected for ${type}:${name}`) - } - - return changed -} - -/** - * Save backup file with timestamp and hash - */ -export function saveBackup( - name: string, - content: string, - type: "glossary" | "tm", - extension: string = "tbx" -): string { - ensureBackupDirs() - - const timestamp = Date.now() - const shortHash = getShortHash(content) - const sanitizedName = name.replace(/[^a-z0-9-_]/gi, "_").toLowerCase() - const filename = `${timestamp}_${shortHash}_${sanitizedName}.${extension}` - - const dir = type === "glossary" ? GLOSSARY_BACKUP_DIR : TM_BACKUP_DIR - const filepath = path.join(dir, filename) - - fs.writeFileSync(filepath, content, "utf8") - console.log(`[BACKUP] Saved ${type} backup: ${filename}`) - - // Update hash record - const hashes = loadBackupHashes() - if (type === "glossary") { - if (!hashes.glossary) hashes.glossary = {} - hashes.glossary[name] = calculateHash(content) - } else { - if (!hashes.tm) hashes.tm = {} - hashes.tm[name] = calculateHash(content) - } - saveBackupHashes(hashes) - - return filepath -} - -/** - * Get list of backup files for a resource - */ -export function listBackups( - name: string, - type: "glossary" | "tm" -): Array<{ path: string; timestamp: number; hash: string }> { - const dir = type === "glossary" ? GLOSSARY_BACKUP_DIR : TM_BACKUP_DIR - - if (!fs.existsSync(dir)) { - return [] - } - - const sanitizedName = name.replace(/[^a-z0-9-_]/gi, "_").toLowerCase() - const files = fs.readdirSync(dir) - - return files - .filter((file) => file.includes(sanitizedName)) - .map((file) => { - const [timestampStr, hash] = file.split("_") - return { - path: path.join(dir, file), - timestamp: parseInt(timestampStr, 10), - hash, - } - }) - .sort((a, b) => b.timestamp - a.timestamp) // Most recent first -} - -/** - * Get most recent backup for a resource - */ -export function getMostRecentBackup( - name: string, - type: "glossary" | "tm" -): string | null { - const backups = listBackups(name, type) - if (backups.length === 0) return null - - const mostRecent = backups[0] - return fs.readFileSync(mostRecent.path, "utf8") -} - -/** - * Clean up old backups, keeping only the most recent N - */ -export function cleanupOldBackups( - name: string, - type: "glossary" | "tm", - keepCount: number = 10 -): void { - const backups = listBackups(name, type) - - if (backups.length <= keepCount) { - console.log( - `[BACKUP] Only ${backups.length} backups for ${type}:${name}, no cleanup needed` - ) - return - } - - const toDelete = backups.slice(keepCount) - console.log( - `[BACKUP] Cleaning up ${toDelete.length} old backups for ${type}:${name}` - ) - - for (const backup of toDelete) { - try { - fs.unlinkSync(backup.path) - console.log(`[BACKUP] Deleted: ${path.basename(backup.path)}`) - } catch (error) { - console.warn(`[BACKUP] Failed to delete ${backup.path}:`, error) - } - } -} - -/** - * Get relative paths for all backup files (for Git commit) - */ -export function getAllBackupPaths(): string[] { - const paths: string[] = [] - - if (fs.existsSync(GLOSSARY_BACKUP_DIR)) { - const glossaryFiles = fs.readdirSync(GLOSSARY_BACKUP_DIR) - paths.push( - ...glossaryFiles.map((f) => - path.relative(ROOT, path.join(GLOSSARY_BACKUP_DIR, f)) - ) - ) - } - - if (fs.existsSync(TM_BACKUP_DIR)) { - const tmFiles = fs.readdirSync(TM_BACKUP_DIR) - paths.push( - ...tmFiles.map((f) => path.relative(ROOT, path.join(TM_BACKUP_DIR, f))) - ) - } - - if (fs.existsSync(HASH_FILE)) { - paths.push(path.relative(ROOT, HASH_FILE)) - } - - return paths -} diff --git a/src/scripts/i18n/lib/glossary/supabase.ts b/src/scripts/i18n/lib/glossary/supabase.ts deleted file mode 100644 index 440b2ac0c2b..00000000000 --- a/src/scripts/i18n/lib/glossary/supabase.ts +++ /dev/null @@ -1,211 +0,0 @@ -/** - * Supabase glossary fetcher using REST API - * Fetches community-approved translations from the top_translations table - */ - -export interface GlossaryEntry { - term: string - translation: string - votes: number - languageCode: string -} - -export interface SupabaseRow { - string_term: string - translation_text: string - total_votes: number - language_code: string -} - -/** - * Fetch top-voted glossary terms for a specific language from Supabase - */ -export async function fetchGlossaryForLanguage( - supabaseUrl: string, - serviceRoleKey: string, - languageCode: string, - minVotes: number = 1 -): Promise { - const url = new URL(`${supabaseUrl}/rest/v1/top_translations`) - url.searchParams.set("language_code", `eq.${languageCode}`) - url.searchParams.set("total_votes", `gte.${minVotes}`) - url.searchParams.set("order", "total_votes.desc,string_term.asc") - url.searchParams.set( - "select", - "string_term,translation_text,total_votes,language_code" - ) - - console.log(`[GLOSSARY] Fetching from Supabase for language: ${languageCode}`) - console.log(`[GLOSSARY] URL: ${url.toString()}`) - - try { - const response = await fetch(url.toString(), { - headers: { - apikey: serviceRoleKey, - Authorization: `Bearer ${serviceRoleKey}`, - "Content-Type": "application/json", - }, - }) - - if (!response.ok) { - const errorText = await response.text() - throw new Error(`Supabase API error (${response.status}): ${errorText}`) - } - - const rows: SupabaseRow[] = await response.json() - console.log( - `[GLOSSARY] Fetched ${rows.length} glossary entries for ${languageCode}` - ) - - return rows.map((row) => ({ - term: row.string_term, - translation: row.translation_text, - votes: row.total_votes, - languageCode: row.language_code, - })) - } catch (error) { - console.error( - `[GLOSSARY] Failed to fetch glossary for ${languageCode}:`, - error - ) - throw error - } -} - -/** - * Fetch glossary entries for all specified languages - */ -export async function fetchGlossaryForAllLanguages( - supabaseUrl: string, - serviceRoleKey: string, - languageCodes: string[], - minVotes: number = 1 -): Promise> { - console.log( - `[GLOSSARY] Fetching glossary for ${languageCodes.length} languages` - ) - - const results: Record = {} - - for (const langCode of languageCodes) { - try { - const entries = await fetchGlossaryForLanguage( - supabaseUrl, - serviceRoleKey, - langCode, - minVotes - ) - results[langCode] = entries - } catch (error) { - console.warn(`[GLOSSARY] Skipping ${langCode} due to error:`, error) - results[langCode] = [] - } - } - - const totalEntries = Object.values(results).reduce( - (sum, entries) => sum + entries.length, - 0 - ) - console.log( - `[GLOSSARY] Fetched ${totalEntries} total entries across all languages` - ) - - return results -} - -/** - * Format glossary entries as CSV for Crowdin import - * Format: term,translation,description,note - */ -export function formatGlossaryAsCSV(entries: GlossaryEntry[]): string { - const header = "term,translation,description,note\n" - const rows = entries.map((entry) => { - const term = escapeCSV(entry.term) - const translation = escapeCSV(entry.translation) - const description = escapeCSV(`Community-voted (${entry.votes} votes)`) - const note = escapeCSV("") - return `${term},${translation},${description},${note}` - }) - - return header + rows.join("\n") -} - -/** - * Escape CSV values (quote if contains comma, quote, or newline) - */ -function escapeCSV(value: string): string { - if (value.includes(",") || value.includes('"') || value.includes("\n")) { - return `"${value.replace(/"/g, '""')}"` - } - return value -} - -/** - * Format glossary entries as TBX (Term Base eXchange) for Crowdin import - */ -export function formatGlossaryAsTBX( - entries: GlossaryEntry[], - sourceLanguage: string, - targetLanguage: string -): string { - const now = new Date().toISOString() - - const termEntries = entries - .map((entry) => { - const escapedTerm = escapeXML(entry.term) - const escapedTranslation = escapeXML(entry.translation) - const escapedNote = escapeXML(`Community-voted: ${entry.votes} votes`) - - return ` - - ${escapedNote} - - - - ${escapedTerm} - - - - - ${escapedTranslation} - - - ` - }) - .join("\n") - - return ` - - - - - - Ethereum.org Community Glossary - - -

Generated from Supabase community glossary on ${now}

-
-
- -

http://www.lisa.org/fileadmin/standards/tbx/TBXXCSV02.xcs

-
-
- - -${termEntries} - - -
` -} - -/** - * Escape XML special characters - */ -function escapeXML(value: string): string { - return value - .replace(/&/g, "&") - .replace(//g, ">") - .replace(/"/g, """) - .replace(/'/g, "'") -} diff --git a/src/scripts/i18n/lib/openai/trust-matrix-generator.ts b/src/scripts/i18n/lib/openai/trust-matrix-generator.ts deleted file mode 100644 index e692c45bfd9..00000000000 --- a/src/scripts/i18n/lib/openai/trust-matrix-generator.ts +++ /dev/null @@ -1,149 +0,0 @@ -// OpenAI integration for generating language trust matrices -import fs from "fs" -import path from "path" - -import i18nConfig from "../../../../../i18n.config.json" - -type TrustBucket = { - lastUpdated?: string - Aplus?: string[] - A?: string[] - Aminus?: string[] - Bplus?: string[] - B?: string[] - Bminus?: string[] - Cplus?: string[] - C?: string[] - Dplus?: string[] -} - -/** - * Generate a trust matrix using OpenAI GPT-4 - * @param modelKey The Crowdin AI model identifier (provider:model:version) - * @returns The generated trust bucket with quality grades for each language - */ -export async function generateTrustMatrixWithOpenAI( - modelKey: string -): Promise { - const apiKey = process.env.OPENAI_API_KEY - if (!apiKey) { - throw new Error( - "OPENAI_API_KEY not found. Cannot generate trust matrix without API access." - ) - } - - const languageList = i18nConfig - .map((lang) => `${lang.code} (${lang.name})`) - .join(", ") - - const prompt = `You are an expert in evaluating AI translation model quality across different languages. - -Given the Crowdin AI translation model identifier: "${modelKey}" - -Please assess the expected translation quality for each of the following languages: ${languageList} - -Group the language codes into these quality buckets: -- Aplus: Exceptional quality, native-level fluency expected -- A: High quality, minimal post-editing needed -- Aminus: Good quality, occasional review needed -- Bplus: Above-average quality, regular review recommended -- B: Average quality, consistent review needed -- Bminus: Below-average quality, careful review required -- Cplus: Fair quality, significant review needed -- C: Poor quality, extensive review required -- Dplus: Very poor quality, requires thorough human translation review - -Respond ONLY with a valid JSON object in this exact format: -{ - "Aplus": ["code1", "code2"], - "A": ["code3"], - "Aminus": ["code4", "code5"], - "Bplus": ["code6"], - "B": ["code7", "code8"], - "Bminus": ["code9"], - "Cplus": ["code10"], - "C": ["code11"], - "Dplus": ["code12"] -} - -Use ONLY the internal codes provided (e.g., "es", "fr", "zh", "pt-br"). Do not include any explanatory text, only the JSON object.` - - const response = await fetch("https://api.openai.com/v1/chat/completions", { - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${apiKey}`, - }, - body: JSON.stringify({ - model: "gpt-4-turbo-preview", - messages: [ - { - role: "system", - content: - "You are a language quality assessment expert. Respond only with valid JSON.", - }, - { role: "user", content: prompt }, - ], - temperature: 0.3, - max_tokens: 2000, - }), - }) - - if (!response.ok) { - const errorText = await response.text() - throw new Error(`OpenAI API error (${response.status}): ${errorText}`) - } - - const data = await response.json() - const content = data.choices?.[0]?.message?.content?.trim() - if (!content) { - throw new Error("OpenAI returned empty response") - } - - // Parse the JSON response - let trustBucket: TrustBucket - try { - trustBucket = JSON.parse(content) - } catch (err) { - console.error("[OPENAI] Failed to parse response:", content) - throw new Error(`OpenAI response was not valid JSON: ${err}`) - } - - // Add timestamp - trustBucket.lastUpdated = new Date().toISOString() - - console.log(`[OPENAI] Generated trust matrix for model: ${modelKey}`) - return trustBucket -} - -/** - * Update the language-trust.json file with a new model's trust matrix - * @param modelKey The model identifier - * @param trustBucket The trust bucket to add - */ -export function saveTrustMatrixToFile( - modelKey: string, - trustBucket: TrustBucket -): void { - const filePath = path.join( - process.cwd(), - "src/scripts/i18n/config/language-trust.json" - ) - - let matrix: Record = {} - try { - const raw = fs.readFileSync(filePath, "utf8") - matrix = JSON.parse(raw) - } catch { - console.warn( - "[TRUST-MATRIX] Could not read existing matrix, creating new file" - ) - } - - matrix[modelKey] = trustBucket - - fs.writeFileSync(filePath, JSON.stringify(matrix, null, 2) + "\n") - console.log( - `[TRUST-MATRIX] Saved trust matrix for model "${modelKey}" to language-trust.json` - ) -} diff --git a/src/scripts/i18n/lib/qa-routing.ts b/src/scripts/i18n/lib/qa-routing.ts deleted file mode 100644 index 7d7eb68de70..00000000000 --- a/src/scripts/i18n/lib/qa-routing.ts +++ /dev/null @@ -1,101 +0,0 @@ -// NOTE: language-trust.json now uses ONLY internal codes (see i18n.config.json 'code' field) -import fs from "fs" -import path from "path" - -type TrustBucket = { - lastUpdated?: string - Aplus?: string[] - A?: string[] - Aminus?: string[] - Bplus?: string[] - B?: string[] - Bminus?: string[] - Cplus?: string[] - C?: string[] - Dplus?: string[] -} - -type TrustMatrix = Record - -export type QaLevel = "skip" | "copilot" | "copilot+claude" - -export function loadTrustMatrix(): TrustMatrix { - const p = path.join( - process.cwd(), - "src/scripts/i18n/config/language-trust.json" - ) - try { - const raw = fs.readFileSync(p, "utf8") - return JSON.parse(raw) - } catch { - return { default: {} } - } -} - -/** - * Find the most recent model key in the trust matrix by lastUpdated timestamp - */ -export function getMostRecentModelKey(matrix: TrustMatrix): string | null { - let mostRecent: string | null = null - let latestTime = 0 - for (const [key, bucket] of Object.entries(matrix)) { - if (bucket.lastUpdated) { - const timestamp = new Date(bucket.lastUpdated).getTime() - if (timestamp > latestTime) { - latestTime = timestamp - mostRecent = key - } - } - } - return mostRecent -} - -export function planQaForLanguages( - languageIds: string[], - modelKey?: string -): Record { - const matrix = loadTrustMatrix() - // Try to use the specified model, fallback to most recent, then default - let bucket: TrustBucket = {} - if (modelKey && matrix[modelKey]) { - bucket = matrix[modelKey] - console.log(`[QA-ROUTING] Using trust matrix for model: ${modelKey}`) - } else { - const fallbackKey = getMostRecentModelKey(matrix) || "default" - bucket = matrix[fallbackKey] || {} - if (modelKey) { - console.log( - `[QA-ROUTING] Model "${modelKey}" not found, using fallback: ${fallbackKey}` - ) - } else { - console.log(`[QA-ROUTING] Using fallback trust matrix: ${fallbackKey}`) - } - } - - const groupIndex = new Map([ - ["Aplus", "skip"], - ["A", "skip"], - ["Aminus", "skip"], - ["Bplus", "copilot"], - ["B", "copilot"], - ["Bminus", "copilot"], - ["Cplus", "copilot+claude"], - ["C", "copilot+claude"], - ["Dplus", "copilot+claude"], - ]) - - const index = new Map() - for (const [group, list] of Object.entries(bucket)) { - if (group === "lastUpdated") continue // skip metadata - const level = groupIndex.get(group as string) - if (!level) continue - for (const code of list || []) index.set(code, level) - } - - const plan: Record = {} - // All languageIds should be internal codes (not Crowdin codes) - for (const lang of languageIds) { - plan[lang] = index.get(lang) || "copilot" // conservative default - } - return plan -} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 48375d72c5f..318830c941d 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -1,10 +1,10 @@ import * as fs from "fs" +import * as path from "path" import { getBuiltFile, postBuildProjectFileTranslation, } from "./lib/crowdin/build" -// Crowdin operations import { findCrowdinFile, getCrowdinProjectFiles, @@ -18,79 +18,70 @@ import { postApplyPreTranslation, } from "./lib/crowdin/pre-translate" import { updatePromptFromFile } from "./lib/crowdin/prompt" -import { getPromptModelKey } from "./lib/crowdin/prompt-model" -import { - awaitQaCompletion, - downloadQaCompletionResult, - listStringIdsForFile, - postQaCompletions, - type QaCompletionJob, - type QaIssue, - resolveCrowdinUserId, - summarizeQaIssues, -} from "./lib/crowdin/qa-completions" import { postCreateBranchFrom } from "./lib/github/branches" import { getDestinationFromPath, putCommitFile } from "./lib/github/commits" -// GitHub operations import { downloadGitHubFile, getAllEnglishFiles, getFileMetadata, } from "./lib/github/files" -import { postPrReviewComment } from "./lib/github/pr-review-comments" import { postPullRequest } from "./lib/github/pull-requests" -import { - generateTrustMatrixWithOpenAI, - saveTrustMatrixToFile, -} from "./lib/openai/trust-matrix-generator" -import { loadTrustMatrix, planQaForLanguages } from "./lib/qa-routing" -import type { - CrowdinAddFileResponse, - CrowdinFileData, - CrowdinPreTranslateResponse, -} from "./lib/types" -// Utilities +import type { CrowdinFileData, CrowdinPreTranslateResponse } from "./lib/types" import { mapCrowdinCodeToInternal } from "./lib/utils/mapping" -import { config, MAX_STRINGS_PER_REQUEST } from "./config" +import { config } from "./config" import { runSanitizer } from "./post_import_sanitize" + // Small helper for async waits const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)) +/** + * Write pre-translation artifact for GitHub Actions + */ +function writePreTranslationArtifact( + preTranslationId: string, + fileCount: number, + languages: string[] +) { + const artifactData = { + preTranslationId, + timestamp: new Date().toISOString(), + fileCount, + languages, + targetPath: config.targetPath || null, + } + + const artifactDir = path.join(process.cwd(), "artifacts") + if (!fs.existsSync(artifactDir)) { + fs.mkdirSync(artifactDir, { recursive: true }) + } + + const artifactPath = path.join(artifactDir, "pre-translation-info.json") + fs.writeFileSync(artifactPath, JSON.stringify(artifactData, null, 2)) + + console.log(`\n[ARTIFACT] Pre-translation info written to ${artifactPath}`) + console.log(`[ARTIFACT] Pre-translation ID: ${preTranslationId}`) + console.log( + `[ARTIFACT] To resume this job later, use: PRETRANSLATION_ID=${preTranslationId}` + ) +} + /** * Main orchestration function */ -async function main(options?: { allLangs: boolean }) { - console.log(`[DEBUG] Starting main function with options:`, options) - console.log(`[DEBUG] Environment config:`, { - projectId: config.projectId, - baseBranch: config.baseBranch, - jsonRoot: config.jsonRoot, - mdRoot: config.mdRoot, - allCrowdinCodes: config.allCrowdinCodes, - }) - - // Step 0: Sync glossary from Supabase to Crowdin - if (!config.existingPreTranslationId) { - console.log("\n[GLOSSARY] ========== Syncing Glossary ==========") - try { - const { syncGlossary } = await import("./sync-glossary") - const glossaryResult = await syncGlossary() - console.log( - `[GLOSSARY] ✓ Updated ${glossaryResult.updatedGlossaries.length} languages` - ) - if (glossaryResult.backupPrUrl) { - console.log(`[GLOSSARY] ✓ Backup PR: ${glossaryResult.backupPrUrl}`) - } - } catch (error) { - console.error("[GLOSSARY] Failed to sync glossary:", error) - console.error("[GLOSSARY] Continuing with workflow anyway...") - } +async function main() { + const { verbose, targetPath, existingPreTranslationId } = config + + console.log(`\n========== Crowdin AI Translation Import ==========`) + console.log(`Target languages: ${config.allCrowdinCodes.join(", ")}`) + if (targetPath) { + const isFile = targetPath.endsWith(".md") || targetPath.endsWith(".json") + console.log(`Mode: ${isFile ? "Single file" : "Directory"} (${targetPath})`) } else { - console.log("\n[GLOSSARY] Skipping glossary sync (resuming existing job)") + console.log(`Mode: Full translation (all files)`) } - // Shared state used in both resume and new flows + // Shared state const crowdinProjectFiles = await getCrowdinProjectFiles() const fileIdsSet = new Set() const processedFileIdToPath: Record = {} @@ -98,310 +89,202 @@ async function main(options?: { allLangs: boolean }) { // If resuming, determine completed pre-translation response; otherwise start new let preTranslateJobCompletedResponse: CrowdinPreTranslateResponse - if (config.existingPreTranslationId) { + + if (existingPreTranslationId) { console.log( - `\n[RESUME] ========== Resuming from pre-translation ID: ${config.existingPreTranslationId} ==========` - ) - console.log(`[RESUME] Checking status of existing pre-translation...`) - const statusResp = await getPreTranslationStatus( - config.existingPreTranslationId + `\n========== Resuming Pre-Translation ${existingPreTranslationId} ==========` ) + const statusResp = await getPreTranslationStatus(existingPreTranslationId) + if (statusResp.status === "in_progress") { console.log( - `[RESUME] Pre-translation still in progress (${statusResp.progress}%). Waiting for completion...` + `Pre-translation in progress (${statusResp.progress}%), waiting for completion...` ) preTranslateJobCompletedResponse = await awaitPreTranslationCompleted( - config.existingPreTranslationId + existingPreTranslationId ) } else if (statusResp.status === "finished") { - console.log( - `[RESUME] Pre-translation already finished. Building translations...` - ) + console.log(`Pre-translation already finished, proceeding to download...`) preTranslateJobCompletedResponse = statusResp } else { throw new Error( - `Pre-translation ${config.existingPreTranslationId} has unexpected status: ${statusResp.status}` + `Pre-translation ${existingPreTranslationId} has unexpected status: ${statusResp.status}` ) } } else { // Normal flow: Start new pre-translation - console.log(`\n[START] ========== Starting new pre-translation ==========`) + console.log(`\n========== Starting New Pre-Translation ==========`) // Ensure Crowdin AI prompt content is synced from repo canonical file - try { - const userId = await resolveCrowdinUserId() - const promptPath = `${process.cwd()}/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt` - await updatePromptFromFile( - Number(userId), - config.preTranslatePromptId, - promptPath - ) + const userId = process.env.I18N_CROWDIN_USER_ID + if (userId) { + try { + const promptPath = path.join( + process.cwd(), + "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" + ) + await updatePromptFromFile( + Number(userId), + config.preTranslatePromptId, + promptPath + ) + console.log("✓ Updated Crowdin pre-translate prompt from repo file") + } catch (e) { + console.warn("Failed to update prompt, continuing:", e) + } + } + + // Fetch English files + const allEnglishFiles = await getAllEnglishFiles() + + if (!allEnglishFiles.length) { + console.log("No files to translate, exiting") + return + } + + if (verbose) { + console.log(`[DEBUG] Found ${allEnglishFiles.length} English files`) console.log( - "[PROMPT] ✓ Updated Crowdin pre-translate prompt from repo file" + `[DEBUG] Found ${crowdinProjectFiles.length} files in Crowdin project` ) - } catch (e) { - console.warn("[PROMPT] Failed to update prompt; continuing:", e) } - // Fetch English files with limit + start offset - const allEnglishFiles = await getAllEnglishFiles( - config.fileLimit, - config.startOffset - ) - console.log( - `[DEBUG] Found ${allEnglishFiles.length} English files from GitHub (offset=${config.startOffset}, limit=${config.fileLimit})` - ) - const fileMetadata = await getFileMetadata(allEnglishFiles) - console.log(`[DEBUG] Generated metadata for ${fileMetadata.length} files`) - console.log(`[DEBUG] First file metadata:`, fileMetadata[0]) - - console.log( - `[DEBUG] Found ${crowdinProjectFiles.length} files in Crowdin project` - ) // Iterate through each file and upload for (const file of fileMetadata) { - console.log(`[DEBUG] Processing file: ${file.filePath}`) - await (async () => { - let foundFile: CrowdinFileData | undefined - try { - foundFile = findCrowdinFile(file, crowdinProjectFiles) - } catch { - console.log("File not found in Crowdin, attempting to add new file") + if (verbose) { + console.log(`[DEBUG] Processing file: ${file.filePath}`) + } + + let foundFile: CrowdinFileData | undefined + try { + foundFile = findCrowdinFile(file, crowdinProjectFiles) + } catch { + if (verbose) { + console.log("File not found in Crowdin, will add new file") } + } - let crowdinFileResponse: CrowdinAddFileResponse | undefined - let effectiveFileId: number - let effectivePath: string + let effectiveFileId: number + let effectivePath: string - if (foundFile) { - // File exists - DO NOT update to preserve parsed string structure + if (foundFile) { + // File exists - DO NOT update to preserve parsed string structure + if (verbose) { console.log( - `[SKIP-UPDATE] File already exists in Crowdin with ID: ${foundFile.id}, using existing structure` + `[DEBUG] File exists in Crowdin (ID: ${foundFile.id}), using existing structure` ) - console.log( - `[SKIP-UPDATE] Skipping upload/update to preserve existing parsed strings` - ) - effectiveFileId = foundFile.id - effectivePath = foundFile.path + } + effectiveFileId = foundFile.id + effectivePath = foundFile.path + + // Download English for buffer comparison later + const fileBuffer = await downloadGitHubFile(file.download_url) + englishBuffers[effectiveFileId] = fileBuffer + } else { + // File doesn't exist - create it + console.log(`Creating new file in Crowdin: ${file.filePath}`) + const fileBuffer = await downloadGitHubFile(file.download_url) + + const storageInfo = await postFileToStorage( + fileBuffer, + file["Crowdin-API-FileName"] + ) - // Still download English for buffer comparison later - console.log( - `[DOWNLOAD] Downloading English source for buffer comparison: ${file.download_url}` - ) - const fileBuffer = await downloadGitHubFile(file.download_url) - englishBuffers[effectiveFileId] = fileBuffer - } else { - // File doesn't exist - create it - console.log(`[UPLOAD] File NOT found in Crowdin, creating new file`) - console.log( - `[UPLOAD] Downloading English source from: ${file.download_url}` - ) - const fileBuffer = await downloadGitHubFile(file.download_url) - console.log(`[UPLOAD] Downloaded ${fileBuffer.length} bytes`) + // Derive full parent directory path (exclude filename) + const parts = file.filePath.split("/").filter(Boolean) + parts.pop() // remove filename + const parentDirPath = parts.join("/") || "/" - const storageInfo = await postFileToStorage( - fileBuffer, - file["Crowdin-API-FileName"] - ) - console.log( - `[UPLOAD] Uploaded to Crowdin storage with ID: ${storageInfo.id}` - ) + const crowdinFileResponse = await postCrowdinFile( + storageInfo.id, + file["Crowdin-API-FileName"], + parentDirPath + ) - // Derive full parent directory path (exclude filename) - const parts = file.filePath.split("/").filter(Boolean) - parts.pop() // remove filename - const parentDirPath = parts.join("/") || "/" - console.log( - `[UPLOAD] Creating new Crowdin file in directory path: ${parentDirPath}` - ) - crowdinFileResponse = await postCrowdinFile( - storageInfo.id, - file["Crowdin-API-FileName"], - parentDirPath - ) - console.log( - `[UPLOAD] ✓ Created new Crowdin file with ID: ${crowdinFileResponse.id}` - ) + console.log( + `✓ Created new Crowdin file (ID: ${crowdinFileResponse.id})` + ) - effectiveFileId = crowdinFileResponse.id - effectivePath = crowdinFileResponse.path - englishBuffers[effectiveFileId] = fileBuffer + effectiveFileId = crowdinFileResponse.id + effectivePath = crowdinFileResponse.path + englishBuffers[effectiveFileId] = fileBuffer - // Wait for new file parsing - const delayMs = 10000 + // Wait for new file parsing + const delayMs = 10000 + if (verbose) { console.log( - `[UPLOAD] ⏱️ Waiting ${delayMs / 1000}s for Crowdin to parse new file...` + `[DEBUG] Waiting ${delayMs / 1000}s for Crowdin to parse new file...` ) - await delay(delayMs) - console.log(`[UPLOAD] ✓ Parsing delay complete`) } + await delay(delayMs) + } - fileIdsSet.add(effectiveFileId) - if (effectivePath) - processedFileIdToPath[effectiveFileId] = effectivePath - })() + fileIdsSet.add(effectiveFileId) + if (effectivePath) processedFileIdToPath[effectiveFileId] = effectivePath } // Unhide any hidden/duplicate strings before pre-translation console.log( - `\n[UNHIDE] ========== Unhiding strings in ${fileIdsSet.size} files ==========` + `\n========== Unhiding Strings in ${fileIdsSet.size} Files ==========` ) for (const fileId of Array.from(fileIdsSet)) { await unhideStringsInFile(fileId) } - console.log( - `\n[PRE-TRANSLATE] ========== Requesting AI Pre-Translation ==========` - ) - console.log(`[PRE-TRANSLATE] FileIds to translate:`, Array.from(fileIdsSet)) - console.log(`[PRE-TRANSLATE] Target languages:`, config.allCrowdinCodes) - console.log(`[PRE-TRANSLATE] AI Prompt ID:`, config.preTranslatePromptId) + console.log(`\n========== Requesting AI Pre-Translation ==========`) + console.log(`Files to translate: ${fileIdsSet.size}`) + console.log(`Target languages: ${config.allCrowdinCodes.join(", ")}`) + console.log(`AI Prompt ID: ${config.preTranslatePromptId}`) const applyPreTranslationResponse = await postApplyPreTranslation( Array.from(fileIdsSet), - options?.allLangs ? config.allCrowdinCodes : config.allCrowdinCodes - ) - console.log( - `[PRE-TRANSLATE] ✓ Pre-translation job created with ID: ${applyPreTranslationResponse.identifier}` + config.allCrowdinCodes ) + console.log( - `[PRE-TRANSLATE] Initial status:`, - applyPreTranslationResponse.status + `✓ Pre-translation job created (ID: ${applyPreTranslationResponse.identifier})` ) - console.log(`\n[PRE-TRANSLATE] Waiting for job to complete...`) - preTranslateJobCompletedResponse = await awaitPreTranslationCompleted( - applyPreTranslationResponse.identifier + // Write artifact with pre-translation ID + writePreTranslationArtifact( + applyPreTranslationResponse.identifier, + fileIdsSet.size, + config.allCrowdinCodes ) - if (preTranslateJobCompletedResponse.status !== "finished") { - console.error( - "[PRE-TRANSLATE] ❌ Pre-translation did not finish successfully. Full response:", - preTranslateJobCompletedResponse + // If no targetPath specified (full translation), exit now and let Crowdin work + if (!targetPath) { + console.log(`\n========== Full Translation Job Started ==========`) + console.log( + `This is a large job that will take significant time to complete.` ) - throw new Error( - `Pre-translation ended with unexpected status: ${preTranslateJobCompletedResponse.status}` + console.log( + `The workflow will exit now. Resume later with the pre-translation ID above.` ) + console.log( + `Check Crowdin dashboard for progress: https://crowdin.com/project/ethereum-org` + ) + return } - console.log(`[PRE-TRANSLATE] ✓ Job completed successfully!`) - console.log( - `[PRE-TRANSLATE] Progress: ${preTranslateJobCompletedResponse.progress}%` - ) - console.log( - `[PRE-TRANSLATE] Full response:`, - JSON.stringify(preTranslateJobCompletedResponse, null, 2) - ) - } - - // QA via Crowdin AI Prompt Completions - console.log(`\n[QA-CHECK] ========== AI QA via Prompt Completions ==========`) - const qaSummaries: string[] = [] - const { languageIds: qaLanguageIds, fileIds: qaFileIds } = - preTranslateJobCompletedResponse.attributes - - // Build stringId lists per file - const fileStringMap: Record = {} - for (const fid of qaFileIds) { - try { - fileStringMap[fid] = await listStringIdsForFile(fid) - } catch (e) { - console.warn(`[QA-CHECK] Failed listing strings for fileId=${fid}:`, e) - fileStringMap[fid] = [] - } - } - - const sourceLanguageId = "en" - - // For each language, run QA per file (naturally batches and ties issues to specific files) - for (const lang of qaLanguageIds) { - console.log( - `[QA-CHECK] Running QA for ${lang} across ${qaFileIds.length} files` + // For file/directory mode, wait for completion + console.log(`\nWaiting for pre-translation to complete...`) + preTranslateJobCompletedResponse = await awaitPreTranslationCompleted( + applyPreTranslationResponse.identifier ) - const allIssues: QaIssue[] = [] - let skipped = false - - for (const fid of qaFileIds) { - const stringIds = fileStringMap[fid] || [] - if (!stringIds.length) { - console.log(`[QA-CHECK] Skipping fileId=${fid} (no strings)`) - continue - } - console.log( - `[QA-CHECK] QA for ${lang} fileId=${fid} (${stringIds.length} strings)` + if (preTranslateJobCompletedResponse.status !== "finished") { + throw new Error( + `Pre-translation ended with unexpected status: ${preTranslateJobCompletedResponse.status}` ) - - // Chunk large files to stay within API limits - const chunks = - stringIds.length > MAX_STRINGS_PER_REQUEST - ? Array.from( - { length: Math.ceil(stringIds.length / MAX_STRINGS_PER_REQUEST) }, - (_, i) => - stringIds.slice( - i * MAX_STRINGS_PER_REQUEST, - (i + 1) * MAX_STRINGS_PER_REQUEST - ) - ) - : [stringIds] - - for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) { - const chunk = chunks[chunkIdx] - if (chunks.length > 1) { - console.log( - `[QA-CHECK] Chunk ${chunkIdx + 1}/${chunks.length} (${chunk.length} strings)` - ) - } - - let job: QaCompletionJob | undefined - try { - job = await postQaCompletions(config.qaPromptId, { - projectId: config.projectId, - sourceLanguageId, - targetLanguageId: lang, - stringIds: chunk, - }) - } catch (e) { - const msg = String((e as Error).message || e) - console.warn( - `[QA-CHECK] Failed for fileId=${fid} chunk ${chunkIdx + 1}: ${msg}` - ) - if (msg.includes("403")) { - // If 403, skip entire language (endpoint not accessible) - qaSummaries.push( - `QA for ${lang}: skipped (endpoint not accessible - may require Enterprise or AI credits).` - ) - skipped = true - break - } - continue - } - - const finished = await awaitQaCompletion(job.id) - if (finished.status !== "finished") { - console.warn( - `[QA-CHECK] Completion for fileId=${fid} chunk ${chunkIdx + 1} status=${finished.status}` - ) - continue - } - const issues = await downloadQaCompletionResult(job.id) - allIssues.push(...issues) - } - - if (skipped) break } - if ( - !skipped && - (allIssues.length > 0 || Object.keys(fileStringMap).length > 0) - ) { - const summary = summarizeQaIssues(allIssues, processedFileIdToPath, lang) - qaSummaries.push(summary) - } + console.log(`✓ Pre-translation completed successfully!`) } + // Build and download translations const { languageIds, fileIds } = preTranslateJobCompletedResponse.attributes // Build mapping for commit phase @@ -413,10 +296,8 @@ async function main(options?: { allLangs: boolean }) { const existing = crowdinProjectFiles.find((f) => f.id === fid) if (existing) fileIdToPathMapping[fid] = existing.path } - if (!fileIdToPathMapping[fid]) { - console.warn( - `[WARN] Missing path mapping for fileId=${fid} (may impact destination path calculation)` - ) + if (!fileIdToPathMapping[fid] && verbose) { + console.warn(`[WARN] Missing path mapping for fileId=${fid}`) } } @@ -426,241 +307,113 @@ async function main(options?: { allLangs: boolean }) { internalLanguageCode: mapCrowdinCodeToInternal(crowdinId), })) - // Step 1: Detect the current model for pre-translation prompt - console.log( - `\n[MODEL-DETECTION] Fetching model for promptId: ${config.preTranslatePromptId}` - ) - let modelKey: string | undefined - try { - const userId = await resolveCrowdinUserId() - modelKey = await getPromptModelKey( - Number(userId), - config.preTranslatePromptId - ) - console.log(`[MODEL-DETECTION] Current model: ${modelKey}`) - } catch (err) { - console.warn(`[MODEL-DETECTION] Failed to detect model:`, err) - } - - // Step 2: Check if trust matrix exists for this model - const matrix = loadTrustMatrix() - const needsNewMatrix = modelKey && !matrix[modelKey] - - if (needsNewMatrix) { - console.log( - `\n[TRUST-MATRIX] Model "${modelKey}" not found in trust matrix` - ) - const openAiKey = process.env.OPENAI_API_KEY - if (openAiKey) { - console.log( - `[TRUST-MATRIX] OpenAI key available, generating new trust matrix...` - ) - try { - const newBucket = await generateTrustMatrixWithOpenAI(modelKey!) - saveTrustMatrixToFile(modelKey!, newBucket) - console.log( - `[TRUST-MATRIX] ✓ Generated and saved trust matrix for ${modelKey}` - ) - } catch (err) { - console.warn(`[TRUST-MATRIX] Failed to generate matrix:`, err) - console.log(`[TRUST-MATRIX] Will use most recent fallback`) - } - } else { - console.log( - `[TRUST-MATRIX] No OpenAI key available, using most recent model fallback` - ) - } - } - - // Step 3: QA routing based on trust matrix (with model-aware lookup) - const internalCodes = languagePairs.map((p) => p.internalLanguageCode) - const qaPlan = planQaForLanguages(internalCodes, modelKey) + console.log(`\n========== Creating Translation PR ==========`) - // Step 4: Group languages by trust tier - const highTrustLangs = languagePairs.filter( - (p) => qaPlan[p.internalLanguageCode] === "skip" - ) - const mediumTrustLangs = languagePairs.filter( - (p) => qaPlan[p.internalLanguageCode] === "copilot" - ) - const lowTrustLangs = languagePairs.filter( - (p) => qaPlan[p.internalLanguageCode] === "copilot+claude" + const { branch } = await postCreateBranchFrom( + config.baseBranch, + "crowdin-translations" ) + console.log(`✓ Created branch: ${branch}`) - console.log( - `\n[TIER-GROUPING] High trust (no review): ${highTrustLangs.length} languages` - ) - console.log( - `[TIER-GROUPING] Medium trust (@copilot): ${mediumTrustLangs.length} languages` - ) - console.log( - `[TIER-GROUPING] Low trust (@copilot + @claude): ${lowTrustLangs.length} languages` - ) - - // Helper function to process one tier - const processTierPr = async ( - tierLabel: "high-trust" | "medium-trust" | "low-trust", - tierName: string, - langs: typeof languagePairs - ) => { - if (langs.length === 0) { - console.log(`\n[TIER-${tierLabel.toUpperCase()}] No languages, skipping`) - return - } - + // For each language + for (const { crowdinId, internalLanguageCode } of languagePairs) { console.log( - `\n[TIER-${tierLabel.toUpperCase()}] ========== Processing ${langs.length} languages ==========` + `\n--- Building translations for ${crowdinId} (${internalLanguageCode}) ---` ) - const { branch } = await postCreateBranchFrom(config.baseBranch, tierLabel) - console.log(`[BRANCH] ✓ Created branch: ${branch}`) - - // For each language in this tier - for (const { crowdinId, internalLanguageCode } of langs) { - console.log( - `\n[BUILD] ========== Building translations for language: ${crowdinId} (internal: ${internalLanguageCode}) ==========` - ) - - // Build, download and commit each file - for (const fileId of fileIds) { - console.log(`\n[BUILD] --- Processing fileId: ${fileId} ---`) - const crowdinPath = fileIdToPathMapping[fileId] - console.log(`[BUILD] Crowdin path: ${crowdinPath}`) - - // 1- Build - console.log( - `[BUILD] Requesting build for fileId=${fileId}, language=${crowdinId}` - ) - const { url: downloadUrl } = await postBuildProjectFileTranslation( - fileId, - crowdinId, - config.projectId - ) - console.log(`[BUILD] ✓ Build complete, download URL: ${downloadUrl}`) + // Build, download and commit each file + for (const fileId of fileIds) { + const crowdinPath = fileIdToPathMapping[fileId] - // 2- Download - console.log(`[BUILD] Downloading translated file...`) - const { buffer } = await getBuiltFile(downloadUrl) - console.log(`[BUILD] Downloaded ${buffer.length} bytes`) + if (verbose) { + console.log(`[DEBUG] Processing fileId: ${fileId} (${crowdinPath})`) + } - // Check if translation differs from English - const originalEnglish = englishBuffers[fileId] - if (originalEnglish) { - console.log( - `[BUILD] Original English size: ${originalEnglish.length} bytes` - ) - if (originalEnglish.compare(buffer) === 0) { - console.warn( - `[BUILD] ⚠️ Skipping commit - content identical to English (no translation occurred)` - ) - continue - } else { - console.log( - `[BUILD] ✓ Translation differs from English, will commit` - ) - } - } + // 1- Build + const { url: downloadUrl } = await postBuildProjectFileTranslation( + fileId, + crowdinId, + config.projectId + ) - // 3a- Get destination path - const destinationPath = getDestinationFromPath( - crowdinPath, - internalLanguageCode - ) - console.log(`[BUILD] Destination path: ${destinationPath}`) + // 2- Download + const { buffer } = await getBuiltFile(downloadUrl) - // 3b- Commit - console.log(`[BUILD] Committing to branch: ${branch}`) - await putCommitFile(buffer, destinationPath, branch) - console.log(`[BUILD] ✓ Committed successfully`) + if (verbose) { + console.log(`[DEBUG] Downloaded ${buffer.length} bytes`) } - } - // Run post-import sanitizer for this tier's languages only - console.log( - `\n[SANITIZE] ========== Running sanitizer for ${tierLabel} languages ==========` - ) - const tierCrowdinCodes = langs.map((p) => p.crowdinId) - const sanitizeResult = runSanitizer(tierCrowdinCodes) - const changedFiles = sanitizeResult.changedFiles || [] - if (changedFiles.length) { - console.log( - `[SANITIZE] Files changed by sanitizer: ${changedFiles.length}` - ) - for (const abs of changedFiles) { - const relPath = abs.startsWith(process.cwd()) - ? abs.slice(process.cwd().length + 1) - : abs - try { - const buf = fs.readFileSync(abs) - await putCommitFile(buf, relPath, branch) - console.log(`[SANITIZE] ✓ Committed sanitized file: ${relPath}`) - } catch (e) { + // Check if translation differs from English + const originalEnglish = englishBuffers[fileId] + if (originalEnglish && originalEnglish.compare(buffer) === 0) { + if (verbose) { console.warn( - `[SANITIZE] Failed to commit sanitized file ${relPath}:`, - e + `[DEBUG] Skipping commit - content identical to English (no translation)` ) } + continue } - } else { - console.log("[SANITIZE] No sanitation changes to commit") - } - // Create PR with tier-appropriate title and body - console.log( - `\n[PR] ========== Creating ${tierName} Pull Request ==========` - ) - console.log(`[PR] Head branch: ${branch}`) - console.log(`[PR] Base branch: ${config.baseBranch}`) - - const langCodes = langs.map((p) => p.internalLanguageCode).join(", ") - let prTitle = `[${tierName}] Automated Crowdin translations (${langCodes})` - if (tierLabel !== "high-trust") { - const reviewers = - tierLabel === "medium-trust" ? "@copilot" : "@copilot @claude" - prTitle += ` - ${reviewers} review requested` - } + // 3- Get destination path and commit + const destinationPath = getDestinationFromPath( + crowdinPath, + internalLanguageCode + ) - // Filter QA summaries to this tier's languages if available - const tierQaSummaries = qaSummaries.filter((s) => - langs.some((p) => s.includes(p.crowdinId)) - ) - const prBody = tierQaSummaries.length - ? `${prTitle}\n\nQA Summary:\n\n${tierQaSummaries.join("\n\n")}` - : prTitle + if (verbose) { + console.log(`[DEBUG] Committing to: ${destinationPath}`) + } - const pr = await postPullRequest(branch, config.baseBranch, prBody) + await putCommitFile(buffer, destinationPath, branch) + } - console.log(`\n[SUCCESS] Pull Request created: ${pr.html_url}`) - console.log(`[SUCCESS] PR Number: #${pr.number}`) + console.log(`✓ Committed translations for ${internalLanguageCode}`) + } - // Post follow-up comment with scoped AI review mentions - console.log(`\n[PR-COMMENT] Posting AI review comment...`) - const tierQaPlan: Record = {} - for (const { internalLanguageCode } of langs) { - tierQaPlan[internalLanguageCode] = qaPlan[internalLanguageCode] - } - try { - await postPrReviewComment(pr.number, tierQaPlan) - } catch (err) { - console.warn(`[PR-COMMENT] Failed to post review comment:`, err) + // Run post-import sanitizer + console.log(`\n========== Running Post-Import Sanitizer ==========`) + const sanitizeResult = runSanitizer(config.allCrowdinCodes) + const changedFiles = sanitizeResult.changedFiles || [] + + if (changedFiles.length) { + console.log(`Sanitizer modified ${changedFiles.length} files`) + for (const absPath of changedFiles) { + const relPath = absPath.startsWith(process.cwd()) + ? absPath.slice(process.cwd().length + 1) + : absPath + try { + const buf = fs.readFileSync(absPath) + await putCommitFile(buf, relPath, branch) + if (verbose) { + console.log(`[DEBUG] Committed sanitized file: ${relPath}`) + } + } catch (e) { + console.warn(`Failed to commit sanitized file ${relPath}:`, e) + } } - - console.log( - `\n[SUCCESS] ========== ${tierName} PR complete: ${pr.html_url} ==========` - ) + console.log(`✓ Committed ${changedFiles.length} sanitized files`) + } else { + console.log("No sanitization changes needed") } - // Process each tier - await processTierPr("high-trust", "High Trust", highTrustLangs) - await processTierPr("medium-trust", "Medium Trust", mediumTrustLangs) - await processTierPr("low-trust", "Low Trust", lowTrustLangs) + // Create PR + console.log(`\n========== Creating Pull Request ==========`) - console.log( - `\n[SUCCESS] ========== All translation imports complete! ==========` - ) + const langCodes = languagePairs.map((p) => p.internalLanguageCode).join(", ") + const prTitle = `Automated Crowdin translations (${langCodes})` + const prBody = `${prTitle}\n\nThis PR contains automated translations from Crowdin for the following languages: ${langCodes}\n\n**Translation Details:**\n- Files translated: ${fileIds.length}\n- Languages: ${languageIds.join(", ")}\n- Branch: ${branch}` + + const pr = await postPullRequest(branch, config.baseBranch, prBody) + + console.log(`\n========== SUCCESS ==========`) + console.log(`Pull Request created: ${pr.html_url}`) + console.log(`PR Number: #${pr.number}`) + console.log(`Languages: ${langCodes}`) + console.log(`Files: ${fileIds.length}`) } main().catch((err) => { - console.error("Fatal error:", err) + console.error("\n========== ERROR ==========") + console.error(err) process.exit(1) }) diff --git a/src/scripts/i18n/sync-glossary.ts b/src/scripts/i18n/sync-glossary.ts deleted file mode 100644 index e7140cf395e..00000000000 --- a/src/scripts/i18n/sync-glossary.ts +++ /dev/null @@ -1,278 +0,0 @@ -/** - * Glossary synchronization orchestrator - * - * Workflow: - * 1. Export existing Crowdin glossaries/TMs - * 2. Check if content has changed (compare hashes) - * 3. If changed, save timestamped backup to .crowdin-backups/ - * 4. Fetch latest glossary from Supabase - * 5. Import updated glossary to Crowdin - * 6. Create Git branch and PR with backup files - */ - -import { - exportGlossary, - exportTranslationMemory, - importGlossary, - listGlossaries, - listTranslationMemories, -} from "./lib/crowdin/glossary" -import { postCreateBranchFrom } from "./lib/github/branches" -import { putCommitFile } from "./lib/github/commits" -import { postPullRequest } from "./lib/github/pull-requests" -import { - getAllBackupPaths, - hasContentChanged, - saveBackup, -} from "./lib/glossary/backup" -import { - fetchGlossaryForLanguage, - formatGlossaryAsTBX, -} from "./lib/glossary/supabase" -import { mapCrowdinCodeToInternal } from "./lib/utils/mapping" -import { config } from "./config" - -const SUPABASE_URL = process.env.SUPABASE_URL || "" -const SUPABASE_SERVICE_ROLE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY || "" - -if (!SUPABASE_SERVICE_ROLE_KEY) { - console.error("[GLOSSARY-SYNC] Missing SUPABASE_SERVICE_ROLE_KEY") - throw new Error("SUPABASE_SERVICE_ROLE_KEY environment variable is required") -} - -const MIN_VOTES = parseInt(process.env.GLOSSARY_MIN_VOTES || "2", 10) -const SKIP_BACKUP_PR = process.env.SKIP_GLOSSARY_BACKUP_PR === "true" - -/** - * Main sync function - */ -export async function syncGlossary(): Promise<{ - backupBranch?: string - backupPrUrl?: string - updatedGlossaries: string[] -}> { - console.log("\n[GLOSSARY-SYNC] ========== Starting Glossary Sync ==========") - console.log(`[GLOSSARY-SYNC] Supabase URL: ${SUPABASE_URL}`) - console.log(`[GLOSSARY-SYNC] Min votes: ${MIN_VOTES}`) - console.log(`[GLOSSARY-SYNC] Skip backup PR: ${SKIP_BACKUP_PR}`) - - const backupPaths: string[] = [] - const updatedGlossaries: string[] = [] - let backupNeeded = false - - // Step 1: Export and backup existing Crowdin glossaries - console.log( - "\n[GLOSSARY-SYNC] Step 1: Backing up existing Crowdin glossaries" - ) - try { - const glossaries = await listGlossaries() - console.log( - `[GLOSSARY-SYNC] Found ${glossaries.length} existing glossaries` - ) - - for (const glossary of glossaries) { - console.log( - `[GLOSSARY-SYNC] Exporting glossary: ${glossary.name} (ID: ${glossary.id})` - ) - const content = await exportGlossary(glossary.id) - - if (hasContentChanged(glossary.name, content, "glossary")) { - const backupPath = saveBackup(glossary.name, content, "glossary", "tbx") - backupPaths.push(backupPath) - backupNeeded = true - } - } - } catch (error) { - console.warn( - "[GLOSSARY-SYNC] Failed to backup glossaries (continuing anyway):", - error - ) - } - - // Step 2: Export and backup Translation Memories (optional) - console.log("\n[GLOSSARY-SYNC] Step 2: Backing up Translation Memories") - try { - const tms = await listTranslationMemories() - console.log(`[GLOSSARY-SYNC] Found ${tms.length} TMs`) - - for (const tm of tms) { - console.log(`[GLOSSARY-SYNC] Exporting TM: ${tm.name} (ID: ${tm.id})`) - const content = await exportTranslationMemory(tm.id) - - if (hasContentChanged(tm.name, content, "tm")) { - const backupPath = saveBackup(tm.name, content, "tm", "tmx") - backupPaths.push(backupPath) - backupNeeded = true - } - } - } catch (error) { - console.warn( - "[GLOSSARY-SYNC] Failed to backup TMs (continuing anyway):", - error - ) - } - - // Step 3: Fetch latest glossary from Supabase for each language - console.log("\n[GLOSSARY-SYNC] Step 3: Fetching glossary from Supabase") - const languageCodes = config.allCrowdinCodes - console.log(`[GLOSSARY-SYNC] Target languages: ${languageCodes.join(", ")}`) - - for (const crowdinCode of languageCodes) { - try { - // Map Crowdin code to internal code for Supabase query - const internalCode = mapCrowdinCodeToInternal(crowdinCode) - console.log( - `\n[GLOSSARY-SYNC] Processing language: ${crowdinCode} (internal: ${internalCode})` - ) - - const entries = await fetchGlossaryForLanguage( - SUPABASE_URL, - SUPABASE_SERVICE_ROLE_KEY, - internalCode, - MIN_VOTES - ) - - if (entries.length === 0) { - console.log( - `[GLOSSARY-SYNC] No glossary entries found for ${crowdinCode}` - ) - continue - } - - console.log( - `[GLOSSARY-SYNC] Found ${entries.length} glossary entries for ${crowdinCode}` - ) - - // Step 4: Import to Crowdin - const tbxContent = formatGlossaryAsTBX(entries, "en", crowdinCode) - const glossaryName = `Ethereum.org Community (${crowdinCode})` - - console.log(`[GLOSSARY-SYNC] Importing glossary: ${glossaryName}`) - await importGlossary(glossaryName, crowdinCode, tbxContent) - - updatedGlossaries.push(crowdinCode) - console.log( - `[GLOSSARY-SYNC] ✓ Successfully updated glossary for ${crowdinCode}` - ) - } catch (error) { - console.error( - `[GLOSSARY-SYNC] Failed to update glossary for ${crowdinCode}:`, - error - ) - // Continue with other languages - } - } - - // Step 5: Create backup PR if needed - let backupBranch: string | undefined - let backupPrUrl: string | undefined - - if (backupNeeded && !SKIP_BACKUP_PR) { - console.log("\n[GLOSSARY-SYNC] Step 5: Creating backup PR") - try { - const result = await createBackupPR() - backupBranch = result.branch - backupPrUrl = result.prUrl - } catch (error) { - console.error("[GLOSSARY-SYNC] Failed to create backup PR:", error) - console.error( - "[GLOSSARY-SYNC] Backups are saved locally but not committed" - ) - } - } else if (backupNeeded) { - console.log( - "\n[GLOSSARY-SYNC] Backups saved locally (PR creation skipped via SKIP_GLOSSARY_BACKUP_PR)" - ) - } else { - console.log("\n[GLOSSARY-SYNC] No backups needed (no changes detected)") - } - - console.log("\n[GLOSSARY-SYNC] ========== Sync Complete ==========") - console.log(`[GLOSSARY-SYNC] Updated glossaries: ${updatedGlossaries.length}`) - console.log(`[GLOSSARY-SYNC] Languages: ${updatedGlossaries.join(", ")}`) - if (backupBranch) { - console.log(`[GLOSSARY-SYNC] Backup branch: ${backupBranch}`) - } - if (backupPrUrl) { - console.log(`[GLOSSARY-SYNC] Backup PR: ${backupPrUrl}`) - } - - return { - backupBranch, - backupPrUrl, - updatedGlossaries, - } -} - -/** - * Create a Git branch and PR with backup files - */ -async function createBackupPR(): Promise<{ branch: string; prUrl: string }> { - const timestamp = new Date().toISOString().split("T")[0] - const branchName = `i18n-glossary-backup-${timestamp}` - - console.log(`[GLOSSARY-SYNC] Creating branch: ${branchName}`) - await postCreateBranchFrom(config.baseBranch, branchName) - - // Get all backup files (including newly created ones) - const allBackupPaths = getAllBackupPaths() - console.log( - `[GLOSSARY-SYNC] Committing ${allBackupPaths.length} backup files` - ) - - // Commit each backup file - const fs = await import("fs") - const path = await import("path") - - for (const relativePath of allBackupPaths) { - console.log(`[GLOSSARY-SYNC] Committing: ${relativePath}`) - const absolutePath = path.join(process.cwd(), relativePath) - const buffer = fs.readFileSync(absolutePath) - await putCommitFile(buffer, relativePath, branchName) - } - - // Create PR - console.log("[GLOSSARY-SYNC] Creating pull request") - const prTitle = `🗂️ Crowdin Glossary/TM Backup - ${timestamp}` - const prBody = `# Crowdin Glossary and Translation Memory Backup - -This automated PR backs up Crowdin glossary and translation memory exports before syncing with the Supabase community glossary. - -## Backup Details -- **Date**: ${new Date().toISOString()} -- **Files**: ${allBackupPaths.length} total backups -- **Glossary backups**: ${allBackupPaths.filter((p) => p.includes("glossary")).length} -- **TM backups**: ${allBackupPaths.filter((p) => p.includes("tm")).length} - -## Purpose -These backups enable easy reversion if the Supabase glossary sync introduces issues. Each backup is timestamped and content-hashed for traceability. - -## Next Steps -- Review the backup files -- Merge to preserve the backup history -- Monitor the main translation workflow for any glossary-related issues - -**Auto-generated by the i18n glossary sync workflow** -` - - const prUrl = await postPullRequest(branchName, prTitle, prBody) - console.log(`[GLOSSARY-SYNC] ✓ Created PR: ${prUrl}`) - - return { branch: branchName, prUrl } -} - -// CLI execution -if (require.main === module) { - syncGlossary() - .then((result) => { - console.log("\n[GLOSSARY-SYNC] Success!") - if (result.backupPrUrl) { - console.log(`[GLOSSARY-SYNC] Backup PR: ${result.backupPrUrl}`) - } - process.exit(0) - }) - .catch((error) => { - console.error("\n[GLOSSARY-SYNC] Fatal error:", error) - process.exit(1) - }) -} From fde76971d162324e4af033d2883be44d58dec967 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 12 Dec 2025 11:49:04 -0300 Subject: [PATCH 20/99] feat(i18n): fix MDX build failures from Crowdin translations Implements upstream fixes to prevent MDX parser errors in AI-translated content: - Update existing Crowdin files before translation to ensure latest English source - Add PUT /files/{fileId} request when file exists instead of skipping update - Add 10s parsing delay after file updates - Add defensive sanitizer for block component line breaks - New fixBlockComponentLineBreaks() function catches inline tags - Fixes 12 component types (Card, ExpandableCard, Alert, etc.) - Reports fix count in sanitizer issues - Scope sanitizer to current translation job languages - Change from all configured languages to languagePairs from job - Fetch AI model name dynamically from Crowdin API - New getPromptInfo() function with PromptResource type - Replace hard-coded "Gemini 2.5 Pro" in PR body with actual model Fixes critical MDX parser errors like "Expected the closing tag either after the end of paragraph" caused by Crowdin AI using stale English files and outputting inline block component tags. --- src/scripts/i18n/lib/crowdin/prompt.ts | 29 ++++++++++ src/scripts/i18n/main.ts | 74 +++++++++++++++++++----- src/scripts/i18n/post_import_sanitize.ts | 55 ++++++++++++++++++ 3 files changed, 145 insertions(+), 13 deletions(-) diff --git a/src/scripts/i18n/lib/crowdin/prompt.ts b/src/scripts/i18n/lib/crowdin/prompt.ts index e3dacfb75f1..1c9a325129b 100644 --- a/src/scripts/i18n/lib/crowdin/prompt.ts +++ b/src/scripts/i18n/lib/crowdin/prompt.ts @@ -2,6 +2,35 @@ import * as fs from "fs" import { crowdinBearerHeaders } from "../../config" +type PromptResource = { + id: number + name: string + action: string + aiProviderId?: number | null + aiModelId?: string | null +} + +/** + * Get information about a Crowdin AI prompt including the model being used. + * Uses Crowdin API v2: GET /users/{userId}/ai/prompts/{promptId} + */ +export async function getPromptInfo( + userId: number, + promptId: number +): Promise { + const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` + const resp = await fetch(url, { + method: "GET", + headers: crowdinBearerHeaders, + }) + if (!resp.ok) { + const text = await resp.text().catch(() => "") + throw new Error(`Failed to get prompt info (${resp.status}): ${text}`) + } + const json = await resp.json() + return json.data as PromptResource +} + /** * Update a Crowdin AI prompt's content from a local file. * Uses Crowdin API v2: PATCH /users/{userId}/ai/prompts/{promptId} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 318830c941d..9924dc78fff 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -17,7 +17,7 @@ import { getPreTranslationStatus, postApplyPreTranslation, } from "./lib/crowdin/pre-translate" -import { updatePromptFromFile } from "./lib/crowdin/prompt" +import { getPromptInfo, updatePromptFromFile } from "./lib/crowdin/prompt" import { postCreateBranchFrom } from "./lib/github/branches" import { getDestinationFromPath, putCommitFile } from "./lib/github/commits" import { @@ -28,7 +28,7 @@ import { import { postPullRequest } from "./lib/github/pull-requests" import type { CrowdinFileData, CrowdinPreTranslateResponse } from "./lib/types" import { mapCrowdinCodeToInternal } from "./lib/utils/mapping" -import { config } from "./config" +import { config, crowdinBearerHeaders } from "./config" import { runSanitizer } from "./post_import_sanitize" // Small helper for async waits @@ -170,18 +170,49 @@ async function main() { let effectivePath: string if (foundFile) { - // File exists - DO NOT update to preserve parsed string structure - if (verbose) { - console.log( - `[DEBUG] File exists in Crowdin (ID: ${foundFile.id}), using existing structure` + // File exists - UPDATE it to ensure Crowdin has the latest English version + console.log( + `Updating existing file in Crowdin: ${file.filePath} (ID: ${foundFile.id})` + ) + const fileBuffer = await downloadGitHubFile(file.download_url) + + const storageInfo = await postFileToStorage( + fileBuffer, + file["Crowdin-API-FileName"] + ) + + // Update the existing file using PUT /files/{fileId} + const updateUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` + const updateResp = await fetch(updateUrl, { + method: "PUT", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ storageId: storageInfo.id }), + }) + + if (!updateResp.ok) { + const text = await updateResp.text().catch(() => "") + throw new Error( + `Failed to update Crowdin file ${foundFile.id} (${updateResp.status}): ${text}` ) } + + console.log(`✓ Updated Crowdin file (ID: ${foundFile.id})`) + effectiveFileId = foundFile.id effectivePath = foundFile.path - - // Download English for buffer comparison later - const fileBuffer = await downloadGitHubFile(file.download_url) englishBuffers[effectiveFileId] = fileBuffer + + // Wait for file parsing after update + const delayMs = 10000 + if (verbose) { + console.log( + `[DEBUG] Waiting ${delayMs / 1000}s for Crowdin to re-parse updated file...` + ) + } + await delay(delayMs) } else { // File doesn't exist - create it console.log(`Creating new file in Crowdin: ${file.filePath}`) @@ -370,9 +401,12 @@ async function main() { console.log(`✓ Committed translations for ${internalLanguageCode}`) } - // Run post-import sanitizer + // Run post-import sanitizer only on languages in this translation job console.log(`\n========== Running Post-Import Sanitizer ==========`) - const sanitizeResult = runSanitizer(config.allCrowdinCodes) + const targetLangsForSanitizer = languagePairs.map( + (pair) => pair.internalLanguageCode + ) + const sanitizeResult = runSanitizer(targetLangsForSanitizer) const changedFiles = sanitizeResult.changedFiles || [] if (changedFiles.length) { @@ -399,9 +433,23 @@ async function main() { // Create PR console.log(`\n========== Creating Pull Request ==========`) + // Fetch AI model name dynamically + let aiModelName = "LLM" + const userId = process.env.I18N_CROWDIN_USER_ID + if (userId) { + try { + const promptInfo = await getPromptInfo( + Number(userId), + config.preTranslatePromptId + ) + aiModelName = promptInfo.aiModelId || "LLM" + } catch (e) { + console.warn("Could not fetch AI model name from Crowdin:", e) + } + } + const langCodes = languagePairs.map((p) => p.internalLanguageCode).join(", ") - const prTitle = `Automated Crowdin translations (${langCodes})` - const prBody = `${prTitle}\n\nThis PR contains automated translations from Crowdin for the following languages: ${langCodes}\n\n**Translation Details:**\n- Files translated: ${fileIds.length}\n- Languages: ${languageIds.join(", ")}\n- Branch: ${branch}` + const prBody = `## Description\n\nThis PR contains automated ${aiModelName} translations from Crowdin\n\n### Translation Details\n\n- Files translated: ${fileIds.length}\n- Languages: ${langCodes}` const pr = await postPullRequest(branch, config.baseBranch, prBody) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 54a9c7cfeee..07b9b77bc23 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -131,6 +131,53 @@ function normalizeBlockHtmlLines(md: string): string { return md } +/** + * Fix block-level React components that have opening/closing tags inline with content. + * MDX parser requires these tags to be on separate lines. + * Returns number of fixes applied. + */ +function fixBlockComponentLineBreaks(md: string): { + content: string + fixCount: number +} { + const blockComponents = [ + "Card", + "ExpandableCard", + "Alert", + "AlertEmoji", + "AlertContent", + "AlertDescription", + "CardGrid", + "InfoGrid", + "InfoBanner", + "ButtonLink", + "Tabs", + "TabItem", + ] + + let content = md + let fixCount = 0 + + for (const component of blockComponents) { + // Fix inline closing tags: content → content\n + const inlineCloseRe = new RegExp(`([^\\n])\\s*`, "g") + content = content.replace(inlineCloseRe, (_, before) => { + fixCount++ + return `${before}\n` + }) + + // Fix inline opening tags: content → \ncontent + // Only if there's actual content after the tag (not another tag or newline) + const inlineOpenRe = new RegExp(`(<${component}[^>]*>)([^\\n<])`, "g") + content = content.replace(inlineOpenRe, (_, tag, after) => { + fixCount++ + return `${tag}\n${after}` + }) + } + + return { content, fixCount } +} + function protectNames(text: string): string { // Replace common incorrectly localized variants back to protected names. // This is heuristic; extend as needed per locale QA. @@ -177,6 +224,14 @@ function processMarkdownFile(mdPath: string): { } const before = content + + // Fix block component line breaks (critical for MDX parser) + const blockResult = fixBlockComponentLineBreaks(content) + content = blockResult.content + if (blockResult.fixCount > 0) { + issues.push(`Fixed ${blockResult.fixCount} inline block component tags`) + } + content = normalizeBlockHtmlLines(content) content = protectNames(content) From 3d325af8e4f760cbfc4ef7b8fc026193394268b4 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 12 Dec 2025 12:35:19 -0300 Subject: [PATCH 21/99] fix(i18n): constrain sanitizer to only translated files Changed sanitizer from processing all files in target languages to only processing the specific files that were just committed in the current job. Changes: - Track committed file paths during translation loop in committedFilePaths array - Pass specific file paths to runSanitizer() instead of language codes - Update runSanitizer() to accept specificFiles parameter - When specificFiles provided, only process those exact files - Falls back to language-based scanning when specificFiles not provided This prevents sanitizer from touching hundreds of unrelated translation files when only translating a single file or directory. --- src/scripts/i18n/main.ts | 15 ++++-- src/scripts/i18n/post_import_sanitize.ts | 65 +++++++++++++++--------- 2 files changed, 53 insertions(+), 27 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 9924dc78fff..60f0d42aced 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -346,6 +346,9 @@ async function main() { ) console.log(`✓ Created branch: ${branch}`) + // Track all committed file paths for sanitizer + const committedFilePaths: string[] = [] + // For each language for (const { crowdinId, internalLanguageCode } of languagePairs) { console.log( @@ -396,17 +399,21 @@ async function main() { } await putCommitFile(buffer, destinationPath, branch) + + // Track this file for sanitizer + const absolutePath = path.join(process.cwd(), destinationPath) + committedFilePaths.push(absolutePath) } console.log(`✓ Committed translations for ${internalLanguageCode}`) } - // Run post-import sanitizer only on languages in this translation job + // Run post-import sanitizer only on files that were just committed console.log(`\n========== Running Post-Import Sanitizer ==========`) - const targetLangsForSanitizer = languagePairs.map( - (pair) => pair.internalLanguageCode + console.log( + `[SANITIZE] Processing ${committedFilePaths.length} committed files` ) - const sanitizeResult = runSanitizer(targetLangsForSanitizer) + const sanitizeResult = runSanitizer(undefined, committedFilePaths) const changedFiles = sanitizeResult.changedFiles || [] if (changedFiles.length) { diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 07b9b77bc23..5e97c7c6a39 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -378,23 +378,32 @@ function languagesFromEnv(): string[] | undefined { .filter(Boolean) } -export function runSanitizer(langs?: string[]) { - const effectiveLangs = langs || languagesFromEnv() +export function runSanitizer(langs?: string[], specificFiles?: string[]) { console.log("[SANITIZE] Starting post-import sanitizer") - console.log( - "[SANITIZE] Target languages:", - effectiveLangs ?? "ALL detected in translations/" - ) - const mdFiles = listFiles(CONTENT_ROOT, (f) => { - if (!f.endsWith(".md")) return false - if (!f.includes(`${path.sep}translations${path.sep}`)) return false - if (effectiveLangs) - return effectiveLangs.some((l) => - f.includes(`${path.sep}translations${path.sep}${l}${path.sep}`) - ) - return true - }) + let mdFiles: string[] + + if (specificFiles && specificFiles.length > 0) { + // Process only the specific files provided + console.log(`[SANITIZE] Target: ${specificFiles.length} specific file(s)`) + mdFiles = specificFiles.filter((f) => f.endsWith(".md")) + } else { + // Fallback to language-based scanning + const effectiveLangs = langs || languagesFromEnv() + console.log( + "[SANITIZE] Target languages:", + effectiveLangs ?? "ALL detected in translations/" + ) + mdFiles = listFiles(CONTENT_ROOT, (f) => { + if (!f.endsWith(".md")) return false + if (!f.includes(`${path.sep}translations${path.sep}`)) return false + if (effectiveLangs) + return effectiveLangs.some((l) => + f.includes(`${path.sep}translations${path.sep}${l}${path.sep}`) + ) + return true + }) + } let mdFixed = 0 const mdIssues: Array<{ file: string; issues: string[] }> = [] @@ -408,14 +417,24 @@ export function runSanitizer(langs?: string[]) { if (issues.length) mdIssues.push({ file: path.relative(ROOT, f), issues }) } - const jsonFiles = listFiles(INTL_ROOT, (f) => { - if (!f.endsWith(".json")) return false - const p = path.relative(INTL_ROOT, f).split(path.sep) - const langDir = p[0] - if (!langDir) return false - if (effectiveLangs) return effectiveLangs.some((l) => l.startsWith(langDir)) - return true - }) + let jsonFiles: string[] + + if (specificFiles && specificFiles.length > 0) { + // Process only the specific files provided + jsonFiles = specificFiles.filter((f) => f.endsWith(".json")) + } else { + // Fallback to language-based scanning + const effectiveLangs = langs || languagesFromEnv() + jsonFiles = listFiles(INTL_ROOT, (f) => { + if (!f.endsWith(".json")) return false + const p = path.relative(INTL_ROOT, f).split(path.sep) + const langDir = p[0] + if (!langDir) return false + if (effectiveLangs) + return effectiveLangs.some((l) => l.startsWith(langDir)) + return true + }) + } let jsonFixed = 0 const jsonIssues: Array<{ file: string; issues: string[] }> = [] From 9352f47e9a53d0637689f42dd57d9eb796add2f9 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 12 Dec 2025 12:52:09 -0300 Subject: [PATCH 22/99] fix(i18n): remove content sanitization, only fix syntax MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed protectNames() function that was changing translated content strings: - Was capitalizing 'ethereum.org' to 'Ethereum.org' in URLs - Was replacing translated terms like 'Etéreo' with 'Ethereum' - Was changing brand name capitalization in prose content The sanitizer should ONLY fix code syntax issues that break the build: - Block component line breaks (MDX parser requirement) - Block HTML tag line breaks - Header ID ASCII normalization - Validation reporting (broken links, malformed markdown) Content terminology and brand name handling should be done by the LLM or a separate content sanitizer with more nuanced rules (e.g., 'Ethereum' vs 'ethereum.org', never touch URLs/hrefs). --- src/scripts/i18n/post_import_sanitize.ts | 35 ------------------------ 1 file changed, 35 deletions(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 5e97c7c6a39..fbd9f60c396 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -20,20 +20,6 @@ const ROOT = process.cwd() const CONTENT_ROOT = path.join(ROOT, "public", "content") const INTL_ROOT = path.join(ROOT, "src", "intl") -const _protectedNames = [ - "Ethereum", - "ETH", - "Solidity", - "MetaMask", - "GitHub", - "Crowdin", - "EIP", - "NFT", - "HTML", - "PoW", - "PoS", -] - const BLOCK_HTML_TAGS = [ "section", "div", @@ -178,26 +164,6 @@ function fixBlockComponentLineBreaks(md: string): { return { content, fixCount } } -function protectNames(text: string): string { - // Replace common incorrectly localized variants back to protected names. - // This is heuristic; extend as needed per locale QA. - const replacements: Array<[RegExp, string]> = [ - [/\bEtéreo\b/gi, "Ethereum"], - [/\bEtéreum\b/gi, "Ethereum"], - [/\bMetamask\b/gi, "MetaMask"], - [/\bGithub\b/gi, "GitHub"], - [/\bNft\b/g, "NFT"], - ] - let out = text - for (const [re, val] of replacements) out = out.replace(re, val) - // Normalize canonical capitalization of protected names - for (const name of _protectedNames) { - const re = new RegExp(`\\b${name}\\b`, "gi") - out = out.replace(re, name) - } - return out -} - function processMarkdownFile(mdPath: string): { fixed: boolean issues: string[] @@ -233,7 +199,6 @@ function processMarkdownFile(mdPath: string): { } content = normalizeBlockHtmlLines(content) - content = protectNames(content) const fixed = before !== content if (fixed) fs.writeFileSync(mdPath, content, "utf8") From 8ab178870d5eb7ed8cee7408195826078dd187f0 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 12 Dec 2025 13:35:51 -0300 Subject: [PATCH 23/99] fix(i18n): sanitizer uses in-memory content MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sanitizer was reading stale files from disk instead of just-committed translated content, causing it to overwrite translations with English. Changes: - Pass in-memory content from committed files to sanitizer - Sanitizer operates on provided content, only reads disk as fallback - Support both .md and .json files with in-memory content - Fix header ID sync to match by structure/position, not text - Extract header structure (level + position) from both files - Match headers by index: 1st H2 → 1st H2, etc. - Copy English IDs to corresponding translated headers - Warn on structure mismatches - Add JSON sanitization: BOM removal, smart quote normalization Previous flow (broken): 1. Commit translated file to branch 2. Sanitizer reads same path from LOCAL DISK (gets old English file) 3. Sanitizer processes English, commits back → translation overwritten New flow (fixed): 1. Commit translated file, keep content in memory 2. Pass in-memory content to sanitizer 3. Sanitizer processes actual translation 4. Commits sanitized translation → translation preserved --- src/scripts/i18n/main.ts | 26 +-- src/scripts/i18n/post_import_sanitize.ts | 278 ++++++++++++----------- 2 files changed, 155 insertions(+), 149 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 60f0d42aced..7f9580325a5 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -346,8 +346,8 @@ async function main() { ) console.log(`✓ Created branch: ${branch}`) - // Track all committed file paths for sanitizer - const committedFilePaths: string[] = [] + // Track all committed files with their content for sanitizer + const committedFiles: Array<{ path: string; content: string }> = [] // For each language for (const { crowdinId, internalLanguageCode } of languagePairs) { @@ -400,9 +400,11 @@ async function main() { await putCommitFile(buffer, destinationPath, branch) - // Track this file for sanitizer - const absolutePath = path.join(process.cwd(), destinationPath) - committedFilePaths.push(absolutePath) + // Track this file's path and content for sanitizer + committedFiles.push({ + path: destinationPath, + content: buffer.toString("utf8"), + }) } console.log(`✓ Committed translations for ${internalLanguageCode}`) @@ -410,20 +412,16 @@ async function main() { // Run post-import sanitizer only on files that were just committed console.log(`\n========== Running Post-Import Sanitizer ==========`) - console.log( - `[SANITIZE] Processing ${committedFilePaths.length} committed files` - ) - const sanitizeResult = runSanitizer(undefined, committedFilePaths) + console.log(`[SANITIZE] Processing ${committedFiles.length} committed files`) + const sanitizeResult = runSanitizer(committedFiles) const changedFiles = sanitizeResult.changedFiles || [] if (changedFiles.length) { console.log(`Sanitizer modified ${changedFiles.length} files`) - for (const absPath of changedFiles) { - const relPath = absPath.startsWith(process.cwd()) - ? absPath.slice(process.cwd().length + 1) - : absPath + for (const file of changedFiles) { + const relPath = file.path try { - const buf = fs.readFileSync(absPath) + const buf = Buffer.from(file.content, "utf8") await putCommitFile(buf, relPath, branch) if (verbose) { console.log(`[DEBUG] Committed sanitized file: ${relPath}`) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index fbd9f60c396..3a9537ebd52 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -18,7 +18,7 @@ import * as path from "path" const ROOT = process.cwd() const CONTENT_ROOT = path.join(ROOT, "public", "content") -const INTL_ROOT = path.join(ROOT, "src", "intl") +// const INTL_ROOT = path.join(ROOT, "src", "intl") // Not currently used const BLOCK_HTML_TAGS = [ "section", @@ -82,31 +82,71 @@ function lineAt(file: string, index: number): string { const lineNumber = `${linePosition}:${charPosition}` return lineNumber } -function extractHeadingIds(md: string): Map { - // Map of heading text -> custom id found in English source - const map = new Map() +type HeaderInfo = { + level: number // Number of # symbols + text: string // Header text (translated or English) + id: string // Custom ID from {#id} + fullMatch: string // Full matched string for replacement +} + +function extractHeaderStructure(md: string): HeaderInfo[] { + const headers: HeaderInfo[] = [] const headingRe = /^(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/gm let m: RegExpExecArray | null while ((m = headingRe.exec(md))) { - const text = m[2].trim() - const id = m[3].trim() - map.set(text, id) + headers.push({ + level: m[1].length, + text: m[2].trim(), + id: m[3].trim(), + fullMatch: m[0], + }) } - return map + return headers } function syncHeaderIdsWithEnglish( translatedMd: string, englishMd: string ): string { - const englishIds = extractHeadingIds(englishMd) - const headingRe = /^(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/gm - return translatedMd.replace(headingRe, (full, hashes, text) => { - const englishId = englishIds.get(text.trim()) - if (!englishId) return full // no corresponding English heading; leave as is - const asciiId = toAsciiId(englishId) - return `${hashes} ${text} {#${asciiId}}` - }) + // Extract header structure from both files + const englishHeaders = extractHeaderStructure(englishMd) + const translatedHeaders = extractHeaderStructure(translatedMd) + + // Match headers by position and level in the document structure + // If structure matches, copy English IDs to translated headers + if (englishHeaders.length !== translatedHeaders.length) { + console.warn( + `[WARN] Header count mismatch: English has ${englishHeaders.length}, translated has ${translatedHeaders.length}` + ) + } + + let result = translatedMd + // Match headers by index - same position = same semantic header + for (let i = 0; i < translatedHeaders.length; i++) { + const translatedHeader = translatedHeaders[i] + const englishHeader = englishHeaders[i] + + if (!englishHeader) { + // More headers in translation than English - skip + continue + } + + if (translatedHeader.level !== englishHeader.level) { + console.warn( + `[WARN] Header level mismatch at position ${i}: English H${englishHeader.level} vs translated H${translatedHeader.level}` + ) + // Still try to sync the ID even if levels don't match + } + + // Replace the translated header's ID with the English ID (ASCII-normalized) + const asciiId = toAsciiId(englishHeader.id) + const updatedHeader = `${"#".repeat(translatedHeader.level)} ${translatedHeader.text} {#${asciiId}}` + + // Use a more specific replacement to avoid affecting other occurrences + result = result.replace(translatedHeader.fullMatch, updatedHeader) + } + + return result } function normalizeBlockHtmlLines(md: string): string { @@ -164,12 +204,16 @@ function fixBlockComponentLineBreaks(md: string): { return { content, fixCount } } -function processMarkdownFile(mdPath: string): { +function processMarkdownFile( + mdPath: string, + providedContent?: string +): { fixed: boolean issues: string[] + content: string } { const issues: string[] = [] - let content = fs.readFileSync(mdPath, "utf8") + let content = providedContent || fs.readFileSync(mdPath, "utf8") // Map translated path to English path: remove `/translations//` segment const parts = mdPath.split(path.sep) @@ -201,7 +245,10 @@ function processMarkdownFile(mdPath: string): { content = normalizeBlockHtmlLines(content) const fixed = before !== content - if (fixed) fs.writeFileSync(mdPath, content, "utf8") + // Only write to disk if no content was provided (legacy mode) + if (fixed && !providedContent) { + fs.writeFileSync(mdPath, content, "utf8") + } // Run critical checks (report-only) let m: RegExpExecArray | null // Broken links containing spaces inside URL @@ -242,96 +289,42 @@ function processMarkdownFile(mdPath: string): { ) } } - return { fixed, issues } + return { fixed, issues, content } } -function processJsonFile(jsonPath: string): { +function processJsonFile( + jsonPath: string, + providedContent?: string +): { fixed: boolean issues: string[] + content: string } { const issues: string[] = [] - let content = fs.readFileSync(jsonPath, "utf8") - let fixed = false + let content = providedContent || fs.readFileSync(jsonPath, "utf8") + const before = content + // Normalize BOM and smart quotes - const cleaned = content + content = content .replace(/^\uFEFF/, "") .replace(/[""]/g, '"') .replace(/['']/g, "'") - if (cleaned !== content) { - content = cleaned - fixed = true - } - // Try parsing; if it fails, attempt to fix unescaped quotes - let parseError: Error | null = null + // Try parsing to validate JSON try { JSON.parse(content) } catch (e) { - parseError = e as Error - issues.push(`Initial JSON parse error: ${parseError.message}`) - - // Attempt to fix unescaped quotes in JSON string values - // Strategy: scan for patterns like "text "word" text" and escape the internal quotes - try { - let fixedContent = content - - // Find all string values that might have unescaped internal quotes - // Pattern: ": "...content..." - we look for quotes after a colon - let modified = false - const lines = fixedContent.split("\n") - const fixedLines = lines.map((line) => { - // Match JSON key-value pairs with string values - // Look for pattern: "key": "value potentially with "quotes"" - const match = line.match(/^(\s*"[^"]+"\s*:\s*")(.*)("\s*,?\s*)$/) - if (!match) return line - - const prefix = match[1] // ' "key": "' - const value = match[2] // 'text with "quotes" inside' - const suffix = match[3] // '",\n' or '"\n' - - // Check if value contains unescaped quotes - if (!value.includes('"')) return line - - // Escape unescaped quotes in the value - let fixedValue = "" - for (let i = 0; i < value.length; i++) { - const char = value[i] - if (char === '"') { - // Count preceding backslashes - let backslashCount = 0 - for (let j = i - 1; j >= 0 && value[j] === "\\"; j--) { - backslashCount++ - } - // If not escaped (even number of backslashes), escape it - if (backslashCount % 2 === 0) { - fixedValue += '\\"' - modified = true - } else { - fixedValue += char - } - } else { - fixedValue += char - } - } - - return prefix + fixedValue + suffix - }) - - if (modified) { - fixedContent = fixedLines.join("\n") - content = fixedContent - fixed = true - // Re-validate after fix - JSON.parse(content) - issues.push("Auto-fixed unescaped quotes in JSON string values") - } - } catch (fixError) { - issues.push(`Failed to auto-fix JSON: ${(fixError as Error).message}`) - } + const error = e as Error + issues.push(`JSON parse error: ${error.message}`) + } + + const fixed = before !== content + // Only write to disk if no content was provided (legacy mode) + if (fixed && !providedContent) { + fs.writeFileSync(jsonPath, content, "utf8") } - if (fixed) fs.writeFileSync(jsonPath, content, "utf8") - return { fixed, issues } + return { fixed, issues, content } } function languagesFromEnv(): string[] | undefined { @@ -343,23 +336,32 @@ function languagesFromEnv(): string[] | undefined { .filter(Boolean) } -export function runSanitizer(langs?: string[], specificFiles?: string[]) { +export function runSanitizer( + filesWithContent?: Array<{ path: string; content: string }>, + langs?: string[] +) { console.log("[SANITIZE] Starting post-import sanitizer") - let mdFiles: string[] + let mdFilesToProcess: Array<{ path: string; content: string }> = [] + let jsonFilesToProcess: Array<{ path: string; content: string }> = [] - if (specificFiles && specificFiles.length > 0) { - // Process only the specific files provided - console.log(`[SANITIZE] Target: ${specificFiles.length} specific file(s)`) - mdFiles = specificFiles.filter((f) => f.endsWith(".md")) + if (filesWithContent && filesWithContent.length > 0) { + // Process only the specific files provided with their in-memory content + console.log( + `[SANITIZE] Target: ${filesWithContent.length} specific file(s)` + ) + mdFilesToProcess = filesWithContent.filter((f) => f.path.endsWith(".md")) + jsonFilesToProcess = filesWithContent.filter((f) => + f.path.endsWith(".json") + ) } else { - // Fallback to language-based scanning + // Fallback to language-based scanning (reads from disk) const effectiveLangs = langs || languagesFromEnv() console.log( "[SANITIZE] Target languages:", effectiveLangs ?? "ALL detected in translations/" ) - mdFiles = listFiles(CONTENT_ROOT, (f) => { + const mdFilePaths = listFiles(CONTENT_ROOT, (f) => { if (!f.endsWith(".md")) return false if (!f.includes(`${path.sep}translations${path.sep}`)) return false if (effectiveLangs) @@ -368,56 +370,59 @@ export function runSanitizer(langs?: string[], specificFiles?: string[]) { ) return true }) + const jsonFilePaths = listFiles(CONTENT_ROOT, (f) => { + if (!f.endsWith(".json")) return false + if (!f.includes(`${path.sep}translations${path.sep}`)) return false + if (effectiveLangs) + return effectiveLangs.some((l) => + f.includes(`${path.sep}translations${path.sep}${l}${path.sep}`) + ) + return true + }) + // Convert file paths to objects without content (will be read from disk) + mdFilesToProcess = mdFilePaths.map((p) => ({ path: p, content: "" })) + jsonFilesToProcess = jsonFilePaths.map((p) => ({ path: p, content: "" })) } let mdFixed = 0 const mdIssues: Array<{ file: string; issues: string[] }> = [] - const mdChanged: string[] = [] - for (const f of mdFiles) { - const { fixed, issues } = processMarkdownFile(f) + const mdChanged: Array<{ path: string; content: string }> = [] + + for (const fileInfo of mdFilesToProcess) { + const { fixed, issues, content } = processMarkdownFile( + fileInfo.path, + fileInfo.content + ) if (fixed) { mdFixed++ - mdChanged.push(f) + mdChanged.push({ path: fileInfo.path, content }) } - if (issues.length) mdIssues.push({ file: path.relative(ROOT, f), issues }) - } - - let jsonFiles: string[] - - if (specificFiles && specificFiles.length > 0) { - // Process only the specific files provided - jsonFiles = specificFiles.filter((f) => f.endsWith(".json")) - } else { - // Fallback to language-based scanning - const effectiveLangs = langs || languagesFromEnv() - jsonFiles = listFiles(INTL_ROOT, (f) => { - if (!f.endsWith(".json")) return false - const p = path.relative(INTL_ROOT, f).split(path.sep) - const langDir = p[0] - if (!langDir) return false - if (effectiveLangs) - return effectiveLangs.some((l) => l.startsWith(langDir)) - return true - }) + if (issues.length) + mdIssues.push({ file: path.relative(ROOT, fileInfo.path), issues }) } let jsonFixed = 0 const jsonIssues: Array<{ file: string; issues: string[] }> = [] - const jsonChanged: string[] = [] - for (const f of jsonFiles) { - const { fixed, issues } = processJsonFile(f) + const jsonChanged: Array<{ path: string; content: string }> = [] + + for (const fileInfo of jsonFilesToProcess) { + const { fixed, issues, content } = processJsonFile( + fileInfo.path, + fileInfo.content + ) if (fixed) { jsonFixed++ - jsonChanged.push(f) + jsonChanged.push({ path: fileInfo.path, content }) } - if (issues.length) jsonIssues.push({ file: path.relative(ROOT, f), issues }) + if (issues.length) + jsonIssues.push({ file: path.relative(ROOT, fileInfo.path), issues }) } console.log( - `\n[SANITIZE] Markdown files scanned: ${mdFiles.length}, fixed: ${mdFixed}` + `\n[SANITIZE] Markdown files scanned: ${mdFilesToProcess.length}, fixed: ${mdFixed}` ) console.log( - `[SANITIZE] JSON files scanned: ${jsonFiles.length}, fixed: ${jsonFixed}` + `[SANITIZE] JSON files scanned: ${jsonFilesToProcess.length}, fixed: ${jsonFixed}` ) if (mdIssues.length || jsonIssues.length) { @@ -434,11 +439,14 @@ export function runSanitizer(langs?: string[], specificFiles?: string[]) { console.log("\n[SANITIZE] No issues detected.") } - const changedFiles = [...mdChanged, ...jsonChanged] + const changedFiles = [...mdChanged, ...jsonChanged].map((f) => ({ + path: f.path, + content: f.content, + })) return { changedFiles, - markdown: { scanned: mdFiles.length, fixed: mdFixed }, - json: { scanned: jsonFiles.length, fixed: jsonFixed }, + markdown: { scanned: mdFilesToProcess.length, fixed: mdFixed }, + json: { scanned: jsonFilesToProcess.length, fixed: jsonFixed }, issues: { markdown: mdIssues, json: jsonIssues }, } } From 983dd11c4d821691d9fdde53f0884dac8b36072c Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 12 Dec 2025 13:53:20 -0300 Subject: [PATCH 24/99] fix(i18n): sanitizer handles inline tags & restores blank lines Fixes two formatting issues in sanitized translations: 1. Opening tags with inline content (especially other tags) - Changed regex from [^\n<] to [^\n] to match ANY character after tag - Now catches: text - Previously missed when content started with < character 2. Missing blank lines after headers and block components - Added restoreBlankLinesFromEnglish() function - Compares translation structure with English source - Adds blank lines where English has them for readability - Preserves proper Markdown/MDX formatting conventions 3. Improved PR body formatting to list translated files These fixes prevent MDX parser errors like: 'Expected a closing tag for before the end of paragraph' --- src/scripts/i18n/main.ts | 2 +- src/scripts/i18n/post_import_sanitize.ts | 84 ++++++++++++++++++++++-- 2 files changed, 81 insertions(+), 5 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 7f9580325a5..42e8fa4bc54 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -454,7 +454,7 @@ async function main() { } const langCodes = languagePairs.map((p) => p.internalLanguageCode).join(", ") - const prBody = `## Description\n\nThis PR contains automated ${aiModelName} translations from Crowdin\n\n### Translation Details\n\n- Files translated: ${fileIds.length}\n- Languages: ${langCodes}` + const prBody = `## Description\n\nThis PR contains automated ${aiModelName} translations from Crowdin\n\n### File translated\n\n${changedFiles.map(({ path }) => `- ${path}\n`)}\n### Languages translated\n\n- ${langCodes}` const pr = await postPullRequest(branch, config.baseBranch, prBody) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 3a9537ebd52..79d2e81f281 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -157,6 +157,69 @@ function normalizeBlockHtmlLines(md: string): string { return md } +/** + * Restore blank lines after headers and block components by comparing + * with English source structure. This preserves readability and formatting. + */ +function restoreBlankLinesFromEnglish( + translatedMd: string, + englishMd: string +): { content: string; fixCount: number } { + const translatedLines = translatedMd.split("\n") + const englishLines = englishMd.split("\n") + + let fixCount = 0 + const result: string[] = [] + + // Patterns that should have blank lines after them + const headerPattern = /^#{1,6}\s+/ + const blockComponentClosePattern = + /<\/(Alert|AlertContent|AlertDescription|Card|ExpandableCard|CardGrid|InfoGrid|ButtonLink|Tabs|TabItem|InfoBanner)>/ + + for (let i = 0; i < translatedLines.length; i++) { + const line = translatedLines[i] + result.push(line) + + // Check if this line should be followed by a blank line + const isHeader = headerPattern.test(line) + const isBlockClose = blockComponentClosePattern.test(line) + + if (isHeader || isBlockClose) { + const nextLine = translatedLines[i + 1] + const hasBlankAfter = nextLine === "" + + // Find corresponding line in English by matching pattern + let englishShouldHaveBlank = false + for (let j = 0; j < englishLines.length; j++) { + const englishLine = englishLines[j] + if (isHeader && headerPattern.test(englishLine)) { + // Headers should match by structure (level) + const transLevel = (line.match(/^#+/) || [""])[0].length + const engLevel = (englishLine.match(/^#+/) || [""])[0].length + if (transLevel === engLevel) { + englishShouldHaveBlank = englishLines[j + 1] === "" + break + } + } else if ( + isBlockClose && + blockComponentClosePattern.test(englishLine) + ) { + englishShouldHaveBlank = englishLines[j + 1] === "" + break + } + } + + // Add blank line if English has it but translation doesn't + if (englishShouldHaveBlank && !hasBlankAfter && nextLine !== undefined) { + result.push("") + fixCount++ + } + } + } + + return { content: result.join("\n"), fixCount } +} + /** * Fix block-level React components that have opening/closing tags inline with content. * MDX parser requires these tags to be on separate lines. @@ -193,8 +256,8 @@ function fixBlockComponentLineBreaks(md: string): { }) // Fix inline opening tags: content → \ncontent - // Only if there's actual content after the tag (not another tag or newline) - const inlineOpenRe = new RegExp(`(<${component}[^>]*>)([^\\n<])`, "g") + // Match any non-newline character after the tag (including other tags) + const inlineOpenRe = new RegExp(`(<${component}[^>]*>)([^\\n])`, "g") content = content.replace(inlineOpenRe, (_, tag, after) => { fixCount++ return `${tag}\n${after}` @@ -215,18 +278,20 @@ function processMarkdownFile( const issues: string[] = [] let content = providedContent || fs.readFileSync(mdPath, "utf8") + let englishMd: string | undefined + // Map translated path to English path: remove `/translations//` segment const parts = mdPath.split(path.sep) const idx = parts.lastIndexOf("translations") if (idx === -1 || idx + 2 >= parts.length) { - issues.push("No translations segment found; skipping header ID sync") + issues.push("No translations segment found; skipping formatting sync") } else { const englishPath = path.join( ...parts.slice(0, idx), ...parts.slice(idx + 2) // drop translations/ ) if (fs.existsSync(englishPath)) { - const englishMd = fs.readFileSync(englishPath, "utf8") + englishMd = fs.readFileSync(englishPath, "utf8") content = syncHeaderIdsWithEnglish(content, englishMd) } else { issues.push(`English source missing: ${path.relative(ROOT, englishPath)}`) @@ -244,6 +309,17 @@ function processMarkdownFile( content = normalizeBlockHtmlLines(content) + // Restore blank lines from English source (improves readability) + if (englishMd) { + const blankLineResult = restoreBlankLinesFromEnglish(content, englishMd) + content = blankLineResult.content + if (blankLineResult.fixCount > 0) { + issues.push( + `Restored ${blankLineResult.fixCount} blank lines from English` + ) + } + } + const fixed = before !== content // Only write to disk if no content was provided (legacy mode) if (fixed && !providedContent) { From ffe02a447d3c3c7dbb5fa25fcd009505b340af04 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 12 Dec 2025 14:26:54 -0300 Subject: [PATCH 25/99] fix(i18n): remove commas from PR file list Add .join('\n') to prevent array-to-string conversion from inserting commas between bullet points in the PR description. --- src/scripts/i18n/main.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 42e8fa4bc54..3efb03fcf25 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -454,7 +454,7 @@ async function main() { } const langCodes = languagePairs.map((p) => p.internalLanguageCode).join(", ") - const prBody = `## Description\n\nThis PR contains automated ${aiModelName} translations from Crowdin\n\n### File translated\n\n${changedFiles.map(({ path }) => `- ${path}\n`)}\n### Languages translated\n\n- ${langCodes}` + const prBody = `## Description\n\nThis PR contains automated ${aiModelName} translations from Crowdin\n\n### File${changedFiles.length > 1 ? "s" : ""} translated\n\n${changedFiles.map(({ path }) => `- ${path}`).join("\n")}\n\n### Languages translated\n\n- ${langCodes}` const pr = await postPullRequest(branch, config.baseBranch, prBody) From 135af193983bac9ad3d4b460ca76c6d07d53d106 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 12 Dec 2025 21:00:43 -0300 Subject: [PATCH 26/99] feat(i18n): enable JSX attribute translation with configurable update strategy - Add translateAttributes parser config for markdown files to enable translation of component attributes (title, description, alt, etc.) - Add UPDATE_OPTION workflow input with radio choices: - keep_translations_and_approvals (default, preserves existing work) - keep_translations (preserves translations only) - clear_translations_and_approvals (full reset) - Configure both file creation and update operations to use translateAttributes - Whitelist 12 human-readable attributes while excluding technical properties like emoji, eventCategory, href, etc. --- .github/workflows/crowdin-ai-import.yml | 10 +++++++ src/scripts/i18n/config.ts | 7 +++++ src/scripts/i18n/lib/crowdin/files.ts | 39 ++++++++++++++++++++++++- src/scripts/i18n/main.ts | 31 +++++++++++++++++++- 4 files changed, 85 insertions(+), 2 deletions(-) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index e347d3b3d3f..7795e72aa13 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -36,6 +36,15 @@ on: required: false default: "326942" type: string + update_option: + description: "How to handle existing translations when updating files" + required: false + default: "keep_translations_and_approvals" + type: choice + options: + - keep_translations_and_approvals + - keep_translations + - clear_translations_and_approvals verbose: description: "Enable verbose logging (default: false)" required: false @@ -74,5 +83,6 @@ jobs: PRETRANSLATE_TIMEOUT_MS: ${{ github.event.inputs.pretranslate_timeout_ms }} PRETRANSLATE_POLL_BASE_MS: ${{ github.event.inputs.pretranslate_poll_base_ms }} PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} + UPDATE_OPTION: ${{ github.event.inputs.update_option }} VERBOSE: ${{ github.event.inputs.verbose }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index 62725015406..300fc1c561a 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -68,6 +68,12 @@ const pretranslatePollBaseMs = process.env.PRETRANSLATE_POLL_BASE_MS const existingPreTranslationId = process.env.PRETRANSLATION_ID || "" +const updateOption = (process.env.UPDATE_OPTION || + "keep_translations_and_approvals") as + | "keep_translations_and_approvals" + | "keep_translations" + | "clear_translations_and_approvals" + const verbose = process.env.VERBOSE === "true" // Parse GitHub repository from env (format: "owner/repo") @@ -113,6 +119,7 @@ export const config = { pretranslateTimeoutMs, pretranslatePollBaseMs, existingPreTranslationId, + updateOption, verbose, } diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts index 50330733e95..b78fbc84eee 100644 --- a/src/scripts/i18n/lib/crowdin/files.ts +++ b/src/scripts/i18n/lib/crowdin/files.ts @@ -11,6 +11,26 @@ import type { GitHubCrowdinFileMetadata, } from "../types" +/** + * JSX component attributes that should be translated in markdown files. + * These contain human-readable strings, as opposed to technical attributes + * like emoji, eventCategory, href, etc. + */ +const TRANSLATABLE_ATTRIBUTES = [ + "title", + "description", + "alt", + "label", + "aria-label", + "placeholder", + "buttonLabel", + "text", + "name", + "caption", + "contentPreview", + "location", +] + /** * Get all files in the Crowdin project */ @@ -377,6 +397,23 @@ export const postCrowdinFile = async ( `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/files` ) + // Configure parser options for markdown files + const isMarkdown = name.endsWith(".md") + const importOptions = isMarkdown + ? { + translateAttributes: TRANSLATABLE_ATTRIBUTES, + } + : undefined + + const requestBody: Record = { + storageId, + name, + directoryId, + } + if (importOptions) { + requestBody.importOptions = importOptions + } + try { const res = await fetch(url.toString(), { method: "POST", @@ -385,7 +422,7 @@ export const postCrowdinFile = async ( "Content-Type": "application/json", Accept: "application/json", }, - body: JSON.stringify({ storageId, name, directoryId }), + body: JSON.stringify(requestBody), }) if (!res.ok) { diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 3efb03fcf25..90ea9ccfc52 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -181,6 +181,35 @@ async function main() { file["Crowdin-API-FileName"] ) + // Configure parser options for markdown files + const isMarkdown = file.filePath.endsWith(".md") + const importOptions = isMarkdown + ? { + translateAttributes: [ + "title", + "description", + "alt", + "label", + "aria-label", + "placeholder", + "buttonLabel", + "text", + "name", + "caption", + "contentPreview", + "location", + ], + } + : undefined + + const updateBody: Record = { + storageId: storageInfo.id, + } + if (importOptions) { + updateBody.updateOption = config.updateOption + updateBody.importOptions = importOptions + } + // Update the existing file using PUT /files/{fileId} const updateUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` const updateResp = await fetch(updateUrl, { @@ -189,7 +218,7 @@ async function main() { ...crowdinBearerHeaders, "Content-Type": "application/json", }, - body: JSON.stringify({ storageId: storageInfo.id }), + body: JSON.stringify(updateBody), }) if (!updateResp.ok) { From 153f8a42f5f5b3f07df1a2404f2c12e2dbe79dad Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 15 Dec 2025 08:52:32 -0300 Subject: [PATCH 27/99] patch: translateAttributes parser options --- src/scripts/i18n/lib/crowdin/files.ts | 83 ++++++++++++++++++--------- src/scripts/i18n/main.ts | 80 ++++++++++++++++---------- 2 files changed, 106 insertions(+), 57 deletions(-) diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts index b78fbc84eee..be1f356d727 100644 --- a/src/scripts/i18n/lib/crowdin/files.ts +++ b/src/scripts/i18n/lib/crowdin/files.ts @@ -15,21 +15,26 @@ import type { * JSX component attributes that should be translated in markdown files. * These contain human-readable strings, as opposed to technical attributes * like emoji, eventCategory, href, etc. + * + * Note: Crowdin's PATCH API only accepts a boolean flag (translateAttributes: true) + * to enable attribute translation. The actual whitelist may need to be configured + * separately via the Crowdin UI or a different API endpoint. */ -const TRANSLATABLE_ATTRIBUTES = [ - "title", - "description", - "alt", - "label", - "aria-label", - "placeholder", - "buttonLabel", - "text", - "name", - "caption", - "contentPreview", - "location", -] +// Keeping this for documentation purposes - may be used in future API updates +// const TRANSLATABLE_ATTRIBUTES = [ +// "title", +// "description", +// "alt", +// "label", +// "aria-label", +// "placeholder", +// "buttonLabel", +// "text", +// "name", +// "caption", +// "contentPreview", +// "location", +// ] /** * Get all files in the Crowdin project @@ -397,24 +402,14 @@ export const postCrowdinFile = async ( `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/files` ) - // Configure parser options for markdown files - const isMarkdown = name.endsWith(".md") - const importOptions = isMarkdown - ? { - translateAttributes: TRANSLATABLE_ATTRIBUTES, - } - : undefined - - const requestBody: Record = { + const requestBody = { storageId, name, directoryId, } - if (importOptions) { - requestBody.importOptions = importOptions - } try { + // First, create the file const res = await fetch(url.toString(), { method: "POST", headers: { @@ -433,7 +428,41 @@ export const postCrowdinFile = async ( type JsonResponse = { data: CrowdinAddFileResponse } const json: JsonResponse = await res.json() - console.log("Updated file:", json.data) + console.log("Created file:", json.data) + + // Then, update parser options for markdown files using PATCH + const isMarkdown = name.endsWith(".md") + if (isMarkdown) { + const patchUrl = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/files/${json.data.id}` + const patchBody = [ + { + op: "replace", + path: "/parserOptions/translateAttributes", + value: true, + }, + ] + + const patchResp = await fetch(patchUrl, { + method: "PATCH", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify(patchBody), + }) + + if (!patchResp.ok) { + const text = await patchResp.text().catch(() => "") + console.warn( + `[WARN] Failed to update parser options for file ${json.data.id}: ${text}` + ) + } else if (config.verbose) { + console.log( + `[DEBUG] Enabled translateAttributes for file ${json.data.id}` + ) + } + } + return json.data } catch (error) { console.error(error) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 90ea9ccfc52..4e09f2a141f 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -181,46 +181,66 @@ async function main() { file["Crowdin-API-FileName"] ) - // Configure parser options for markdown files - const isMarkdown = file.filePath.endsWith(".md") - const importOptions = isMarkdown - ? { - translateAttributes: [ - "title", - "description", - "alt", - "label", - "aria-label", - "placeholder", - "buttonLabel", - "text", - "name", - "caption", - "contentPreview", - "location", - ], - } - : undefined - - const updateBody: Record = { + // First, update the file content using PUT + const putUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` + const putBody: Record = { storageId: storageInfo.id, - } - if (importOptions) { - updateBody.updateOption = config.updateOption - updateBody.importOptions = importOptions + updateOption: config.updateOption, } - // Update the existing file using PUT /files/{fileId} - const updateUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` - const updateResp = await fetch(updateUrl, { + const putResp = await fetch(putUrl, { method: "PUT", headers: { ...crowdinBearerHeaders, "Content-Type": "application/json", }, - body: JSON.stringify(updateBody), + body: JSON.stringify(putBody), }) + if (!putResp.ok) { + const text = await putResp.text().catch(() => "") + throw new Error( + `Failed to update Crowdin file ${foundFile.id} (${putResp.status}): ${text}` + ) + } + + // Then, update parser options using PATCH (for markdown files only) + const isMarkdown = file.filePath.endsWith(".md") + if (isMarkdown) { + const patchUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` + const patchBody = [ + { + op: "replace", + path: "/parserOptions/translateAttributes", + value: true, + }, + ] + + const patchResp = await fetch(patchUrl, { + method: "PATCH", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify(patchBody), + }) + + if (!patchResp.ok) { + const text = await patchResp.text().catch(() => "") + console.warn( + `[WARN] Failed to update parser options for file ${foundFile.id}: ${text}` + ) + } else { + if (verbose) { + console.log( + `[DEBUG] Enabled translateAttributes for file ${foundFile.id}` + ) + } + } + } + + const updateResp = putResp + if (!updateResp.ok) { const text = await updateResp.text().catch(() => "") throw new Error( From 7c208fe97b94b28b59921f9a300f55c8735768c6 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:52:18 -0300 Subject: [PATCH 28/99] revert(i18n): remove parser option changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert recent attempts to configure Crowdin parser via API. - Remove workflow input and env plumbing for update_option - Drop PATCH /files/{id} for /parserOptions/translateAttributes - Simplify PUT updates to only set storageId - Rely on Crowdin UI-managed parser settings for now Context: PATCH path required 'parserOptions' which may not exist and 'translateAttributes' expects a boolean. Failed attempts indicate this is best managed in Crowdin UI or a different endpoint. We’ll coordinate with our Crowdin liaison and restore a minimal, stable flow in code. --- .github/workflows/crowdin-ai-import.yml | 10 ----- src/scripts/i18n/config.ts | 7 ---- src/scripts/i18n/lib/crowdin/files.ts | 33 +-------------- src/scripts/i18n/main.ts | 54 ++++--------------------- 4 files changed, 9 insertions(+), 95 deletions(-) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index 7795e72aa13..e347d3b3d3f 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -36,15 +36,6 @@ on: required: false default: "326942" type: string - update_option: - description: "How to handle existing translations when updating files" - required: false - default: "keep_translations_and_approvals" - type: choice - options: - - keep_translations_and_approvals - - keep_translations - - clear_translations_and_approvals verbose: description: "Enable verbose logging (default: false)" required: false @@ -83,6 +74,5 @@ jobs: PRETRANSLATE_TIMEOUT_MS: ${{ github.event.inputs.pretranslate_timeout_ms }} PRETRANSLATE_POLL_BASE_MS: ${{ github.event.inputs.pretranslate_poll_base_ms }} PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} - UPDATE_OPTION: ${{ github.event.inputs.update_option }} VERBOSE: ${{ github.event.inputs.verbose }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index 300fc1c561a..62725015406 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -68,12 +68,6 @@ const pretranslatePollBaseMs = process.env.PRETRANSLATE_POLL_BASE_MS const existingPreTranslationId = process.env.PRETRANSLATION_ID || "" -const updateOption = (process.env.UPDATE_OPTION || - "keep_translations_and_approvals") as - | "keep_translations_and_approvals" - | "keep_translations" - | "clear_translations_and_approvals" - const verbose = process.env.VERBOSE === "true" // Parse GitHub repository from env (format: "owner/repo") @@ -119,7 +113,6 @@ export const config = { pretranslateTimeoutMs, pretranslatePollBaseMs, existingPreTranslationId, - updateOption, verbose, } diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts index be1f356d727..b42068a7d4e 100644 --- a/src/scripts/i18n/lib/crowdin/files.ts +++ b/src/scripts/i18n/lib/crowdin/files.ts @@ -430,38 +430,7 @@ export const postCrowdinFile = async ( const json: JsonResponse = await res.json() console.log("Created file:", json.data) - // Then, update parser options for markdown files using PATCH - const isMarkdown = name.endsWith(".md") - if (isMarkdown) { - const patchUrl = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/files/${json.data.id}` - const patchBody = [ - { - op: "replace", - path: "/parserOptions/translateAttributes", - value: true, - }, - ] - - const patchResp = await fetch(patchUrl, { - method: "PATCH", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify(patchBody), - }) - - if (!patchResp.ok) { - const text = await patchResp.text().catch(() => "") - console.warn( - `[WARN] Failed to update parser options for file ${json.data.id}: ${text}` - ) - } else if (config.verbose) { - console.log( - `[DEBUG] Enabled translateAttributes for file ${json.data.id}` - ) - } - } + // Note: parser options are managed in Crowdin UI. No PATCH here. return json.data } catch (error) { diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 4e09f2a141f..8631f792cf7 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -181,66 +181,28 @@ async function main() { file["Crowdin-API-FileName"] ) - // First, update the file content using PUT - const putUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` - const putBody: Record = { + // Update the file content using PUT + const updateUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` + const updateBody: Record = { storageId: storageInfo.id, - updateOption: config.updateOption, } - const putResp = await fetch(putUrl, { + const updateResp = await fetch(updateUrl, { method: "PUT", headers: { ...crowdinBearerHeaders, "Content-Type": "application/json", }, - body: JSON.stringify(putBody), + body: JSON.stringify(updateBody), }) - if (!putResp.ok) { - const text = await putResp.text().catch(() => "") + if (!updateResp.ok) { + const text = await updateResp.text().catch(() => "") throw new Error( - `Failed to update Crowdin file ${foundFile.id} (${putResp.status}): ${text}` + `Failed to update Crowdin file ${foundFile.id} (${updateResp.status}): ${text}` ) } - // Then, update parser options using PATCH (for markdown files only) - const isMarkdown = file.filePath.endsWith(".md") - if (isMarkdown) { - const patchUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` - const patchBody = [ - { - op: "replace", - path: "/parserOptions/translateAttributes", - value: true, - }, - ] - - const patchResp = await fetch(patchUrl, { - method: "PATCH", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify(patchBody), - }) - - if (!patchResp.ok) { - const text = await patchResp.text().catch(() => "") - console.warn( - `[WARN] Failed to update parser options for file ${foundFile.id}: ${text}` - ) - } else { - if (verbose) { - console.log( - `[DEBUG] Enabled translateAttributes for file ${foundFile.id}` - ) - } - } - } - - const updateResp = putResp - if (!updateResp.ok) { const text = await updateResp.text().catch(() => "") throw new Error( From b8841c430fb9946156dba3c1006535628e56866c Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:51:52 -0300 Subject: [PATCH 29/99] fix(i18n): list all translated files & diagnose AI model --- src/scripts/i18n/main.ts | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 8631f792cf7..d44e822413d 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -458,14 +458,40 @@ async function main() { Number(userId), config.preTranslatePromptId ) - aiModelName = promptInfo.aiModelId || "LLM" + if (promptInfo?.aiModelId) { + aiModelName = promptInfo.aiModelId + console.log(`✓ Fetched AI model: ${aiModelName}`) + } else { + console.warn("Prompt info missing aiModelId, using default") + } } catch (e) { console.warn("Could not fetch AI model name from Crowdin:", e) } + } else { + console.warn("I18N_CROWDIN_USER_ID not set, using default AI model name") } const langCodes = languagePairs.map((p) => p.internalLanguageCode).join(", ") - const prBody = `## Description\n\nThis PR contains automated ${aiModelName} translations from Crowdin\n\n### File${changedFiles.length > 1 ? "s" : ""} translated\n\n${changedFiles.map(({ path }) => `- ${path}`).join("\n")}\n\n### Languages translated\n\n- ${langCodes}` + + // Include both sanitized files and original committed files + const allChangedPaths = [ + ...new Set([ + ...changedFiles.map(({ path }) => path), + ...committedFiles.map(({ path }) => path), + ]), + ] + + const prBody = `## Description + +This PR contains automated ${aiModelName} translations from Crowdin + +### File${allChangedPaths.length > 1 ? "s" : ""} translated + +${allChangedPaths.map((path) => `- ${path}`).join("\n")} + +### Language${langCodes.length > 1 ? "s" : ""} translated + +- ${langCodes}` const pr = await postPullRequest(branch, config.baseBranch, prBody) From bc04456e9741453a41b1870fed5d687b8e491e9b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 15 Dec 2025 11:16:34 -0300 Subject: [PATCH 30/99] chore(i18n): fetch crowdin user dynamically --- .github/workflows/crowdin-ai-import.yml | 1 - src/scripts/i18n/lib/crowdin/user.ts | 41 +++++++++++++++++ src/scripts/i18n/main.ts | 61 ++++++++++++------------- 3 files changed, 69 insertions(+), 34 deletions(-) create mode 100644 src/scripts/i18n/lib/crowdin/user.ts diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index e347d3b3d3f..771b39449f7 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -66,7 +66,6 @@ jobs: env: I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_API_KEY }} I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} - I18N_CROWDIN_USER_ID: ${{ secrets.I18N_CROWDIN_USER_ID }} PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} TARGET_PATH: ${{ github.event.inputs.target_path }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} diff --git a/src/scripts/i18n/lib/crowdin/user.ts b/src/scripts/i18n/lib/crowdin/user.ts new file mode 100644 index 00000000000..954630b6e40 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/user.ts @@ -0,0 +1,41 @@ +import { crowdinBearerHeaders } from "../../config" + +interface CrowdinUser { + id: number + username: string + email: string + emailVerified: boolean + fullName: string + avatarUrl: string + createdAt: string + lastSeen: string + twoFactor: string + timezone: string +} + +interface CrowdinUserResponse { + data: CrowdinUser +} + +/** + * Get the authenticated Crowdin user's information + * @returns The authenticated user's data + */ +export async function getCurrentUser(): Promise { + const url = "https://api.crowdin.com/api/v2/user" + + const response = await fetch(url, { + method: "GET", + headers: crowdinBearerHeaders, + }) + + if (!response.ok) { + const text = await response.text().catch(() => "") + throw new Error( + `Failed to fetch current user (${response.status}): ${text}` + ) + } + + const json = (await response.json()) as CrowdinUserResponse + return json.data +} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index d44e822413d..db6fe7309fc 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -18,6 +18,7 @@ import { postApplyPreTranslation, } from "./lib/crowdin/pre-translate" import { getPromptInfo, updatePromptFromFile } from "./lib/crowdin/prompt" +import { getCurrentUser } from "./lib/crowdin/user" import { postCreateBranchFrom } from "./lib/github/branches" import { getDestinationFromPath, putCommitFile } from "./lib/github/commits" import { @@ -116,22 +117,20 @@ async function main() { console.log(`\n========== Starting New Pre-Translation ==========`) // Ensure Crowdin AI prompt content is synced from repo canonical file - const userId = process.env.I18N_CROWDIN_USER_ID - if (userId) { - try { - const promptPath = path.join( - process.cwd(), - "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" - ) - await updatePromptFromFile( - Number(userId), - config.preTranslatePromptId, - promptPath - ) - console.log("✓ Updated Crowdin pre-translate prompt from repo file") - } catch (e) { - console.warn("Failed to update prompt, continuing:", e) - } + try { + const currentUser = await getCurrentUser() + const promptPath = path.join( + process.cwd(), + "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" + ) + await updatePromptFromFile( + currentUser.id, + config.preTranslatePromptId, + promptPath + ) + console.log("✓ Updated Crowdin pre-translate prompt from repo file") + } catch (e) { + console.warn("Failed to update prompt, continuing:", e) } // Fetch English files @@ -451,24 +450,20 @@ async function main() { // Fetch AI model name dynamically let aiModelName = "LLM" - const userId = process.env.I18N_CROWDIN_USER_ID - if (userId) { - try { - const promptInfo = await getPromptInfo( - Number(userId), - config.preTranslatePromptId - ) - if (promptInfo?.aiModelId) { - aiModelName = promptInfo.aiModelId - console.log(`✓ Fetched AI model: ${aiModelName}`) - } else { - console.warn("Prompt info missing aiModelId, using default") - } - } catch (e) { - console.warn("Could not fetch AI model name from Crowdin:", e) + try { + const currentUser = await getCurrentUser() + const promptInfo = await getPromptInfo( + currentUser.id, + config.preTranslatePromptId + ) + if (promptInfo?.aiModelId) { + aiModelName = promptInfo.aiModelId + console.log(`✓ Fetched AI model: ${aiModelName}`) + } else { + console.warn("Prompt info missing aiModelId, using default") } - } else { - console.warn("I18N_CROWDIN_USER_ID not set, using default AI model name") + } catch (e) { + console.warn("Could not fetch AI model name from Crowdin:", e) } const langCodes = languagePairs.map((p) => p.internalLanguageCode).join(", ") From 0f0a5c329ac54146b8c166121ab037a0df19cbc3 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 15 Dec 2025 11:36:42 -0300 Subject: [PATCH 31/99] feat(i18n): add SKIP_PR_CREATION flag to workflow and gate PR step --- .github/workflows/crowdin-ai-import.yml | 6 ++++++ src/scripts/i18n/main.ts | 15 +++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index 771b39449f7..bc573639703 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -36,6 +36,11 @@ on: required: false default: "326942" type: string + skip_pr: + description: "If true, skip creating the Pull Request" + required: false + default: false + type: boolean verbose: description: "Enable verbose logging (default: false)" required: false @@ -74,4 +79,5 @@ jobs: PRETRANSLATE_POLL_BASE_MS: ${{ github.event.inputs.pretranslate_poll_base_ms }} PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} VERBOSE: ${{ github.event.inputs.verbose }} + SKIP_PR_CREATION: ${{ github.event.inputs.skip_pr }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index db6fe7309fc..eff81433727 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -445,6 +445,21 @@ async function main() { console.log("No sanitization changes needed") } + // Optionally skip PR creation based on workflow input + const skipPrCreation = ["1", "true", "yes", "on"].includes( + (process.env.SKIP_PR_CREATION || "").toLowerCase() + ) + if (skipPrCreation) { + console.log(`\n========== Skipping PR Creation ==========`) + console.log( + `Files have been committed to branch: ${branch}. No PR will be opened.` + ) + console.log( + `Set SKIP_PR_CREATION=false to enable automatic PR creation in the workflow.` + ) + return + } + // Create PR console.log(`\n========== Creating Pull Request ==========`) From 67c1c84a86bda94c7f4e1d233677df127a0555f1 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 15 Dec 2025 14:32:32 -0300 Subject: [PATCH 32/99] fix: stop workflow on invalid paths --- src/scripts/i18n/config.ts | 57 ++++++++++++++++++++++++++++++++++++++ src/scripts/i18n/main.ts | 9 +++++- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index 62725015406..f588de16cdb 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -1,3 +1,6 @@ +import * as fs from "fs" +import * as path from "path" + import * as dotenv from "dotenv" import i18nConfig from "../../../i18n.config.json" @@ -116,6 +119,60 @@ export const config = { verbose, } +// Load excluded paths from canonical config file +function loadExcludedPaths(): string[] { + try { + const excludedPathsFile = path.join( + process.cwd(), + "src/scripts/i18n/config/excluded-paths.json" + ) + const raw = fs.readFileSync(excludedPathsFile, "utf8") + return JSON.parse(raw) as string[] + } catch { + return [] + } +} + +// Validation for target path +export function validateTargetPath(targetPath: string): void { + if (!targetPath) { + // Full translation mode is allowed + return + } + + // Disallowed: paths under public/content/translations (translated content) + if (targetPath.includes("public/content/translations")) { + throw new Error( + `[ERROR] Invalid target path: "${targetPath}"\n` + + `Target path cannot be under "public/content/translations" (this is translated content)\n` + + `Did you mean to target a file under "public/content" instead?` + ) + } + + // Disallowed: paths under src/intl other than src/intl/en + if ( + targetPath.startsWith("src/intl/") && + !targetPath.startsWith("src/intl/en") + ) { + throw new Error( + `[ERROR] Invalid target path: "${targetPath}"\n` + + `Target path under "src/intl/" can only be "src/intl/en" (English source)\n` + + `Other src/intl directories contain translated content` + ) + } + + // Disallowed: explicitly excluded paths from config file + const excludedPaths = loadExcludedPaths() + for (const excluded of excludedPaths) { + if (targetPath.includes(excluded)) { + throw new Error( + `[ERROR] Invalid target path: "${targetPath}"\n` + + `This path is in the excluded paths list (${excluded})` + ) + } + } +} + // Constants export const CROWDIN_API_BASE_URL = "https://api.crowdin.com/api/v2" export const MAX_STRINGS_PER_REQUEST = 500 diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index eff81433727..002d9245ac7 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -29,7 +29,7 @@ import { import { postPullRequest } from "./lib/github/pull-requests" import type { CrowdinFileData, CrowdinPreTranslateResponse } from "./lib/types" import { mapCrowdinCodeToInternal } from "./lib/utils/mapping" -import { config, crowdinBearerHeaders } from "./config" +import { config, crowdinBearerHeaders, validateTargetPath } from "./config" import { runSanitizer } from "./post_import_sanitize" // Small helper for async waits @@ -78,6 +78,13 @@ async function main() { if (targetPath) { const isFile = targetPath.endsWith(".md") || targetPath.endsWith(".json") console.log(`Mode: ${isFile ? "Single file" : "Directory"} (${targetPath})`) + // Validate target path is in allowed location + try { + validateTargetPath(targetPath) + } catch (e) { + console.error(e instanceof Error ? e.message : String(e)) + process.exit(1) + } } else { console.log(`Mode: Full translation (all files)`) } From 72e87d3242824ff215070044666812731d28f51e Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:30:21 -0300 Subject: [PATCH 33/99] feat: add blank language input support - Add use_legacy_languages workflow input (default: false) - When target_languages is blank, translate to ALL languages - Legacy mode: uses i18n.config.json (~60+ languages) - Non-legacy mode: uses canonical-llm-language-list.json (25 languages) - Backward compatible with existing workflows --- .github/workflows/crowdin-ai-import.yml | 24 ++++++++----- src/scripts/i18n/config.ts | 45 +++++++++++++++++++++---- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index bc573639703..1c63d9895d1 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -3,24 +3,29 @@ name: Import Crowdin AI Translations on: workflow_dispatch: inputs: - pretranslation_id: - description: "Pre-translation ID to resume from (optional - leave empty to start new)" - required: false - type: string target_path: - description: "File or directory path to translate (optional - e.g., public/content/developers/index.md or public/content/developers)" + description: "File or directory path to translate (e.g., public/content/developers/index.md or public/content/developers or blank for all files)" required: false type: string target_languages: - description: "Comma-separated internal language codes (default: es)" + description: "Comma-separated internal language codes (blank for all locales)" required: false default: "es" type: string + use_legacy_languages: + description: "Use legacy locales i18n.config.json (else uses canonical-llm-language-list.json)" + required: false + default: false + type: boolean base_branch: - description: "Base branch to create PR against (default: dev)" + description: "Base branch to create PR against" required: false default: "dev" type: string + pretranslation_id: + description: "Pre-translation ID to resume from (leave empty to start new)" + required: false + type: string pretranslate_timeout_ms: description: "Max ms to wait for pre-translate (default: 21600000 ~6h)" required: false @@ -37,12 +42,12 @@ on: default: "326942" type: string skip_pr: - description: "If true, skip creating the Pull Request" + description: "Skip PR creation?" required: false default: false type: boolean verbose: - description: "Enable verbose logging (default: false)" + description: "Enable verbose logging?" required: false default: "false" type: boolean @@ -74,6 +79,7 @@ jobs: PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} TARGET_PATH: ${{ github.event.inputs.target_path }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} + USE_LEGACY_LANGUAGES: ${{ github.event.inputs.use_legacy_languages }} BASE_BRANCH: ${{ github.event.inputs.base_branch }} PRETRANSLATE_TIMEOUT_MS: ${{ github.event.inputs.pretranslate_timeout_ms }} PRETRANSLATE_POLL_BASE_MS: ${{ github.event.inputs.pretranslate_poll_base_ms }} diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index f588de16cdb..16522de2162 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -5,6 +5,7 @@ import * as dotenv from "dotenv" import i18nConfig from "../../../i18n.config.json" +import canonicalLanguageList from "./config/canonical-llm-language-list.json" import { mapInternalCodeToCrowdin } from "./lib/utils/mapping" dotenv.config({ path: ".env.local" }) @@ -48,14 +49,37 @@ export const crowdinBearerHeaders = { Authorization: `Bearer ${crowdinApiKey}` } // Parse environment variables with defaults // Accept internal codes (e.g., "es") and convert to Crowdin codes (e.g., "es-EM") -const targetLanguagesInput = process.env.TARGET_LANGUAGES - ? process.env.TARGET_LANGUAGES.split(",").map((lang) => lang.trim()) - : ["es"] - -const targetLanguages = targetLanguagesInput.map((code) => - mapInternalCodeToCrowdin(code) +const useLegacyLanguages = ["1", "true", "yes", "on"].includes( + (process.env.USE_LEGACY_LANGUAGES || "").toLowerCase() ) +const targetLanguagesInput = process.env.TARGET_LANGUAGES + ? process.env.TARGET_LANGUAGES.split(",") + .map((lang) => lang.trim()) + .filter(Boolean) + : [] + +// If no target languages specified, use all languages from appropriate config +let targetLanguages: string[] +if (targetLanguagesInput.length === 0) { + if (useLegacyLanguages) { + // Use i18n.config.json, excluding 'en' + targetLanguages = i18nConfig + .map(({ code }) => code) + .filter((code) => code !== "en") + .map((code) => mapInternalCodeToCrowdin(code)) + } else { + // Use canonical-llm-language-list.json + targetLanguages = canonicalLanguageList + .map(({ code }) => code) + .map((code) => mapInternalCodeToCrowdin(code)) + } +} else { + targetLanguages = targetLanguagesInput.map((code) => + mapInternalCodeToCrowdin(code) + ) +} + const baseBranch = process.env.BASE_BRANCH || "dev" const targetPath = process.env.TARGET_PATH || "" @@ -81,11 +105,12 @@ const [ghOrganization, ghRepo] = githubRepo.split("/") if (verbose) { console.log("[DEBUG] Configuration:") console.log( - `[DEBUG] - Target languages (internal): ${targetLanguagesInput.join(", ")}` + `[DEBUG] - Target languages (internal): ${targetLanguagesInput.length ? targetLanguagesInput.join(", ") : "ALL"}` ) console.log( `[DEBUG] - Target languages (Crowdin): ${targetLanguages.join(", ")}` ) + console.log(`[DEBUG] - Use legacy languages: ${useLegacyLanguages}`) console.log(`[DEBUG] - Base branch: ${baseBranch}`) console.log( `[DEBUG] - Target path: ${targetPath || "none (full translation)"}` @@ -111,6 +136,12 @@ export const config = { process.env.PRE_TRANSLATE_PROMPT_ID || "326942" ), allCrowdinCodes: targetLanguages, + allInternalCodes: targetLanguagesInput.length + ? targetLanguagesInput + : useLegacyLanguages + ? i18nConfig.map(({ code }) => code).filter((code) => code !== "en") + : canonicalLanguageList.map(({ code }) => code), + useLegacyLanguages, baseBranch, targetPath, pretranslateTimeoutMs, From 0ddcad3ae1303b571d8afb324ba45876af2e7ef0 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:31:32 -0300 Subject: [PATCH 34/99] feat: add dynamic PR title formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add title parameter to postPullRequest function - PR titles now adjust based on language count: - ≤3 locales: shows comma-separated codes - >3 locales (not all): shows '(many languages)' - All languages: shows '(all languages)' --- src/scripts/i18n/lib/github/pull-requests.ts | 37 +++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/lib/github/pull-requests.ts b/src/scripts/i18n/lib/github/pull-requests.ts index d62723db136..18be0aa47d1 100644 --- a/src/scripts/i18n/lib/github/pull-requests.ts +++ b/src/scripts/i18n/lib/github/pull-requests.ts @@ -8,12 +8,14 @@ import { fetchWithRetry } from "../utils/fetch" * * @param head - The head branch (source of changes) * @param base - The base branch (target for merge, defaults to config.baseBranch) + * @param title - PR title * @param bodyText - Optional PR description text * @returns The created pull request object */ export const postPullRequest = async ( head: string, base = config.baseBranch, + title: string, bodyText?: string ) => { const url = new URL( @@ -21,7 +23,7 @@ export const postPullRequest = async ( ) const body = { - title: "i18n: automated Crowdin translation import", + title, head, base, body: bodyText || "Automated Crowdin translation import", @@ -45,3 +47,36 @@ export const postPullRequest = async ( const json = await res.json() return json } + +/** + * Post a comment on a pull request + * + * @param prNumber - The PR number + * @param commentBody - The comment body text + * @returns The created comment object + */ +export const postPullRequestComment = async ( + prNumber: number, + commentBody: string +) => { + const url = new URL( + `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/issues/${prNumber}/comments` + ) + + const res = await fetchWithRetry(url.toString(), { + method: "POST", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ body: commentBody }), + }) + + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error(`Failed to post PR comment (${res.status}): ${body}`) + } + + const json = await res.json() + return json +} From efe6e54955c719625c8366f33c9c2f08a48e9cb5 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:33:06 -0300 Subject: [PATCH 35/99] feat: add syntax validation & PR formatting - Add syntax tree validation for JSON and Markdown files - Validate JSON: key count, missing/extra keys, order - Validate Markdown: heading count, levels, custom IDs - Post PR comment if validation issues found (runs after PR creation) - Polish PR body formatting: - Remove bullet from language codes line - Separate JSON/Markdown file sections - Simplify paths (remove locale-specific prefixes) - Generate dynamic PR titles based on language count --- .../i18n/lib/validation/syntax-tree.ts | 227 ++++++++++++++++++ src/scripts/i18n/main.ts | 182 ++++++++++++-- 2 files changed, 391 insertions(+), 18 deletions(-) create mode 100644 src/scripts/i18n/lib/validation/syntax-tree.ts diff --git a/src/scripts/i18n/lib/validation/syntax-tree.ts b/src/scripts/i18n/lib/validation/syntax-tree.ts new file mode 100644 index 00000000000..b1a671784f0 --- /dev/null +++ b/src/scripts/i18n/lib/validation/syntax-tree.ts @@ -0,0 +1,227 @@ +// Syntax tree validation for JSON and Markdown files + +interface JsonValidationResult { + isValid: boolean + expectedKeyCount: number + actualKeyCount: number + missingKeys: string[] + extraKeys: string[] + orderMatches: boolean +} + +interface MarkdownValidationResult { + isValid: boolean + expectedHeadingCount: number + actualHeadingCount: number + mismatchedHeadings: Array<{ + level: number + expectedId: string + actualId: string | null + line: number + }> +} + +/** + * Extract JSON keys in order from a JSON string + */ +function extractJsonKeys(jsonContent: string): string[] { + try { + const obj = JSON.parse(jsonContent) + if (typeof obj !== "object" || obj === null || Array.isArray(obj)) { + return [] + } + return Object.keys(obj) + } catch { + return [] + } +} + +/** + * Validate JSON file structure against English source + */ +export function validateJsonStructure( + englishContent: string, + translatedContent: string +): JsonValidationResult { + const englishKeys = extractJsonKeys(englishContent) + const translatedKeys = extractJsonKeys(translatedContent) + + const englishKeySet = new Set(englishKeys) + const translatedKeySet = new Set(translatedKeys) + + const missingKeys = englishKeys.filter((key) => !translatedKeySet.has(key)) + const extraKeys = translatedKeys.filter((key) => !englishKeySet.has(key)) + + const orderMatches = + JSON.stringify(englishKeys) === JSON.stringify(translatedKeys) + + return { + isValid: missingKeys.length === 0 && extraKeys.length === 0, + expectedKeyCount: englishKeys.length, + actualKeyCount: translatedKeys.length, + missingKeys, + extraKeys, + orderMatches, + } +} + +/** + * Extract markdown headings with their custom IDs + */ +function extractMarkdownHeadings( + content: string +): Array<{ level: number; id: string | null; line: number }> { + const lines = content.split("\n") + const headings: Array<{ level: number; id: string | null; line: number }> = [] + + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + const headingMatch = line.match(/^(#{1,6})\s+(.+)$/) + + if (headingMatch) { + const level = headingMatch[1].length + const headingText = headingMatch[2] + + // Extract custom ID if present (e.g., "Heading text {#custom-id}") + const idMatch = headingText.match(/\{#([^}]+)\}\s*$/) + const customId = idMatch ? idMatch[1] : null + + headings.push({ + level, + id: customId, + line: i + 1, + }) + } + } + + return headings +} + +/** + * Validate markdown heading structure against English source + */ +export function validateMarkdownStructure( + englishContent: string, + translatedContent: string +): MarkdownValidationResult { + const englishHeadings = extractMarkdownHeadings(englishContent) + const translatedHeadings = extractMarkdownHeadings(translatedContent) + + const mismatchedHeadings: Array<{ + level: number + expectedId: string + actualId: string | null + line: number + }> = [] + + // Check if heading counts match + if (englishHeadings.length !== translatedHeadings.length) { + return { + isValid: false, + expectedHeadingCount: englishHeadings.length, + actualHeadingCount: translatedHeadings.length, + mismatchedHeadings: [], + } + } + + // Compare each heading + for (let i = 0; i < englishHeadings.length; i++) { + const englishHeading = englishHeadings[i] + const translatedHeading = translatedHeadings[i] + + // Check if level matches + if (englishHeading.level !== translatedHeading.level) { + mismatchedHeadings.push({ + level: translatedHeading.level, + expectedId: englishHeading.id || "(no id)", + actualId: translatedHeading.id, + line: translatedHeading.line, + }) + continue + } + + // Check if custom IDs match (if present in English) + if (englishHeading.id && englishHeading.id !== translatedHeading.id) { + mismatchedHeadings.push({ + level: translatedHeading.level, + expectedId: englishHeading.id, + actualId: translatedHeading.id, + line: translatedHeading.line, + }) + } + } + + return { + isValid: mismatchedHeadings.length === 0, + expectedHeadingCount: englishHeadings.length, + actualHeadingCount: translatedHeadings.length, + mismatchedHeadings, + } +} + +/** + * Format validation results into a markdown comment + */ +export function formatValidationComment( + validationResults: Array<{ + path: string + type: "json" | "markdown" + result: JsonValidationResult | MarkdownValidationResult + }> +): string | null { + const issues = validationResults.filter((v) => !v.result.isValid) + + if (issues.length === 0) { + return null + } + + let comment = "## ⚠️ Syntax Tree Validation Issues\n\n" + comment += + "The following files have structural differences from their English source:\n\n" + + for (const issue of issues) { + comment += `### \`${issue.path}\`\n\n` + + if (issue.type === "json") { + const result = issue.result as JsonValidationResult + comment += `**JSON Structure Issues:**\n` + comment += `- Expected keys: ${result.expectedKeyCount}\n` + comment += `- Actual keys: ${result.actualKeyCount}\n` + + if (result.missingKeys.length > 0) { + comment += `- Missing keys: ${result.missingKeys.map((k) => `\`${k}\``).join(", ")}\n` + } + + if (result.extraKeys.length > 0) { + comment += `- Extra keys: ${result.extraKeys.map((k) => `\`${k}\``).join(", ")}\n` + } + + if ( + !result.orderMatches && + result.missingKeys.length === 0 && + result.extraKeys.length === 0 + ) { + comment += `- ⚠️ Key order differs from English version\n` + } + } else { + const result = issue.result as MarkdownValidationResult + comment += `**Markdown Structure Issues:**\n` + comment += `- Expected headings: ${result.expectedHeadingCount}\n` + comment += `- Actual headings: ${result.actualHeadingCount}\n` + + if (result.mismatchedHeadings.length > 0) { + comment += `\n**Mismatched Headings:**\n` + for (const mismatch of result.mismatchedHeadings) { + comment += `- Line ${mismatch.line}: Expected ID \`${mismatch.expectedId}\`, found \`${mismatch.actualId || "(none)"}\`\n` + } + } + } + + comment += `\n` + } + + comment += `\n---\n` + comment += `*This validation check ensures translated files maintain the same structure as the English source.*` + + return comment +} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 002d9245ac7..7dd92f3f28a 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -26,9 +26,17 @@ import { getAllEnglishFiles, getFileMetadata, } from "./lib/github/files" -import { postPullRequest } from "./lib/github/pull-requests" +import { + postPullRequest, + postPullRequestComment, +} from "./lib/github/pull-requests" import type { CrowdinFileData, CrowdinPreTranslateResponse } from "./lib/types" import { mapCrowdinCodeToInternal } from "./lib/utils/mapping" +import { + formatValidationComment, + validateJsonStructure, + validateMarkdownStructure, +} from "./lib/validation/syntax-tree" import { config, crowdinBearerHeaders, validateTargetPath } from "./config" import { runSanitizer } from "./post_import_sanitize" @@ -488,34 +496,172 @@ async function main() { console.warn("Could not fetch AI model name from Crowdin:", e) } - const langCodes = languagePairs.map((p) => p.internalLanguageCode).join(", ") + const langCodes = languagePairs.map((p) => p.internalLanguageCode) + + // Determine all language codes based on config (for title comparison) + const allPossibleLanguages = config.allInternalCodes + const isAllLanguages = langCodes.length === allPossibleLanguages.length + + // Build PR title + let prTitle = "i18n: automated Crowdin translation import" + if (langCodes.length <= 3) { + prTitle += ` (${langCodes.join(", ")})` + } else if (isAllLanguages) { + prTitle += ` (all languages)` + } else { + prTitle += ` (many languages)` + } // Include both sanitized files and original committed files - const allChangedPaths = [ - ...new Set([ - ...changedFiles.map(({ path }) => path), - ...committedFiles.map(({ path }) => path), - ]), - ] + const allChangedPathsSet = new Set([ + ...changedFiles.map(({ path }) => path), + ...committedFiles.map(({ path }) => path), + ]) + const allChangedPaths = Array.from(allChangedPathsSet) + + // Separate JSON and Markdown files + const jsonFiles = allChangedPaths.filter((path) => + path.toLowerCase().endsWith(".json") + ) + const markdownFiles = allChangedPaths.filter((path) => + path.toLowerCase().endsWith(".md") + ) + + // Build PR body + let prBody = `## Description\n\n` + prBody += `This PR contains automated ${aiModelName} translations from Crowdin\n\n` + + // Language section + prBody += `### Languages translated\n\n` + prBody += `${langCodes.join(", ")}\n\n` + + // Files section + if (jsonFiles.length > 0) { + prBody += `#### JSON changes (\`src/intl/{locale}/\`)\n\n` + for (const path of jsonFiles) { + // Remove src/intl/{locale}/ prefix + const simplifiedPath = path.replace(/^src\/intl\/[^/]+\//, "") + prBody += `- ${simplifiedPath}\n` + } + prBody += `\n` + } - const prBody = `## Description + if (markdownFiles.length > 0) { + prBody += `#### Markdown changes (\`public/content/translations/{locale}/\`)\n\n` + for (const path of markdownFiles) { + // Remove public/content/translations/{locale}/ prefix + const simplifiedPath = path.replace( + /^public\/content\/translations\/[^/]+\//, + "" + ) + prBody += `- ${simplifiedPath}\n` + } + prBody += `\n` + } -This PR contains automated ${aiModelName} translations from Crowdin + const pr = await postPullRequest(branch, config.baseBranch, prTitle, prBody) -### File${allChangedPaths.length > 1 ? "s" : ""} translated + console.log(`\n✓ Pull Request created: ${pr.html_url}`) + console.log(`PR Number: #${pr.number}`) -${allChangedPaths.map((path) => `- ${path}`).join("\n")} + // Run syntax tree validation + console.log(`\n========== Running Syntax Tree Validation ==========`) + const validationResults: Array<{ + path: string + type: "json" | "markdown" + result: unknown + }> = [] + + for (const file of committedFiles) { + const isJson = file.path.toLowerCase().endsWith(".json") + const isMarkdown = file.path.toLowerCase().endsWith(".md") + + if (!isJson && !isMarkdown) continue + + // Find the corresponding English file + let englishContent: string | null = null + + // Determine the English source path + if (isJson) { + // Extract the file name from the destination path + const match = file.path.match(/src\/intl\/[^/]+\/(.+)$/) + if (match) { + const fileName = match[1] + // Find the English buffer from our tracked files + for (const [fileId, buffer] of Object.entries(englishBuffers)) { + const crowdinPath = fileIdToPathMapping[Number(fileId)] + if (crowdinPath && crowdinPath.includes(fileName)) { + englishContent = buffer.toString("utf8") + break + } + } + } + } else if (isMarkdown) { + // Extract the relative path from translations + const match = file.path.match( + /public\/content\/translations\/[^/]+\/(.+)$/ + ) + if (match) { + const relPath = match[1] + // Find the English buffer + for (const [fileId, buffer] of Object.entries(englishBuffers)) { + const crowdinPath = fileIdToPathMapping[Number(fileId)] + if (crowdinPath && crowdinPath.includes(relPath)) { + englishContent = buffer.toString("utf8") + break + } + } + } + } -### Language${langCodes.length > 1 ? "s" : ""} translated + if (!englishContent) { + if (verbose) { + console.warn(`[DEBUG] Could not find English source for ${file.path}`) + } + continue + } -- ${langCodes}` + // Validate structure + if (isJson) { + const result = validateJsonStructure(englishContent, file.content) + validationResults.push({ + path: file.path, + type: "json", + result, + }) + if (!result.isValid && verbose) { + console.log(`[DEBUG] JSON validation failed for ${file.path}`) + } + } else if (isMarkdown) { + const result = validateMarkdownStructure(englishContent, file.content) + validationResults.push({ + path: file.path, + type: "markdown", + result, + }) + if (!result.isValid && verbose) { + console.log(`[DEBUG] Markdown validation failed for ${file.path}`) + } + } + } - const pr = await postPullRequest(branch, config.baseBranch, prBody) + // Post validation comment if there are issues + const validationComment = formatValidationComment(validationResults) + if (validationComment) { + console.log(`\n⚠️ Syntax validation issues found, posting comment...`) + try { + await postPullRequestComment(pr.number, validationComment) + console.log(`✓ Posted validation comment to PR`) + } catch (e) { + console.warn(`Failed to post validation comment:`, e) + } + } else { + console.log(`✓ All files passed syntax tree validation`) + } console.log(`\n========== SUCCESS ==========`) - console.log(`Pull Request created: ${pr.html_url}`) - console.log(`PR Number: #${pr.number}`) - console.log(`Languages: ${langCodes}`) + console.log(`Pull Request: ${pr.html_url}`) + console.log(`Languages: ${langCodes.join(", ")}`) console.log(`Files: ${fileIds.length}`) } From 2a9433ac93ae42ada9bfc8a689a3b6282ade89b8 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:53:40 -0300 Subject: [PATCH 36/99] refactor: add workflow types and shared utilities - Add TypeScript interfaces for workflow phases - Add shared utilities (delay, logSection, logSubsection) - Foundation for extracting workflow modules from main.ts --- src/scripts/i18n/lib/workflows/types.ts | 64 +++++++++++++++++++++++++ src/scripts/i18n/lib/workflows/utils.ts | 21 ++++++++ 2 files changed, 85 insertions(+) create mode 100644 src/scripts/i18n/lib/workflows/types.ts create mode 100644 src/scripts/i18n/lib/workflows/utils.ts diff --git a/src/scripts/i18n/lib/workflows/types.ts b/src/scripts/i18n/lib/workflows/types.ts new file mode 100644 index 00000000000..0ae96521d62 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/types.ts @@ -0,0 +1,64 @@ +// Types for i18n workflow phases + +import type { CrowdinFileData, CrowdinPreTranslateResponse } from "../types" + +/** + * Shared context passed between workflow phases + */ +export interface WorkflowContext { + crowdinProjectFiles: CrowdinFileData[] + fileIdsSet: Set + processedFileIdToPath: Record + englishBuffers: Record +} + +/** + * Result of file preparation phase + */ +export interface FilePreparationResult { + fileIdsSet: Set + processedFileIdToPath: Record + englishBuffers: Record +} + +/** + * File committed to GitHub branch + */ +export interface CommittedFile { + path: string + content: string +} + +/** + * Language pair mapping + */ +export interface LanguagePair { + crowdinId: string + internalLanguageCode: string +} + +/** + * Result of translation download phase + */ +export interface TranslationDownloadResult { + branch: string + committedFiles: CommittedFile[] + languagePairs: LanguagePair[] + fileIdToPathMapping: Record +} + +/** + * Pull request data + */ +export interface PullRequest { + html_url: string + number: number +} + +/** + * Pre-translation job result + */ +export interface PreTranslationResult { + response: CrowdinPreTranslateResponse + fileIdToPathMapping: Record +} diff --git a/src/scripts/i18n/lib/workflows/utils.ts b/src/scripts/i18n/lib/workflows/utils.ts new file mode 100644 index 00000000000..e6776676704 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/utils.ts @@ -0,0 +1,21 @@ +// Common utilities for i18n workflows + +/** + * Delay execution for specified milliseconds + */ +export const delay = (ms: number): Promise => + new Promise((resolve) => setTimeout(resolve, ms)) + +/** + * Log a section header with consistent formatting + */ +export function logSection(title: string): void { + console.log(`\n========== ${title} ==========`) +} + +/** + * Log a subsection with lighter formatting + */ +export function logSubsection(title: string): void { + console.log(`\n--- ${title} ---`) +} From 5cfef56605631cdabce52465a1d8be5823de691f Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:53:59 -0300 Subject: [PATCH 37/99] refactor: extract initialization workflow - Extract Phase 1 (initialization) from main.ts - Validate environment variables and load config - Return WorkflowContext with Crowdin branches --- src/scripts/i18n/lib/workflows/initialize.ts | 43 ++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 src/scripts/i18n/lib/workflows/initialize.ts diff --git a/src/scripts/i18n/lib/workflows/initialize.ts b/src/scripts/i18n/lib/workflows/initialize.ts new file mode 100644 index 00000000000..873664d1050 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/initialize.ts @@ -0,0 +1,43 @@ +// Workflow initialization phase + +import { config, validateTargetPath } from "../../config" +import { getCrowdinProjectFiles } from "../crowdin/files" + +import type { WorkflowContext } from "./types" +import { logSection } from "./utils" + +/** + * Initialize workflow: validate config, log settings, fetch Crowdin state + */ +export async function initializeWorkflow(): Promise { + const { targetPath } = config + + logSection("Crowdin AI Translation Import") + console.log(`Target languages: ${config.allCrowdinCodes.join(", ")}`) + + if (targetPath) { + const isFile = targetPath.endsWith(".md") || targetPath.endsWith(".json") + console.log(`Mode: ${isFile ? "Single file" : "Directory"} (${targetPath})`) + + // Validate target path is in allowed location + try { + validateTargetPath(targetPath) + } catch (e) { + console.error(e instanceof Error ? e.message : String(e)) + process.exit(1) + } + } else { + console.log(`Mode: Full translation (all files)`) + } + + // Fetch Crowdin project state + const crowdinProjectFiles = await getCrowdinProjectFiles() + + // Initialize shared state + return { + crowdinProjectFiles, + fileIdsSet: new Set(), + processedFileIdToPath: {}, + englishBuffers: {}, + } +} From 671421780e4c8baf73b52d0e1d2d5fed557bd966 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:54:19 -0300 Subject: [PATCH 38/99] refactor: extract file preparation workflow - Extract Phase 2 (file upload/update) from main.ts - Upload or update English files to Crowdin - Include 10-second delays required by Crowdin API --- .../i18n/lib/workflows/file-preparation.ts | 219 ++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 src/scripts/i18n/lib/workflows/file-preparation.ts diff --git a/src/scripts/i18n/lib/workflows/file-preparation.ts b/src/scripts/i18n/lib/workflows/file-preparation.ts new file mode 100644 index 00000000000..1c08a94c085 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/file-preparation.ts @@ -0,0 +1,219 @@ +// File preparation workflow phase + +import * as path from "path" + +import { config, crowdinBearerHeaders } from "../../config" +import { + findCrowdinFile, + postCrowdinFile, + postFileToStorage, + unhideStringsInFile, +} from "../crowdin/files" +import { updatePromptFromFile } from "../crowdin/prompt" +import { getCurrentUser } from "../crowdin/user" +import { + downloadGitHubFile, + getAllEnglishFiles, + getFileMetadata, +} from "../github/files" +import type { CrowdinFileData } from "../types" + +import type { FilePreparationResult, WorkflowContext } from "./types" +import { delay, logSection } from "./utils" + +/** + * Update existing file in Crowdin with latest English content + */ +async function updateCrowdinFile( + file: { + filePath: string + download_url: string + "Crowdin-API-FileName": string + }, + foundFile: CrowdinFileData, + verbose: boolean +): Promise<{ fileId: number; path: string; buffer: Buffer }> { + console.log( + `Updating existing file in Crowdin: ${file.filePath} (ID: ${foundFile.id})` + ) + + const fileBuffer = await downloadGitHubFile(file.download_url) + const storageInfo = await postFileToStorage( + fileBuffer, + file["Crowdin-API-FileName"] + ) + + // Update the file content using PUT + const updateUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` + const updateBody = { storageId: storageInfo.id } + + const updateResp = await fetch(updateUrl, { + method: "PUT", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify(updateBody), + }) + + if (!updateResp.ok) { + const text = await updateResp.text().catch(() => "") + throw new Error( + `Failed to update Crowdin file ${foundFile.id} (${updateResp.status}): ${text}` + ) + } + + console.log(`✓ Updated Crowdin file (ID: ${foundFile.id})`) + + // Wait for file parsing after update + const delayMs = 10000 + if (verbose) { + console.log( + `[DEBUG] Waiting ${delayMs / 1000}s for Crowdin to re-parse updated file...` + ) + } + await delay(delayMs) + + return { + fileId: foundFile.id, + path: foundFile.path, + buffer: fileBuffer, + } +} + +/** + * Create new file in Crowdin + */ +async function createCrowdinFile( + file: { + filePath: string + download_url: string + "Crowdin-API-FileName": string + }, + verbose: boolean +): Promise<{ fileId: number; path: string; buffer: Buffer }> { + console.log(`Creating new file in Crowdin: ${file.filePath}`) + + const fileBuffer = await downloadGitHubFile(file.download_url) + const storageInfo = await postFileToStorage( + fileBuffer, + file["Crowdin-API-FileName"] + ) + + // Derive full parent directory path (exclude filename) + const parts = file.filePath.split("/").filter(Boolean) + parts.pop() // remove filename + const parentDirPath = parts.join("/") || "/" + + const crowdinFileResponse = await postCrowdinFile( + storageInfo.id, + file["Crowdin-API-FileName"], + parentDirPath + ) + + console.log(`✓ Created new Crowdin file (ID: ${crowdinFileResponse.id})`) + + // Wait for new file parsing + const delayMs = 10000 + if (verbose) { + console.log( + `[DEBUG] Waiting ${delayMs / 1000}s for Crowdin to parse new file...` + ) + } + await delay(delayMs) + + return { + fileId: crowdinFileResponse.id, + path: crowdinFileResponse.path, + buffer: fileBuffer, + } +} + +/** + * Upload/update English files to Crowdin and prepare for translation + */ +export async function prepareEnglishFiles( + context: WorkflowContext +): Promise { + const { verbose } = config + const { + crowdinProjectFiles, + fileIdsSet, + processedFileIdToPath, + englishBuffers, + } = context + + logSection("Starting New Pre-Translation") + + // Ensure Crowdin AI prompt content is synced from repo canonical file + try { + const currentUser = await getCurrentUser() + const promptPath = path.join( + process.cwd(), + "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" + ) + await updatePromptFromFile( + currentUser.id, + config.preTranslatePromptId, + promptPath + ) + console.log("✓ Updated Crowdin pre-translate prompt from repo file") + } catch (e) { + console.warn("Failed to update prompt, continuing:", e) + } + + // Fetch English files + const allEnglishFiles = await getAllEnglishFiles() + + if (!allEnglishFiles.length) { + console.log("No files to translate, exiting") + process.exit(0) + } + + if (verbose) { + console.log(`[DEBUG] Found ${allEnglishFiles.length} English files`) + console.log( + `[DEBUG] Found ${crowdinProjectFiles.length} files in Crowdin project` + ) + } + + const fileMetadata = await getFileMetadata(allEnglishFiles) + + // Iterate through each file and upload/update + for (const file of fileMetadata) { + if (verbose) { + console.log(`[DEBUG] Processing file: ${file.filePath}`) + } + + let foundFile: CrowdinFileData | undefined + try { + foundFile = findCrowdinFile(file, crowdinProjectFiles) + } catch { + if (verbose) { + console.log("File not found in Crowdin, will add new file") + } + } + + const result = foundFile + ? await updateCrowdinFile(file, foundFile, verbose) + : await createCrowdinFile(file, verbose) + + fileIdsSet.add(result.fileId) + if (result.path) { + processedFileIdToPath[result.fileId] = result.path + } + englishBuffers[result.fileId] = result.buffer + } + + // Unhide any hidden/duplicate strings before pre-translation + logSection(`Unhiding Strings in ${fileIdsSet.size} Files`) + for (const fileId of Array.from(fileIdsSet)) { + await unhideStringsInFile(fileId) + } + + return { + fileIdsSet, + processedFileIdToPath, + englishBuffers, + } +} From 3317ab398fb218556857bee050a71a06c15712a4 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:54:34 -0300 Subject: [PATCH 39/99] refactor: extract pre-translation workflow - Extract Phase 3 (pre-translation) from main.ts - Resume existing or start new pre-translation - Handle artifact file for job tracking --- .../i18n/lib/workflows/pre-translation.ts | 165 ++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 src/scripts/i18n/lib/workflows/pre-translation.ts diff --git a/src/scripts/i18n/lib/workflows/pre-translation.ts b/src/scripts/i18n/lib/workflows/pre-translation.ts new file mode 100644 index 00000000000..2b76b756414 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/pre-translation.ts @@ -0,0 +1,165 @@ +// Pre-translation workflow phase + +import * as fs from "fs" +import * as path from "path" + +import { config } from "../../config" +import { + awaitPreTranslationCompleted, + getPreTranslationStatus, + postApplyPreTranslation, +} from "../crowdin/pre-translate" +import type { CrowdinPreTranslateResponse } from "../types" + +import type { PreTranslationResult, WorkflowContext } from "./types" +import { logSection } from "./utils" + +/** + * Write pre-translation artifact for GitHub Actions + */ +function writePreTranslationArtifact( + preTranslationId: string, + fileCount: number, + languages: string[] +): void { + const artifactData = { + preTranslationId, + timestamp: new Date().toISOString(), + fileCount, + languages, + targetPath: config.targetPath || null, + } + + const artifactDir = path.join(process.cwd(), "artifacts") + if (!fs.existsSync(artifactDir)) { + fs.mkdirSync(artifactDir, { recursive: true }) + } + + const artifactPath = path.join(artifactDir, "pre-translation-info.json") + fs.writeFileSync(artifactPath, JSON.stringify(artifactData, null, 2)) + + console.log(`\n[ARTIFACT] Pre-translation info written to ${artifactPath}`) + console.log(`[ARTIFACT] Pre-translation ID: ${preTranslationId}`) + console.log( + `[ARTIFACT] To resume this job later, use: PRETRANSLATION_ID=${preTranslationId}` + ) +} + +/** + * Resume existing pre-translation job + */ +async function resumePreTranslation( + preTranslationId: string +): Promise { + logSection(`Resuming Pre-Translation ${preTranslationId}`) + + const statusResp = await getPreTranslationStatus(preTranslationId) + + if (statusResp.status === "in_progress") { + console.log( + `Pre-translation in progress (${statusResp.progress}%), waiting for completion...` + ) + return await awaitPreTranslationCompleted(preTranslationId) + } else if (statusResp.status === "finished") { + console.log(`Pre-translation already finished, proceeding to download...`) + return statusResp + } else { + throw new Error( + `Pre-translation ${preTranslationId} has unexpected status: ${statusResp.status}` + ) + } +} + +/** + * Start new pre-translation job + */ +async function startNewPreTranslation( + fileIdsSet: Set +): Promise { + logSection("Requesting AI Pre-Translation") + console.log(`Files to translate: ${fileIdsSet.size}`) + console.log(`Target languages: ${config.allCrowdinCodes.join(", ")}`) + console.log(`AI Prompt ID: ${config.preTranslatePromptId}`) + + const applyPreTranslationResponse = await postApplyPreTranslation( + Array.from(fileIdsSet), + config.allCrowdinCodes + ) + + console.log( + `✓ Pre-translation job created (ID: ${applyPreTranslationResponse.identifier})` + ) + + // Write artifact with pre-translation ID + writePreTranslationArtifact( + applyPreTranslationResponse.identifier, + fileIdsSet.size, + config.allCrowdinCodes + ) + + // If no targetPath specified (full translation), exit now and let Crowdin work + if (!config.targetPath) { + logSection("Full Translation Job Started") + console.log( + `This is a large job that will take significant time to complete.` + ) + console.log( + `The workflow will exit now. Resume later with the pre-translation ID above.` + ) + console.log( + `Check Crowdin dashboard for progress: https://crowdin.com/project/ethereum-org` + ) + process.exit(0) + } + + // For file/directory mode, wait for completion + console.log(`\nWaiting for pre-translation to complete...`) + const completedResponse = await awaitPreTranslationCompleted( + applyPreTranslationResponse.identifier + ) + + if (completedResponse.status !== "finished") { + throw new Error( + `Pre-translation ended with unexpected status: ${completedResponse.status}` + ) + } + + console.log(`✓ Pre-translation completed successfully!`) + return completedResponse +} + +/** + * Handle pre-translation: resume existing or start new + */ +export async function handlePreTranslation( + context: WorkflowContext +): Promise { + const { existingPreTranslationId, verbose } = config + const { fileIdsSet, processedFileIdToPath, crowdinProjectFiles } = context + + // Resume existing or start new + const preTranslateResponse = existingPreTranslationId + ? await resumePreTranslation(existingPreTranslationId) + : await startNewPreTranslation(fileIdsSet) + + // Build mapping for commit phase + const { fileIds } = preTranslateResponse.attributes + const fileIdToPathMapping: Record = {} + + for (const fid of fileIds) { + if (processedFileIdToPath[fid]) { + fileIdToPathMapping[fid] = processedFileIdToPath[fid] + } else { + const existing = crowdinProjectFiles.find((f) => f.id === fid) + if (existing) fileIdToPathMapping[fid] = existing.path + } + if (!fileIdToPathMapping[fid] && verbose) { + console.warn(`[WARN] Missing path mapping for fileId=${fid}`) + } + } + + return { + response: preTranslateResponse, + fileIdToPathMapping, + } +} From 1b82bc53578a30df361907eae25fe356b36b09ea Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:55:03 -0300 Subject: [PATCH 40/99] refactor: extract translation download workflow - Extract Phase 4 (download & commit) from main.ts - Build translations and commit to GitHub - Map language codes between Crowdin and GitHub --- .../lib/workflows/translation-download.ts | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 src/scripts/i18n/lib/workflows/translation-download.ts diff --git a/src/scripts/i18n/lib/workflows/translation-download.ts b/src/scripts/i18n/lib/workflows/translation-download.ts new file mode 100644 index 00000000000..ef2ea8f559a --- /dev/null +++ b/src/scripts/i18n/lib/workflows/translation-download.ts @@ -0,0 +1,123 @@ +// Translation download workflow phase + +import { config } from "../../config" +import { getBuiltFile, postBuildProjectFileTranslation } from "../crowdin/build" +import { postCreateBranchFrom } from "../github/branches" +import { getDestinationFromPath, putCommitFile } from "../github/commits" +import { mapCrowdinCodeToInternal } from "../utils/mapping" + +import type { + CommittedFile, + LanguagePair, + PreTranslationResult, + TranslationDownloadResult, + WorkflowContext, +} from "./types" +import { logSection, logSubsection } from "./utils" + +/** + * Build language pair mappings from Crowdin IDs to internal codes + */ +export function buildLanguageMappings(languageIds: string[]): LanguagePair[] { + return languageIds.map((crowdinId) => ({ + crowdinId, + internalLanguageCode: mapCrowdinCodeToInternal(crowdinId), + })) +} + +/** + * Download translations from Crowdin and commit to GitHub branch + */ +export async function downloadAndCommitTranslations( + preTranslateResult: PreTranslationResult, + context: WorkflowContext +): Promise { + const { verbose } = config + const { englishBuffers } = context + const { response, fileIdToPathMapping } = preTranslateResult + + const { languageIds, fileIds } = response.attributes + + // Build language pair mappings + const languagePairs = buildLanguageMappings(languageIds) + + logSection("Creating Translation PR") + + // Create GitHub branch + const { branch } = await postCreateBranchFrom( + config.baseBranch, + "crowdin-translations" + ) + console.log(`✓ Created branch: ${branch}`) + + // Track all committed files with their content for sanitizer/validation + const committedFiles: CommittedFile[] = [] + + // For each language, download and commit translations + for (const { crowdinId, internalLanguageCode } of languagePairs) { + logSubsection( + `Building translations for ${crowdinId} (${internalLanguageCode})` + ) + + // Build, download and commit each file + for (const fileId of fileIds) { + const crowdinPath = fileIdToPathMapping[fileId] + + if (verbose) { + console.log(`[DEBUG] Processing fileId: ${fileId} (${crowdinPath})`) + } + + // 1- Build translation + const { url: downloadUrl } = await postBuildProjectFileTranslation( + fileId, + crowdinId, + config.projectId + ) + + // 2- Download + const { buffer } = await getBuiltFile(downloadUrl) + + if (verbose) { + console.log(`[DEBUG] Downloaded ${buffer.length} bytes`) + } + + // Check if translation differs from English + const originalEnglish = englishBuffers[fileId] + if (originalEnglish && originalEnglish.compare(buffer) === 0) { + if (verbose) { + console.warn( + `[DEBUG] Skipping commit - content identical to English (no translation)` + ) + } + continue + } + + // 3- Get destination path and commit + const destinationPath = getDestinationFromPath( + crowdinPath, + internalLanguageCode + ) + + if (verbose) { + console.log(`[DEBUG] Committing to: ${destinationPath}`) + } + + await putCommitFile(buffer, destinationPath, branch) + + // Track this file's path and content for sanitizer/validation + committedFiles.push({ + path: destinationPath, + content: buffer.toString("utf8"), + }) + } + + console.log(`✓ Committed translations for ${internalLanguageCode}`) + } + + return { + branch, + committedFiles, + languagePairs, + fileIdToPathMapping, + } +} From 02aba1a30ffeab8e3edd55afe03b0523437e6112 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:55:18 -0300 Subject: [PATCH 41/99] refactor: extract PR creation workflow - Extract Phase 6 (PR creation) from main.ts - Generate dynamic PR title based on language count - Format PR body with sections and simplified paths --- src/scripts/i18n/lib/workflows/pr-creation.ts | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 src/scripts/i18n/lib/workflows/pr-creation.ts diff --git a/src/scripts/i18n/lib/workflows/pr-creation.ts b/src/scripts/i18n/lib/workflows/pr-creation.ts new file mode 100644 index 00000000000..e5e384c24c8 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/pr-creation.ts @@ -0,0 +1,150 @@ +// PR creation workflow phase + +import { config } from "../../config" +import { getPromptInfo } from "../crowdin/prompt" +import { getCurrentUser } from "../crowdin/user" +import { postPullRequest } from "../github/pull-requests" + +import type { CommittedFile, LanguagePair, PullRequest } from "./types" +import { logSection } from "./utils" + +/** + * Generate dynamic PR title based on language count + */ +export function generatePRTitle( + langCodes: string[], + allPossibleLanguages: string[] +): string { + const isAllLanguages = langCodes.length === allPossibleLanguages.length + + let prTitle = "i18n: automated Crowdin translation import" + + if (langCodes.length <= 3) { + prTitle += ` (${langCodes.join(", ")})` + } else if (isAllLanguages) { + prTitle += ` (all languages)` + } else { + prTitle += ` (many languages)` + } + + return prTitle +} + +/** + * Generate PR body with organized file listings + */ +export function generatePRBody( + aiModelName: string, + langCodes: string[], + committedFiles: CommittedFile[], + sanitizedFiles: CommittedFile[] +): string { + // Include both sanitized files and original committed files + const allChangedPathsSet = new Set([ + ...sanitizedFiles.map(({ path }) => path), + ...committedFiles.map(({ path }) => path), + ]) + const allChangedPaths = Array.from(allChangedPathsSet) + + // Separate JSON and Markdown files + const jsonFiles = allChangedPaths.filter((path) => + path.toLowerCase().endsWith(".json") + ) + const markdownFiles = allChangedPaths.filter((path) => + path.toLowerCase().endsWith(".md") + ) + + // Build PR body + let prBody = `## Description\n\n` + prBody += `This PR contains automated ${aiModelName} translations from Crowdin\n\n` + + // Language section + prBody += `### Languages translated\n\n` + prBody += `${langCodes.join(", ")}\n\n` + + // Files section - JSON + if (jsonFiles.length > 0) { + prBody += `#### JSON changes (\`src/intl/{locale}/\`)\n\n` + for (const path of jsonFiles) { + // Remove src/intl/{locale}/ prefix + const simplifiedPath = path.replace(/^src\/intl\/[^/]+\//, "") + prBody += `- ${simplifiedPath}\n` + } + prBody += `\n` + } + + // Files section - Markdown + if (markdownFiles.length > 0) { + prBody += `#### Markdown changes (\`public/content/translations/{locale}/\`)\n\n` + for (const path of markdownFiles) { + // Remove public/content/translations/{locale}/ prefix + const simplifiedPath = path.replace( + /^public\/content\/translations\/[^/]+\//, + "" + ) + prBody += `- ${simplifiedPath}\n` + } + prBody += `\n` + } + + return prBody +} + +/** + * Fetch AI model name from Crowdin + */ +async function fetchAIModelName(): Promise { + try { + const currentUser = await getCurrentUser() + const promptInfo = await getPromptInfo( + currentUser.id, + config.preTranslatePromptId + ) + + if (promptInfo?.aiModelId) { + console.log(`✓ Fetched AI model: ${promptInfo.aiModelId}`) + return promptInfo.aiModelId + } else { + console.warn("Prompt info missing aiModelId, using default") + return "LLM" + } + } catch (e) { + console.warn("Could not fetch AI model name from Crowdin:", e) + return "LLM" + } +} + +/** + * Create pull request with formatted title and body + */ +export async function createTranslationPR( + branch: string, + committedFiles: CommittedFile[], + sanitizedFiles: CommittedFile[], + languagePairs: LanguagePair[] +): Promise { + logSection("Creating Pull Request") + + // Fetch AI model name dynamically + const aiModelName = await fetchAIModelName() + + // Extract language codes + const langCodes = languagePairs.map((p) => p.internalLanguageCode) + + // Generate PR title and body + const prTitle = generatePRTitle(langCodes, config.allInternalCodes) + const prBody = generatePRBody( + aiModelName, + langCodes, + committedFiles, + sanitizedFiles + ) + + // Create PR + const pr = await postPullRequest(branch, config.baseBranch, prTitle, prBody) + + console.log(`\n✓ Pull Request created: ${pr.html_url}`) + console.log(`PR Number: #${pr.number}`) + + return pr +} From 80b3f770a088bb2accd1df08739500b8437ed3f8 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:55:33 -0300 Subject: [PATCH 42/99] refactor: extract validation workflow - Extract Phase 7 (syntax validation) from main.ts - Validate translated files against English structure - Post PR comment if validation issues found --- src/scripts/i18n/lib/workflows/validation.ts | 115 +++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 src/scripts/i18n/lib/workflows/validation.ts diff --git a/src/scripts/i18n/lib/workflows/validation.ts b/src/scripts/i18n/lib/workflows/validation.ts new file mode 100644 index 00000000000..90c131ece2a --- /dev/null +++ b/src/scripts/i18n/lib/workflows/validation.ts @@ -0,0 +1,115 @@ +// Syntax tree validation workflow phase + +import { config } from "../../config" +import { postPullRequestComment } from "../github/pull-requests" +import { + formatValidationComment, + validateJsonStructure, + validateMarkdownStructure, +} from "../validation/syntax-tree" + +import type { CommittedFile, PullRequest } from "./types" +import { logSection } from "./utils" + +/** + * Run syntax tree validation and post comment if issues found + */ +export async function runSyntaxValidation( + pr: PullRequest, + committedFiles: CommittedFile[], + englishBuffers: Record, + fileIdToPathMapping: Record +): Promise { + const { verbose } = config + + logSection("Running Syntax Tree Validation") + + const validationResults: Parameters[0] = [] + + for (const file of committedFiles) { + const isJson = file.path.toLowerCase().endsWith(".json") + const isMarkdown = file.path.toLowerCase().endsWith(".md") + + if (!isJson && !isMarkdown) continue + + // Find the corresponding English file + let englishContent: string | null = null + + // Determine the English source path + if (isJson) { + // Extract the file name from the destination path + const match = file.path.match(/src\/intl\/[^/]+\/(.+)$/) + if (match) { + const fileName = match[1] + // Find the English buffer from our tracked files + for (const [fileId, buffer] of Object.entries(englishBuffers)) { + const crowdinPath = fileIdToPathMapping[Number(fileId)] + if (crowdinPath && crowdinPath.includes(fileName)) { + englishContent = buffer.toString("utf8") + break + } + } + } + } else if (isMarkdown) { + // Extract the relative path from translations + const match = file.path.match( + /public\/content\/translations\/[^/]+\/(.+)$/ + ) + if (match) { + const relPath = match[1] + // Find the English buffer + for (const [fileId, buffer] of Object.entries(englishBuffers)) { + const crowdinPath = fileIdToPathMapping[Number(fileId)] + if (crowdinPath && crowdinPath.includes(relPath)) { + englishContent = buffer.toString("utf8") + break + } + } + } + } + + if (!englishContent) { + if (verbose) { + console.warn(`[DEBUG] Could not find English source for ${file.path}`) + } + continue + } + + // Validate structure + if (isJson) { + const result = validateJsonStructure(englishContent, file.content) + validationResults.push({ + path: file.path, + type: "json", + result, + }) + if (!result.isValid && verbose) { + console.log(`[DEBUG] JSON validation failed for ${file.path}`) + } + } else if (isMarkdown) { + const result = validateMarkdownStructure(englishContent, file.content) + validationResults.push({ + path: file.path, + type: "markdown", + result, + }) + if (!result.isValid && verbose) { + console.log(`[DEBUG] Markdown validation failed for ${file.path}`) + } + } + } + + // Post validation comment if there are issues + const validationComment = formatValidationComment(validationResults) + if (validationComment) { + console.log(`\n⚠️ Syntax validation issues found, posting comment...`) + try { + await postPullRequestComment(pr.number, validationComment) + console.log(`✓ Posted validation comment to PR`) + } catch (e) { + console.warn(`Failed to post validation comment:`, e) + } + } else { + console.log(`✓ All files passed syntax tree validation`) + } +} From a464ef3e44881608aba079a9b3d976f3df51a4be Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:55:56 -0300 Subject: [PATCH 43/99] refactor: simplify main.ts orchestration - Reduce from 673 lines to ~110 lines - Replace monolithic logic with workflow module calls - Maintain functional parity with improved readability --- src/scripts/i18n/main.ts | 666 +++------------------------------------ 1 file changed, 50 insertions(+), 616 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 7dd92f3f28a..0ef158a300f 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -1,444 +1,43 @@ -import * as fs from "fs" -import * as path from "path" - -import { - getBuiltFile, - postBuildProjectFileTranslation, -} from "./lib/crowdin/build" -import { - findCrowdinFile, - getCrowdinProjectFiles, - postCrowdinFile, - postFileToStorage, - unhideStringsInFile, -} from "./lib/crowdin/files" -import { - awaitPreTranslationCompleted, - getPreTranslationStatus, - postApplyPreTranslation, -} from "./lib/crowdin/pre-translate" -import { getPromptInfo, updatePromptFromFile } from "./lib/crowdin/prompt" -import { getCurrentUser } from "./lib/crowdin/user" -import { postCreateBranchFrom } from "./lib/github/branches" -import { getDestinationFromPath, putCommitFile } from "./lib/github/commits" -import { - downloadGitHubFile, - getAllEnglishFiles, - getFileMetadata, -} from "./lib/github/files" -import { - postPullRequest, - postPullRequestComment, -} from "./lib/github/pull-requests" -import type { CrowdinFileData, CrowdinPreTranslateResponse } from "./lib/types" -import { mapCrowdinCodeToInternal } from "./lib/utils/mapping" -import { - formatValidationComment, - validateJsonStructure, - validateMarkdownStructure, -} from "./lib/validation/syntax-tree" -import { config, crowdinBearerHeaders, validateTargetPath } from "./config" +import { putCommitFile } from "./lib/github/commits" +import { prepareEnglishFiles } from "./lib/workflows/file-preparation" +import { initializeWorkflow } from "./lib/workflows/initialize" +import { createTranslationPR } from "./lib/workflows/pr-creation" +import { handlePreTranslation } from "./lib/workflows/pre-translation" +import { downloadAndCommitTranslations } from "./lib/workflows/translation-download" +import { logSection } from "./lib/workflows/utils" +import { runSyntaxValidation } from "./lib/workflows/validation" +import { config } from "./config" import { runSanitizer } from "./post_import_sanitize" -// Small helper for async waits -const delay = (ms: number) => - new Promise((resolve) => setTimeout(resolve, ms)) - -/** - * Write pre-translation artifact for GitHub Actions - */ -function writePreTranslationArtifact( - preTranslationId: string, - fileCount: number, - languages: string[] -) { - const artifactData = { - preTranslationId, - timestamp: new Date().toISOString(), - fileCount, - languages, - targetPath: config.targetPath || null, - } - - const artifactDir = path.join(process.cwd(), "artifacts") - if (!fs.existsSync(artifactDir)) { - fs.mkdirSync(artifactDir, { recursive: true }) - } - - const artifactPath = path.join(artifactDir, "pre-translation-info.json") - fs.writeFileSync(artifactPath, JSON.stringify(artifactData, null, 2)) - - console.log(`\n[ARTIFACT] Pre-translation info written to ${artifactPath}`) - console.log(`[ARTIFACT] Pre-translation ID: ${preTranslationId}`) - console.log( - `[ARTIFACT] To resume this job later, use: PRETRANSLATION_ID=${preTranslationId}` - ) -} - /** * Main orchestration function */ async function main() { - const { verbose, targetPath, existingPreTranslationId } = config - - console.log(`\n========== Crowdin AI Translation Import ==========`) - console.log(`Target languages: ${config.allCrowdinCodes.join(", ")}`) - if (targetPath) { - const isFile = targetPath.endsWith(".md") || targetPath.endsWith(".json") - console.log(`Mode: ${isFile ? "Single file" : "Directory"} (${targetPath})`) - // Validate target path is in allowed location - try { - validateTargetPath(targetPath) - } catch (e) { - console.error(e instanceof Error ? e.message : String(e)) - process.exit(1) - } - } else { - console.log(`Mode: Full translation (all files)`) - } - - // Shared state - const crowdinProjectFiles = await getCrowdinProjectFiles() - const fileIdsSet = new Set() - const processedFileIdToPath: Record = {} - const englishBuffers: Record = {} - - // If resuming, determine completed pre-translation response; otherwise start new - let preTranslateJobCompletedResponse: CrowdinPreTranslateResponse - - if (existingPreTranslationId) { - console.log( - `\n========== Resuming Pre-Translation ${existingPreTranslationId} ==========` - ) - const statusResp = await getPreTranslationStatus(existingPreTranslationId) - - if (statusResp.status === "in_progress") { - console.log( - `Pre-translation in progress (${statusResp.progress}%), waiting for completion...` - ) - preTranslateJobCompletedResponse = await awaitPreTranslationCompleted( - existingPreTranslationId - ) - } else if (statusResp.status === "finished") { - console.log(`Pre-translation already finished, proceeding to download...`) - preTranslateJobCompletedResponse = statusResp - } else { - throw new Error( - `Pre-translation ${existingPreTranslationId} has unexpected status: ${statusResp.status}` - ) - } - } else { - // Normal flow: Start new pre-translation - console.log(`\n========== Starting New Pre-Translation ==========`) - - // Ensure Crowdin AI prompt content is synced from repo canonical file - try { - const currentUser = await getCurrentUser() - const promptPath = path.join( - process.cwd(), - "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" - ) - await updatePromptFromFile( - currentUser.id, - config.preTranslatePromptId, - promptPath - ) - console.log("✓ Updated Crowdin pre-translate prompt from repo file") - } catch (e) { - console.warn("Failed to update prompt, continuing:", e) - } - - // Fetch English files - const allEnglishFiles = await getAllEnglishFiles() - - if (!allEnglishFiles.length) { - console.log("No files to translate, exiting") - return - } - - if (verbose) { - console.log(`[DEBUG] Found ${allEnglishFiles.length} English files`) - console.log( - `[DEBUG] Found ${crowdinProjectFiles.length} files in Crowdin project` - ) - } - - const fileMetadata = await getFileMetadata(allEnglishFiles) - - // Iterate through each file and upload - for (const file of fileMetadata) { - if (verbose) { - console.log(`[DEBUG] Processing file: ${file.filePath}`) - } - - let foundFile: CrowdinFileData | undefined - try { - foundFile = findCrowdinFile(file, crowdinProjectFiles) - } catch { - if (verbose) { - console.log("File not found in Crowdin, will add new file") - } - } - - let effectiveFileId: number - let effectivePath: string - - if (foundFile) { - // File exists - UPDATE it to ensure Crowdin has the latest English version - console.log( - `Updating existing file in Crowdin: ${file.filePath} (ID: ${foundFile.id})` - ) - const fileBuffer = await downloadGitHubFile(file.download_url) - - const storageInfo = await postFileToStorage( - fileBuffer, - file["Crowdin-API-FileName"] - ) - - // Update the file content using PUT - const updateUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` - const updateBody: Record = { - storageId: storageInfo.id, - } - - const updateResp = await fetch(updateUrl, { - method: "PUT", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify(updateBody), - }) - - if (!updateResp.ok) { - const text = await updateResp.text().catch(() => "") - throw new Error( - `Failed to update Crowdin file ${foundFile.id} (${updateResp.status}): ${text}` - ) - } - - if (!updateResp.ok) { - const text = await updateResp.text().catch(() => "") - throw new Error( - `Failed to update Crowdin file ${foundFile.id} (${updateResp.status}): ${text}` - ) - } - - console.log(`✓ Updated Crowdin file (ID: ${foundFile.id})`) - - effectiveFileId = foundFile.id - effectivePath = foundFile.path - englishBuffers[effectiveFileId] = fileBuffer - - // Wait for file parsing after update - const delayMs = 10000 - if (verbose) { - console.log( - `[DEBUG] Waiting ${delayMs / 1000}s for Crowdin to re-parse updated file...` - ) - } - await delay(delayMs) - } else { - // File doesn't exist - create it - console.log(`Creating new file in Crowdin: ${file.filePath}`) - const fileBuffer = await downloadGitHubFile(file.download_url) - - const storageInfo = await postFileToStorage( - fileBuffer, - file["Crowdin-API-FileName"] - ) - - // Derive full parent directory path (exclude filename) - const parts = file.filePath.split("/").filter(Boolean) - parts.pop() // remove filename - const parentDirPath = parts.join("/") || "/" - - const crowdinFileResponse = await postCrowdinFile( - storageInfo.id, - file["Crowdin-API-FileName"], - parentDirPath - ) - - console.log( - `✓ Created new Crowdin file (ID: ${crowdinFileResponse.id})` - ) - - effectiveFileId = crowdinFileResponse.id - effectivePath = crowdinFileResponse.path - englishBuffers[effectiveFileId] = fileBuffer - - // Wait for new file parsing - const delayMs = 10000 - if (verbose) { - console.log( - `[DEBUG] Waiting ${delayMs / 1000}s for Crowdin to parse new file...` - ) - } - await delay(delayMs) - } - - fileIdsSet.add(effectiveFileId) - if (effectivePath) processedFileIdToPath[effectiveFileId] = effectivePath - } - - // Unhide any hidden/duplicate strings before pre-translation - console.log( - `\n========== Unhiding Strings in ${fileIdsSet.size} Files ==========` - ) - for (const fileId of Array.from(fileIdsSet)) { - await unhideStringsInFile(fileId) - } - - console.log(`\n========== Requesting AI Pre-Translation ==========`) - console.log(`Files to translate: ${fileIdsSet.size}`) - console.log(`Target languages: ${config.allCrowdinCodes.join(", ")}`) - console.log(`AI Prompt ID: ${config.preTranslatePromptId}`) - - const applyPreTranslationResponse = await postApplyPreTranslation( - Array.from(fileIdsSet), - config.allCrowdinCodes - ) - - console.log( - `✓ Pre-translation job created (ID: ${applyPreTranslationResponse.identifier})` - ) - - // Write artifact with pre-translation ID - writePreTranslationArtifact( - applyPreTranslationResponse.identifier, - fileIdsSet.size, - config.allCrowdinCodes - ) - - // If no targetPath specified (full translation), exit now and let Crowdin work - if (!targetPath) { - console.log(`\n========== Full Translation Job Started ==========`) - console.log( - `This is a large job that will take significant time to complete.` - ) - console.log( - `The workflow will exit now. Resume later with the pre-translation ID above.` - ) - console.log( - `Check Crowdin dashboard for progress: https://crowdin.com/project/ethereum-org` - ) - return - } + const { verbose, existingPreTranslationId } = config - // For file/directory mode, wait for completion - console.log(`\nWaiting for pre-translation to complete...`) - preTranslateJobCompletedResponse = await awaitPreTranslationCompleted( - applyPreTranslationResponse.identifier - ) - - if (preTranslateJobCompletedResponse.status !== "finished") { - throw new Error( - `Pre-translation ended with unexpected status: ${preTranslateJobCompletedResponse.status}` - ) - } - - console.log(`✓ Pre-translation completed successfully!`) - } + // Phase 1: Initialize workflow + const context = await initializeWorkflow() - // Build and download translations - const { languageIds, fileIds } = preTranslateJobCompletedResponse.attributes - - // Build mapping for commit phase - const fileIdToPathMapping: Record = {} - for (const fid of fileIds) { - if (processedFileIdToPath[fid]) { - fileIdToPathMapping[fid] = processedFileIdToPath[fid] - } else { - const existing = crowdinProjectFiles.find((f) => f.id === fid) - if (existing) fileIdToPathMapping[fid] = existing.path - } - if (!fileIdToPathMapping[fid] && verbose) { - console.warn(`[WARN] Missing path mapping for fileId=${fid}`) - } + // Phase 2: Prepare English files (skip if resuming existing job) + if (!existingPreTranslationId) { + await prepareEnglishFiles(context) } - // Build mapping between Crowdin IDs and internal codes - const languagePairs = languageIds.map((crowdinId) => ({ - crowdinId, - internalLanguageCode: mapCrowdinCodeToInternal(crowdinId), - })) + // Phase 3: Handle pre-translation (resume or start new) + const preTranslateResult = await handlePreTranslation(context) - console.log(`\n========== Creating Translation PR ==========`) - - const { branch } = await postCreateBranchFrom( - config.baseBranch, - "crowdin-translations" + // Phase 4: Download and commit translations + const translationResult = await downloadAndCommitTranslations( + preTranslateResult, + context ) - console.log(`✓ Created branch: ${branch}`) - - // Track all committed files with their content for sanitizer - const committedFiles: Array<{ path: string; content: string }> = [] - - // For each language - for (const { crowdinId, internalLanguageCode } of languagePairs) { - console.log( - `\n--- Building translations for ${crowdinId} (${internalLanguageCode}) ---` - ) - - // Build, download and commit each file - for (const fileId of fileIds) { - const crowdinPath = fileIdToPathMapping[fileId] - - if (verbose) { - console.log(`[DEBUG] Processing fileId: ${fileId} (${crowdinPath})`) - } - - // 1- Build - const { url: downloadUrl } = await postBuildProjectFileTranslation( - fileId, - crowdinId, - config.projectId - ) - - // 2- Download - const { buffer } = await getBuiltFile(downloadUrl) - - if (verbose) { - console.log(`[DEBUG] Downloaded ${buffer.length} bytes`) - } - // Check if translation differs from English - const originalEnglish = englishBuffers[fileId] - if (originalEnglish && originalEnglish.compare(buffer) === 0) { - if (verbose) { - console.warn( - `[DEBUG] Skipping commit - content identical to English (no translation)` - ) - } - continue - } - - // 3- Get destination path and commit - const destinationPath = getDestinationFromPath( - crowdinPath, - internalLanguageCode - ) - - if (verbose) { - console.log(`[DEBUG] Committing to: ${destinationPath}`) - } - - await putCommitFile(buffer, destinationPath, branch) - - // Track this file's path and content for sanitizer - committedFiles.push({ - path: destinationPath, - content: buffer.toString("utf8"), - }) - } - - console.log(`✓ Committed translations for ${internalLanguageCode}`) - } - - // Run post-import sanitizer only on files that were just committed - console.log(`\n========== Running Post-Import Sanitizer ==========`) - console.log(`[SANITIZE] Processing ${committedFiles.length} committed files`) - const sanitizeResult = runSanitizer(committedFiles) + // Phase 5: Run post-import sanitizer + logSection("Running Post-Import Sanitizer") + console.log( + `[SANITIZE] Processing ${translationResult.committedFiles.length} committed files` + ) + const sanitizeResult = runSanitizer(translationResult.committedFiles) const changedFiles = sanitizeResult.changedFiles || [] if (changedFiles.length) { @@ -447,7 +46,7 @@ async function main() { const relPath = file.path try { const buf = Buffer.from(file.content, "utf8") - await putCommitFile(buf, relPath, branch) + await putCommitFile(buf, relPath, translationResult.branch) if (verbose) { console.log(`[DEBUG] Committed sanitized file: ${relPath}`) } @@ -460,14 +59,14 @@ async function main() { console.log("No sanitization changes needed") } - // Optionally skip PR creation based on workflow input + // Check if PR creation should be skipped const skipPrCreation = ["1", "true", "yes", "on"].includes( (process.env.SKIP_PR_CREATION || "").toLowerCase() ) if (skipPrCreation) { - console.log(`\n========== Skipping PR Creation ==========`) + logSection("Skipping PR Creation") console.log( - `Files have been committed to branch: ${branch}. No PR will be opened.` + `Files have been committed to branch: ${translationResult.branch}. No PR will be opened.` ) console.log( `Set SKIP_PR_CREATION=false to enable automatic PR creation in the workflow.` @@ -475,194 +74,29 @@ async function main() { return } - // Create PR - console.log(`\n========== Creating Pull Request ==========`) - - // Fetch AI model name dynamically - let aiModelName = "LLM" - try { - const currentUser = await getCurrentUser() - const promptInfo = await getPromptInfo( - currentUser.id, - config.preTranslatePromptId - ) - if (promptInfo?.aiModelId) { - aiModelName = promptInfo.aiModelId - console.log(`✓ Fetched AI model: ${aiModelName}`) - } else { - console.warn("Prompt info missing aiModelId, using default") - } - } catch (e) { - console.warn("Could not fetch AI model name from Crowdin:", e) - } - - const langCodes = languagePairs.map((p) => p.internalLanguageCode) - - // Determine all language codes based on config (for title comparison) - const allPossibleLanguages = config.allInternalCodes - const isAllLanguages = langCodes.length === allPossibleLanguages.length - - // Build PR title - let prTitle = "i18n: automated Crowdin translation import" - if (langCodes.length <= 3) { - prTitle += ` (${langCodes.join(", ")})` - } else if (isAllLanguages) { - prTitle += ` (all languages)` - } else { - prTitle += ` (many languages)` - } - - // Include both sanitized files and original committed files - const allChangedPathsSet = new Set([ - ...changedFiles.map(({ path }) => path), - ...committedFiles.map(({ path }) => path), - ]) - const allChangedPaths = Array.from(allChangedPathsSet) - - // Separate JSON and Markdown files - const jsonFiles = allChangedPaths.filter((path) => - path.toLowerCase().endsWith(".json") + // Phase 6: Create PR + const pr = await createTranslationPR( + translationResult.branch, + translationResult.committedFiles, + changedFiles, + translationResult.languagePairs ) - const markdownFiles = allChangedPaths.filter((path) => - path.toLowerCase().endsWith(".md") - ) - - // Build PR body - let prBody = `## Description\n\n` - prBody += `This PR contains automated ${aiModelName} translations from Crowdin\n\n` - - // Language section - prBody += `### Languages translated\n\n` - prBody += `${langCodes.join(", ")}\n\n` - - // Files section - if (jsonFiles.length > 0) { - prBody += `#### JSON changes (\`src/intl/{locale}/\`)\n\n` - for (const path of jsonFiles) { - // Remove src/intl/{locale}/ prefix - const simplifiedPath = path.replace(/^src\/intl\/[^/]+\//, "") - prBody += `- ${simplifiedPath}\n` - } - prBody += `\n` - } - - if (markdownFiles.length > 0) { - prBody += `#### Markdown changes (\`public/content/translations/{locale}/\`)\n\n` - for (const path of markdownFiles) { - // Remove public/content/translations/{locale}/ prefix - const simplifiedPath = path.replace( - /^public\/content\/translations\/[^/]+\//, - "" - ) - prBody += `- ${simplifiedPath}\n` - } - prBody += `\n` - } - - const pr = await postPullRequest(branch, config.baseBranch, prTitle, prBody) - - console.log(`\n✓ Pull Request created: ${pr.html_url}`) - console.log(`PR Number: #${pr.number}`) - - // Run syntax tree validation - console.log(`\n========== Running Syntax Tree Validation ==========`) - const validationResults: Array<{ - path: string - type: "json" | "markdown" - result: unknown - }> = [] - - for (const file of committedFiles) { - const isJson = file.path.toLowerCase().endsWith(".json") - const isMarkdown = file.path.toLowerCase().endsWith(".md") - - if (!isJson && !isMarkdown) continue - - // Find the corresponding English file - let englishContent: string | null = null - // Determine the English source path - if (isJson) { - // Extract the file name from the destination path - const match = file.path.match(/src\/intl\/[^/]+\/(.+)$/) - if (match) { - const fileName = match[1] - // Find the English buffer from our tracked files - for (const [fileId, buffer] of Object.entries(englishBuffers)) { - const crowdinPath = fileIdToPathMapping[Number(fileId)] - if (crowdinPath && crowdinPath.includes(fileName)) { - englishContent = buffer.toString("utf8") - break - } - } - } - } else if (isMarkdown) { - // Extract the relative path from translations - const match = file.path.match( - /public\/content\/translations\/[^/]+\/(.+)$/ - ) - if (match) { - const relPath = match[1] - // Find the English buffer - for (const [fileId, buffer] of Object.entries(englishBuffers)) { - const crowdinPath = fileIdToPathMapping[Number(fileId)] - if (crowdinPath && crowdinPath.includes(relPath)) { - englishContent = buffer.toString("utf8") - break - } - } - } - } - - if (!englishContent) { - if (verbose) { - console.warn(`[DEBUG] Could not find English source for ${file.path}`) - } - continue - } - - // Validate structure - if (isJson) { - const result = validateJsonStructure(englishContent, file.content) - validationResults.push({ - path: file.path, - type: "json", - result, - }) - if (!result.isValid && verbose) { - console.log(`[DEBUG] JSON validation failed for ${file.path}`) - } - } else if (isMarkdown) { - const result = validateMarkdownStructure(englishContent, file.content) - validationResults.push({ - path: file.path, - type: "markdown", - result, - }) - if (!result.isValid && verbose) { - console.log(`[DEBUG] Markdown validation failed for ${file.path}`) - } - } - } - - // Post validation comment if there are issues - const validationComment = formatValidationComment(validationResults) - if (validationComment) { - console.log(`\n⚠️ Syntax validation issues found, posting comment...`) - try { - await postPullRequestComment(pr.number, validationComment) - console.log(`✓ Posted validation comment to PR`) - } catch (e) { - console.warn(`Failed to post validation comment:`, e) - } - } else { - console.log(`✓ All files passed syntax tree validation`) - } + // Phase 7: Run syntax tree validation + await runSyntaxValidation( + pr, + translationResult.committedFiles, + context.englishBuffers, + translationResult.fileIdToPathMapping + ) - console.log(`\n========== SUCCESS ==========`) + // Success! + logSection("SUCCESS") console.log(`Pull Request: ${pr.html_url}`) - console.log(`Languages: ${langCodes.join(", ")}`) - console.log(`Files: ${fileIds.length}`) + console.log( + `Languages: ${translationResult.languagePairs.map((p) => p.internalLanguageCode).join(", ")}` + ) + console.log(`Files: ${preTranslateResult.response.attributes.fileIds.length}`) } main().catch((err) => { From 9f9c2002ad559d1d18c901d211dfbb63f90567ba Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:56:56 -0300 Subject: [PATCH 44/99] fix: export validation result types - Export JsonValidationResult and MarkdownValidationResult - Required by validation.ts workflow module --- src/scripts/i18n/lib/validation/syntax-tree.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scripts/i18n/lib/validation/syntax-tree.ts b/src/scripts/i18n/lib/validation/syntax-tree.ts index b1a671784f0..6ee8ba538a7 100644 --- a/src/scripts/i18n/lib/validation/syntax-tree.ts +++ b/src/scripts/i18n/lib/validation/syntax-tree.ts @@ -1,6 +1,6 @@ // Syntax tree validation for JSON and Markdown files -interface JsonValidationResult { +export interface JsonValidationResult { isValid: boolean expectedKeyCount: number actualKeyCount: number @@ -9,7 +9,7 @@ interface JsonValidationResult { orderMatches: boolean } -interface MarkdownValidationResult { +export interface MarkdownValidationResult { isValid: boolean expectedHeadingCount: number actualHeadingCount: number From 37d25a0fc861cb56dc6fc854969633093d61fd52 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:34:59 -0300 Subject: [PATCH 45/99] refactor(i18n): consolidate delay() utility Import delay() from lib/workflows/utils instead of defining locally. Removes duplicate implementations in fetch.ts, commits.ts, pre-translate.ts. --- src/scripts/i18n/lib/crowdin/pre-translate.ts | 3 +-- src/scripts/i18n/lib/github/commits.ts | 3 +-- src/scripts/i18n/lib/utils/fetch.ts | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/scripts/i18n/lib/crowdin/pre-translate.ts b/src/scripts/i18n/lib/crowdin/pre-translate.ts index 62b312b6217..149d2a42891 100644 --- a/src/scripts/i18n/lib/crowdin/pre-translate.ts +++ b/src/scripts/i18n/lib/crowdin/pre-translate.ts @@ -6,8 +6,7 @@ import { crowdinBearerHeaders, } from "../../config" import type { CrowdinPreTranslateResponse } from "../types" - -const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) +import { delay } from "../workflows/utils" /** * Apply pre-translation to files diff --git a/src/scripts/i18n/lib/github/commits.ts b/src/scripts/i18n/lib/github/commits.ts index f4aca2e8ef7..a9bfc4c911c 100644 --- a/src/scripts/i18n/lib/github/commits.ts +++ b/src/scripts/i18n/lib/github/commits.ts @@ -2,8 +2,7 @@ import { config, gitHubBearerHeaders } from "../../config" import { fetchWithRetry } from "../utils/fetch" - -const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) +import { delay } from "../workflows/utils" /** * Get the destination path for a translated file diff --git a/src/scripts/i18n/lib/utils/fetch.ts b/src/scripts/i18n/lib/utils/fetch.ts index 7191a531527..2295311b2e9 100644 --- a/src/scripts/i18n/lib/utils/fetch.ts +++ b/src/scripts/i18n/lib/utils/fetch.ts @@ -1,6 +1,6 @@ // Utilities: resilient fetch with retry logic -const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) +import { delay } from "../workflows/utils" export type RetryOptions = { retries?: number From 8d259a662b332e2765257e087447dfb900378098 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:36:17 -0300 Subject: [PATCH 46/99] refactor(i18n): consolidate loadExcludedPaths() utility Export loadExcludedPaths from config.ts, import it in files.ts. Removes duplicate implementation. --- src/scripts/i18n/config.ts | 2 +- src/scripts/i18n/lib/github/files.ts | 21 +-------------------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index 16522de2162..5149929dee8 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -151,7 +151,7 @@ export const config = { } // Load excluded paths from canonical config file -function loadExcludedPaths(): string[] { +export function loadExcludedPaths(): string[] { try { const excludedPathsFile = path.join( process.cwd(), diff --git a/src/scripts/i18n/lib/github/files.ts b/src/scripts/i18n/lib/github/files.ts index fab7e4dfa8f..732648e2f0e 100644 --- a/src/scripts/i18n/lib/github/files.ts +++ b/src/scripts/i18n/lib/github/files.ts @@ -1,9 +1,6 @@ // GitHub file operations -import * as fs from "fs" -import * as path from "path" - -import { config, gitHubBearerHeaders } from "../../config" +import { config, gitHubBearerHeaders, loadExcludedPaths } from "../../config" import type { ContentType, GitHubCrowdinFileMetadata, @@ -11,22 +8,6 @@ import type { } from "../types" import { fetchWithRetry } from "../utils/fetch" -/** - * Load excluded paths from config - */ -function loadExcludedPaths(): string[] { - try { - const excludedPathsFile = path.join( - process.cwd(), - "src/scripts/i18n/config/excluded-paths.json" - ) - const raw = fs.readFileSync(excludedPathsFile, "utf8") - return JSON.parse(raw) as string[] - } catch { - return [] - } -} - /** * Check if a path should be excluded */ From 1dd31e351285c09c4c40bc17a9411d6cd0225956 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:36:28 -0300 Subject: [PATCH 47/99] refactor(i18n): define QaLevel locally Define QaLevel type locally instead of importing from nonexistent qa-routing module. Removes dead import, type is planned for v0.2.0 implementation. --- src/scripts/i18n/lib/github/pr-review-comments.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/lib/github/pr-review-comments.ts b/src/scripts/i18n/lib/github/pr-review-comments.ts index 30ff5229c7e..168724183e9 100644 --- a/src/scripts/i18n/lib/github/pr-review-comments.ts +++ b/src/scripts/i18n/lib/github/pr-review-comments.ts @@ -1,8 +1,10 @@ // GitHub PR review comment helper with scoped @mentions import { config, gitHubBearerHeaders } from "../../config" -import type { QaLevel } from "../qa-routing" import { fetchWithRetry } from "../utils/fetch" +// QA level for AI review routing (planned for v0.2.0) +export type QaLevel = "copilot" | "copilot+claude" + /** * Post a follow-up comment on a PR with AI reviewer mentions and clear scope * @param prNumber The PR number From 0b8f91103373a21ce50590f15c1c4e3ead4f88a7 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:36:41 -0300 Subject: [PATCH 48/99] feat(i18n): add JSX attribute extraction types Add types for JSX attribute extraction and translation: - TRANSLATABLE_ATTRIBUTES list (title, description, alt, etc.) - ExtractedAttribute, TranslatedAttribute interfaces - FileExtractionResult, FileTranslationResult interfaces - JsxTranslationSummary for batch processing stats --- src/scripts/i18n/lib/jsx-attributes/types.ts | 78 ++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 src/scripts/i18n/lib/jsx-attributes/types.ts diff --git a/src/scripts/i18n/lib/jsx-attributes/types.ts b/src/scripts/i18n/lib/jsx-attributes/types.ts new file mode 100644 index 00000000000..f248f6e286b --- /dev/null +++ b/src/scripts/i18n/lib/jsx-attributes/types.ts @@ -0,0 +1,78 @@ +/** + * Types for JSX attribute extraction and translation + */ + +/** Attributes that contain human-readable text requiring translation */ +export const TRANSLATABLE_ATTRIBUTES = [ + "title", + "description", + "alt", + "label", + "aria-label", + "placeholder", + "buttonLabel", + "text", + "name", + "caption", + "contentPreview", + "location", +] as const + +export type TranslatableAttribute = (typeof TRANSLATABLE_ATTRIBUTES)[number] + +/** A single extracted attribute from a JSX component */ +export interface ExtractedAttribute { + /** File path the attribute was found in */ + filePath: string + /** Line number (1-indexed) where the attribute appears */ + line: number + /** Column position where the attribute value starts */ + column: number + /** The attribute name (e.g., "title", "description") */ + attributeName: TranslatableAttribute + /** The component name (e.g., "Card", "ExpandableCard") */ + componentName: string + /** The original English attribute value */ + originalValue: string + /** Surrounding context (1-2 sentences before/after) for translation accuracy */ + context: string +} + +/** Result of extracting attributes from a single file */ +export interface FileExtractionResult { + filePath: string + attributes: ExtractedAttribute[] + /** Original file content for re-insertion */ + content: string +} + +/** A translated attribute ready for re-insertion */ +export interface TranslatedAttribute extends ExtractedAttribute { + translatedValue: string +} + +/** Result of translating attributes for a file */ +export interface FileTranslationResult { + filePath: string + translatedAttributes: TranslatedAttribute[] + /** Updated file content with translations inserted */ + updatedContent: string + /** Whether any attributes were translated */ + hasChanges: boolean +} + +/** Summary of JSX attribute translation for a batch of files */ +export interface JsxTranslationSummary { + /** Total files processed */ + filesProcessed: number + /** Files that had attributes translated */ + filesWithChanges: number + /** Total attributes translated */ + attributesTranslated: number + /** Attributes that failed translation */ + attributesFailed: number + /** Whether Gemini API was available */ + geminiAvailable: boolean + /** Files with updated content */ + updatedFiles: FileTranslationResult[] +} From cca30221b0f8b3fadca8ce184a73e2ad95e73750 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:36:53 -0300 Subject: [PATCH 49/99] feat(i18n): add JSX attribute extraction logic Extract translatable JSX attributes from markdown files. Uses regex to find JSX components and their translatable attributes. Includes heuristics to identify English text (vs URLs, variables, CSS classes). Extracts surrounding context for translation accuracy. --- .../i18n/lib/jsx-attributes/extract.ts | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 src/scripts/i18n/lib/jsx-attributes/extract.ts diff --git a/src/scripts/i18n/lib/jsx-attributes/extract.ts b/src/scripts/i18n/lib/jsx-attributes/extract.ts new file mode 100644 index 00000000000..cf3ce41602b --- /dev/null +++ b/src/scripts/i18n/lib/jsx-attributes/extract.ts @@ -0,0 +1,178 @@ +/** + * Extract translatable JSX attributes from markdown files + */ + +import type { + ExtractedAttribute, + FileExtractionResult, + TranslatableAttribute, +} from "./types" +import { TRANSLATABLE_ATTRIBUTES } from "./types" + +/** + * Regex to match JSX/HTML-style attributes with quoted values. + * Captures: attributeName="value" or attributeName='value' + */ +const ATTRIBUTE_REGEX = + /\b([a-zA-Z][\w-]*)\s*=\s*(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)')/g + +/** + * Regex to identify JSX component opening tags. + * Captures the component name and all attributes. + */ +const JSX_COMPONENT_REGEX = /<([A-Z][a-zA-Z0-9]*)\s+([^>]*?)(?:\/>|>)/g + +/** + * Check if a string appears to be English text (not a variable, URL, or code). + * Uses heuristics: contains spaces, common English words, or sentence structure. + */ +function isLikelyEnglishText(value: string): boolean { + // Skip empty or very short values + if (!value || value.length < 3) return false + + // Skip URLs + if (/^https?:\/\//.test(value)) return false + + // Skip paths + if (/^[/.]/.test(value) || /\.(png|jpg|svg|gif|json|md)$/i.test(value)) + return false + + // Skip variables/placeholders like {variable} or {{variable}} + if (/^\{.*\}$/.test(value)) return false + + // Skip CSS classes or technical identifiers (camelCase/kebab-case only) + if (/^[a-z][a-zA-Z0-9-]*$/.test(value) && !value.includes(" ")) return false + + // Skip emoji-only values + if (/^[\p{Emoji}\s]+$/u.test(value)) return false + + // Skip numbers-only + if (/^[\d.,\s%$€£]+$/.test(value)) return false + + // Likely English if it contains spaces (multi-word) or common English patterns + if (value.includes(" ")) return true + + // Single words that look like natural language (capitalized, common endings) + if (/^[A-Z][a-z]+(?:ing|ed|er|est|ly|tion|ness)?$/.test(value)) return true + + return false +} + +/** + * Extract surrounding context (lines before/after) for translation accuracy. + */ +function extractContext( + content: string, + lineNumber: number, + contextLines = 2 +): string { + const lines = content.split("\n") + const startLine = Math.max(0, lineNumber - 1 - contextLines) + const endLine = Math.min(lines.length, lineNumber + contextLines) + + return lines + .slice(startLine, endLine) + .map((line) => line.trim()) + .filter((line) => line.length > 0) + .join(" ") + .slice(0, 500) // Limit context length +} + +/** + * Extract translatable attributes from a single file's content. + */ +export function extractAttributesFromContent( + content: string, + filePath: string +): ExtractedAttribute[] { + const attributes: ExtractedAttribute[] = [] + const lines = content.split("\n") + + // Track line numbers for each match + let currentLine = 0 + let currentPos = 0 + + // Process each JSX component + let componentMatch: RegExpExecArray | null + JSX_COMPONENT_REGEX.lastIndex = 0 + + while ((componentMatch = JSX_COMPONENT_REGEX.exec(content)) !== null) { + const componentName = componentMatch[1] + const attributeString = componentMatch[2] + const componentStartPos = componentMatch.index + + // Calculate line number for this component + while (currentPos < componentStartPos && currentLine < lines.length) { + currentPos += lines[currentLine].length + 1 // +1 for newline + currentLine++ + } + const componentLine = currentLine + 1 // 1-indexed + + // Extract attributes from this component + let attrMatch: RegExpExecArray | null + ATTRIBUTE_REGEX.lastIndex = 0 + + while ((attrMatch = ATTRIBUTE_REGEX.exec(attributeString)) !== null) { + const attrName = attrMatch[1] + const attrValue = attrMatch[2] || attrMatch[3] // double or single quotes + + // Check if this is a translatable attribute + if ( + !TRANSLATABLE_ATTRIBUTES.includes(attrName as TranslatableAttribute) + ) { + continue + } + + // Check if the value looks like English text needing translation + if (!isLikelyEnglishText(attrValue)) { + continue + } + + attributes.push({ + filePath, + line: componentLine, + column: attrMatch.index, + attributeName: attrName as TranslatableAttribute, + componentName, + originalValue: attrValue, + context: extractContext(content, componentLine), + }) + } + } + + return attributes +} + +/** + * Extract translatable attributes from a file, returning the extraction result. + */ +export function extractAttributesFromFile( + content: string, + filePath: string +): FileExtractionResult { + const attributes = extractAttributesFromContent(content, filePath) + + return { + filePath, + attributes, + content, + } +} + +/** + * Extract attributes from multiple files. + */ +export function extractAttributesFromFiles( + files: { path: string; content: string }[] +): FileExtractionResult[] { + return files.map((file) => extractAttributesFromFile(file.content, file.path)) +} + +/** + * Count total attributes across multiple extraction results. + */ +export function countExtractedAttributes( + results: FileExtractionResult[] +): number { + return results.reduce((sum, result) => sum + result.attributes.length, 0) +} From 44fb5577fd1172e1881b3889e5675767f22549f2 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:37:03 -0300 Subject: [PATCH 50/99] feat(i18n): add JSX attribute reinsertion logic Re-insert translated attribute values into file content. Replaces original attribute values with translated values. Handles both double and single quoted attributes. --- .../i18n/lib/jsx-attributes/reinsert.ts | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 src/scripts/i18n/lib/jsx-attributes/reinsert.ts diff --git a/src/scripts/i18n/lib/jsx-attributes/reinsert.ts b/src/scripts/i18n/lib/jsx-attributes/reinsert.ts new file mode 100644 index 00000000000..d4a3db75290 --- /dev/null +++ b/src/scripts/i18n/lib/jsx-attributes/reinsert.ts @@ -0,0 +1,81 @@ +/** + * Re-insert translated attribute values into file content + */ + +import type { + FileExtractionResult, + FileTranslationResult, + TranslatedAttribute, +} from "./types" + +/** + * Escape special regex characters in a string + */ +function escapeRegex(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") +} + +/** + * Replace a single attribute value in content. + * Handles both double and single quoted attributes. + */ +function replaceAttributeValue( + content: string, + attr: TranslatedAttribute +): string { + // Build regex to find this specific attribute with its original value + // Match: attributeName="originalValue" or attributeName='originalValue' + const escapedOriginal = escapeRegex(attr.originalValue) + const pattern = new RegExp( + `(\\b${attr.attributeName}\\s*=\\s*)(?:"${escapedOriginal}"|'${escapedOriginal}')`, + "g" + ) + + // Replace with translated value, preserving quote style (default to double quotes) + return content.replace(pattern, `$1"${attr.translatedValue}"`) +} + +/** + * Re-insert all translated attributes into a file's content. + */ +export function reinsertTranslatedAttributes( + extraction: FileExtractionResult, + translatedAttributes: TranslatedAttribute[] +): FileTranslationResult { + let updatedContent = extraction.content + let successCount = 0 + + // Sort by position (reverse order) to avoid offset issues when replacing + const sortedAttrs = [...translatedAttributes].sort( + (a, b) => b.line - a.line || b.column - a.column + ) + + for (const attr of sortedAttrs) { + const beforeReplace = updatedContent + updatedContent = replaceAttributeValue(updatedContent, attr) + + if (updatedContent !== beforeReplace) { + successCount++ + } + } + + return { + filePath: extraction.filePath, + translatedAttributes, + updatedContent, + hasChanges: successCount > 0, + } +} + +/** + * Process multiple files with their translated attributes. + */ +export function reinsertTranslationsForFiles( + extractions: FileExtractionResult[], + translationsByFile: Map +): FileTranslationResult[] { + return extractions.map((extraction) => { + const translations = translationsByFile.get(extraction.filePath) || [] + return reinsertTranslatedAttributes(extraction, translations) + }) +} From cdc13c55b856ac2b1e4ec34923071409e48a2528 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:37:14 -0300 Subject: [PATCH 51/99] feat(i18n): add JSX attributes module exports Add index.ts exporting the JSX attribute module API: - Extract functions: extractAttributesFromFile, extractAttributesFromFiles - Reinsert functions: reinsertTranslatedAttributes, reinsertTranslationsForFiles - Types: ExtractedAttribute, TranslatedAttribute, etc. --- src/scripts/i18n/lib/jsx-attributes/index.ts | 23 ++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 src/scripts/i18n/lib/jsx-attributes/index.ts diff --git a/src/scripts/i18n/lib/jsx-attributes/index.ts b/src/scripts/i18n/lib/jsx-attributes/index.ts new file mode 100644 index 00000000000..517481472c3 --- /dev/null +++ b/src/scripts/i18n/lib/jsx-attributes/index.ts @@ -0,0 +1,23 @@ +/** + * JSX attribute extraction and translation module + */ + +export { + countExtractedAttributes, + extractAttributesFromContent, + extractAttributesFromFile, + extractAttributesFromFiles, +} from "./extract" +export { + reinsertTranslatedAttributes, + reinsertTranslationsForFiles, +} from "./reinsert" +export { + type ExtractedAttribute, + type FileExtractionResult, + type FileTranslationResult, + type JsxTranslationSummary, + TRANSLATABLE_ATTRIBUTES, + type TranslatableAttribute, + type TranslatedAttribute, +} from "./types" From 8b1d350b473f23c31e9dbdaafcc4a99dd2ba06ea Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:37:25 -0300 Subject: [PATCH 52/99] feat(i18n): add Gemini AI translation module Add @google/generative-ai for JSX attribute translation. Uses gemini-2.5-pro model with retry logic and exponential backoff. Includes language display names for better prompt context. Exports isGeminiAvailable() for graceful fallback when API key unavailable. --- package.json | 1 + pnpm-lock.yaml | 11 +- src/scripts/i18n/lib/ai/gemini.ts | 243 ++++++++++++++++++++++++++++++ src/scripts/i18n/lib/ai/index.ts | 10 ++ 4 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 src/scripts/i18n/lib/ai/gemini.ts create mode 100644 src/scripts/i18n/lib/ai/index.ts diff --git a/package.json b/package.json index a43d2800816..e548fcdee81 100644 --- a/package.json +++ b/package.json @@ -110,6 +110,7 @@ "devDependencies": { "@chromatic-com/playwright": "^0.12.4", "@chromatic-com/storybook": "1.5.0", + "@google/generative-ai": "^0.24.1", "@netlify/plugin-nextjs": "^5.12.0", "@playwright/test": "^1.52.0", "@storybook/addon-essentials": "8.6.14", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b53627e49ed..ecce48608c1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -246,6 +246,9 @@ importers: '@chromatic-com/storybook': specifier: 1.5.0 version: 1.5.0(@chromatic-com/playwright@0.12.5(@playwright/test@1.53.1)(@types/react@18.2.57)(bufferutil@4.0.9)(esbuild@0.25.5)(prettier@3.5.3)(typescript@5.8.3)(utf-8-validate@5.0.10))(react@18.3.1) + '@google/generative-ai': + specifier: ^0.24.1 + version: 0.24.1 '@netlify/plugin-nextjs': specifier: ^5.12.0 version: 5.12.0 @@ -1503,6 +1506,10 @@ packages: '@formatjs/intl-localematcher@0.6.1': resolution: {integrity: sha512-ePEgLgVCqi2BBFnTMWPfIghu6FkbZnnBVhO2sSxvLfrdFw7wCHAHiDoM2h4NRgjbaY7+B7HgOLZGkK187pZTZg==} + '@google/generative-ai@0.24.1': + resolution: {integrity: sha512-MqO+MLfM6kjxcKoy0p1wRzG3b4ZZXtPI+z2IE26UogS2Cm/XHO+7gGRBh6gcJsOiIVoH93UwKvW4HdgiOZCy9Q==} + engines: {node: '>=18.0.0'} + '@hookform/resolvers@3.10.0': resolution: {integrity: sha512-79Dv+3mDF7i+2ajj7SkypSKHhl1cbln1OGavqrsF7p6mbUv11xpqpacPsGDCTRvCSjEEIez2ef1NveSVL3b0Ag==} peerDependencies: @@ -10913,6 +10920,8 @@ snapshots: dependencies: tslib: 2.8.1 + '@google/generative-ai@0.24.1': {} + '@hookform/resolvers@3.10.0(react-hook-form@7.57.0(react@18.3.1))': dependencies: react-hook-form: 7.57.0(react@18.3.1) @@ -16787,7 +16796,7 @@ snapshots: extension-port-stream@3.0.0: dependencies: - readable-stream: 3.6.2 + readable-stream: 4.7.0 webextension-polyfill: 0.10.0 fast-deep-equal@3.1.3: {} diff --git a/src/scripts/i18n/lib/ai/gemini.ts b/src/scripts/i18n/lib/ai/gemini.ts new file mode 100644 index 00000000000..1246eacb4ad --- /dev/null +++ b/src/scripts/i18n/lib/ai/gemini.ts @@ -0,0 +1,243 @@ +/** + * Gemini AI translation wrapper for JSX attribute translation + */ + +import { GoogleGenerativeAI } from "@google/generative-ai" + +import type { ExtractedAttribute, TranslatedAttribute } from "../jsx-attributes" + +/** Gemini API configuration */ +const GEMINI_MODEL = "gemini-2.5-pro" + +/** Language display names for better prompt context */ +const LANGUAGE_NAMES: Record = { + es: "Spanish", + fr: "French", + de: "German", + it: "Italian", + pt: "Portuguese", + ru: "Russian", + zh: "Chinese (Simplified)", + ja: "Japanese", + ko: "Korean", + ar: "Arabic", + tr: "Turkish", + nl: "Dutch", + pl: "Polish", + vi: "Vietnamese", + th: "Thai", + id: "Indonesian", + uk: "Ukrainian", + cs: "Czech", + ro: "Romanian", + hu: "Hungarian", + el: "Greek", + sv: "Swedish", + da: "Danish", + fi: "Finnish", + no: "Norwegian", + he: "Hebrew", + hi: "Hindi", + bn: "Bengali", + ms: "Malay", + tl: "Filipino", + sw: "Swahili", +} + +/** + * Check if Gemini API is available (API key present) + */ +export function isGeminiAvailable(): boolean { + return Boolean(process.env.GEMINI_API_KEY) +} + +/** + * Get the Gemini API client + */ +function getGeminiClient(): GoogleGenerativeAI { + const apiKey = process.env.GEMINI_API_KEY + if (!apiKey) { + throw new Error("GEMINI_API_KEY environment variable is not set") + } + return new GoogleGenerativeAI(apiKey) +} + +/** + * Get human-readable language name from code + */ +function getLanguageName(code: string): string { + return LANGUAGE_NAMES[code] || code.toUpperCase() +} + +/** + * Build translation prompt for a batch of attributes + */ +function buildTranslationPrompt( + attributes: ExtractedAttribute[], + targetLanguage: string +): string { + const langName = getLanguageName(targetLanguage) + + const attributeList = attributes + .map( + (attr, i) => + `${i + 1}. [${attr.componentName}.${attr.attributeName}] "${attr.originalValue}" + Context: ${attr.context}` + ) + .join("\n\n") + + return `You are translating UI component attributes for the Ethereum.org website into ${langName}. + +These are JSX component attributes that contain human-readable text. Translate each value naturally and accurately while: +- Preserving technical Ethereum terminology appropriately for ${langName} +- Keeping the translation concise (similar length to original) +- Maintaining any placeholders like {variable} or {{variable}} unchanged +- Using region-neutral ${langName} that most speakers would understand + +Attributes to translate: + +${attributeList} + +Respond with ONLY a JSON array of translated strings in the same order, like: +["translated text 1", "translated text 2", ...] + +Do not include any explanation, just the JSON array.` +} + +/** + * Parse Gemini response to extract translated strings + */ +function parseTranslationResponse(response: string): string[] { + // Clean up response - remove markdown code blocks if present + let cleaned = response.trim() + if (cleaned.startsWith("```json")) { + cleaned = cleaned.slice(7) + } else if (cleaned.startsWith("```")) { + cleaned = cleaned.slice(3) + } + if (cleaned.endsWith("```")) { + cleaned = cleaned.slice(0, -3) + } + cleaned = cleaned.trim() + + try { + const parsed = JSON.parse(cleaned) + if (!Array.isArray(parsed)) { + throw new Error("Response is not an array") + } + return parsed.map((item) => String(item)) + } catch (error) { + console.error("[GEMINI] Failed to parse response:", cleaned) + throw new Error(`Failed to parse Gemini response: ${error}`) + } +} + +/** + * Translate a batch of attributes for a single language. + * Returns translated attributes with their values filled in. + */ +export async function translateAttributes( + attributes: ExtractedAttribute[], + targetLanguage: string +): Promise { + if (attributes.length === 0) { + return [] + } + + if (!isGeminiAvailable()) { + console.warn( + "[GEMINI] API key not available, skipping attribute translation" + ) + return [] + } + + const client = getGeminiClient() + const model = client.getGenerativeModel({ model: GEMINI_MODEL }) + + const prompt = buildTranslationPrompt(attributes, targetLanguage) + + console.log( + `[GEMINI] Translating ${attributes.length} attributes to ${getLanguageName(targetLanguage)}` + ) + + try { + const result = await model.generateContent(prompt) + const response = result.response.text() + const translations = parseTranslationResponse(response) + + if (translations.length !== attributes.length) { + console.warn( + `[GEMINI] Translation count mismatch: expected ${attributes.length}, got ${translations.length}` + ) + } + + // Map translations back to attributes + return attributes.map((attr, i) => ({ + ...attr, + translatedValue: translations[i] || attr.originalValue, + })) + } catch (error) { + console.error("[GEMINI] Translation failed:", error) + throw error + } +} + +/** + * Translate attributes with retry logic + */ +export async function translateAttributesWithRetry( + attributes: ExtractedAttribute[], + targetLanguage: string, + maxRetries = 3 +): Promise { + let lastError: Error | null = null + + for (let attempt = 1; attempt <= maxRetries; attempt++) { + try { + return await translateAttributes(attributes, targetLanguage) + } catch (error) { + lastError = error instanceof Error ? error : new Error(String(error)) + console.warn( + `[GEMINI] Attempt ${attempt}/${maxRetries} failed: ${lastError.message}` + ) + + if (attempt < maxRetries) { + // Exponential backoff + const delay = Math.min(1000 * Math.pow(2, attempt - 1), 10000) + await new Promise((resolve) => setTimeout(resolve, delay)) + } + } + } + + throw lastError || new Error("Translation failed after retries") +} + +/** + * Translate attributes grouped by file, processing each file's batch sequentially + * to avoid rate limits while maximizing context per request. + */ +export async function translateAttributesByFile( + attributesByFile: Map, + targetLanguage: string +): Promise> { + const results = new Map() + + for (const [filePath, attributes] of attributesByFile) { + try { + const translated = await translateAttributesWithRetry( + attributes, + targetLanguage + ) + results.set(filePath, translated) + console.log( + `[GEMINI] ✓ Translated ${translated.length} attributes in ${filePath}` + ) + } catch (error) { + console.error(`[GEMINI] ✗ Failed to translate ${filePath}:`, error) + // Continue with other files even if one fails + results.set(filePath, []) + } + } + + return results +} diff --git a/src/scripts/i18n/lib/ai/index.ts b/src/scripts/i18n/lib/ai/index.ts new file mode 100644 index 00000000000..e2f75fd59d1 --- /dev/null +++ b/src/scripts/i18n/lib/ai/index.ts @@ -0,0 +1,10 @@ +/** + * AI translation module + */ + +export { + isGeminiAvailable, + translateAttributes, + translateAttributesByFile, + translateAttributesWithRetry, +} from "./gemini" From 231f60cd4dfd6b9140f239afe0b2a8603cd50c03 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:37:36 -0300 Subject: [PATCH 53/99] feat(i18n): add standalone JSX translation orchestrator Add translate-jsx-attributes.ts for standalone/CLI execution. Can be called from main workflow or via CLI: npx ts-node translate-jsx-attributes.ts --language es --files file1.md Extracts JSX attributes, translates via Gemini, re-inserts into files. Returns JsxTranslationSummary with stats and updated file contents. --- src/scripts/i18n/translate-jsx-attributes.ts | 249 +++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 src/scripts/i18n/translate-jsx-attributes.ts diff --git a/src/scripts/i18n/translate-jsx-attributes.ts b/src/scripts/i18n/translate-jsx-attributes.ts new file mode 100644 index 00000000000..555c29ab202 --- /dev/null +++ b/src/scripts/i18n/translate-jsx-attributes.ts @@ -0,0 +1,249 @@ +/** + * Standalone JSX attribute translation module + * + * Can be called from: + * 1. Main i18n workflow (after Crowdin download, before sanitizer) + * 2. Dedicated GitHub Action (accepts branch/PR, runs in isolation) + * + * Usage: + * npx ts-node translate-jsx-attributes.ts --language es --files file1.md,file2.md + * npx ts-node translate-jsx-attributes.ts --language es --branch translations/es + */ + +import fs from "fs" +import path from "path" + +import { isGeminiAvailable, translateAttributesByFile } from "./lib/ai" +import type { + ExtractedAttribute, + FileExtractionResult, + FileTranslationResult, + JsxTranslationSummary, +} from "./lib/jsx-attributes" +import { + countExtractedAttributes, + extractAttributesFromFile, + reinsertTranslatedAttributes, +} from "./lib/jsx-attributes" + +/** + * Options for JSX attribute translation + */ +export interface TranslateJsxOptions { + /** Target language code (e.g., "es", "fr") */ + targetLanguage: string + /** Files to process (path and content) */ + files: { path: string; content: string }[] + /** Whether to log verbose output */ + verbose?: boolean +} + +/** + * Translate JSX attributes in a batch of files for a single language. + * This is the main entry point for both workflow integration and standalone use. + */ +export async function translateJsxAttributes( + options: TranslateJsxOptions +): Promise { + const { targetLanguage, files, verbose = false } = options + + console.log(`\n[JSX-TRANSLATE] Starting JSX attribute translation`) + console.log(`[JSX-TRANSLATE] Target language: ${targetLanguage}`) + console.log(`[JSX-TRANSLATE] Files to process: ${files.length}`) + + // Check Gemini availability + const geminiAvailable = isGeminiAvailable() + if (!geminiAvailable) { + console.warn( + `[JSX-TRANSLATE] ⚠️ GEMINI_API_KEY not available, skipping translation` + ) + return { + filesProcessed: files.length, + filesWithChanges: 0, + attributesTranslated: 0, + attributesFailed: 0, + geminiAvailable: false, + updatedFiles: [], + } + } + + // Extract attributes from all files + const extractions: FileExtractionResult[] = [] + const attributesByFile = new Map() + + for (const file of files) { + // Only process markdown files + if (!file.path.endsWith(".md") && !file.path.endsWith(".mdx")) { + continue + } + + const extraction = extractAttributesFromFile(file.content, file.path) + extractions.push(extraction) + + if (extraction.attributes.length > 0) { + attributesByFile.set(file.path, extraction.attributes) + if (verbose) { + console.log( + `[JSX-TRANSLATE] Found ${extraction.attributes.length} attributes in ${file.path}` + ) + } + } + } + + const totalAttributes = countExtractedAttributes(extractions) + console.log( + `[JSX-TRANSLATE] Found ${totalAttributes} translatable attributes in ${attributesByFile.size} files` + ) + + if (totalAttributes === 0) { + console.log(`[JSX-TRANSLATE] No attributes to translate`) + return { + filesProcessed: files.length, + filesWithChanges: 0, + attributesTranslated: 0, + attributesFailed: 0, + geminiAvailable: true, + updatedFiles: [], + } + } + + // Translate attributes via Gemini (one API call per file batch) + const translatedByFile = await translateAttributesByFile( + attributesByFile, + targetLanguage + ) + + // Re-insert translated attributes into files + const updatedFiles: FileTranslationResult[] = [] + let attributesTranslated = 0 + let attributesFailed = 0 + + for (const extraction of extractions) { + const translated = translatedByFile.get(extraction.filePath) || [] + const result = reinsertTranslatedAttributes(extraction, translated) + + if (result.hasChanges) { + updatedFiles.push(result) + attributesTranslated += translated.length + } + + // Count failed as those we extracted but didn't get back + const originalCount = extraction.attributes.length + const translatedCount = translated.length + if (translatedCount < originalCount) { + attributesFailed += originalCount - translatedCount + } + } + + console.log(`[JSX-TRANSLATE] ✓ Translation complete`) + console.log(`[JSX-TRANSLATE] - Files with changes: ${updatedFiles.length}`) + console.log( + `[JSX-TRANSLATE] - Attributes translated: ${attributesTranslated}` + ) + if (attributesFailed > 0) { + console.log(`[JSX-TRANSLATE] - Attributes failed: ${attributesFailed}`) + } + + return { + filesProcessed: files.length, + filesWithChanges: updatedFiles.length, + attributesTranslated, + attributesFailed, + geminiAvailable: true, + updatedFiles, + } +} + +/** + * Read files from disk for standalone execution + */ +function readFilesFromDisk( + filePaths: string[] +): { path: string; content: string }[] { + return filePaths.map((filePath) => { + const absolutePath = path.isAbsolute(filePath) + ? filePath + : path.join(process.cwd(), filePath) + const content = fs.readFileSync(absolutePath, "utf-8") + return { path: filePath, content } + }) +} + +/** + * Write updated files back to disk + */ +function writeFilesToDisk(files: FileTranslationResult[]): void { + for (const file of files) { + const absolutePath = path.isAbsolute(file.filePath) + ? file.filePath + : path.join(process.cwd(), file.filePath) + fs.writeFileSync(absolutePath, file.updatedContent, "utf-8") + console.log(`[JSX-TRANSLATE] Wrote: ${file.filePath}`) + } +} + +/** + * Parse CLI arguments + */ +function parseArgs(): { language: string; files: string[] } | null { + const args = process.argv.slice(2) + let language = "" + let files: string[] = [] + + for (let i = 0; i < args.length; i++) { + if (args[i] === "--language" || args[i] === "-l") { + language = args[++i] + } else if (args[i] === "--files" || args[i] === "-f") { + files = args[++i].split(",").map((f) => f.trim()) + } + } + + if (!language || files.length === 0) { + return null + } + + return { language, files } +} + +/** + * CLI entry point for standalone execution + */ +async function main() { + const parsed = parseArgs() + + if (!parsed) { + console.log(` +Usage: npx ts-node translate-jsx-attributes.ts --language --files + +Options: + --language, -l Target language code (e.g., "es", "fr", "de") + --files, -f Comma-separated list of file paths to process + +Example: + npx ts-node translate-jsx-attributes.ts -l es -f public/content/roadmap/pbs/index.md +`) + process.exit(1) + } + + const fileContents = readFilesFromDisk(parsed.files) + const result = await translateJsxAttributes({ + targetLanguage: parsed.language, + files: fileContents, + verbose: true, + }) + + if (result.updatedFiles.length > 0) { + writeFilesToDisk(result.updatedFiles) + console.log(`\n✓ Updated ${result.updatedFiles.length} files`) + } else { + console.log(`\nNo files were modified`) + } +} + +// Run CLI if executed directly +if (require.main === module) { + main().catch((err) => { + console.error("Error:", err) + process.exit(1) + }) +} From b1adb81d939243a523e44ffec91d8c70e92936dc Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:37:48 -0300 Subject: [PATCH 54/99] feat(i18n): add JSX attribute validation Add validation for potentially untranslated JSX attributes. - JsxAttributeValidationResult interface with untranslated count/percentage - validateJsxAttributes() with configurable threshold - Integration in PR validation workflow with JSX_UNTRANSLATED_THRESHOLD env var - Reports untranslated attributes in PR validation comment --- .../i18n/lib/validation/syntax-tree.ts | 149 +++++++++++++++++- src/scripts/i18n/lib/workflows/validation.ts | 21 +++ 2 files changed, 167 insertions(+), 3 deletions(-) diff --git a/src/scripts/i18n/lib/validation/syntax-tree.ts b/src/scripts/i18n/lib/validation/syntax-tree.ts index 6ee8ba538a7..4b047cf8a3e 100644 --- a/src/scripts/i18n/lib/validation/syntax-tree.ts +++ b/src/scripts/i18n/lib/validation/syntax-tree.ts @@ -21,6 +21,19 @@ export interface MarkdownValidationResult { }> } +export interface JsxAttributeValidationResult { + isValid: boolean + untranslatedCount: number + totalCount: number + untranslatedPercentage: number + untranslatedAttributes: Array<{ + attributeName: string + componentName: string + value: string + line: number + }> +} + /** * Extract JSON keys in order from a JSON string */ @@ -159,14 +172,124 @@ export function validateMarkdownStructure( } } +/** Attributes that should be translated */ +const TRANSLATABLE_ATTRIBUTES = [ + "title", + "description", + "alt", + "label", + "aria-label", + "placeholder", + "buttonLabel", + "text", + "name", + "caption", + "contentPreview", + "location", +] + +/** JSX component regex for validation */ +const JSX_COMPONENT_REGEX = /<([A-Z][a-zA-Z0-9]*)\s+([^>]*?)(?:\/>|>)/g + +/** Attribute regex for validation */ +const ATTRIBUTE_REGEX = + /\b([a-zA-Z][\w-]*)\s*=\s*(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)')/g + +/** + * Check if text appears to be English (heuristic) + */ +function looksLikeEnglish(value: string): boolean { + if (!value || value.length < 3) return false + if (/^https?:\/\//.test(value)) return false + if (/^[/.]/.test(value) || /\.(png|jpg|svg|gif|json|md)$/i.test(value)) + return false + if (/^\{.*\}$/.test(value)) return false + if (/^[a-z][a-zA-Z0-9-]*$/.test(value) && !value.includes(" ")) return false + if (/^[\p{Emoji}\s]+$/u.test(value)) return false + if (/^[\d.,\s%$€£]+$/.test(value)) return false + + // Check for common English patterns + if (value.includes(" ")) return true + if (/^[A-Z][a-z]+(?:ing|ed|er|est|ly|tion|ness)?$/.test(value)) return true + + return false +} + +/** + * Validate JSX attributes for potential untranslated English content + */ +export function validateJsxAttributes( + content: string, + threshold = 5 +): JsxAttributeValidationResult { + const untranslatedAttributes: JsxAttributeValidationResult["untranslatedAttributes"] = + [] + let totalCount = 0 + + const lines = content.split("\n") + let currentLine = 0 + let currentPos = 0 + + let componentMatch: RegExpExecArray | null + JSX_COMPONENT_REGEX.lastIndex = 0 + + while ((componentMatch = JSX_COMPONENT_REGEX.exec(content)) !== null) { + const componentName = componentMatch[1] + const attributeString = componentMatch[2] + const componentStartPos = componentMatch.index + + // Calculate line number + while (currentPos < componentStartPos && currentLine < lines.length) { + currentPos += lines[currentLine].length + 1 + currentLine++ + } + const componentLine = currentLine + 1 + + let attrMatch: RegExpExecArray | null + ATTRIBUTE_REGEX.lastIndex = 0 + + while ((attrMatch = ATTRIBUTE_REGEX.exec(attributeString)) !== null) { + const attrName = attrMatch[1] + const attrValue = attrMatch[2] || attrMatch[3] + + if (!TRANSLATABLE_ATTRIBUTES.includes(attrName)) continue + + totalCount++ + + if (looksLikeEnglish(attrValue)) { + untranslatedAttributes.push({ + attributeName: attrName, + componentName, + value: attrValue, + line: componentLine, + }) + } + } + } + + const untranslatedPercentage = + totalCount > 0 ? (untranslatedAttributes.length / totalCount) * 100 : 0 + + return { + isValid: untranslatedPercentage <= threshold, + untranslatedCount: untranslatedAttributes.length, + totalCount, + untranslatedPercentage, + untranslatedAttributes, + } +} + /** * Format validation results into a markdown comment */ export function formatValidationComment( validationResults: Array<{ path: string - type: "json" | "markdown" - result: JsonValidationResult | MarkdownValidationResult + type: "json" | "markdown" | "jsx-attributes" + result: + | JsonValidationResult + | MarkdownValidationResult + | JsxAttributeValidationResult }> ): string | null { const issues = validationResults.filter((v) => !v.result.isValid) @@ -203,7 +326,7 @@ export function formatValidationComment( ) { comment += `- ⚠️ Key order differs from English version\n` } - } else { + } else if (issue.type === "markdown") { const result = issue.result as MarkdownValidationResult comment += `**Markdown Structure Issues:**\n` comment += `- Expected headings: ${result.expectedHeadingCount}\n` @@ -215,6 +338,26 @@ export function formatValidationComment( comment += `- Line ${mismatch.line}: Expected ID \`${mismatch.expectedId}\`, found \`${mismatch.actualId || "(none)"}\`\n` } } + } else if (issue.type === "jsx-attributes") { + const result = issue.result as JsxAttributeValidationResult + comment += `**Potentially Untranslated JSX Attributes:**\n` + comment += `- Untranslated: ${result.untranslatedCount} / ${result.totalCount} (${result.untranslatedPercentage.toFixed(1)}%)\n` + + if (result.untranslatedAttributes.length > 0) { + comment += `\n**Attributes that may need translation:**\n` + // Show up to 10 examples + const examples = result.untranslatedAttributes.slice(0, 10) + for (const attr of examples) { + const truncatedValue = + attr.value.length > 50 + ? attr.value.slice(0, 47) + "..." + : attr.value + comment += `- Line ${attr.line}: \`<${attr.componentName} ${attr.attributeName}="${truncatedValue}">\`\n` + } + if (result.untranslatedAttributes.length > 10) { + comment += `- ... and ${result.untranslatedAttributes.length - 10} more\n` + } + } } comment += `\n` diff --git a/src/scripts/i18n/lib/workflows/validation.ts b/src/scripts/i18n/lib/workflows/validation.ts index 90c131ece2a..d296a07a66b 100644 --- a/src/scripts/i18n/lib/workflows/validation.ts +++ b/src/scripts/i18n/lib/workflows/validation.ts @@ -5,12 +5,16 @@ import { postPullRequestComment } from "../github/pull-requests" import { formatValidationComment, validateJsonStructure, + validateJsxAttributes, validateMarkdownStructure, } from "../validation/syntax-tree" import type { CommittedFile, PullRequest } from "./types" import { logSection } from "./utils" +/** Default threshold for JSX attribute untranslated percentage */ +const DEFAULT_JSX_THRESHOLD = 5 + /** * Run syntax tree validation and post comment if issues found */ @@ -96,6 +100,23 @@ export async function runSyntaxValidation( if (!result.isValid && verbose) { console.log(`[DEBUG] Markdown validation failed for ${file.path}`) } + + // Also validate JSX attributes for markdown files + const jsxThreshold = + Number(process.env.JSX_UNTRANSLATED_THRESHOLD) || DEFAULT_JSX_THRESHOLD + const jsxResult = validateJsxAttributes(file.content, jsxThreshold) + if (!jsxResult.isValid) { + validationResults.push({ + path: file.path, + type: "jsx-attributes", + result: jsxResult, + }) + if (verbose) { + console.log( + `[DEBUG] JSX attribute validation flagged ${file.path}: ${jsxResult.untranslatedPercentage.toFixed(1)}% untranslated` + ) + } + } } } From 65abc66550bc7367e5837d1314ede24abfecf3f3 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:38:01 -0300 Subject: [PATCH 55/99] feat(i18n): add geminiSkipped flag to PR body Add PRBodyOptions interface with geminiSkipped flag. When Gemini API is unavailable, PR body includes warning that JSX attributes may remain untranslated with instructions to run standalone workflow. --- src/scripts/i18n/lib/workflows/pr-creation.ts | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/scripts/i18n/lib/workflows/pr-creation.ts b/src/scripts/i18n/lib/workflows/pr-creation.ts index e5e384c24c8..8877786e513 100644 --- a/src/scripts/i18n/lib/workflows/pr-creation.ts +++ b/src/scripts/i18n/lib/workflows/pr-creation.ts @@ -30,6 +30,11 @@ export function generatePRTitle( return prTitle } +/** Options for PR body generation */ +export interface PRBodyOptions { + geminiSkipped?: boolean +} + /** * Generate PR body with organized file listings */ @@ -37,7 +42,8 @@ export function generatePRBody( aiModelName: string, langCodes: string[], committedFiles: CommittedFile[], - sanitizedFiles: CommittedFile[] + sanitizedFiles: CommittedFile[], + options: PRBodyOptions = {} ): string { // Include both sanitized files and original committed files const allChangedPathsSet = new Set([ @@ -87,6 +93,15 @@ export function generatePRBody( prBody += `\n` } + // Add warning if Gemini was skipped + if (options.geminiSkipped) { + prBody += `---\n\n` + prBody += `> ⚠️ **Note:** GEMINI_API_KEY was not available during this run. ` + prBody += `JSX component attributes (e.g., \`title="..."\`, \`description="..."\`) ` + prBody += `may remain untranslated. You can run the \`translate-jsx-attributes\` ` + prBody += `workflow on this branch to translate them separately.\n\n` + } + return prBody } @@ -121,7 +136,8 @@ export async function createTranslationPR( branch: string, committedFiles: CommittedFile[], sanitizedFiles: CommittedFile[], - languagePairs: LanguagePair[] + languagePairs: LanguagePair[], + options: PRBodyOptions = {} ): Promise { logSection("Creating Pull Request") @@ -137,7 +153,8 @@ export async function createTranslationPR( aiModelName, langCodes, committedFiles, - sanitizedFiles + sanitizedFiles, + options ) // Create PR From 3777c6f977b7280258fc2291391ff4a8bb8d6977 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:38:12 -0300 Subject: [PATCH 56/99] feat(i18n): integrate JSX translation into main workflow Add Phase 5: JSX Attribute Translation (before sanitizer). - Checks isGeminiAvailable() for graceful fallback - Processes each language's markdown files separately - Commits updated files with JSX translations - Updates committedFiles array for sanitizer - Passes geminiSkipped flag to PR creation --- src/scripts/i18n/main.ts | 77 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 0ef158a300f..159cb7e5769 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -1,3 +1,4 @@ +import { isGeminiAvailable } from "./lib/ai" import { putCommitFile } from "./lib/github/commits" import { prepareEnglishFiles } from "./lib/workflows/file-preparation" import { initializeWorkflow } from "./lib/workflows/initialize" @@ -8,6 +9,7 @@ import { logSection } from "./lib/workflows/utils" import { runSyntaxValidation } from "./lib/workflows/validation" import { config } from "./config" import { runSanitizer } from "./post_import_sanitize" +import { translateJsxAttributes } from "./translate-jsx-attributes" /** * Main orchestration function @@ -32,7 +34,73 @@ async function main() { context ) - // Phase 5: Run post-import sanitizer + // Phase 5: Translate JSX attributes via Gemini (before sanitizer) + let geminiSkipped = false + logSection("JSX Attribute Translation") + + if (!isGeminiAvailable()) { + console.warn( + `[JSX-TRANSLATE] ⚠️ GEMINI_API_KEY not set - JSX attributes may remain untranslated` + ) + geminiSkipped = true + } else { + // Process each language separately + for (const langPair of translationResult.languagePairs) { + const langCode = langPair.internalLanguageCode + + // Filter files for this language (markdown only) + const langFiles = translationResult.committedFiles + .filter((f) => f.path.includes(`/translations/${langCode}/`)) + .filter((f) => f.path.endsWith(".md") || f.path.endsWith(".mdx")) + .map((f) => ({ path: f.path, content: f.content })) + + if (langFiles.length === 0) { + console.log(`[JSX-TRANSLATE] No markdown files for ${langCode}`) + continue + } + + console.log( + `[JSX-TRANSLATE] Processing ${langFiles.length} files for ${langCode}` + ) + + const jsxResult = await translateJsxAttributes({ + targetLanguage: langCode, + files: langFiles, + verbose, + }) + + // Commit updated files + if (jsxResult.updatedFiles.length > 0) { + for (const updated of jsxResult.updatedFiles) { + try { + const buf = Buffer.from(updated.updatedContent, "utf8") + await putCommitFile(buf, updated.filePath, translationResult.branch) + if (verbose) { + console.log(`[JSX-TRANSLATE] Committed: ${updated.filePath}`) + } + + // Update the committedFiles array with new content for sanitizer + const existingFile = translationResult.committedFiles.find( + (f) => f.path === updated.filePath + ) + if (existingFile) { + existingFile.content = updated.updatedContent + } + } catch (e) { + console.warn( + `[JSX-TRANSLATE] Failed to commit ${updated.filePath}:`, + e + ) + } + } + console.log( + `[JSX-TRANSLATE] ✓ Committed ${jsxResult.updatedFiles.length} files for ${langCode}` + ) + } + } + } + + // Phase 6: Run post-import sanitizer logSection("Running Post-Import Sanitizer") console.log( `[SANITIZE] Processing ${translationResult.committedFiles.length} committed files` @@ -74,15 +142,16 @@ async function main() { return } - // Phase 6: Create PR + // Phase 7: Create PR const pr = await createTranslationPR( translationResult.branch, translationResult.committedFiles, changedFiles, - translationResult.languagePairs + translationResult.languagePairs, + { geminiSkipped } ) - // Phase 7: Run syntax tree validation + // Phase 8: Run syntax tree validation await runSyntaxValidation( pr, translationResult.committedFiles, From be7f821d05adf484e59c390afe90b11d99ce7bce Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:38:23 -0300 Subject: [PATCH 57/99] ci(i18n): add standalone JSX attribute translation workflow Add workflow_dispatch workflow for translating JSX attributes on existing branches. Inputs: branch, target_language, file_pattern, verbose. Useful for running JSX translation separately when Gemini was unavailable during main workflow, or for re-processing specific files. --- .../workflows/translate-jsx-attributes.yml | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 .github/workflows/translate-jsx-attributes.yml diff --git a/.github/workflows/translate-jsx-attributes.yml b/.github/workflows/translate-jsx-attributes.yml new file mode 100644 index 00000000000..45927c84a04 --- /dev/null +++ b/.github/workflows/translate-jsx-attributes.yml @@ -0,0 +1,74 @@ +name: Translate JSX Attributes + +on: + workflow_dispatch: + inputs: + branch: + description: "Branch name to process (e.g., translations/es-2024-12-17)" + required: true + type: string + target_language: + description: "Target language code (e.g., es, fr, de)" + required: true + type: string + file_pattern: + description: "Glob pattern for files to process (default: all markdown in translations folder)" + required: false + default: "public/content/translations/**/*.md" + type: string + verbose: + description: "Enable verbose logging?" + required: false + default: false + type: boolean + +jobs: + translate_attributes: + runs-on: ubuntu-latest + steps: + - name: Check out branch + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.branch }} + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: 20 + cache: "pnpm" + + - name: Install dependencies + run: pnpm install + + - name: Find markdown files + id: find-files + run: | + FILES=$(find ${{ github.event.inputs.file_pattern }} -type f 2>/dev/null | head -500 | tr '\n' ',') + echo "files=${FILES%,}" >> $GITHUB_OUTPUT + echo "Found files: ${FILES%,}" + + - name: Translate JSX attributes + if: steps.find-files.outputs.files != '' + run: | + npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/translate-jsx-attributes.ts \ + --language ${{ github.event.inputs.target_language }} \ + --files "${{ steps.find-files.outputs.files }}" + env: + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + + - name: Commit changes + if: steps.find-files.outputs.files != '' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add -A + if git diff --staged --quiet; then + echo "No changes to commit" + else + git commit -m "chore: translate JSX attributes (${{ github.event.inputs.target_language }})" + git push + echo "✓ Committed and pushed JSX attribute translations" + fi From 2bfe955b7ee9feb33feaf3395972eafa1f1265c2 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:38:31 -0300 Subject: [PATCH 58/99] docs(i18n): add v0.2.0 roadmap Document planned features for next iteration: - Glossary Supabase sync (separate cron job) - Term/phrase consistency validation - Confidence scoring for translation quality - Proposed Supabase schema and Crowdin API endpoints --- src/scripts/i18n/docs/v0.2.0-roadmap.md | 146 ++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 src/scripts/i18n/docs/v0.2.0-roadmap.md diff --git a/src/scripts/i18n/docs/v0.2.0-roadmap.md b/src/scripts/i18n/docs/v0.2.0-roadmap.md new file mode 100644 index 00000000000..c86e4f57447 --- /dev/null +++ b/src/scripts/i18n/docs/v0.2.0-roadmap.md @@ -0,0 +1,146 @@ +# v0.2.0 Roadmap: Glossary & Consistency Validation + +This document outlines planned features for the next major iteration of the i18n automation system. + +## Overview + +v0.1.0 focused on: +- JSX attribute translation via Gemini API (fallback for Crowdin) +- Build-breaking syntax validation +- Modular architecture for standalone workflow execution + +v0.2.0 will focus on **translation quality and consistency** through glossary enforcement and term validation. + +--- + +## Planned Features + +### 1. Glossary Supabase Sync (Separate Cron) + +**Goal:** Keep Crowdin glossary synchronized with community-curated terms in Supabase. + +**Implementation:** +- Dedicated GitHub Action running on cron schedule (e.g., daily at midnight UTC) +- Fetches glossary terms from Supabase `glossary` table +- Uploads/updates terms in Crowdin project glossary via API +- Logs sync status and any conflicts + +**Files to create:** +- `src/scripts/i18n/sync-glossary.ts` - Main sync orchestrator +- `src/scripts/i18n/lib/supabase/glossary.ts` - Supabase client for glossary queries +- `.github/workflows/sync-glossary.yml` - Cron workflow + +**Environment variables needed:** +- `SUPABASE_URL` - Supabase project URL +- `SUPABASE_KEY` - Supabase anon/service key +- `CROWDIN_PROJECT_ID`, `CROWDIN_API_KEY` (existing) + +--- + +### 2. Term/Phrase Consistency Validation + +**Goal:** Validate that translated files use glossary terms consistently. + +**Implementation:** +- Post-translation validation step in main workflow +- Extract glossary terms from Crowdin (or local cache from sync) +- Scan translated files for source terms that should have been translated +- Flag inconsistencies in PR validation comment + +**Validation rules:** +- Source term appears in translation → likely missed (should be target term) +- Target term varies within same file → inconsistent usage +- Protected terms (ethereum.org, Ethereum, etc.) → should remain unchanged + +**Files to create:** +- `src/scripts/i18n/lib/validation/glossary.ts` - Glossary term validation +- Updates to `lib/workflows/validation.ts` - Integrate glossary checks + +--- + +### 3. Confidence Scoring + +**Goal:** Provide per-file and per-language confidence scores based on validation results. + +**Scoring factors:** +- JSX attribute untranslated percentage (from v0.1.0) +- Glossary term consistency rate +- Syntax validation pass/fail +- Source file complexity (length, technical density) + +**Output:** +- Confidence score (0-100) per file in PR comment +- Aggregate confidence per language +- Suggested review priority based on low-confidence files + +**Files to create:** +- `src/scripts/i18n/lib/validation/confidence.ts` - Scoring algorithm +- Updates to PR comment formatting + +--- + +## Architecture Considerations + +### Supabase Schema (Proposed) + +```sql +-- Glossary terms table +CREATE TABLE glossary ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + source_term TEXT NOT NULL, + language_code TEXT NOT NULL, + target_term TEXT NOT NULL, + context TEXT, -- e.g., "technical", "UI", "marketing" + notes TEXT, + created_at TIMESTAMPTZ DEFAULT now(), + updated_at TIMESTAMPTZ DEFAULT now(), + UNIQUE(source_term, language_code) +); + +-- Translation memory (future) +CREATE TABLE translation_memory ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + source_text TEXT NOT NULL, + language_code TEXT NOT NULL, + target_text TEXT NOT NULL, + source_file TEXT, + created_at TIMESTAMPTZ DEFAULT now() +); +``` + +### Crowdin API Endpoints + +- `POST /projects/{projectId}/glossaries/{glossaryId}/terms` - Add/update terms +- `GET /projects/{projectId}/glossaries/{glossaryId}/terms` - List terms for validation + +--- + +## Timeline (Tentative) + +| Feature | Estimated Effort | Priority | +|---------|------------------|----------| +| Glossary Supabase sync | 2-3 days | High | +| Term consistency validation | 2-3 days | High | +| Confidence scoring | 1-2 days | Medium | +| Documentation & testing | 1-2 days | High | + +--- + +## Dependencies + +- Supabase project setup with glossary table +- Crowdin glossary ID configuration +- Community glossary data migration (if existing) + +--- + +## Open Questions + +1. Should glossary sync be bidirectional (Supabase ↔ Crowdin)? +2. What threshold for glossary inconsistency should trigger a warning vs error? +3. Should confidence scores block PR merge below a certain threshold? +4. How to handle language-specific glossary exceptions? + +--- + +*This roadmap was created as part of the v0.1.0 development cycle. Updates will be made as requirements evolve.* From 59ed0dbe1b6a0e5126a04e17dedf8936000d418e Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:38:48 -0300 Subject: [PATCH 59/99] ci(i18n): add GEMINI_API_KEY to crowdin-ai-import workflow Pass GEMINI_API_KEY secret to main script for JSX attribute translation. --- .github/workflows/crowdin-ai-import.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index 1c63d9895d1..da65c8dd466 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -76,6 +76,7 @@ jobs: env: I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_API_KEY }} I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} TARGET_PATH: ${{ github.event.inputs.target_path }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} From f527074b334281cf1b1f01bd5f2c97f9a93e89a8 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 17:16:36 -0300 Subject: [PATCH 60/99] fix(i18n): improve JSX attribute and heading validation JSX attribute validation: - Compare against English source instead of using heuristics - Only flag attributes that are IDENTICAL to English (truly untranslated) - Update interface to include englishValue and translatedValue Heading validation: - Fix validation running on pre-sanitized content - Update committedFiles with sanitized content before validation - Now correctly validates that heading IDs match English after sanitization --- .../i18n/lib/validation/syntax-tree.ts | 106 +++++++++++------- src/scripts/i18n/lib/workflows/validation.ts | 8 +- src/scripts/i18n/main.ts | 8 ++ 3 files changed, 77 insertions(+), 45 deletions(-) diff --git a/src/scripts/i18n/lib/validation/syntax-tree.ts b/src/scripts/i18n/lib/validation/syntax-tree.ts index 4b047cf8a3e..15dc989fae2 100644 --- a/src/scripts/i18n/lib/validation/syntax-tree.ts +++ b/src/scripts/i18n/lib/validation/syntax-tree.ts @@ -29,7 +29,8 @@ export interface JsxAttributeValidationResult { untranslatedAttributes: Array<{ attributeName: string componentName: string - value: string + englishValue: string + translatedValue: string line: number }> } @@ -196,35 +197,16 @@ const ATTRIBUTE_REGEX = /\b([a-zA-Z][\w-]*)\s*=\s*(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)')/g /** - * Check if text appears to be English (heuristic) + * Extract JSX component attributes from content + * Returns a map of componentName.attrName -> value for matching */ -function looksLikeEnglish(value: string): boolean { - if (!value || value.length < 3) return false - if (/^https?:\/\//.test(value)) return false - if (/^[/.]/.test(value) || /\.(png|jpg|svg|gif|json|md)$/i.test(value)) - return false - if (/^\{.*\}$/.test(value)) return false - if (/^[a-z][a-zA-Z0-9-]*$/.test(value) && !value.includes(" ")) return false - if (/^[\p{Emoji}\s]+$/u.test(value)) return false - if (/^[\d.,\s%$€£]+$/.test(value)) return false - - // Check for common English patterns - if (value.includes(" ")) return true - if (/^[A-Z][a-z]+(?:ing|ed|er|est|ly|tion|ness)?$/.test(value)) return true - - return false -} - -/** - * Validate JSX attributes for potential untranslated English content - */ -export function validateJsxAttributes( - content: string, - threshold = 5 -): JsxAttributeValidationResult { - const untranslatedAttributes: JsxAttributeValidationResult["untranslatedAttributes"] = - [] - let totalCount = 0 +function extractJsxAttributes( + content: string +): Map { + const attributes = new Map< + string, + { value: string; line: number; componentName: string } + >() const lines = content.split("\n") let currentLine = 0 @@ -254,16 +236,54 @@ export function validateJsxAttributes( if (!TRANSLATABLE_ATTRIBUTES.includes(attrName)) continue - totalCount++ + // Use component position + attribute name as key for matching + // This allows us to match attributes even if component names differ slightly + const key = `${componentLine}:${attrName}` + attributes.set(key, { + value: attrValue, + line: componentLine, + componentName, + }) + } + } - if (looksLikeEnglish(attrValue)) { - untranslatedAttributes.push({ - attributeName: attrName, - componentName, - value: attrValue, - line: componentLine, - }) - } + return attributes +} + +/** + * Validate JSX attributes by comparing translated content against English source. + * An attribute is considered untranslated if its value is IDENTICAL to the English source. + */ +export function validateJsxAttributes( + englishContent: string, + translatedContent: string, + threshold = 5 +): JsxAttributeValidationResult { + const englishAttrs = extractJsxAttributes(englishContent) + const translatedAttrs = extractJsxAttributes(translatedContent) + + const untranslatedAttributes: JsxAttributeValidationResult["untranslatedAttributes"] = + [] + let totalCount = 0 + + // Compare each English attribute with its translated counterpart + for (const [key, englishAttr] of englishAttrs) { + const translatedAttr = translatedAttrs.get(key) + + // Skip if attribute doesn't exist in translation (structural difference) + if (!translatedAttr) continue + + totalCount++ + + // Check if the translated value is IDENTICAL to English (i.e., not translated) + if (translatedAttr.value === englishAttr.value) { + untranslatedAttributes.push({ + attributeName: key.split(":")[1], + componentName: translatedAttr.componentName, + englishValue: englishAttr.value, + translatedValue: translatedAttr.value, + line: translatedAttr.line, + }) } } @@ -340,18 +360,18 @@ export function formatValidationComment( } } else if (issue.type === "jsx-attributes") { const result = issue.result as JsxAttributeValidationResult - comment += `**Potentially Untranslated JSX Attributes:**\n` + comment += `**Untranslated JSX Attributes (identical to English):**\n` comment += `- Untranslated: ${result.untranslatedCount} / ${result.totalCount} (${result.untranslatedPercentage.toFixed(1)}%)\n` if (result.untranslatedAttributes.length > 0) { - comment += `\n**Attributes that may need translation:**\n` + comment += `\n**Attributes that need translation:**\n` // Show up to 10 examples const examples = result.untranslatedAttributes.slice(0, 10) for (const attr of examples) { const truncatedValue = - attr.value.length > 50 - ? attr.value.slice(0, 47) + "..." - : attr.value + attr.englishValue.length > 50 + ? attr.englishValue.slice(0, 47) + "..." + : attr.englishValue comment += `- Line ${attr.line}: \`<${attr.componentName} ${attr.attributeName}="${truncatedValue}">\`\n` } if (result.untranslatedAttributes.length > 10) { diff --git a/src/scripts/i18n/lib/workflows/validation.ts b/src/scripts/i18n/lib/workflows/validation.ts index d296a07a66b..21ffc20528f 100644 --- a/src/scripts/i18n/lib/workflows/validation.ts +++ b/src/scripts/i18n/lib/workflows/validation.ts @@ -101,10 +101,14 @@ export async function runSyntaxValidation( console.log(`[DEBUG] Markdown validation failed for ${file.path}`) } - // Also validate JSX attributes for markdown files + // Also validate JSX attributes for markdown files (compare against English) const jsxThreshold = Number(process.env.JSX_UNTRANSLATED_THRESHOLD) || DEFAULT_JSX_THRESHOLD - const jsxResult = validateJsxAttributes(file.content, jsxThreshold) + const jsxResult = validateJsxAttributes( + englishContent, + file.content, + jsxThreshold + ) if (!jsxResult.isValid) { validationResults.push({ path: file.path, diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 159cb7e5769..9e5d7980d8c 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -118,6 +118,14 @@ async function main() { if (verbose) { console.log(`[DEBUG] Committed sanitized file: ${relPath}`) } + + // Update committedFiles with sanitized content for validation + const existingFile = translationResult.committedFiles.find( + (f) => f.path === relPath + ) + if (existingFile) { + existingFile.content = file.content + } } catch (e) { console.warn(`Failed to commit sanitized file ${relPath}:`, e) } From 29547a3cda80a9f488c21447fd3b9ccd55a6f0c4 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 17:18:18 -0300 Subject: [PATCH 61/99] refactor(i18n): extract JSX translation and sanitization into modules Extract inline Phase 5 and Phase 6 logic into dedicated workflow modules: - lib/workflows/jsx-translation.ts - runJsxTranslation() - lib/workflows/sanitization.ts - runPostImportSanitization() main.ts reduced from ~175 lines to ~95 lines. Each phase is now a clean function call with explicit inputs/outputs. --- .../i18n/lib/workflows/jsx-translation.ts | 107 ++++++++++++++++ .../i18n/lib/workflows/sanitization.ts | 62 ++++++++++ src/scripts/i18n/main.ts | 116 +++--------------- 3 files changed, 183 insertions(+), 102 deletions(-) create mode 100644 src/scripts/i18n/lib/workflows/jsx-translation.ts create mode 100644 src/scripts/i18n/lib/workflows/sanitization.ts diff --git a/src/scripts/i18n/lib/workflows/jsx-translation.ts b/src/scripts/i18n/lib/workflows/jsx-translation.ts new file mode 100644 index 00000000000..a540338b323 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/jsx-translation.ts @@ -0,0 +1,107 @@ +// JSX attribute translation workflow phase + +import { translateJsxAttributes } from "../../translate-jsx-attributes" +import { isGeminiAvailable } from "../ai" +import { putCommitFile } from "../github/commits" + +import type { CommittedFile, LanguagePair } from "./types" +import { logSection } from "./utils" + +export interface JsxTranslationResult { + /** Whether Gemini was skipped due to missing API key */ + geminiSkipped: boolean + /** Total attributes translated across all files */ + totalAttributesTranslated: number + /** Total files updated */ + totalFilesUpdated: number +} + +/** + * Translate JSX attributes in markdown files via Gemini. + * Updates committedFiles in-place with translated content. + */ +export async function runJsxTranslation( + committedFiles: CommittedFile[], + languagePairs: LanguagePair[], + branch: string, + verbose: boolean +): Promise { + logSection("JSX Attribute Translation") + + if (!isGeminiAvailable()) { + console.warn( + `[JSX-TRANSLATE] ⚠️ GEMINI_API_KEY not set - JSX attributes may remain untranslated` + ) + return { + geminiSkipped: true, + totalAttributesTranslated: 0, + totalFilesUpdated: 0, + } + } + + let totalAttributesTranslated = 0 + let totalFilesUpdated = 0 + + // Process each language separately + for (const langPair of languagePairs) { + const langCode = langPair.internalLanguageCode + + // Filter files for this language (markdown only) + const langFiles = committedFiles + .filter((f) => f.path.includes(`/translations/${langCode}/`)) + .filter((f) => f.path.endsWith(".md") || f.path.endsWith(".mdx")) + .map((f) => ({ path: f.path, content: f.content })) + + if (langFiles.length === 0) { + console.log(`[JSX-TRANSLATE] No markdown files for ${langCode}`) + continue + } + + console.log( + `[JSX-TRANSLATE] Processing ${langFiles.length} files for ${langCode}` + ) + + const jsxResult = await translateJsxAttributes({ + targetLanguage: langCode, + files: langFiles, + verbose, + }) + + // Commit updated files + if (jsxResult.updatedFiles.length > 0) { + for (const updated of jsxResult.updatedFiles) { + try { + const buf = Buffer.from(updated.updatedContent, "utf8") + await putCommitFile(buf, updated.filePath, branch) + if (verbose) { + console.log(`[JSX-TRANSLATE] Committed: ${updated.filePath}`) + } + + // Update the committedFiles array with new content for sanitizer + const existingFile = committedFiles.find( + (f) => f.path === updated.filePath + ) + if (existingFile) { + existingFile.content = updated.updatedContent + } + } catch (e) { + console.warn( + `[JSX-TRANSLATE] Failed to commit ${updated.filePath}:`, + e + ) + } + } + console.log( + `[JSX-TRANSLATE] ✓ Committed ${jsxResult.updatedFiles.length} files for ${langCode}` + ) + totalFilesUpdated += jsxResult.updatedFiles.length + totalAttributesTranslated += jsxResult.attributesTranslated + } + } + + return { + geminiSkipped: false, + totalAttributesTranslated, + totalFilesUpdated, + } +} diff --git a/src/scripts/i18n/lib/workflows/sanitization.ts b/src/scripts/i18n/lib/workflows/sanitization.ts new file mode 100644 index 00000000000..3022be4c13d --- /dev/null +++ b/src/scripts/i18n/lib/workflows/sanitization.ts @@ -0,0 +1,62 @@ +// Post-import sanitization workflow phase + +import { runSanitizer } from "../../post_import_sanitize" +import { putCommitFile } from "../github/commits" + +import type { CommittedFile } from "./types" +import { logSection } from "./utils" + +export interface SanitizationResult { + /** Files that were modified by the sanitizer */ + changedFiles: CommittedFile[] + /** Total files processed */ + totalProcessed: number +} + +/** + * Run post-import sanitizer on committed files. + * Updates committedFiles in-place with sanitized content. + */ +export async function runPostImportSanitization( + committedFiles: CommittedFile[], + branch: string, + verbose: boolean +): Promise { + logSection("Running Post-Import Sanitizer") + + console.log(`[SANITIZE] Processing ${committedFiles.length} committed files`) + + const sanitizeResult = runSanitizer(committedFiles) + const changedFiles = sanitizeResult.changedFiles || [] + + if (changedFiles.length) { + console.log(`Sanitizer modified ${changedFiles.length} files`) + + for (const file of changedFiles) { + const relPath = file.path + try { + const buf = Buffer.from(file.content, "utf8") + await putCommitFile(buf, relPath, branch) + if (verbose) { + console.log(`[DEBUG] Committed sanitized file: ${relPath}`) + } + + // Update committedFiles with sanitized content for validation + const existingFile = committedFiles.find((f) => f.path === relPath) + if (existingFile) { + existingFile.content = file.content + } + } catch (e) { + console.warn(`Failed to commit sanitized file ${relPath}:`, e) + } + } + console.log(`✓ Committed ${changedFiles.length} sanitized files`) + } else { + console.log("No sanitization changes needed") + } + + return { + changedFiles, + totalProcessed: committedFiles.length, + } +} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 9e5d7980d8c..69772776e65 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -1,15 +1,13 @@ -import { isGeminiAvailable } from "./lib/ai" -import { putCommitFile } from "./lib/github/commits" import { prepareEnglishFiles } from "./lib/workflows/file-preparation" import { initializeWorkflow } from "./lib/workflows/initialize" +import { runJsxTranslation } from "./lib/workflows/jsx-translation" import { createTranslationPR } from "./lib/workflows/pr-creation" import { handlePreTranslation } from "./lib/workflows/pre-translation" +import { runPostImportSanitization } from "./lib/workflows/sanitization" import { downloadAndCommitTranslations } from "./lib/workflows/translation-download" import { logSection } from "./lib/workflows/utils" import { runSyntaxValidation } from "./lib/workflows/validation" import { config } from "./config" -import { runSanitizer } from "./post_import_sanitize" -import { translateJsxAttributes } from "./translate-jsx-attributes" /** * Main orchestration function @@ -35,105 +33,19 @@ async function main() { ) // Phase 5: Translate JSX attributes via Gemini (before sanitizer) - let geminiSkipped = false - logSection("JSX Attribute Translation") - - if (!isGeminiAvailable()) { - console.warn( - `[JSX-TRANSLATE] ⚠️ GEMINI_API_KEY not set - JSX attributes may remain untranslated` - ) - geminiSkipped = true - } else { - // Process each language separately - for (const langPair of translationResult.languagePairs) { - const langCode = langPair.internalLanguageCode - - // Filter files for this language (markdown only) - const langFiles = translationResult.committedFiles - .filter((f) => f.path.includes(`/translations/${langCode}/`)) - .filter((f) => f.path.endsWith(".md") || f.path.endsWith(".mdx")) - .map((f) => ({ path: f.path, content: f.content })) - - if (langFiles.length === 0) { - console.log(`[JSX-TRANSLATE] No markdown files for ${langCode}`) - continue - } - - console.log( - `[JSX-TRANSLATE] Processing ${langFiles.length} files for ${langCode}` - ) - - const jsxResult = await translateJsxAttributes({ - targetLanguage: langCode, - files: langFiles, - verbose, - }) - - // Commit updated files - if (jsxResult.updatedFiles.length > 0) { - for (const updated of jsxResult.updatedFiles) { - try { - const buf = Buffer.from(updated.updatedContent, "utf8") - await putCommitFile(buf, updated.filePath, translationResult.branch) - if (verbose) { - console.log(`[JSX-TRANSLATE] Committed: ${updated.filePath}`) - } - - // Update the committedFiles array with new content for sanitizer - const existingFile = translationResult.committedFiles.find( - (f) => f.path === updated.filePath - ) - if (existingFile) { - existingFile.content = updated.updatedContent - } - } catch (e) { - console.warn( - `[JSX-TRANSLATE] Failed to commit ${updated.filePath}:`, - e - ) - } - } - console.log( - `[JSX-TRANSLATE] ✓ Committed ${jsxResult.updatedFiles.length} files for ${langCode}` - ) - } - } - } + const jsxTranslationResult = await runJsxTranslation( + translationResult.committedFiles, + translationResult.languagePairs, + translationResult.branch, + verbose + ) // Phase 6: Run post-import sanitizer - logSection("Running Post-Import Sanitizer") - console.log( - `[SANITIZE] Processing ${translationResult.committedFiles.length} committed files` + const sanitizeResult = await runPostImportSanitization( + translationResult.committedFiles, + translationResult.branch, + verbose ) - const sanitizeResult = runSanitizer(translationResult.committedFiles) - const changedFiles = sanitizeResult.changedFiles || [] - - if (changedFiles.length) { - console.log(`Sanitizer modified ${changedFiles.length} files`) - for (const file of changedFiles) { - const relPath = file.path - try { - const buf = Buffer.from(file.content, "utf8") - await putCommitFile(buf, relPath, translationResult.branch) - if (verbose) { - console.log(`[DEBUG] Committed sanitized file: ${relPath}`) - } - - // Update committedFiles with sanitized content for validation - const existingFile = translationResult.committedFiles.find( - (f) => f.path === relPath - ) - if (existingFile) { - existingFile.content = file.content - } - } catch (e) { - console.warn(`Failed to commit sanitized file ${relPath}:`, e) - } - } - console.log(`✓ Committed ${changedFiles.length} sanitized files`) - } else { - console.log("No sanitization changes needed") - } // Check if PR creation should be skipped const skipPrCreation = ["1", "true", "yes", "on"].includes( @@ -154,9 +66,9 @@ async function main() { const pr = await createTranslationPR( translationResult.branch, translationResult.committedFiles, - changedFiles, + sanitizeResult.changedFiles, translationResult.languagePairs, - { geminiSkipped } + { geminiSkipped: jsxTranslationResult.geminiSkipped } ) // Phase 8: Run syntax tree validation From 26c251b1d69d5984a808fa9d8879a48db0be69a1 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 17:33:21 -0300 Subject: [PATCH 62/99] fix(i18n): exclude 'text' from translatable attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove 'text' from TRANSLATABLE_ATTRIBUTES list. This attribute is typically used for emoji (e.g., ) which don't require translation. --- src/scripts/i18n/lib/jsx-attributes/types.ts | 1 - src/scripts/i18n/lib/validation/syntax-tree.ts | 1 - 2 files changed, 2 deletions(-) diff --git a/src/scripts/i18n/lib/jsx-attributes/types.ts b/src/scripts/i18n/lib/jsx-attributes/types.ts index f248f6e286b..d434c13f9c6 100644 --- a/src/scripts/i18n/lib/jsx-attributes/types.ts +++ b/src/scripts/i18n/lib/jsx-attributes/types.ts @@ -11,7 +11,6 @@ export const TRANSLATABLE_ATTRIBUTES = [ "aria-label", "placeholder", "buttonLabel", - "text", "name", "caption", "contentPreview", diff --git a/src/scripts/i18n/lib/validation/syntax-tree.ts b/src/scripts/i18n/lib/validation/syntax-tree.ts index 15dc989fae2..530cf890e7d 100644 --- a/src/scripts/i18n/lib/validation/syntax-tree.ts +++ b/src/scripts/i18n/lib/validation/syntax-tree.ts @@ -182,7 +182,6 @@ const TRANSLATABLE_ATTRIBUTES = [ "aria-label", "placeholder", "buttonLabel", - "text", "name", "caption", "contentPreview", From f97545459e4a3e588b4bdf98ad632b75491c66c6 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 17:36:05 -0300 Subject: [PATCH 63/99] refactor(i18n): DRY code consolidation - Remove 'text' from TRANSLATABLE_ATTRIBUTES (emoji attribute, not translatable) - Consolidate JSX regex patterns (JSX_COMPONENT_REGEX, JSX_ATTRIBUTE_REGEX) into jsx-attributes/types.ts - Import shared delay() utility in gemini.ts - Consolidate PromptResource type into prompt.ts - Replace duplicate TRANSLATABLE_ATTRIBUTES comment with reference to canonical source --- src/scripts/i18n/lib/ai/gemini.ts | 5 +-- src/scripts/i18n/lib/crowdin/files.ts | 17 ++------- src/scripts/i18n/lib/crowdin/prompt-model.ts | 9 +---- src/scripts/i18n/lib/crowdin/prompt.ts | 5 ++- .../i18n/lib/jsx-attributes/extract.ts | 23 ++++-------- src/scripts/i18n/lib/jsx-attributes/types.ts | 7 ++++ .../i18n/lib/validation/syntax-tree.ts | 36 +++++++------------ 7 files changed, 36 insertions(+), 66 deletions(-) diff --git a/src/scripts/i18n/lib/ai/gemini.ts b/src/scripts/i18n/lib/ai/gemini.ts index 1246eacb4ad..7d37722bad0 100644 --- a/src/scripts/i18n/lib/ai/gemini.ts +++ b/src/scripts/i18n/lib/ai/gemini.ts @@ -5,6 +5,7 @@ import { GoogleGenerativeAI } from "@google/generative-ai" import type { ExtractedAttribute, TranslatedAttribute } from "../jsx-attributes" +import { delay } from "../workflows/utils" /** Gemini API configuration */ const GEMINI_MODEL = "gemini-2.5-pro" @@ -203,8 +204,8 @@ export async function translateAttributesWithRetry( if (attempt < maxRetries) { // Exponential backoff - const delay = Math.min(1000 * Math.pow(2, attempt - 1), 10000) - await new Promise((resolve) => setTimeout(resolve, delay)) + const backoff = Math.min(1000 * Math.pow(2, attempt - 1), 10000) + await delay(backoff) } } } diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts index b42068a7d4e..d672796241d 100644 --- a/src/scripts/i18n/lib/crowdin/files.ts +++ b/src/scripts/i18n/lib/crowdin/files.ts @@ -19,22 +19,9 @@ import type { * Note: Crowdin's PATCH API only accepts a boolean flag (translateAttributes: true) * to enable attribute translation. The actual whitelist may need to be configured * separately via the Crowdin UI or a different API endpoint. + * + * See TRANSLATABLE_ATTRIBUTES in jsx-attributes/types.ts for the canonical list. */ -// Keeping this for documentation purposes - may be used in future API updates -// const TRANSLATABLE_ATTRIBUTES = [ -// "title", -// "description", -// "alt", -// "label", -// "aria-label", -// "placeholder", -// "buttonLabel", -// "text", -// "name", -// "caption", -// "contentPreview", -// "location", -// ] /** * Get all files in the Crowdin project diff --git a/src/scripts/i18n/lib/crowdin/prompt-model.ts b/src/scripts/i18n/lib/crowdin/prompt-model.ts index d3a43c751a0..0b3bb581ce6 100644 --- a/src/scripts/i18n/lib/crowdin/prompt-model.ts +++ b/src/scripts/i18n/lib/crowdin/prompt-model.ts @@ -1,13 +1,6 @@ import { crowdinBearerHeaders } from "../../config" -type PromptResource = { - id: number - name: string - action: string - aiProviderId?: number | null - model?: string | null - version?: string | null -} +import type { PromptResource } from "./prompt" export async function getPromptModelKey( userId: number, diff --git a/src/scripts/i18n/lib/crowdin/prompt.ts b/src/scripts/i18n/lib/crowdin/prompt.ts index 1c9a325129b..dce615a1518 100644 --- a/src/scripts/i18n/lib/crowdin/prompt.ts +++ b/src/scripts/i18n/lib/crowdin/prompt.ts @@ -2,12 +2,15 @@ import * as fs from "fs" import { crowdinBearerHeaders } from "../../config" -type PromptResource = { +/** Crowdin AI prompt resource type */ +export type PromptResource = { id: number name: string action: string aiProviderId?: number | null aiModelId?: string | null + model?: string | null + version?: string | null } /** diff --git a/src/scripts/i18n/lib/jsx-attributes/extract.ts b/src/scripts/i18n/lib/jsx-attributes/extract.ts index cf3ce41602b..f74c1b7a1b3 100644 --- a/src/scripts/i18n/lib/jsx-attributes/extract.ts +++ b/src/scripts/i18n/lib/jsx-attributes/extract.ts @@ -7,20 +7,11 @@ import type { FileExtractionResult, TranslatableAttribute, } from "./types" -import { TRANSLATABLE_ATTRIBUTES } from "./types" - -/** - * Regex to match JSX/HTML-style attributes with quoted values. - * Captures: attributeName="value" or attributeName='value' - */ -const ATTRIBUTE_REGEX = - /\b([a-zA-Z][\w-]*)\s*=\s*(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)')/g - -/** - * Regex to identify JSX component opening tags. - * Captures the component name and all attributes. - */ -const JSX_COMPONENT_REGEX = /<([A-Z][a-zA-Z0-9]*)\s+([^>]*?)(?:\/>|>)/g +import { + JSX_ATTRIBUTE_REGEX, + JSX_COMPONENT_REGEX, + TRANSLATABLE_ATTRIBUTES, +} from "./types" /** * Check if a string appears to be English text (not a variable, URL, or code). @@ -110,9 +101,9 @@ export function extractAttributesFromContent( // Extract attributes from this component let attrMatch: RegExpExecArray | null - ATTRIBUTE_REGEX.lastIndex = 0 + JSX_ATTRIBUTE_REGEX.lastIndex = 0 - while ((attrMatch = ATTRIBUTE_REGEX.exec(attributeString)) !== null) { + while ((attrMatch = JSX_ATTRIBUTE_REGEX.exec(attributeString)) !== null) { const attrName = attrMatch[1] const attrValue = attrMatch[2] || attrMatch[3] // double or single quotes diff --git a/src/scripts/i18n/lib/jsx-attributes/types.ts b/src/scripts/i18n/lib/jsx-attributes/types.ts index d434c13f9c6..4d4823de6fc 100644 --- a/src/scripts/i18n/lib/jsx-attributes/types.ts +++ b/src/scripts/i18n/lib/jsx-attributes/types.ts @@ -2,6 +2,13 @@ * Types for JSX attribute extraction and translation */ +/** Regex to match JSX/HTML-style attributes with quoted values */ +export const JSX_ATTRIBUTE_REGEX = + /\b([a-zA-Z][\w-]*)\s*=\s*(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)')/g + +/** Regex to identify JSX component opening tags */ +export const JSX_COMPONENT_REGEX = /<([A-Z][a-zA-Z0-9]*)\s+([^>]*?)(?:\/>|>)/g + /** Attributes that contain human-readable text requiring translation */ export const TRANSLATABLE_ATTRIBUTES = [ "title", diff --git a/src/scripts/i18n/lib/validation/syntax-tree.ts b/src/scripts/i18n/lib/validation/syntax-tree.ts index 530cf890e7d..c1b7762ba5d 100644 --- a/src/scripts/i18n/lib/validation/syntax-tree.ts +++ b/src/scripts/i18n/lib/validation/syntax-tree.ts @@ -1,5 +1,12 @@ // Syntax tree validation for JSON and Markdown files +import type { TranslatableAttribute } from "../jsx-attributes/types" +import { + JSX_ATTRIBUTE_REGEX, + JSX_COMPONENT_REGEX, + TRANSLATABLE_ATTRIBUTES, +} from "../jsx-attributes/types" + export interface JsonValidationResult { isValid: boolean expectedKeyCount: number @@ -173,27 +180,7 @@ export function validateMarkdownStructure( } } -/** Attributes that should be translated */ -const TRANSLATABLE_ATTRIBUTES = [ - "title", - "description", - "alt", - "label", - "aria-label", - "placeholder", - "buttonLabel", - "name", - "caption", - "contentPreview", - "location", -] - -/** JSX component regex for validation */ -const JSX_COMPONENT_REGEX = /<([A-Z][a-zA-Z0-9]*)\s+([^>]*?)(?:\/>|>)/g - -/** Attribute regex for validation */ -const ATTRIBUTE_REGEX = - /\b([a-zA-Z][\w-]*)\s*=\s*(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)')/g +// JSX_COMPONENT_REGEX and JSX_ATTRIBUTE_REGEX imported from jsx-attributes/types /** * Extract JSX component attributes from content @@ -227,13 +214,14 @@ function extractJsxAttributes( const componentLine = currentLine + 1 let attrMatch: RegExpExecArray | null - ATTRIBUTE_REGEX.lastIndex = 0 + JSX_ATTRIBUTE_REGEX.lastIndex = 0 - while ((attrMatch = ATTRIBUTE_REGEX.exec(attributeString)) !== null) { + while ((attrMatch = JSX_ATTRIBUTE_REGEX.exec(attributeString)) !== null) { const attrName = attrMatch[1] const attrValue = attrMatch[2] || attrMatch[3] - if (!TRANSLATABLE_ATTRIBUTES.includes(attrName)) continue + if (!TRANSLATABLE_ATTRIBUTES.includes(attrName as TranslatableAttribute)) + continue // Use component position + attribute name as key for matching // This allows us to match attributes even if component names differ slightly From cad286372f519cd4eead967ac9db25c66b5dfcec Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 17 Dec 2025 21:25:47 -0300 Subject: [PATCH 64/99] refactor(i18n): use i18n.config.json for language names in Gemini Replace hardcoded LANGUAGE_NAMES record with dynamic import from i18n.config.json to avoid maintaining duplicate language lists. --- src/scripts/i18n/lib/ai/gemini.ts | 39 ++++--------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/src/scripts/i18n/lib/ai/gemini.ts b/src/scripts/i18n/lib/ai/gemini.ts index 7d37722bad0..550238eda52 100644 --- a/src/scripts/i18n/lib/ai/gemini.ts +++ b/src/scripts/i18n/lib/ai/gemini.ts @@ -4,46 +4,17 @@ import { GoogleGenerativeAI } from "@google/generative-ai" +import i18nConfig from "../../../../../i18n.config.json" import type { ExtractedAttribute, TranslatedAttribute } from "../jsx-attributes" import { delay } from "../workflows/utils" /** Gemini API configuration */ const GEMINI_MODEL = "gemini-2.5-pro" -/** Language display names for better prompt context */ -const LANGUAGE_NAMES: Record = { - es: "Spanish", - fr: "French", - de: "German", - it: "Italian", - pt: "Portuguese", - ru: "Russian", - zh: "Chinese (Simplified)", - ja: "Japanese", - ko: "Korean", - ar: "Arabic", - tr: "Turkish", - nl: "Dutch", - pl: "Polish", - vi: "Vietnamese", - th: "Thai", - id: "Indonesian", - uk: "Ukrainian", - cs: "Czech", - ro: "Romanian", - hu: "Hungarian", - el: "Greek", - sv: "Swedish", - da: "Danish", - fi: "Finnish", - no: "Norwegian", - he: "Hebrew", - hi: "Hindi", - bn: "Bengali", - ms: "Malay", - tl: "Filipino", - sw: "Swahili", -} +/** Language names parsed from i18n.config.json */ +const LANGUAGE_NAMES: Record = Object.fromEntries( + i18nConfig.map(({ code, name }) => [code, name]) +) /** * Check if Gemini API is available (API key present) From 020078b5e4f131c5f33017a5ab7b09d24718ef98 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 19 Dec 2025 14:20:33 -0300 Subject: [PATCH 65/99] fix(i18n): normalize ButtonLink formatting from English Add sanitizer to collapse ButtonLink content to single line when English source is single-line. MDX wraps multi-line content in

tags, causing styling issues. - Match ButtonLink instances by href attribute - Only collapse if English is single-line - Remove ButtonLink from block component lists (inline component) --- src/scripts/i18n/post_import_sanitize.ts | 78 +++++++++++++++++++++++- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 79d2e81f281..d67aa087399 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -173,8 +173,9 @@ function restoreBlankLinesFromEnglish( // Patterns that should have blank lines after them const headerPattern = /^#{1,6}\s+/ + // NOTE: ButtonLink is excluded - children should remain inline const blockComponentClosePattern = - /<\/(Alert|AlertContent|AlertDescription|Card|ExpandableCard|CardGrid|InfoGrid|ButtonLink|Tabs|TabItem|InfoBanner)>/ + /<\/(Alert|AlertContent|AlertDescription|Card|ExpandableCard|CardGrid|InfoGrid|Tabs|TabItem|InfoBanner)>/ for (let i = 0; i < translatedLines.length; i++) { const line = translatedLines[i] @@ -225,10 +226,70 @@ function restoreBlankLinesFromEnglish( * MDX parser requires these tags to be on separate lines. * Returns number of fixes applied. */ +/** + * Normalize inline component formatting to match English source. + * If English has the component on one line, collapse translated version too. + * This prevents MDX from wrapping multi-line content in

tags. + */ +function normalizeInlineComponentsFromEnglish( + translatedMd: string, + englishMd: string +): { + content: string + fixCount: number +} { + const inlineComponents = ["ButtonLink"] + + let content = translatedMd + let fixCount = 0 + + for (const component of inlineComponents) { + // Extract English instances and check if they're single-line + // Key by href attribute since that's preserved in translation + const englishRe = new RegExp( + `<${component}[^>]*href="([^"]*)"[^>]*>([\\s\\S]*?)`, + "g" + ) + const englishFormats = new Map() // href -> isOneLine + + let match + while ((match = englishRe.exec(englishMd))) { + const href = match[1] + const innerContent = match[2] + const isOneLine = !innerContent.includes("\n") + englishFormats.set(href, isOneLine) + } + + // For each translated instance, mirror English format + const translatedRe = new RegExp( + `(<${component}[^>]*href="([^"]*)"[^>]*>)([\\s\\S]*?)()`, + "g" + ) + content = content.replace( + translatedRe, + (fullMatch, openTag, href, innerContent, closeTag) => { + const englishIsOneLine = englishFormats.get(href) + const translatedHasLineBreaks = innerContent.includes("\n") + + // If English is single-line but translated has line breaks, collapse it + if (englishIsOneLine && translatedHasLineBreaks) { + fixCount++ + return `${openTag}${innerContent.trim()}${closeTag}` + } + return fullMatch + } + ) + } + + return { content, fixCount } +} + function fixBlockComponentLineBreaks(md: string): { content: string fixCount: number } { + // Block components that need opening/closing tags on separate lines + // NOTE: ButtonLink is intentionally excluded - it's an inline component const blockComponents = [ "Card", "ExpandableCard", @@ -239,7 +300,6 @@ function fixBlockComponentLineBreaks(md: string): { "CardGrid", "InfoGrid", "InfoBanner", - "ButtonLink", "Tabs", "TabItem", ] @@ -309,8 +369,20 @@ function processMarkdownFile( content = normalizeBlockHtmlLines(content) - // Restore blank lines from English source (improves readability) + // Normalize inline components and restore blank lines from English source if (englishMd) { + // Collapse inline component line breaks to match English format + const inlineResult = normalizeInlineComponentsFromEnglish( + content, + englishMd + ) + content = inlineResult.content + if (inlineResult.fixCount > 0) { + issues.push( + `Normalized ${inlineResult.fixCount} inline components to match English` + ) + } + const blankLineResult = restoreBlankLinesFromEnglish(content, englishMd) content = blankLineResult.content if (blankLineResult.fixCount > 0) { From 3e7f8370d5d4ece321853fc663ae781a7c1966ea Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 19 Dec 2025 20:34:36 -0300 Subject: [PATCH 66/99] fix(i18n): improve error handling and logging in file processing - findCrowdinFile returns null instead of throwing when file not found - Removed noisy "Available Crowdin file paths" dump from logs - Changed log level from ERROR to INFO for new files - postCrowdinFile propagates errors instead of process.exit(1) - Added try-catch-continue pattern in prepareEnglishFiles - Single file failures no longer terminate entire workflow - Exit code 1 only if ALL files fail --- src/scripts/i18n/lib/crowdin/files.ts | 64 ++++++++----------- .../i18n/lib/workflows/file-preparation.ts | 48 ++++++++++---- 2 files changed, 62 insertions(+), 50 deletions(-) diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts index d672796241d..772af4d7727 100644 --- a/src/scripts/i18n/lib/crowdin/files.ts +++ b/src/scripts/i18n/lib/crowdin/files.ts @@ -66,12 +66,13 @@ export const getCrowdinProjectFiles = async (): Promise => { } /** - * Find a Crowdin file matching a GitHub file + * Find a Crowdin file matching a GitHub file. + * Returns null if file not found (indicating it's new and needs to be uploaded). */ export const findCrowdinFile = ( targetFile: GitHubCrowdinFileMetadata, crowdinFiles: CrowdinFileData[] -): CrowdinFileData => { +): CrowdinFileData | null => { if (config.verbose) { console.log( `[DEBUG] Looking for Crowdin file matching: ${targetFile.filePath}` @@ -83,16 +84,11 @@ export const findCrowdinFile = ( ) if (!found) { - console.error( - `[ERROR] No matching Crowdin project file found for: ${targetFile.filePath}` - ) - console.error( - `[ERROR] Available Crowdin file paths:`, - crowdinFiles.map((f) => f.path) - ) - throw new Error( - `No matching Crowdin project file found for: ${targetFile.filePath}` + // Not an error - file is new and will be uploaded + console.log( + `[INFO] File not in Crowdin (will upload): ${targetFile.filePath}` ) + return null } if (config.verbose) { @@ -395,33 +391,27 @@ export const postCrowdinFile = async ( directoryId, } - try { - // First, create the file - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - Accept: "application/json", - }, - body: JSON.stringify(requestBody), - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`Crowdin postCrowdinFile failed (${res.status}): ${body}`) - } + // Create the file (errors propagate to caller for graceful handling) + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + Accept: "application/json", + }, + body: JSON.stringify(requestBody), + }) + + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error(`Crowdin postCrowdinFile failed (${res.status}): ${body}`) + } - type JsonResponse = { data: CrowdinAddFileResponse } - const json: JsonResponse = await res.json() - console.log("Created file:", json.data) + type JsonResponse = { data: CrowdinAddFileResponse } + const json: JsonResponse = await res.json() + console.log("Created file:", json.data) - // Note: parser options are managed in Crowdin UI. No PATCH here. + // Note: parser options are managed in Crowdin UI. No PATCH here. - return json.data - } catch (error) { - console.error(error) - process.exit(1) - } + return json.data } diff --git a/src/scripts/i18n/lib/workflows/file-preparation.ts b/src/scripts/i18n/lib/workflows/file-preparation.ts index 1c08a94c085..9e5616ef98d 100644 --- a/src/scripts/i18n/lib/workflows/file-preparation.ts +++ b/src/scripts/i18n/lib/workflows/file-preparation.ts @@ -179,32 +179,54 @@ export async function prepareEnglishFiles( const fileMetadata = await getFileMetadata(allEnglishFiles) + // Track failed files for summary + const failedFiles: Array<{ path: string; error: string }> = [] + let successCount = 0 + // Iterate through each file and upload/update for (const file of fileMetadata) { if (verbose) { console.log(`[DEBUG] Processing file: ${file.filePath}`) } - let foundFile: CrowdinFileData | undefined try { - foundFile = findCrowdinFile(file, crowdinProjectFiles) - } catch { - if (verbose) { - console.log("File not found in Crowdin, will add new file") + // findCrowdinFile returns null if file doesn't exist (will be created) + const foundFile = findCrowdinFile(file, crowdinProjectFiles) + + const result = foundFile + ? await updateCrowdinFile(file, foundFile, verbose) + : await createCrowdinFile(file, verbose) + + fileIdsSet.add(result.fileId) + if (result.path) { + processedFileIdToPath[result.fileId] = result.path } + englishBuffers[result.fileId] = result.buffer + successCount++ + } catch (error) { + // Log and continue - don't let one file failure kill the entire job + const message = error instanceof Error ? error.message : String(error) + failedFiles.push({ path: file.filePath, error: message }) + console.warn(`[WARN] Skipping ${file.filePath}: ${message}`) } + } - const result = foundFile - ? await updateCrowdinFile(file, foundFile, verbose) - : await createCrowdinFile(file, verbose) + // Log summary of failed files + if (failedFiles.length > 0) { + console.log(`\n[SUMMARY] ${failedFiles.length} files skipped:`) + failedFiles.forEach((f) => console.log(` - ${f.path}`)) + } - fileIdsSet.add(result.fileId) - if (result.path) { - processedFileIdToPath[result.fileId] = result.path - } - englishBuffers[result.fileId] = result.buffer + // Exit 1 only if ALL files failed + if (successCount === 0 && failedFiles.length > 0) { + console.error("[ERROR] All files failed to process") + process.exit(1) } + console.log( + `\n[INFO] Processed ${successCount} files successfully${failedFiles.length > 0 ? `, ${failedFiles.length} skipped` : ""}` + ) + // Unhide any hidden/duplicate strings before pre-translation logSection(`Unhiding Strings in ${fileIdsSet.size} Files`) for (const fileId of Array.from(fileIdsSet)) { From 891d1f2dd3616cb7fe0a2f9fd75f982ec701355d Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 19 Dec 2025 20:34:58 -0300 Subject: [PATCH 67/99] feat(i18n): add exclude_path and skip_await workflow options New workflow inputs: - exclude_path: Runtime path exclusion for job-specific filtering (separate from permanent excluded-paths.json) - skip_await: Exit after dispatching pre-translation for manual resume Use case: Translate all content except tutorials: target_path: public/content exclude_path: public/content/developers/tutorials Use case: Large job that will exceed timeout: skip_await: true (then resume later with pretranslation_id) --- .github/workflows/crowdin-ai-import.yml | 11 ++++++++++ src/scripts/i18n/config.ts | 10 +++++++++ src/scripts/i18n/lib/github/files.ts | 16 ++++++++++---- .../i18n/lib/workflows/pre-translation.ts | 22 +++++++++---------- 4 files changed, 44 insertions(+), 15 deletions(-) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index da65c8dd466..a6cc1ce5d4b 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -7,6 +7,10 @@ on: description: "File or directory path to translate (e.g., public/content/developers/index.md or public/content/developers or blank for all files)" required: false type: string + exclude_path: + description: "Path to exclude from this job (e.g., public/content/developers/tutorials)" + required: false + type: string target_languages: description: "Comma-separated internal language codes (blank for all locales)" required: false @@ -46,6 +50,11 @@ on: required: false default: false type: boolean + skip_await: + description: "Exit after dispatching pre-translation (resume later with ID)" + required: false + default: false + type: boolean verbose: description: "Enable verbose logging?" required: false @@ -79,6 +88,7 @@ jobs: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} TARGET_PATH: ${{ github.event.inputs.target_path }} + EXCLUDE_PATH: ${{ github.event.inputs.exclude_path }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} USE_LEGACY_LANGUAGES: ${{ github.event.inputs.use_legacy_languages }} BASE_BRANCH: ${{ github.event.inputs.base_branch }} @@ -87,4 +97,5 @@ jobs: PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} VERBOSE: ${{ github.event.inputs.verbose }} SKIP_PR_CREATION: ${{ github.event.inputs.skip_pr }} + SKIP_AWAIT: ${{ github.event.inputs.skip_await }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index 5149929dee8..d9a79e61266 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -83,6 +83,12 @@ if (targetLanguagesInput.length === 0) { const baseBranch = process.env.BASE_BRANCH || "dev" const targetPath = process.env.TARGET_PATH || "" +const excludePath = process.env.EXCLUDE_PATH?.trim() || "" + +// Skip awaiting pre-translation completion (exit early with ID for manual resume) +const skipAwait = ["1", "true", "yes", "on"].includes( + (process.env.SKIP_AWAIT || "").toLowerCase() +) // Adaptive polling / timeout configuration (milliseconds) const pretranslateTimeoutMs = process.env.PRETRANSLATE_TIMEOUT_MS @@ -115,6 +121,8 @@ if (verbose) { console.log( `[DEBUG] - Target path: ${targetPath || "none (full translation)"}` ) + console.log(`[DEBUG] - Exclude path: ${excludePath || "none"}`) + console.log(`[DEBUG] - Skip await: ${skipAwait}`) console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) console.log(`[DEBUG] - Pretranslate timeout ms: ${pretranslateTimeoutMs}`) console.log(`[DEBUG] - Pretranslate poll base ms: ${pretranslatePollBaseMs}`) @@ -144,6 +152,8 @@ export const config = { useLegacyLanguages, baseBranch, targetPath, + excludePath, + skipAwait, pretranslateTimeoutMs, pretranslatePollBaseMs, existingPreTranslationId, diff --git a/src/scripts/i18n/lib/github/files.ts b/src/scripts/i18n/lib/github/files.ts index 732648e2f0e..0e9d80c7efb 100644 --- a/src/scripts/i18n/lib/github/files.ts +++ b/src/scripts/i18n/lib/github/files.ts @@ -31,18 +31,26 @@ function isFilePath(targetPath: string): boolean { export const getAllEnglishFiles = async (): Promise< GitHubQueryResponseItem[] > => { - const { targetPath, verbose } = config + const { targetPath, excludePath, verbose } = config const excludedPaths = loadExcludedPaths() + // Add runtime exclusion if specified + const allExcludedPaths = excludePath + ? [...excludedPaths, excludePath] + : excludedPaths + if (verbose) { console.log( `[DEBUG] Excluded paths loaded: ${excludedPaths.length} entries` ) + if (excludePath) { + console.log(`[DEBUG] Runtime exclude path: ${excludePath}`) + } } // Determine if targetPath is a file or directory if (targetPath) { - if (isPathExcluded(targetPath, excludedPaths)) { + if (isPathExcluded(targetPath, allExcludedPaths)) { console.log(`[INFO] Path ${targetPath} is in excluded paths, skipping`) return [] } @@ -130,9 +138,9 @@ export const getAllEnglishFiles = async (): Promise< } } - // Filter out excluded paths + // Filter out excluded paths (static + runtime) const filtered = collected.filter( - (item) => !isPathExcluded(item.path, excludedPaths) + (item) => !isPathExcluded(item.path, allExcludedPaths) ) const excludedCount = collected.length - filtered.length diff --git a/src/scripts/i18n/lib/workflows/pre-translation.ts b/src/scripts/i18n/lib/workflows/pre-translation.ts index 2b76b756414..e7f14ae1819 100644 --- a/src/scripts/i18n/lib/workflows/pre-translation.ts +++ b/src/scripts/i18n/lib/workflows/pre-translation.ts @@ -97,22 +97,22 @@ async function startNewPreTranslation( config.allCrowdinCodes ) - // If no targetPath specified (full translation), exit now and let Crowdin work - if (!config.targetPath) { - logSection("Full Translation Job Started") + // Exit early if skipAwait is set or if full translation mode (no targetPath) + if (config.skipAwait || !config.targetPath) { + const reason = config.skipAwait + ? "skip_await option enabled" + : "full translation job" + logSection(`Exiting for Manual Resume (${reason})`) + console.log(`Pre-translation ID: ${applyPreTranslationResponse.identifier}`) + console.log(`\nTo resume later, dispatch workflow with:`) console.log( - `This is a large job that will take significant time to complete.` - ) - console.log( - `The workflow will exit now. Resume later with the pre-translation ID above.` - ) - console.log( - `Check Crowdin dashboard for progress: https://crowdin.com/project/ethereum-org` + ` pretranslation_id: ${applyPreTranslationResponse.identifier}` ) + console.log(`\nCheck progress: https://crowdin.com/project/ethereum-org`) process.exit(0) } - // For file/directory mode, wait for completion + // For file/directory mode without skipAwait, wait for completion console.log(`\nWaiting for pre-translation to complete...`) const completedResponse = await awaitPreTranslationCompleted( applyPreTranslationResponse.identifier From e41149b9ad1d9b2665177fcc6395e595b33c4fea Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 12:23:50 -0300 Subject: [PATCH 68/99] feat(i18n): add supabase glossary module --- src/scripts/i18n/lib/supabase/glossary.ts | 117 ++++++++++++++++++++++ src/scripts/i18n/lib/supabase/index.ts | 9 ++ 2 files changed, 126 insertions(+) create mode 100644 src/scripts/i18n/lib/supabase/glossary.ts create mode 100644 src/scripts/i18n/lib/supabase/index.ts diff --git a/src/scripts/i18n/lib/supabase/glossary.ts b/src/scripts/i18n/lib/supabase/glossary.ts new file mode 100644 index 00000000000..2eee0a0ef74 --- /dev/null +++ b/src/scripts/i18n/lib/supabase/glossary.ts @@ -0,0 +1,117 @@ +/** + * Supabase glossary client for fetching community-approved translations + * + * Fetches from the `top_translations` view which contains the highest-voted + * translation for each term/language pair. + */ + +/** Glossary entry from Supabase top_translations view */ +export interface GlossaryEntry { + string_term: string + translation_text: string + language_code: string + total_votes: number +} + +/** Glossary grouped by language code */ +export type GlossaryByLanguage = Map> + +/** Tone for translation register */ +export type Tone = "informal" | "formal" + +/** + * Fetch all glossary entries from Supabase + */ +export async function fetchGlossaryEntries(): Promise { + const supabaseUrl = process.env.SUPABASE_URL + const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY + + if (!supabaseUrl || !supabaseKey) { + console.warn( + "[GLOSSARY] Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY, skipping glossary fetch" + ) + return [] + } + + const url = `${supabaseUrl}/rest/v1/top_translations?select=string_term,translation_text,language_code,total_votes` + + try { + const response = await fetch(url, { + headers: { + apikey: supabaseKey, + Authorization: `Bearer ${supabaseKey}`, + "Content-Type": "application/json", + }, + }) + + if (!response.ok) { + const text = await response.text().catch(() => "") + throw new Error(`Supabase API error (${response.status}): ${text}`) + } + + const entries: GlossaryEntry[] = await response.json() + console.log(`[GLOSSARY] Fetched ${entries.length} glossary entries`) + return entries + } catch (error) { + console.warn("[GLOSSARY] Failed to fetch glossary:", error) + return [] + } +} + +/** + * Group glossary entries by language code for efficient lookup + * Returns Map> + */ +export function groupGlossaryByLanguage( + entries: GlossaryEntry[] +): GlossaryByLanguage { + const byLanguage: GlossaryByLanguage = new Map() + + for (const entry of entries) { + if (!byLanguage.has(entry.language_code)) { + byLanguage.set(entry.language_code, new Map()) + } + byLanguage + .get(entry.language_code)! + .set(entry.string_term, entry.translation_text) + } + + return byLanguage +} + +/** + * Get glossary terms for a specific language code + * Returns Map or empty map if not found + */ +export function getGlossaryForLanguage( + glossary: GlossaryByLanguage, + languageCode: string +): Map { + return glossary.get(languageCode) ?? new Map() +} + +/** + * Format glossary as string for inclusion in AI prompts + */ +export function formatGlossaryForPrompt( + glossaryTerms: Map, + tone: Tone = "informal" +): string { + if (glossaryTerms.size === 0) return "" + + const toneInstruction = + tone === "formal" + ? "Use formal register." + : "Use informal, friendly register." + + const terms = Array.from(glossaryTerms.entries()) + .map(([term, translation]) => `- "${term}" → "${translation}"`) + .join("\n") + + return `## REQUIRED TERMINOLOGY + +Use these exact translations. Do not substitute synonyms. +${toneInstruction} + +${terms}` +} diff --git a/src/scripts/i18n/lib/supabase/index.ts b/src/scripts/i18n/lib/supabase/index.ts new file mode 100644 index 00000000000..1689b520c25 --- /dev/null +++ b/src/scripts/i18n/lib/supabase/index.ts @@ -0,0 +1,9 @@ +// Supabase integration exports + +export type { GlossaryByLanguage, GlossaryEntry, Tone } from "./glossary" +export { + fetchGlossaryEntries, + formatGlossaryForPrompt, + getGlossaryForLanguage, + groupGlossaryByLanguage, +} from "./glossary" From 4f7606fcc95df615fe779ecdf496a80c7e53453f Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 12:24:18 -0300 Subject: [PATCH 69/99] feat(i18n): integrate glossary into workflow --- .github/workflows/crowdin-ai-import.yml | 2 ++ src/scripts/i18n/lib/workflows/initialize.ts | 9 +++++++++ src/scripts/i18n/lib/workflows/types.ts | 2 ++ 3 files changed, 13 insertions(+) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index a6cc1ce5d4b..39f57ad7e89 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -86,6 +86,8 @@ jobs: I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_API_KEY }} I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} TARGET_PATH: ${{ github.event.inputs.target_path }} EXCLUDE_PATH: ${{ github.event.inputs.exclude_path }} diff --git a/src/scripts/i18n/lib/workflows/initialize.ts b/src/scripts/i18n/lib/workflows/initialize.ts index 873664d1050..a4cd4fb9ffc 100644 --- a/src/scripts/i18n/lib/workflows/initialize.ts +++ b/src/scripts/i18n/lib/workflows/initialize.ts @@ -2,6 +2,7 @@ import { config, validateTargetPath } from "../../config" import { getCrowdinProjectFiles } from "../crowdin/files" +import { fetchGlossaryEntries, groupGlossaryByLanguage } from "../supabase" import type { WorkflowContext } from "./types" import { logSection } from "./utils" @@ -33,11 +34,19 @@ export async function initializeWorkflow(): Promise { // Fetch Crowdin project state const crowdinProjectFiles = await getCrowdinProjectFiles() + // Fetch glossary from Supabase (graceful degradation if unavailable) + const glossaryEntries = await fetchGlossaryEntries() + const glossary = groupGlossaryByLanguage(glossaryEntries) + console.log( + `[INIT] Loaded glossary: ${glossaryEntries.length} terms across ${glossary.size} languages` + ) + // Initialize shared state return { crowdinProjectFiles, fileIdsSet: new Set(), processedFileIdToPath: {}, englishBuffers: {}, + glossary, } } diff --git a/src/scripts/i18n/lib/workflows/types.ts b/src/scripts/i18n/lib/workflows/types.ts index 0ae96521d62..a85815e2a40 100644 --- a/src/scripts/i18n/lib/workflows/types.ts +++ b/src/scripts/i18n/lib/workflows/types.ts @@ -1,5 +1,6 @@ // Types for i18n workflow phases +import type { GlossaryByLanguage } from "../supabase" import type { CrowdinFileData, CrowdinPreTranslateResponse } from "../types" /** @@ -10,6 +11,7 @@ export interface WorkflowContext { fileIdsSet: Set processedFileIdToPath: Record englishBuffers: Record + glossary: GlossaryByLanguage } /** From 67cdde9ab770c81f2a7bce6451d52d5d4e56693b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 12:33:06 -0300 Subject: [PATCH 70/99] feat(i18n): inject glossary into Crowdin prompt --- src/scripts/i18n/lib/crowdin/prompt.ts | 12 ++++++++ .../i18n/lib/workflows/file-preparation.ts | 30 +++++++++++++++---- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/scripts/i18n/lib/crowdin/prompt.ts b/src/scripts/i18n/lib/crowdin/prompt.ts index dce615a1518..7f0c43a923e 100644 --- a/src/scripts/i18n/lib/crowdin/prompt.ts +++ b/src/scripts/i18n/lib/crowdin/prompt.ts @@ -44,6 +44,18 @@ export async function updatePromptFromFile( filePath: string ): Promise { const content = await fs.promises.readFile(filePath, "utf8") + await updatePromptContent(userId, promptId, content) +} + +/** + * Update a Crowdin AI prompt with provided content. + * Uses Crowdin API v2: PATCH /users/{userId}/ai/prompts/{promptId} + */ +export async function updatePromptContent( + userId: number, + promptId: number, + content: string +): Promise { const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` const resp = await fetch(url, { method: "PATCH", diff --git a/src/scripts/i18n/lib/workflows/file-preparation.ts b/src/scripts/i18n/lib/workflows/file-preparation.ts index 9e5616ef98d..d1057d266a3 100644 --- a/src/scripts/i18n/lib/workflows/file-preparation.ts +++ b/src/scripts/i18n/lib/workflows/file-preparation.ts @@ -1,5 +1,6 @@ // File preparation workflow phase +import * as fs from "fs" import * as path from "path" import { config, crowdinBearerHeaders } from "../../config" @@ -9,13 +10,14 @@ import { postFileToStorage, unhideStringsInFile, } from "../crowdin/files" -import { updatePromptFromFile } from "../crowdin/prompt" +import { updatePromptContent } from "../crowdin/prompt" import { getCurrentUser } from "../crowdin/user" import { downloadGitHubFile, getAllEnglishFiles, getFileMetadata, } from "../github/files" +import { formatGlossaryForPrompt, getGlossaryForLanguage } from "../supabase" import type { CrowdinFileData } from "../types" import type { FilePreparationResult, WorkflowContext } from "./types" @@ -135,27 +137,45 @@ async function createCrowdinFile( export async function prepareEnglishFiles( context: WorkflowContext ): Promise { - const { verbose } = config + const { verbose, allInternalCodes } = config const { crowdinProjectFiles, fileIdsSet, processedFileIdToPath, englishBuffers, + glossary, } = context logSection("Starting New Pre-Translation") - // Ensure Crowdin AI prompt content is synced from repo canonical file + // Ensure Crowdin AI prompt content is synced from repo canonical file with glossary try { const currentUser = await getCurrentUser() const promptPath = path.join( process.cwd(), "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" ) - await updatePromptFromFile( + const basePrompt = fs.readFileSync(promptPath, "utf8") + + // Get glossary for target language and append to prompt + const targetLang = allInternalCodes[0] + const glossaryTerms = getGlossaryForLanguage(glossary, targetLang) + const glossarySection = formatGlossaryForPrompt(glossaryTerms, "informal") + + const fullPrompt = glossarySection + ? `${basePrompt}\n\n---\n\n${glossarySection}` + : basePrompt + + if (glossaryTerms.size > 0) { + console.log( + `[GLOSSARY] Injecting ${glossaryTerms.size} terms for ${targetLang} into prompt` + ) + } + + await updatePromptContent( currentUser.id, config.preTranslatePromptId, - promptPath + fullPrompt ) console.log("✓ Updated Crowdin pre-translate prompt from repo file") } catch (e) { From 248a5bc79117a2dbf8e90700b58327505b87027b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 12:33:23 -0300 Subject: [PATCH 71/99] feat(i18n): inject glossary into Gemini JSX prompt --- src/scripts/i18n/lib/ai/gemini.ts | 39 ++++++++++++++++--- .../i18n/lib/workflows/jsx-translation.ts | 5 +++ src/scripts/i18n/main.ts | 1 + src/scripts/i18n/translate-jsx-attributes.ts | 7 +++- 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/scripts/i18n/lib/ai/gemini.ts b/src/scripts/i18n/lib/ai/gemini.ts index 550238eda52..1e978ea192c 100644 --- a/src/scripts/i18n/lib/ai/gemini.ts +++ b/src/scripts/i18n/lib/ai/gemini.ts @@ -46,7 +46,8 @@ function getLanguageName(code: string): string { */ function buildTranslationPrompt( attributes: ExtractedAttribute[], - targetLanguage: string + targetLanguage: string, + glossaryTerms?: Map ): string { const langName = getLanguageName(targetLanguage) @@ -58,6 +59,19 @@ function buildTranslationPrompt( ) .join("\n\n") + // Build glossary section if terms provided + let glossarySection = "" + if (glossaryTerms && glossaryTerms.size > 0) { + const termsList = Array.from(glossaryTerms.entries()) + .map(([term, translation]) => `- "${term}" → "${translation}"`) + .join("\n") + glossarySection = ` + +REQUIRED TERMINOLOGY (use these exact translations): +${termsList} +` + } + return `You are translating UI component attributes for the Ethereum.org website into ${langName}. These are JSX component attributes that contain human-readable text. Translate each value naturally and accurately while: @@ -65,6 +79,7 @@ These are JSX component attributes that contain human-readable text. Translate e - Keeping the translation concise (similar length to original) - Maintaining any placeholders like {variable} or {{variable}} unchanged - Using region-neutral ${langName} that most speakers would understand +- Using informal, friendly register${glossarySection} Attributes to translate: @@ -110,7 +125,8 @@ function parseTranslationResponse(response: string): string[] { */ export async function translateAttributes( attributes: ExtractedAttribute[], - targetLanguage: string + targetLanguage: string, + glossaryTerms?: Map ): Promise { if (attributes.length === 0) { return [] @@ -126,7 +142,11 @@ export async function translateAttributes( const client = getGeminiClient() const model = client.getGenerativeModel({ model: GEMINI_MODEL }) - const prompt = buildTranslationPrompt(attributes, targetLanguage) + const prompt = buildTranslationPrompt( + attributes, + targetLanguage, + glossaryTerms + ) console.log( `[GEMINI] Translating ${attributes.length} attributes to ${getLanguageName(targetLanguage)}` @@ -160,13 +180,18 @@ export async function translateAttributes( export async function translateAttributesWithRetry( attributes: ExtractedAttribute[], targetLanguage: string, + glossaryTerms?: Map, maxRetries = 3 ): Promise { let lastError: Error | null = null for (let attempt = 1; attempt <= maxRetries; attempt++) { try { - return await translateAttributes(attributes, targetLanguage) + return await translateAttributes( + attributes, + targetLanguage, + glossaryTerms + ) } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)) console.warn( @@ -190,7 +215,8 @@ export async function translateAttributesWithRetry( */ export async function translateAttributesByFile( attributesByFile: Map, - targetLanguage: string + targetLanguage: string, + glossaryTerms?: Map ): Promise> { const results = new Map() @@ -198,7 +224,8 @@ export async function translateAttributesByFile( try { const translated = await translateAttributesWithRetry( attributes, - targetLanguage + targetLanguage, + glossaryTerms ) results.set(filePath, translated) console.log( diff --git a/src/scripts/i18n/lib/workflows/jsx-translation.ts b/src/scripts/i18n/lib/workflows/jsx-translation.ts index a540338b323..0da3f83ece2 100644 --- a/src/scripts/i18n/lib/workflows/jsx-translation.ts +++ b/src/scripts/i18n/lib/workflows/jsx-translation.ts @@ -3,6 +3,8 @@ import { translateJsxAttributes } from "../../translate-jsx-attributes" import { isGeminiAvailable } from "../ai" import { putCommitFile } from "../github/commits" +import type { GlossaryByLanguage } from "../supabase" +import { getGlossaryForLanguage } from "../supabase" import type { CommittedFile, LanguagePair } from "./types" import { logSection } from "./utils" @@ -24,6 +26,7 @@ export async function runJsxTranslation( committedFiles: CommittedFile[], languagePairs: LanguagePair[], branch: string, + glossary: GlossaryByLanguage, verbose: boolean ): Promise { logSection("JSX Attribute Translation") @@ -61,9 +64,11 @@ export async function runJsxTranslation( `[JSX-TRANSLATE] Processing ${langFiles.length} files for ${langCode}` ) + const glossaryTerms = getGlossaryForLanguage(glossary, langCode) const jsxResult = await translateJsxAttributes({ targetLanguage: langCode, files: langFiles, + glossaryTerms, verbose, }) diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 69772776e65..630132b9e35 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -37,6 +37,7 @@ async function main() { translationResult.committedFiles, translationResult.languagePairs, translationResult.branch, + context.glossary, verbose ) diff --git a/src/scripts/i18n/translate-jsx-attributes.ts b/src/scripts/i18n/translate-jsx-attributes.ts index 555c29ab202..ae195b65fca 100644 --- a/src/scripts/i18n/translate-jsx-attributes.ts +++ b/src/scripts/i18n/translate-jsx-attributes.ts @@ -34,6 +34,8 @@ export interface TranslateJsxOptions { targetLanguage: string /** Files to process (path and content) */ files: { path: string; content: string }[] + /** Glossary terms for this language (English term -> translated term) */ + glossaryTerms?: Map /** Whether to log verbose output */ verbose?: boolean } @@ -45,7 +47,7 @@ export interface TranslateJsxOptions { export async function translateJsxAttributes( options: TranslateJsxOptions ): Promise { - const { targetLanguage, files, verbose = false } = options + const { targetLanguage, files, glossaryTerms, verbose = false } = options console.log(`\n[JSX-TRANSLATE] Starting JSX attribute translation`) console.log(`[JSX-TRANSLATE] Target language: ${targetLanguage}`) @@ -110,7 +112,8 @@ export async function translateJsxAttributes( // Translate attributes via Gemini (one API call per file batch) const translatedByFile = await translateAttributesByFile( attributesByFile, - targetLanguage + targetLanguage, + glossaryTerms ) // Re-insert translated attributes into files From d97e0cbd7aa767628842235bb8b3d1823f40ddc7 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 12:58:19 -0300 Subject: [PATCH 72/99] fix: crowdin env var name --- .github/workflows/crowdin-ai-import.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index 39f57ad7e89..3734f54c3be 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -83,7 +83,7 @@ jobs: - name: Run Crowdin AI translation import run: npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/main.ts env: - I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_API_KEY }} + I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_WORKFLOW_API_KEY }} I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} From 0723e890a467bb1d688717d5d6d6bc0c8c70debb Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 13:02:08 -0300 Subject: [PATCH 73/99] feat(i18n): add prompt ID and workflow link to PR body --- src/scripts/i18n/lib/workflows/pr-creation.ts | 41 +++++++++++++++++-- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/src/scripts/i18n/lib/workflows/pr-creation.ts b/src/scripts/i18n/lib/workflows/pr-creation.ts index 8877786e513..333b5aa5b49 100644 --- a/src/scripts/i18n/lib/workflows/pr-creation.ts +++ b/src/scripts/i18n/lib/workflows/pr-creation.ts @@ -33,6 +33,8 @@ export function generatePRTitle( /** Options for PR body generation */ export interface PRBodyOptions { geminiSkipped?: boolean + promptId?: number + workflowRunUrl?: string } /** @@ -62,7 +64,17 @@ export function generatePRBody( // Build PR body let prBody = `## Description\n\n` - prBody += `This PR contains automated ${aiModelName} translations from Crowdin\n\n` + prBody += `This PR contains automated ${aiModelName} translations from Crowdin.\n\n` + + if (options.promptId) { + prBody += `**Prompt ID:** ${options.promptId}\n` + } + if (options.workflowRunUrl) { + prBody += `**Workflow Run:** ${options.workflowRunUrl}\n` + } + if (options.promptId || options.workflowRunUrl) { + prBody += `\n` + } // Language section prBody += `### Languages translated\n\n` @@ -70,7 +82,7 @@ export function generatePRBody( // Files section - JSON if (jsonFiles.length > 0) { - prBody += `#### JSON changes (\`src/intl/{locale}/\`)\n\n` + prBody += `### JSON changes (\`src/intl/{locale}/\`)\n\n` for (const path of jsonFiles) { // Remove src/intl/{locale}/ prefix const simplifiedPath = path.replace(/^src\/intl\/[^/]+\//, "") @@ -81,7 +93,7 @@ export function generatePRBody( // Files section - Markdown if (markdownFiles.length > 0) { - prBody += `#### Markdown changes (\`public/content/translations/{locale}/\`)\n\n` + prBody += `### Markdown changes (\`public/content/translations/{locale}/\`)\n\n` for (const path of markdownFiles) { // Remove public/content/translations/{locale}/ prefix const simplifiedPath = path.replace( @@ -129,6 +141,20 @@ async function fetchAIModelName(): Promise { } } +/** + * Build workflow run URL from GitHub environment variables + */ +function getWorkflowRunUrl(): string | undefined { + const serverUrl = process.env.GITHUB_SERVER_URL + const repository = process.env.GITHUB_REPOSITORY + const runId = process.env.GITHUB_RUN_ID + + if (serverUrl && repository && runId) { + return `${serverUrl}/${repository}/actions/runs/${runId}` + } + return undefined +} + /** * Create pull request with formatted title and body */ @@ -147,6 +173,13 @@ export async function createTranslationPR( // Extract language codes const langCodes = languagePairs.map((p) => p.internalLanguageCode) + // Add workflow metadata to options + const fullOptions: PRBodyOptions = { + ...options, + promptId: config.preTranslatePromptId, + workflowRunUrl: getWorkflowRunUrl(), + } + // Generate PR title and body const prTitle = generatePRTitle(langCodes, config.allInternalCodes) const prBody = generatePRBody( @@ -154,7 +187,7 @@ export async function createTranslationPR( langCodes, committedFiles, sanitizedFiles, - options + fullOptions ) // Create PR From f381eeaaa3f86e8e1505780bc0ab883953b46e80 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 14:24:40 -0300 Subject: [PATCH 74/99] feat(i18n): use ephemeral prompts for glossary injection Switch from PATCH (which does not support /config/prompt) to POST approach that creates per-job prompts with glossary terms baked in. Prompts are cleaned up after successful job completion. --- .../i18n/lib/crowdin/ephemeral-prompts.ts | 151 ++++++++++++++++++ .../i18n/lib/workflows/file-preparation.ts | 60 +++---- .../i18n/lib/workflows/pre-translation.ts | 15 +- src/scripts/i18n/lib/workflows/types.ts | 4 + src/scripts/i18n/main.ts | 16 ++ 5 files changed, 213 insertions(+), 33 deletions(-) create mode 100644 src/scripts/i18n/lib/crowdin/ephemeral-prompts.ts diff --git a/src/scripts/i18n/lib/crowdin/ephemeral-prompts.ts b/src/scripts/i18n/lib/crowdin/ephemeral-prompts.ts new file mode 100644 index 00000000000..985979bca9c --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/ephemeral-prompts.ts @@ -0,0 +1,151 @@ +/** + * Ephemeral Prompts + * + * Manages Crowdin AI prompts that are created per-job and cleaned up after use. + * Each prompt is uniquely named with language, key, and timestamp to avoid conflicts. + * + * Naming convention: eth-org-{lang}-{key}-{timestamp} + * Example: eth-org-es-glossary-1702987200 + */ + +import { crowdinBearerHeaders } from "../../config" + +import type { PromptResource } from "./prompt" + +/** Parameters for creating an ephemeral prompt */ +export interface CreateEphemeralPromptParams { + /** Crowdin user ID (owner of the prompt) */ + userId: number + /** Language code (e.g., "es", "fr", "de") */ + languageCode: string + /** Prompt key (e.g., "glossary", "formal") */ + promptKey: string + /** The full prompt text */ + promptText: string + /** AI provider ID (optional, uses default if not specified) */ + aiProviderId?: number + /** AI model ID (optional, uses default if not specified) */ + aiModelId?: string +} + +/** Result of creating an ephemeral prompt */ +export interface EphemeralPromptResult { + /** The created prompt's ID */ + promptId: number + /** The prompt's unique name */ + promptName: string +} + +/** Crowdin API response for prompt creation */ +interface CrowdinCreatePromptResponse { + data: PromptResource +} + +/** Prefix for all ephemeral prompt names */ +const EPHEMERAL_PREFIX = "eth-org" + +/** Crowdin action type for pre-translation prompts */ +const PRE_TRANSLATE_ACTION = "pre_translate" + +/** + * Generate a unique name for an ephemeral prompt + */ +export function generateEphemeralPromptName( + languageCode: string, + promptKey: string +): string { + const timestamp = Math.floor(Date.now() / 1000) + return `${EPHEMERAL_PREFIX}-${languageCode}-${promptKey}-${timestamp}` +} + +/** + * Create an ephemeral AI prompt in Crowdin + * + * Uses Crowdin API v2: POST /users/{userId}/ai/prompts + */ +export async function createEphemeralPrompt( + params: CreateEphemeralPromptParams +): Promise { + const { + userId, + languageCode, + promptKey, + promptText, + aiProviderId, + aiModelId, + } = params + + const promptName = generateEphemeralPromptName(languageCode, promptKey) + console.log(`[EPHEMERAL-PROMPT] Creating prompt: ${promptName}`) + + const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts` + + const body: Record = { + name: promptName, + action: PRE_TRANSLATE_ACTION, + config: { + mode: "advanced", + prompt: promptText, + glossaryTerms: true, + tmSuggestions: true, + }, + } + + if (aiProviderId !== undefined) { + body.aiProviderId = aiProviderId + } + if (aiModelId !== undefined) { + body.aiModelId = aiModelId + } + + const response = await fetch(url, { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }) + + if (!response.ok) { + const text = await response.text().catch(() => "") + throw new Error( + `Failed to create ephemeral prompt "${promptName}" (${response.status}): ${text}` + ) + } + + const json = (await response.json()) as CrowdinCreatePromptResponse + const promptId = json.data.id + + console.log( + `[EPHEMERAL-PROMPT] Created prompt: ${promptName} (ID: ${promptId})` + ) + return { promptId, promptName } +} + +/** + * Delete an ephemeral AI prompt from Crowdin + */ +export async function deleteEphemeralPrompt( + userId: number, + promptId: number +): Promise { + console.log(`[EPHEMERAL-PROMPT] Deleting prompt ID: ${promptId}`) + + const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` + + const response = await fetch(url, { + method: "DELETE", + headers: crowdinBearerHeaders, + }) + + // 204 No Content is success, 404 is also acceptable (already deleted) + if (!response.ok && response.status !== 404) { + const text = await response.text().catch(() => "") + throw new Error( + `Failed to delete ephemeral prompt ${promptId} (${response.status}): ${text}` + ) + } + + console.log(`[EPHEMERAL-PROMPT] Deleted prompt ID: ${promptId}`) +} diff --git a/src/scripts/i18n/lib/workflows/file-preparation.ts b/src/scripts/i18n/lib/workflows/file-preparation.ts index d1057d266a3..bc565e1fbb3 100644 --- a/src/scripts/i18n/lib/workflows/file-preparation.ts +++ b/src/scripts/i18n/lib/workflows/file-preparation.ts @@ -4,13 +4,13 @@ import * as fs from "fs" import * as path from "path" import { config, crowdinBearerHeaders } from "../../config" +import { createEphemeralPrompt } from "../crowdin/ephemeral-prompts" import { findCrowdinFile, postCrowdinFile, postFileToStorage, unhideStringsInFile, } from "../crowdin/files" -import { updatePromptContent } from "../crowdin/prompt" import { getCurrentUser } from "../crowdin/user" import { downloadGitHubFile, @@ -148,40 +148,42 @@ export async function prepareEnglishFiles( logSection("Starting New Pre-Translation") - // Ensure Crowdin AI prompt content is synced from repo canonical file with glossary - try { - const currentUser = await getCurrentUser() - const promptPath = path.join( - process.cwd(), - "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" - ) - const basePrompt = fs.readFileSync(promptPath, "utf8") - - // Get glossary for target language and append to prompt - const targetLang = allInternalCodes[0] - const glossaryTerms = getGlossaryForLanguage(glossary, targetLang) - const glossarySection = formatGlossaryForPrompt(glossaryTerms, "informal") + // Create ephemeral prompt with glossary terms baked in + const currentUser = await getCurrentUser() + const promptPath = path.join( + process.cwd(), + "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" + ) + const basePrompt = fs.readFileSync(promptPath, "utf8") - const fullPrompt = glossarySection - ? `${basePrompt}\n\n---\n\n${glossarySection}` - : basePrompt + // Get glossary for target language and append to prompt + const targetLang = allInternalCodes[0] + const glossaryTerms = getGlossaryForLanguage(glossary, targetLang) + const glossarySection = formatGlossaryForPrompt(glossaryTerms, "informal") - if (glossaryTerms.size > 0) { - console.log( - `[GLOSSARY] Injecting ${glossaryTerms.size} terms for ${targetLang} into prompt` - ) - } + const fullPrompt = glossarySection + ? `${basePrompt}\n\n---\n\n${glossarySection}` + : basePrompt - await updatePromptContent( - currentUser.id, - config.preTranslatePromptId, - fullPrompt + if (glossaryTerms.size > 0) { + console.log( + `[GLOSSARY] Injecting ${glossaryTerms.size} terms for ${targetLang} into prompt` ) - console.log("✓ Updated Crowdin pre-translate prompt from repo file") - } catch (e) { - console.warn("Failed to update prompt, continuing:", e) } + // Create ephemeral prompt for this job + const { promptId: ephemeralPromptId } = await createEphemeralPrompt({ + userId: currentUser.id, + languageCode: targetLang, + promptKey: "glossary", + promptText: fullPrompt, + }) + + // Store ephemeral prompt ID and user ID in context for pre-translation and cleanup + context.ephemeralPromptId = ephemeralPromptId + context.crowdinUserId = currentUser.id + console.log(`✓ Created ephemeral prompt (ID: ${ephemeralPromptId})`) + // Fetch English files const allEnglishFiles = await getAllEnglishFiles() diff --git a/src/scripts/i18n/lib/workflows/pre-translation.ts b/src/scripts/i18n/lib/workflows/pre-translation.ts index e7f14ae1819..42e1a63b53f 100644 --- a/src/scripts/i18n/lib/workflows/pre-translation.ts +++ b/src/scripts/i18n/lib/workflows/pre-translation.ts @@ -74,16 +74,23 @@ async function resumePreTranslation( * Start new pre-translation job */ async function startNewPreTranslation( - fileIdsSet: Set + fileIdsSet: Set, + ephemeralPromptId?: number ): Promise { logSection("Requesting AI Pre-Translation") console.log(`Files to translate: ${fileIdsSet.size}`) console.log(`Target languages: ${config.allCrowdinCodes.join(", ")}`) - console.log(`AI Prompt ID: ${config.preTranslatePromptId}`) + + // Use ephemeral prompt if available, otherwise fall back to static prompt + const promptId = ephemeralPromptId ?? config.preTranslatePromptId + console.log( + `AI Prompt ID: ${promptId}${ephemeralPromptId ? " (ephemeral)" : ""}` + ) const applyPreTranslationResponse = await postApplyPreTranslation( Array.from(fileIdsSet), - config.allCrowdinCodes + config.allCrowdinCodes, + promptId ) console.log( @@ -140,7 +147,7 @@ export async function handlePreTranslation( // Resume existing or start new const preTranslateResponse = existingPreTranslationId ? await resumePreTranslation(existingPreTranslationId) - : await startNewPreTranslation(fileIdsSet) + : await startNewPreTranslation(fileIdsSet, context.ephemeralPromptId) // Build mapping for commit phase const { fileIds } = preTranslateResponse.attributes diff --git a/src/scripts/i18n/lib/workflows/types.ts b/src/scripts/i18n/lib/workflows/types.ts index a85815e2a40..e14f2f8a273 100644 --- a/src/scripts/i18n/lib/workflows/types.ts +++ b/src/scripts/i18n/lib/workflows/types.ts @@ -12,6 +12,10 @@ export interface WorkflowContext { processedFileIdToPath: Record englishBuffers: Record glossary: GlossaryByLanguage + /** Ephemeral prompt ID created for this job (to be cleaned up after) */ + ephemeralPromptId?: number + /** Crowdin user ID (needed for ephemeral prompt cleanup) */ + crowdinUserId?: number } /** diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 630132b9e35..177937ea77a 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -1,3 +1,4 @@ +import { deleteEphemeralPrompt } from "./lib/crowdin/ephemeral-prompts" import { prepareEnglishFiles } from "./lib/workflows/file-preparation" import { initializeWorkflow } from "./lib/workflows/initialize" import { runJsxTranslation } from "./lib/workflows/jsx-translation" @@ -87,6 +88,21 @@ async function main() { `Languages: ${translationResult.languagePairs.map((p) => p.internalLanguageCode).join(", ")}` ) console.log(`Files: ${preTranslateResult.response.attributes.fileIds.length}`) + + // Cleanup ephemeral prompt (best effort - don't fail the workflow if cleanup fails) + if (context.ephemeralPromptId && context.crowdinUserId) { + try { + await deleteEphemeralPrompt( + context.crowdinUserId, + context.ephemeralPromptId + ) + } catch (err) { + console.warn( + `[WARN] Failed to cleanup ephemeral prompt ${context.ephemeralPromptId}:`, + err instanceof Error ? err.message : err + ) + } + } } main().catch((err) => { From 1c2948f90fa7ab0462b283e6f83a6db6ddcd9ffd Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 14:29:00 -0300 Subject: [PATCH 75/99] revert: artifact writing --- .../i18n/lib/workflows/pre-translation.ts | 41 ------------------- 1 file changed, 41 deletions(-) diff --git a/src/scripts/i18n/lib/workflows/pre-translation.ts b/src/scripts/i18n/lib/workflows/pre-translation.ts index 42e1a63b53f..5407b46fdcb 100644 --- a/src/scripts/i18n/lib/workflows/pre-translation.ts +++ b/src/scripts/i18n/lib/workflows/pre-translation.ts @@ -1,8 +1,5 @@ // Pre-translation workflow phase -import * as fs from "fs" -import * as path from "path" - import { config } from "../../config" import { awaitPreTranslationCompleted, @@ -14,37 +11,6 @@ import type { CrowdinPreTranslateResponse } from "../types" import type { PreTranslationResult, WorkflowContext } from "./types" import { logSection } from "./utils" -/** - * Write pre-translation artifact for GitHub Actions - */ -function writePreTranslationArtifact( - preTranslationId: string, - fileCount: number, - languages: string[] -): void { - const artifactData = { - preTranslationId, - timestamp: new Date().toISOString(), - fileCount, - languages, - targetPath: config.targetPath || null, - } - - const artifactDir = path.join(process.cwd(), "artifacts") - if (!fs.existsSync(artifactDir)) { - fs.mkdirSync(artifactDir, { recursive: true }) - } - - const artifactPath = path.join(artifactDir, "pre-translation-info.json") - fs.writeFileSync(artifactPath, JSON.stringify(artifactData, null, 2)) - - console.log(`\n[ARTIFACT] Pre-translation info written to ${artifactPath}`) - console.log(`[ARTIFACT] Pre-translation ID: ${preTranslationId}`) - console.log( - `[ARTIFACT] To resume this job later, use: PRETRANSLATION_ID=${preTranslationId}` - ) -} - /** * Resume existing pre-translation job */ @@ -97,13 +63,6 @@ async function startNewPreTranslation( `✓ Pre-translation job created (ID: ${applyPreTranslationResponse.identifier})` ) - // Write artifact with pre-translation ID - writePreTranslationArtifact( - applyPreTranslationResponse.identifier, - fileIdsSet.size, - config.allCrowdinCodes - ) - // Exit early if skipAwait is set or if full translation mode (no targetPath) if (config.skipAwait || !config.targetPath) { const reason = config.skipAwait From aec9dc8377596a5d3d0ed2f03a4fe03101ae55ba Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 14:39:04 -0300 Subject: [PATCH 76/99] refactor(i18n): DRY verbose logging with debugLog utility Add centralized debugLog() that handles verbose check internally. Replace 20+ scattered if(verbose) blocks across 11 files. Remove verbose parameters from functions (now uses config.verbose). - Added debugLog() to utils.ts with guard clause pattern - Removed redundant artifact writing from pre-translation - Net reduction: 65 lines of code --- src/scripts/i18n/lib/crowdin/files.ts | 52 ++++++------------- src/scripts/i18n/lib/github/branches.ts | 5 +- src/scripts/i18n/lib/github/commits.ts | 6 +-- src/scripts/i18n/lib/github/files.ts | 30 ++++------- .../i18n/lib/workflows/file-preparation.ts | 48 ++++++----------- .../i18n/lib/workflows/jsx-translation.ts | 12 ++--- .../i18n/lib/workflows/sanitization.ts | 9 ++-- .../lib/workflows/translation-download.ts | 25 +++------ src/scripts/i18n/lib/workflows/utils.ts | 10 ++++ src/scripts/i18n/lib/workflows/validation.ts | 25 ++++----- src/scripts/i18n/main.ts | 8 ++- 11 files changed, 84 insertions(+), 146 deletions(-) diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts index 772af4d7727..bcd615e64bc 100644 --- a/src/scripts/i18n/lib/crowdin/files.ts +++ b/src/scripts/i18n/lib/crowdin/files.ts @@ -10,6 +10,7 @@ import type { CrowdinFileData, GitHubCrowdinFileMetadata, } from "../types" +import { debugLog } from "../workflows/utils" /** * JSX component attributes that should be translated in markdown files. @@ -32,11 +33,7 @@ export const getCrowdinProjectFiles = async (): Promise => { ) url.searchParams.set("limit", "500") - if (config.verbose) { - console.log( - `[DEBUG] Fetching Crowdin project files from: ${url.toString()}` - ) - } + debugLog(`Fetching Crowdin project files from: ${url.toString()}`) try { const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) @@ -52,12 +49,7 @@ export const getCrowdinProjectFiles = async (): Promise => { const json: JsonResponse = await res.json() const mappedData = json.data.map(({ data }) => data) - - if (config.verbose) { - console.log( - `[DEBUG] Successfully fetched ${mappedData.length} Crowdin files` - ) - } + debugLog(`Successfully fetched ${mappedData.length} Crowdin files`) return mappedData } catch (error) { console.error(`[ERROR] Failed to fetch Crowdin project files:`, error) @@ -73,11 +65,7 @@ export const findCrowdinFile = ( targetFile: GitHubCrowdinFileMetadata, crowdinFiles: CrowdinFileData[] ): CrowdinFileData | null => { - if (config.verbose) { - console.log( - `[DEBUG] Looking for Crowdin file matching: ${targetFile.filePath}` - ) - } + debugLog(`Looking for Crowdin file matching: ${targetFile.filePath}`) const found = crowdinFiles.find(({ path }) => path.endsWith(targetFile.filePath) @@ -91,11 +79,9 @@ export const findCrowdinFile = ( return null } - if (config.verbose) { - console.log( - `[DEBUG] Successfully matched with Crowdin file: ${found.path} (ID: ${found.id})` - ) - } + debugLog( + `Successfully matched with Crowdin file: ${found.path} (ID: ${found.id})` + ) return found } @@ -105,9 +91,7 @@ export const findCrowdinFile = ( * This function makes them visible so they can be processed by pre-translation. */ export const unhideStringsInFile = async (fileId: number): Promise => { - if (config.verbose) { - console.log(`[DEBUG] Checking for hidden strings in fileId=${fileId}`) - } + debugLog(`Checking for hidden strings in fileId=${fileId}`) // Get all strings from the file const listUrl = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/strings?fileId=${fileId}&limit=500` @@ -189,7 +173,7 @@ export const getCrowdinProjectDirectories = async (): Promise< ) url.searchParams.set("limit", "500") - console.log(`[DEBUG] Fetching Crowdin directories: ${url.toString()}`) + debugLog(`Fetching Crowdin directories: ${url.toString()}`) try { const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) @@ -204,7 +188,7 @@ export const getCrowdinProjectDirectories = async (): Promise< } const json: DirJson = await res.json() const dirs = json.data.map(({ data }) => data) - console.log(`[DEBUG] Loaded ${dirs.length} directories`) + debugLog(`Loaded ${dirs.length} directories`) return dirs } catch (error) { console.error("[ERROR] getCrowdinProjectDirectories:", error) @@ -226,8 +210,8 @@ export const postCrowdinDirectory = async ( const body: Record = { name } if (parentDirectoryId) body.directoryId = parentDirectoryId - console.log( - `[DEBUG] Creating directory segment "${name}" parent=${parentDirectoryId ?? "ROOT"}` + debugLog( + `Creating directory segment "${name}" parent=${parentDirectoryId ?? "ROOT"}` ) try { @@ -251,7 +235,7 @@ export const postCrowdinDirectory = async ( type JsonResponse = { data: { id: number } } const json: JsonResponse = await res.json() - console.log(`[DEBUG] Created directory id=${json.data.id} name="${name}"`) + debugLog(`Created directory id=${json.data.id} name="${name}"`) return json.data.id } catch (error) { console.error("[ERROR] postCrowdinDirectory:", error) @@ -274,7 +258,7 @@ export const createCrowdinDirectory = async ( if (!fullPath || typeof fullPath !== "string") { throw new Error("createCrowdinDirectory: path must be a non-empty string") } - console.log(`[DEBUG] Ensuring Crowdin directory path: "${fullPath}"`) + debugLog(`Ensuring Crowdin directory path: "${fullPath}"`) const segments = fullPath .split("/") @@ -308,8 +292,8 @@ export const createCrowdinDirectory = async ( const k = key(currentParentId, segment) let dirId = directoryIndex.get(k) if (dirId) { - console.log( - `[DEBUG] Reusing existing directory "${segment}" id=${dirId} parent=${currentParentId ?? "ROOT"}` + debugLog( + `Reusing existing directory "${segment}" id=${dirId} parent=${currentParentId ?? "ROOT"}` ) currentParentId = dirId continue @@ -323,9 +307,7 @@ export const createCrowdinDirectory = async ( if (!currentParentId) throw new Error("Failed to resolve final directory id (unexpected)") - console.log( - `[DEBUG] Final directory id for path "${fullPath}" = ${currentParentId}` - ) + debugLog(`Final directory id for path "${fullPath}" = ${currentParentId}`) return currentParentId } diff --git a/src/scripts/i18n/lib/github/branches.ts b/src/scripts/i18n/lib/github/branches.ts index 2e05a97c077..e523a14f846 100644 --- a/src/scripts/i18n/lib/github/branches.ts +++ b/src/scripts/i18n/lib/github/branches.ts @@ -3,6 +3,7 @@ import { config, gitHubBearerHeaders } from "../../config" import type { BranchDetailsResponse, BranchObject } from "../types" import { fetchWithRetry } from "../utils/fetch" +import { debugLog } from "../workflows/utils" /** * Retrieves the Git object for a branch from the GitHub API @@ -58,8 +59,8 @@ export const postCreateBranchFrom = async ( ) try { - console.log( - `[DEBUG] Creating branch from base="${ref}" sha=${sha} -> new branch="${branch}"` + debugLog( + `Creating branch from base="${ref}" sha=${sha} -> new branch="${branch}"` ) const res = await fetchWithRetry(url.toString(), { method: "POST", diff --git a/src/scripts/i18n/lib/github/commits.ts b/src/scripts/i18n/lib/github/commits.ts index a9bfc4c911c..4603afa1ec3 100644 --- a/src/scripts/i18n/lib/github/commits.ts +++ b/src/scripts/i18n/lib/github/commits.ts @@ -2,7 +2,7 @@ import { config, gitHubBearerHeaders } from "../../config" import { fetchWithRetry } from "../utils/fetch" -import { delay } from "../workflows/utils" +import { debugLog, delay } from "../workflows/utils" /** * Get the destination path for a translated file @@ -51,8 +51,8 @@ export const getDestinationFromPath = ( } } - console.log( - `[DEBUG] Destination mapping: ${crowdinFilePath} -> ${destinationPath} (lang=${internalLanguageCode})` + debugLog( + `Destination mapping: ${crowdinFilePath} -> ${destinationPath} (lang=${internalLanguageCode})` ) return destinationPath } diff --git a/src/scripts/i18n/lib/github/files.ts b/src/scripts/i18n/lib/github/files.ts index 0e9d80c7efb..b10120d3cec 100644 --- a/src/scripts/i18n/lib/github/files.ts +++ b/src/scripts/i18n/lib/github/files.ts @@ -7,6 +7,7 @@ import type { GitHubQueryResponseItem, } from "../types" import { fetchWithRetry } from "../utils/fetch" +import { debugLog } from "../workflows/utils" /** * Check if a path should be excluded @@ -31,7 +32,7 @@ function isFilePath(targetPath: string): boolean { export const getAllEnglishFiles = async (): Promise< GitHubQueryResponseItem[] > => { - const { targetPath, excludePath, verbose } = config + const { targetPath, excludePath } = config const excludedPaths = loadExcludedPaths() // Add runtime exclusion if specified @@ -39,13 +40,9 @@ export const getAllEnglishFiles = async (): Promise< ? [...excludedPaths, excludePath] : excludedPaths - if (verbose) { - console.log( - `[DEBUG] Excluded paths loaded: ${excludedPaths.length} entries` - ) - if (excludePath) { - console.log(`[DEBUG] Runtime exclude path: ${excludePath}`) - } + debugLog(`Excluded paths loaded: ${excludedPaths.length} entries`) + if (excludePath) { + debugLog(`Runtime exclude path: ${excludePath}`) } // Determine if targetPath is a file or directory @@ -80,9 +77,7 @@ export const getAllEnglishFiles = async (): Promise< } } - if (verbose) { - console.log(`[DEBUG] GitHub search query: ${query}`) - } + debugLog(`GitHub search query: ${query}`) const perPage = 100 const collected: GitHubQueryResponseItem[] = [] @@ -95,9 +90,7 @@ export const getAllEnglishFiles = async (): Promise< url.searchParams.set("per_page", perPage.toString()) url.searchParams.set("page", page.toString()) - if (verbose) { - console.log(`[DEBUG] Fetching search page ${page}...`) - } + debugLog(`Fetching search page ${page}...`) try { const res = await fetchWithRetry(url.toString(), { @@ -113,18 +106,13 @@ export const getAllEnglishFiles = async (): Promise< const json: JsonResponse = await res.json() if (!json.items.length) { - if (verbose) { - console.log(`[DEBUG] No more results at page ${page}`) - } + debugLog(`No more results at page ${page}`) hasMorePages = false break } collected.push(...json.items) - - if (verbose) { - console.log(`[DEBUG] Collected ${collected.length} items so far`) - } + debugLog(`Collected ${collected.length} items so far`) page += 1 if (page > 10) { diff --git a/src/scripts/i18n/lib/workflows/file-preparation.ts b/src/scripts/i18n/lib/workflows/file-preparation.ts index bc565e1fbb3..fe4afa7fced 100644 --- a/src/scripts/i18n/lib/workflows/file-preparation.ts +++ b/src/scripts/i18n/lib/workflows/file-preparation.ts @@ -21,7 +21,7 @@ import { formatGlossaryForPrompt, getGlossaryForLanguage } from "../supabase" import type { CrowdinFileData } from "../types" import type { FilePreparationResult, WorkflowContext } from "./types" -import { delay, logSection } from "./utils" +import { debugLog, delay, logSection } from "./utils" /** * Update existing file in Crowdin with latest English content @@ -32,8 +32,7 @@ async function updateCrowdinFile( download_url: string "Crowdin-API-FileName": string }, - foundFile: CrowdinFileData, - verbose: boolean + foundFile: CrowdinFileData ): Promise<{ fileId: number; path: string; buffer: Buffer }> { console.log( `Updating existing file in Crowdin: ${file.filePath} (ID: ${foundFile.id})` @@ -69,11 +68,7 @@ async function updateCrowdinFile( // Wait for file parsing after update const delayMs = 10000 - if (verbose) { - console.log( - `[DEBUG] Waiting ${delayMs / 1000}s for Crowdin to re-parse updated file...` - ) - } + debugLog(`Waiting ${delayMs / 1000}s for Crowdin to re-parse updated file...`) await delay(delayMs) return { @@ -86,14 +81,11 @@ async function updateCrowdinFile( /** * Create new file in Crowdin */ -async function createCrowdinFile( - file: { - filePath: string - download_url: string - "Crowdin-API-FileName": string - }, - verbose: boolean -): Promise<{ fileId: number; path: string; buffer: Buffer }> { +async function createCrowdinFile(file: { + filePath: string + download_url: string + "Crowdin-API-FileName": string +}): Promise<{ fileId: number; path: string; buffer: Buffer }> { console.log(`Creating new file in Crowdin: ${file.filePath}`) const fileBuffer = await downloadGitHubFile(file.download_url) @@ -117,11 +109,7 @@ async function createCrowdinFile( // Wait for new file parsing const delayMs = 10000 - if (verbose) { - console.log( - `[DEBUG] Waiting ${delayMs / 1000}s for Crowdin to parse new file...` - ) - } + debugLog(`Waiting ${delayMs / 1000}s for Crowdin to parse new file...`) await delay(delayMs) return { @@ -137,7 +125,7 @@ async function createCrowdinFile( export async function prepareEnglishFiles( context: WorkflowContext ): Promise { - const { verbose, allInternalCodes } = config + const { allInternalCodes } = config const { crowdinProjectFiles, fileIdsSet, @@ -192,12 +180,8 @@ export async function prepareEnglishFiles( process.exit(0) } - if (verbose) { - console.log(`[DEBUG] Found ${allEnglishFiles.length} English files`) - console.log( - `[DEBUG] Found ${crowdinProjectFiles.length} files in Crowdin project` - ) - } + debugLog(`Found ${allEnglishFiles.length} English files`) + debugLog(`Found ${crowdinProjectFiles.length} files in Crowdin project`) const fileMetadata = await getFileMetadata(allEnglishFiles) @@ -207,17 +191,15 @@ export async function prepareEnglishFiles( // Iterate through each file and upload/update for (const file of fileMetadata) { - if (verbose) { - console.log(`[DEBUG] Processing file: ${file.filePath}`) - } + debugLog(`Processing file: ${file.filePath}`) try { // findCrowdinFile returns null if file doesn't exist (will be created) const foundFile = findCrowdinFile(file, crowdinProjectFiles) const result = foundFile - ? await updateCrowdinFile(file, foundFile, verbose) - : await createCrowdinFile(file, verbose) + ? await updateCrowdinFile(file, foundFile) + : await createCrowdinFile(file) fileIdsSet.add(result.fileId) if (result.path) { diff --git a/src/scripts/i18n/lib/workflows/jsx-translation.ts b/src/scripts/i18n/lib/workflows/jsx-translation.ts index 0da3f83ece2..8f3821d2581 100644 --- a/src/scripts/i18n/lib/workflows/jsx-translation.ts +++ b/src/scripts/i18n/lib/workflows/jsx-translation.ts @@ -1,5 +1,6 @@ // JSX attribute translation workflow phase +import { config } from "../../config" import { translateJsxAttributes } from "../../translate-jsx-attributes" import { isGeminiAvailable } from "../ai" import { putCommitFile } from "../github/commits" @@ -7,7 +8,7 @@ import type { GlossaryByLanguage } from "../supabase" import { getGlossaryForLanguage } from "../supabase" import type { CommittedFile, LanguagePair } from "./types" -import { logSection } from "./utils" +import { debugLog, logSection } from "./utils" export interface JsxTranslationResult { /** Whether Gemini was skipped due to missing API key */ @@ -26,8 +27,7 @@ export async function runJsxTranslation( committedFiles: CommittedFile[], languagePairs: LanguagePair[], branch: string, - glossary: GlossaryByLanguage, - verbose: boolean + glossary: GlossaryByLanguage ): Promise { logSection("JSX Attribute Translation") @@ -69,7 +69,7 @@ export async function runJsxTranslation( targetLanguage: langCode, files: langFiles, glossaryTerms, - verbose, + verbose: config.verbose, }) // Commit updated files @@ -78,9 +78,7 @@ export async function runJsxTranslation( try { const buf = Buffer.from(updated.updatedContent, "utf8") await putCommitFile(buf, updated.filePath, branch) - if (verbose) { - console.log(`[JSX-TRANSLATE] Committed: ${updated.filePath}`) - } + debugLog(`JSX-TRANSLATE: Committed ${updated.filePath}`) // Update the committedFiles array with new content for sanitizer const existingFile = committedFiles.find( diff --git a/src/scripts/i18n/lib/workflows/sanitization.ts b/src/scripts/i18n/lib/workflows/sanitization.ts index 3022be4c13d..c5285b24a7f 100644 --- a/src/scripts/i18n/lib/workflows/sanitization.ts +++ b/src/scripts/i18n/lib/workflows/sanitization.ts @@ -4,7 +4,7 @@ import { runSanitizer } from "../../post_import_sanitize" import { putCommitFile } from "../github/commits" import type { CommittedFile } from "./types" -import { logSection } from "./utils" +import { debugLog, logSection } from "./utils" export interface SanitizationResult { /** Files that were modified by the sanitizer */ @@ -19,8 +19,7 @@ export interface SanitizationResult { */ export async function runPostImportSanitization( committedFiles: CommittedFile[], - branch: string, - verbose: boolean + branch: string ): Promise { logSection("Running Post-Import Sanitizer") @@ -37,9 +36,7 @@ export async function runPostImportSanitization( try { const buf = Buffer.from(file.content, "utf8") await putCommitFile(buf, relPath, branch) - if (verbose) { - console.log(`[DEBUG] Committed sanitized file: ${relPath}`) - } + debugLog(`Committed sanitized file: ${relPath}`) // Update committedFiles with sanitized content for validation const existingFile = committedFiles.find((f) => f.path === relPath) diff --git a/src/scripts/i18n/lib/workflows/translation-download.ts b/src/scripts/i18n/lib/workflows/translation-download.ts index ef2ea8f559a..16dd51e8c23 100644 --- a/src/scripts/i18n/lib/workflows/translation-download.ts +++ b/src/scripts/i18n/lib/workflows/translation-download.ts @@ -13,7 +13,7 @@ import type { TranslationDownloadResult, WorkflowContext, } from "./types" -import { logSection, logSubsection } from "./utils" +import { debugLog, logSection, logSubsection } from "./utils" /** * Build language pair mappings from Crowdin IDs to internal codes @@ -32,7 +32,6 @@ export async function downloadAndCommitTranslations( preTranslateResult: PreTranslationResult, context: WorkflowContext ): Promise { - const { verbose } = config const { englishBuffers } = context const { response, fileIdToPathMapping } = preTranslateResult @@ -63,9 +62,7 @@ export async function downloadAndCommitTranslations( for (const fileId of fileIds) { const crowdinPath = fileIdToPathMapping[fileId] - if (verbose) { - console.log(`[DEBUG] Processing fileId: ${fileId} (${crowdinPath})`) - } + debugLog(`Processing fileId: ${fileId} (${crowdinPath})`) // 1- Build translation const { url: downloadUrl } = await postBuildProjectFileTranslation( @@ -76,19 +73,14 @@ export async function downloadAndCommitTranslations( // 2- Download const { buffer } = await getBuiltFile(downloadUrl) - - if (verbose) { - console.log(`[DEBUG] Downloaded ${buffer.length} bytes`) - } + debugLog(`Downloaded ${buffer.length} bytes`) // Check if translation differs from English const originalEnglish = englishBuffers[fileId] if (originalEnglish && originalEnglish.compare(buffer) === 0) { - if (verbose) { - console.warn( - `[DEBUG] Skipping commit - content identical to English (no translation)` - ) - } + debugLog( + `Skipping commit - content identical to English (no translation)` + ) continue } @@ -97,10 +89,7 @@ export async function downloadAndCommitTranslations( crowdinPath, internalLanguageCode ) - - if (verbose) { - console.log(`[DEBUG] Committing to: ${destinationPath}`) - } + debugLog(`Committing to: ${destinationPath}`) await putCommitFile(buffer, destinationPath, branch) diff --git a/src/scripts/i18n/lib/workflows/utils.ts b/src/scripts/i18n/lib/workflows/utils.ts index e6776676704..bd81c73819a 100644 --- a/src/scripts/i18n/lib/workflows/utils.ts +++ b/src/scripts/i18n/lib/workflows/utils.ts @@ -1,11 +1,21 @@ // Common utilities for i18n workflows +import { config } from "../../config" + /** * Delay execution for specified milliseconds */ export const delay = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)) +/** + * Log debug message (only when verbose mode is enabled) + */ +export function debugLog(message: string): void { + if (!config.verbose) return + console.log(`[DEBUG] ${message}`) +} + /** * Log a section header with consistent formatting */ diff --git a/src/scripts/i18n/lib/workflows/validation.ts b/src/scripts/i18n/lib/workflows/validation.ts index 21ffc20528f..17241baf6ab 100644 --- a/src/scripts/i18n/lib/workflows/validation.ts +++ b/src/scripts/i18n/lib/workflows/validation.ts @@ -1,6 +1,5 @@ // Syntax tree validation workflow phase -import { config } from "../../config" import { postPullRequestComment } from "../github/pull-requests" import { formatValidationComment, @@ -10,7 +9,7 @@ import { } from "../validation/syntax-tree" import type { CommittedFile, PullRequest } from "./types" -import { logSection } from "./utils" +import { debugLog, logSection } from "./utils" /** Default threshold for JSX attribute untranslated percentage */ const DEFAULT_JSX_THRESHOLD = 5 @@ -24,8 +23,6 @@ export async function runSyntaxValidation( englishBuffers: Record, fileIdToPathMapping: Record ): Promise { - const { verbose } = config - logSection("Running Syntax Tree Validation") const validationResults: Parameters[0] = [] @@ -73,9 +70,7 @@ export async function runSyntaxValidation( } if (!englishContent) { - if (verbose) { - console.warn(`[DEBUG] Could not find English source for ${file.path}`) - } + debugLog(`Could not find English source for ${file.path}`) continue } @@ -87,8 +82,8 @@ export async function runSyntaxValidation( type: "json", result, }) - if (!result.isValid && verbose) { - console.log(`[DEBUG] JSON validation failed for ${file.path}`) + if (!result.isValid) { + debugLog(`JSON validation failed for ${file.path}`) } } else if (isMarkdown) { const result = validateMarkdownStructure(englishContent, file.content) @@ -97,8 +92,8 @@ export async function runSyntaxValidation( type: "markdown", result, }) - if (!result.isValid && verbose) { - console.log(`[DEBUG] Markdown validation failed for ${file.path}`) + if (!result.isValid) { + debugLog(`Markdown validation failed for ${file.path}`) } // Also validate JSX attributes for markdown files (compare against English) @@ -115,11 +110,9 @@ export async function runSyntaxValidation( type: "jsx-attributes", result: jsxResult, }) - if (verbose) { - console.log( - `[DEBUG] JSX attribute validation flagged ${file.path}: ${jsxResult.untranslatedPercentage.toFixed(1)}% untranslated` - ) - } + debugLog( + `JSX attribute validation flagged ${file.path}: ${jsxResult.untranslatedPercentage.toFixed(1)}% untranslated` + ) } } } diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 177937ea77a..9faade3c640 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -14,7 +14,7 @@ import { config } from "./config" * Main orchestration function */ async function main() { - const { verbose, existingPreTranslationId } = config + const { existingPreTranslationId } = config // Phase 1: Initialize workflow const context = await initializeWorkflow() @@ -38,15 +38,13 @@ async function main() { translationResult.committedFiles, translationResult.languagePairs, translationResult.branch, - context.glossary, - verbose + context.glossary ) // Phase 6: Run post-import sanitizer const sanitizeResult = await runPostImportSanitization( translationResult.committedFiles, - translationResult.branch, - verbose + translationResult.branch ) // Check if PR creation should be skipped From fa5f9ae8f40e07999d8a7b801a180ee7c631f30f Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 14:42:34 -0300 Subject: [PATCH 77/99] fix(i18n): clean up PR body metadata - Remove prompt ID (ephemeral prompts are deleted after use) - Format workflow URL as readable markdown link --- src/scripts/i18n/lib/workflows/pr-creation.ts | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/scripts/i18n/lib/workflows/pr-creation.ts b/src/scripts/i18n/lib/workflows/pr-creation.ts index 333b5aa5b49..147fd1794b2 100644 --- a/src/scripts/i18n/lib/workflows/pr-creation.ts +++ b/src/scripts/i18n/lib/workflows/pr-creation.ts @@ -33,7 +33,6 @@ export function generatePRTitle( /** Options for PR body generation */ export interface PRBodyOptions { geminiSkipped?: boolean - promptId?: number workflowRunUrl?: string } @@ -66,14 +65,8 @@ export function generatePRBody( let prBody = `## Description\n\n` prBody += `This PR contains automated ${aiModelName} translations from Crowdin.\n\n` - if (options.promptId) { - prBody += `**Prompt ID:** ${options.promptId}\n` - } if (options.workflowRunUrl) { - prBody += `**Workflow Run:** ${options.workflowRunUrl}\n` - } - if (options.promptId || options.workflowRunUrl) { - prBody += `\n` + prBody += `[🔗 View workflow run](${options.workflowRunUrl})\n\n` } // Language section @@ -176,7 +169,6 @@ export async function createTranslationPR( // Add workflow metadata to options const fullOptions: PRBodyOptions = { ...options, - promptId: config.preTranslatePromptId, workflowRunUrl: getWorkflowRunUrl(), } From 2c6ad1143cf15b625dbdf8cdc48a10dfb745be73 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sat, 20 Dec 2025 14:55:55 -0300 Subject: [PATCH 78/99] fix(i18n): copy AI provider settings to ephemeral prompts The ephemeral prompt must have aiProviderId and aiModelId set, otherwise Crowdin returns 422 "Specified AI prompt provider is not enabled". Fetches these settings from the static prompt and copies them. --- .../i18n/lib/workflows/file-preparation.ts | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/lib/workflows/file-preparation.ts b/src/scripts/i18n/lib/workflows/file-preparation.ts index fe4afa7fced..1191f5c18b6 100644 --- a/src/scripts/i18n/lib/workflows/file-preparation.ts +++ b/src/scripts/i18n/lib/workflows/file-preparation.ts @@ -11,6 +11,7 @@ import { postFileToStorage, unhideStringsInFile, } from "../crowdin/files" +import { getPromptInfo } from "../crowdin/prompt" import { getCurrentUser } from "../crowdin/user" import { downloadGitHubFile, @@ -138,6 +139,16 @@ export async function prepareEnglishFiles( // Create ephemeral prompt with glossary terms baked in const currentUser = await getCurrentUser() + + // Get AI provider/model settings from the static prompt + const staticPromptInfo = await getPromptInfo( + currentUser.id, + config.preTranslatePromptId + ) + debugLog( + `Static prompt AI settings: provider=${staticPromptInfo.aiProviderId}, model=${staticPromptInfo.aiModelId}` + ) + const promptPath = path.join( process.cwd(), "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" @@ -159,12 +170,14 @@ export async function prepareEnglishFiles( ) } - // Create ephemeral prompt for this job + // Create ephemeral prompt for this job (copy AI provider from static prompt) const { promptId: ephemeralPromptId } = await createEphemeralPrompt({ userId: currentUser.id, languageCode: targetLang, promptKey: "glossary", promptText: fullPrompt, + aiProviderId: staticPromptInfo.aiProviderId ?? undefined, + aiModelId: staticPromptInfo.aiModelId ?? undefined, }) // Store ephemeral prompt ID and user ID in context for pre-translation and cleanup From 289b3a1c0ec91d6ebddaf34c12d78202925191fb Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Sun, 21 Dec 2025 11:41:48 -0300 Subject: [PATCH 79/99] feat(i18n): enhance post_import_sanitize with additional fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add comprehensive protected frontmatter fields list (published, author, skill, etc.) - Fix ASCII guillemets (<< >>) to Unicode (« ») while preserving code blocks - Add fixMergedClosingTags for ButtonLink/Link components - Fix escaped backticks (\`) to regular backticks - Skip YAML arrays and dash-prefixed items in quote wrapping - Improve collapseInlineHtmlFromEnglish to handle nested tags --- src/scripts/i18n/post_import_sanitize.ts | 479 ++++++++++++++++++++++- 1 file changed, 478 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index d67aa087399..0d3b15032fb 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -327,6 +327,414 @@ function fixBlockComponentLineBreaks(md: string): { return { content, fixCount } } +/** + * Collapse inline HTML tags to single line when English source has them on one line. + * Fixes MDX paragraph wrapping issues:

content\n
content
+ */ +function collapseInlineHtmlFromEnglish( + translatedMd: string, + englishMd: string +): { content: string; fixCount: number } { + const inlineTags = ["div", "span", "p", "strong", "em"] + let content = translatedMd + let fixCount = 0 + + // Build a set of lines in English where tag opens and closes on same line + const englishLines = englishMd.split("\n") + + for (const tag of inlineTags) { + // Find English lines that have ... all on one line + // (content can include nested tags like ,
, etc.) + const singleLinePattern = new RegExp(`<${tag}[^>]*>.*`) + const englishSingleLineSet = new Set() + + for (const line of englishLines) { + if (singleLinePattern.test(line)) { + // Extract just the opening tag to use as a key + const openTagMatch = line.match(new RegExp(`<${tag}[^>]*>`)) + if (openTagMatch) { + englishSingleLineSet.add(openTagMatch[0]) + } + } + } + + // In translated content, find cases where: + // - Opening tag + content is on one line (content may include nested tags) + // - Newline follows + // - Closing tag is on the next line (possibly with leading whitespace) + // Pattern: content-with-possible-nested-tags\n + const translatedMultiLineRe = new RegExp( + `(<${tag}[^>]*>)([^\\n]+)\\n(\\s*)`, + "g" + ) + + content = content.replace( + translatedMultiLineRe, + (fullMatch, openTag, innerContent, closeTagLine) => { + // Check if this opening tag should be single-line per English + if (englishSingleLineSet.has(openTag)) { + fixCount++ + // Collapse: opening tag + trimmed content + closing tag (no newline) + return `${openTag}${innerContent.trim()}${closeTagLine.trim()}` + } + return fullMatch + } + ) + } + + return { content, fixCount } +} + +/** + * Fix JSX component closing tags that are merged with content. + * English format: + * + * Content + * + * Spanish (broken): + * + * Content + * This function splits the closing tag to its own line when English has it that way. + */ +function fixMergedClosingTags( + translatedMd: string, + englishMd: string +): { content: string; fixCount: number } { + const componentTags = ["ButtonLink", "Link"] + let content = translatedMd + let fixCount = 0 + + for (const tag of componentTags) { + // Find patterns in English where the closing tag is on its own line + // Pattern: \n content\n or \n content\n + const englishMultiLineRe = new RegExp( + `<${tag}[^>]*>\\n[\\s\\S]*?\\n\\s*`, + "g" + ) + + // Check if English uses multi-line format for this component + if (!englishMultiLineRe.test(englishMd)) continue + + // In translated content, find cases where closing tag is merged with content on same line + // Pattern: \n content (content and closing tag on same line) + const mergedPattern = new RegExp( + `(<${tag}[^>]*>\\n)(\\s*)([^\\n]+)()`, + "g" + ) + + content = content.replace( + mergedPattern, + (match, openTagLine, indent, innerContent, closeTag) => { + // Only fix if the inner content doesn't end with just whitespace + // and the closing tag is directly after content (not on its own line) + const trimmedContent = innerContent.trimEnd() + if (trimmedContent.length > 0 && !innerContent.includes("\n")) { + fixCount++ + // Split: put closing tag on its own line with same indentation + return `${openTagLine}${indent}${trimmedContent}\n${indent}${closeTag}` + } + return match + } + ) + } + + return { content, fixCount } +} + +/** + * Repair unclosed backticks by comparing with English source. + * Detects lines with odd backtick counts containing < and attempts repair. + */ +function repairUnclosedBackticks( + translatedMd: string, + englishMd: string +): { content: string; fixCount: number } { + const translatedLines = translatedMd.split("\n") + const englishLines = englishMd.split("\n") + let fixCount = 0 + + for (let i = 0; i < translatedLines.length; i++) { + const line = translatedLines[i] + const backtickCount = (line.match(/`/g) || []).length + + // Odd number of backticks and contains < means potentially unclosed code with HTML-like content + if ( + backtickCount % 2 === 1 && + line.includes("<") && + !line.includes("```") + ) { + // Try to find a matching English line with similar structure + for (const engLine of englishLines) { + // Look for English lines with balanced backticks containing similar patterns + const engBackticks = (engLine.match(/`/g) || []).length + if ( + engBackticks % 2 === 0 && + engBackticks > 0 && + engLine.includes("<") + ) { + // Extract inline code blocks from English + const codeBlockRe = /`([^`]+)`/g + let engMatch + while ((engMatch = codeBlockRe.exec(engLine))) { + const engCode = engMatch[1] + // Check if the translated line contains this code pattern without closing backtick + const unbalancedPattern = new RegExp( + "`" + + engCode + .replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + .replace(/\s+/g, "\\s*") + ) + if ( + unbalancedPattern.test(line) && + !line.includes("`" + engCode + "`") + ) { + // Found a match - add the missing closing backtick + translatedLines[i] = line.replace( + new RegExp( + "`" + + engCode + .replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + .replace(/\s+/g, "\\s*") + ), + "`" + engCode + "`" + ) + fixCount++ + break + } + } + if (fixCount > 0) break + } + } + } + } + + return { content: translatedLines.join("\n"), fixCount } +} + +/** + * Normalize frontmatter dates from localized format (DD-MM-YYYY) back to ISO (YYYY-MM-DD). + */ +function normalizeFrontmatterDates(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + + // Match frontmatter block + const frontmatterRe = /^---\n([\s\S]*?)\n---/ + const match = content.match(frontmatterRe) + if (!match) return { content, fixCount } + + let frontmatter = match[1] + const originalFrontmatter = frontmatter + + // Fix published: dates in DD-MM-YYYY or DD/MM/YYYY format + frontmatter = frontmatter.replace( + /^(published:\s*)(\d{1,2})[-/](\d{1,2})[-/](\d{4})$/gm, + (_, prefix, day, month, year) => { + fixCount++ + // Pad day and month with leading zeros if needed + const paddedDay = day.padStart(2, "0") + const paddedMonth = month.padStart(2, "0") + return `${prefix}${year}-${paddedMonth}-${paddedDay}` + } + ) + + if (frontmatter !== originalFrontmatter) { + content = content.replace(frontmatterRe, `---\n${frontmatter}\n---`) + } + + return { content, fixCount } +} + +/** + * Sync protected frontmatter fields from English source. + * These fields should never be translated (e.g., template, sidebar). + */ +function syncProtectedFrontmatterFields( + translatedMd: string, + englishMd: string +): { content: string; fixCount: number } { + // Fields that should never be translated - sync from English canonical + // Note: 'buttons' array needs special handling (content translatable, toId/isSecondary not) + // Note: 'lang' must NOT be protected - it must remain as target language code + const protectedFields = [ + "template", + "sidebar", + "sidebarDepth", + "published", + "author", + "source", + "sourceUrl", + "address", + "emoji", + "skill", + "isOutdated", + "incomplete", + "hideEditButton", + "showDropdown", + "image", + "blurDataURL", + ] + let fixCount = 0 + + // Extract frontmatter from both + const frontmatterRe = /^---\n([\s\S]*?)\n---/ + const transMatch = translatedMd.match(frontmatterRe) + const engMatch = englishMd.match(frontmatterRe) + + if (!transMatch || !engMatch) return { content: translatedMd, fixCount } + + let transFrontmatter = transMatch[1] + const engFrontmatter = engMatch[1] + + for (const field of protectedFields) { + // Get English value + const engFieldRe = new RegExp(`^${field}:\\s*(.+)$`, "m") + const engFieldMatch = engFrontmatter.match(engFieldRe) + if (!engFieldMatch) continue + + const englishValue = engFieldMatch[1].trim() + + // Check if translated value differs + const transFieldRe = new RegExp(`^${field}:\\s*(.+)$`, "m") + const transFieldMatch = transFrontmatter.match(transFieldRe) + + if (transFieldMatch) { + const translatedValue = transFieldMatch[1].trim() + // Remove quotes for comparison + const cleanTranslated = translatedValue.replace(/^["']|["']$/g, "") + const cleanEnglish = englishValue.replace(/^["']|["']$/g, "") + + if (cleanTranslated !== cleanEnglish) { + // Replace with English value + transFrontmatter = transFrontmatter.replace( + transFieldRe, + `${field}: ${englishValue}` + ) + fixCount++ + } + } + } + + if (fixCount > 0) { + return { + content: translatedMd.replace( + frontmatterRe, + `---\n${transFrontmatter}\n---` + ), + fixCount, + } + } + + return { content: translatedMd, fixCount } +} + +/** + * Fix ASCII guillemets (<< and >>) to proper Unicode guillemets (« and »). + * Prevents MDX parsing errors from malformed angle bracket sequences. + * IMPORTANT: Skips code blocks where << and >> are valid bit-shift operators. + */ +function fixAsciiGuillemets(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + + // Split content to preserve code blocks (both fenced and inline) + // Fenced: ```...``` or ~~~...~~~ + // Inline: `...` + const codeBlockPattern = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g + const parts = content.split(codeBlockPattern) + + for (let i = 0; i < parts.length; i++) { + // Skip code blocks (odd indices after split with capturing group) + if (i % 2 === 1) continue + + // Count and replace in non-code parts only + const leftMatches = parts[i].match(/<>/g) + + if (leftMatches) { + fixCount += leftMatches.length + parts[i] = parts[i].replace(/<>/g, "»") + } + } + + return { content: parts.join(""), fixCount } +} + +/** + * Wrap frontmatter string values containing non-ASCII characters in double quotes. + * Prevents YAML parsing issues with accented characters. + */ +function quoteFrontmatterNonAscii(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + + // Match frontmatter block + const frontmatterRe = /^---\n([\s\S]*?)\n---/ + const match = content.match(frontmatterRe) + if (!match) return { content, fixCount } + + let frontmatter = match[1] + const originalFrontmatter = frontmatter + + // Find lines with unquoted values containing non-ASCII + const lines = frontmatter.split("\n") + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + // Match key: value pattern + const keyValueRe = /^(\s*\w+:\s*)(.+)$/ + const kvMatch = line.match(keyValueRe) + if (kvMatch) { + const [, prefix, value] = kvMatch + const trimmedValue = value.trim() + + // Skip if already quoted (starts and ends with matching quotes) + if ( + (trimmedValue.startsWith('"') && trimmedValue.endsWith('"')) || + (trimmedValue.startsWith("'") && trimmedValue.endsWith("'")) + ) { + continue + } + + // Skip YAML arrays - they handle their own internal quoting + // Inline arrays: tags: [ "value1", "value2" ] + if (trimmedValue.startsWith("[") && trimmedValue.endsWith("]")) { + continue + } + // Multi-line array items with - prefix won't match our key:value regex, + // but check explicitly for robustness (e.g., `key: - value` edge case) + if (trimmedValue.startsWith("-")) { + continue + } + + // Check if value contains non-ASCII characters + // eslint-disable-next-line no-control-regex + if (/[^\x00-\x7F]/.test(value)) { + // Escape any existing double quotes in the value + const escapedValue = trimmedValue.replace(/"/g, '\\"') + lines[i] = `${prefix}"${escapedValue}"` + fixCount++ + } + } + } + + frontmatter = lines.join("\n") + if (frontmatter !== originalFrontmatter) { + content = content.replace(frontmatterRe, `---\n${frontmatter}\n---`) + } + + return { content, fixCount } +} + function processMarkdownFile( mdPath: string, providedContent?: string @@ -346,7 +754,9 @@ function processMarkdownFile( if (idx === -1 || idx + 2 >= parts.length) { issues.push("No translations segment found; skipping formatting sync") } else { - const englishPath = path.join( + // Use path.resolve to preserve absolute paths (path.join loses leading /) + const englishPath = path.resolve( + path.sep, ...parts.slice(0, idx), ...parts.slice(idx + 2) // drop translations/ ) @@ -360,6 +770,39 @@ function processMarkdownFile( const before = content + // Fix frontmatter issues (don't need English source) + const dateResult = normalizeFrontmatterDates(content) + content = dateResult.content + if (dateResult.fixCount > 0) { + issues.push( + `Normalized ${dateResult.fixCount} frontmatter dates to ISO format` + ) + } + + const quoteResult = quoteFrontmatterNonAscii(content) + content = quoteResult.content + if (quoteResult.fixCount > 0) { + issues.push( + `Quoted ${quoteResult.fixCount} frontmatter values with non-ASCII chars` + ) + } + + const guillemetResult = fixAsciiGuillemets(content) + content = guillemetResult.content + if (guillemetResult.fixCount > 0) { + issues.push( + `Fixed ${guillemetResult.fixCount} ASCII guillemets (<< >>) to Unicode (« »)` + ) + } + + // Fix escaped backticks (\`) to regular backticks (`) + // Crowdin sometimes escapes backticks unnecessarily + const escapedBacktickCount = (content.match(/\\`/g) || []).length + if (escapedBacktickCount > 0) { + content = content.replace(/\\`/g, "`") + issues.push(`Unescaped ${escapedBacktickCount} backslash-escaped backticks`) + } + // Fix block component line breaks (critical for MDX parser) const blockResult = fixBlockComponentLineBreaks(content) content = blockResult.content @@ -371,6 +814,33 @@ function processMarkdownFile( // Normalize inline components and restore blank lines from English source if (englishMd) { + // Sync protected frontmatter fields (template, sidebar, etc.) + const protectedResult = syncProtectedFrontmatterFields(content, englishMd) + content = protectedResult.content + if (protectedResult.fixCount > 0) { + issues.push( + `Synced ${protectedResult.fixCount} protected frontmatter fields from English` + ) + } + + // Collapse inline HTML tags to match English single-line format + const inlineHtmlResult = collapseInlineHtmlFromEnglish(content, englishMd) + content = inlineHtmlResult.content + if (inlineHtmlResult.fixCount > 0) { + issues.push( + `Collapsed ${inlineHtmlResult.fixCount} inline HTML tags to match English` + ) + } + + // Fix JSX component closing tags merged with content (split to own line) + const mergedTagResult = fixMergedClosingTags(content, englishMd) + content = mergedTagResult.content + if (mergedTagResult.fixCount > 0) { + issues.push( + `Split ${mergedTagResult.fixCount} merged closing tags to own lines` + ) + } + // Collapse inline component line breaks to match English format const inlineResult = normalizeInlineComponentsFromEnglish( content, @@ -383,6 +853,13 @@ function processMarkdownFile( ) } + // Repair unclosed backticks in inline code + const backtickResult = repairUnclosedBackticks(content, englishMd) + content = backtickResult.content + if (backtickResult.fixCount > 0) { + issues.push(`Repaired ${backtickResult.fixCount} unclosed backticks`) + } + const blankLineResult = restoreBlankLinesFromEnglish(content, englishMd) content = blankLineResult.content if (blankLineResult.fixCount > 0) { From b19a1a6a430c3a208342212423626235f81853e5 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 22 Dec 2025 13:53:49 -0300 Subject: [PATCH 80/99] refactor: do-not-translate path list --- src/scripts/i18n/config.ts | 27 +++++++-------------- src/scripts/i18n/config/excluded-paths.json | 1 - src/scripts/i18n/lib/github/files.ts | 13 +++++----- 3 files changed, 16 insertions(+), 25 deletions(-) delete mode 100644 src/scripts/i18n/config/excluded-paths.json diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index d9a79e61266..ab3cd5b1fd5 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -1,6 +1,3 @@ -import * as fs from "fs" -import * as path from "path" - import * as dotenv from "dotenv" import i18nConfig from "../../../i18n.config.json" @@ -160,19 +157,14 @@ export const config = { verbose, } -// Load excluded paths from canonical config file -export function loadExcludedPaths(): string[] { - try { - const excludedPathsFile = path.join( - process.cwd(), - "src/scripts/i18n/config/excluded-paths.json" - ) - const raw = fs.readFileSync(excludedPathsFile, "utf8") - return JSON.parse(raw) as string[] - } catch { - return [] - } -} +// Do not translate list - Declare paths that should never be translated +export const doNotTranslatePaths = [ + "/cookie-policy/", + "/privacy-policy/", + "/terms-of-use/", + "/terms-and-conditions/", + "/style-guide/", +] // Validation for target path export function validateTargetPath(targetPath: string): void { @@ -203,8 +195,7 @@ export function validateTargetPath(targetPath: string): void { } // Disallowed: explicitly excluded paths from config file - const excludedPaths = loadExcludedPaths() - for (const excluded of excludedPaths) { + for (const excluded of doNotTranslatePaths) { if (targetPath.includes(excluded)) { throw new Error( `[ERROR] Invalid target path: "${targetPath}"\n` + diff --git a/src/scripts/i18n/config/excluded-paths.json b/src/scripts/i18n/config/excluded-paths.json deleted file mode 100644 index e179216a205..00000000000 --- a/src/scripts/i18n/config/excluded-paths.json +++ /dev/null @@ -1 +0,0 @@ -["/cookie-policy/", "/privacy-policy/", "/terms-of-use/", "/style-guide/"] diff --git a/src/scripts/i18n/lib/github/files.ts b/src/scripts/i18n/lib/github/files.ts index b10120d3cec..64ee90c933b 100644 --- a/src/scripts/i18n/lib/github/files.ts +++ b/src/scripts/i18n/lib/github/files.ts @@ -1,6 +1,6 @@ // GitHub file operations -import { config, gitHubBearerHeaders, loadExcludedPaths } from "../../config" +import { config, doNotTranslatePaths, gitHubBearerHeaders } from "../../config" import type { ContentType, GitHubCrowdinFileMetadata, @@ -33,16 +33,17 @@ export const getAllEnglishFiles = async (): Promise< GitHubQueryResponseItem[] > => { const { targetPath, excludePath } = config - const excludedPaths = loadExcludedPaths() // Add runtime exclusion if specified const allExcludedPaths = excludePath - ? [...excludedPaths, excludePath] - : excludedPaths + ? [...doNotTranslatePaths, excludePath] + : doNotTranslatePaths - debugLog(`Excluded paths loaded: ${excludedPaths.length} entries`) + debugLog( + `Do-not-translate paths loaded: ${doNotTranslatePaths.length} entries` + ) if (excludePath) { - debugLog(`Runtime exclude path: ${excludePath}`) + debugLog(`Runtime path exclusions: ${excludePath}`) } // Determine if targetPath is a file or directory From 7f9356b2bf1684dcb32a1e94edd55c4b65e71585 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 8 Jan 2026 14:51:22 -0800 Subject: [PATCH 81/99] feat(i18n): add auto-fix for common translation issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sanitizer improvements: - Brand name detection: warns when protected brands (Solidity, Alchemy, MetaMask, etc.) appear in English but are missing from translation - Duplicated headings: fixes "## Text? Text? {#id}" → "## Text? {#id}" - Broken markdown links: fixes "] (https://" → "](https://" Updated pre-translate prompt to clarify that brand names should not be translated even when they have common translations in target languages. --- .../i18n/lib/crowdin/pre-translate-prompt.txt | 2 +- src/scripts/i18n/post_import_sanitize.ts | 115 ++++++++++++++++++ 2 files changed, 116 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt b/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt index fe02bc2ada6..d35cf850695 100644 --- a/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt +++ b/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt @@ -10,7 +10,7 @@ CRITICAL DO-NOT-BREAK RULES (must follow exactly): - Custom header IDs: If a Markdown heading includes a custom anchor like `{#custom-id}`, the ID MUST remain identical to the English source, ASCII-only (no accents or special characters). Do NOT alter, translate, add, or remove braces. Keep the exact ID string. - HTML/MDX tag line placement: If an opening HTML tag appears on its own line, the matching closing tag MUST also be on its own line. Preserve line breaks around paired block-level tags. - JSX/MDX attributes: Translate human-readable text found inside attribute values (e.g., `title="..."`, `aria-label="..."`, `alt="..."`) while preserving placeholders, variables, and code. Do NOT translate attribute names or change quoting/escaping. -- Protected names: Do NOT translate obvious proper names, brands, or team names (e.g., "Ethereum", "ETH", "Solidity", "MetaMask", "GitHub", "Crowdin", "ethereum.org"). Leave these as in the source unless a community-approved localized form exists. +- Protected names: Do NOT translate obvious proper names, brands, or team names. This includes programming languages (e.g., "Solidity", "Vyper"), company/product names (e.g., "Alchemy", "Infura", "MetaMask", "Consensys", "Chainlink", "Uniswap", "OpenSea", "OpenZeppelin"), protocol/network names (e.g., "Ethereum", "ETH"), and tools/platforms (e.g., "GitHub", "Crowdin", "ethereum.org"). Leave these as in the source unless a community-approved localized form exists. IMPORTANT: Even when a word has a common translation in the target language (e.g., "Alchemy" meaning the historical practice, or "Solidity" meaning firmness), keep the English term when it refers to a brand, product, or technology name. - URL/path destinations MUST be preserved character-for-character: keep exact case, hyphens, slashes, fragments (`#...`), and query parameters (`?...`). Do NOT change, normalize, or localize any part of a link destination. This rule also applies to any links contained within JSON string values used in React/MDX pages. Maintain Clarity and Professionalism: Ensure the translated text is clear, accurate, and professional in tone, just like the source. Match the tone and register of the English content – if the source is explanatory and formal, the translation should mirror that style. Remember that Ethereum’s content serves both experts and complete beginners, so the translation should be accessible to technical and non-technical readers alike. diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index d67aa087399..c933dfc5459 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -74,6 +74,103 @@ const COMMON_SPELLING_MISTAKES = [ ] const CASE_SENSITIVE_SPELLING_MISTAKES = ["Metamask", "Github"] +/** + * Brand names that should NEVER be translated in ANY language. + * These are proper nouns - programming languages, companies, products. + */ +const PROTECTED_BRAND_NAMES = [ + // Programming languages + "Solidity", + "Vyper", + // Companies/Products + "Alchemy", + "Infura", + "MetaMask", + "Consensys", + "Chainlink", + "OpenZeppelin", +] + +/** + * Check if protected brand names from English source are preserved in translation. + * Returns warnings for any brand names that appear in English but not in translation. + */ +function checkProtectedBrandNames( + translatedContent: string, + englishContent: string +): string[] { + const warnings: string[] = [] + + for (const brand of PROTECTED_BRAND_NAMES) { + // Check if brand exists in English source (case-insensitive search, case-sensitive match) + const brandRegex = new RegExp(`\\b${brand}\\b`, "g") + const inEnglish = englishContent.match(brandRegex) + + if (inEnglish && inEnglish.length > 0) { + // Brand is in English, check if it's preserved in translation + const inTranslation = translatedContent.match(brandRegex) + const englishCount = inEnglish.length + const translationCount = inTranslation?.length ?? 0 + + if (translationCount < englishCount) { + warnings.push( + `Protected brand "${brand}" appears ${englishCount}x in English but ${translationCount}x in translation - may have been mistranslated` + ) + } + } + } + + return warnings +} + +/** + * Fix duplicated headings where the text is repeated. + * Pattern: ## Text? Text? {#id} → ## Text? {#id} + * This happens when translators accidentally duplicate question headings. + */ +function fixDuplicatedHeadings(content: string): { + content: string + fixCount: number +} { + let result = content + let fixCount = 0 + + // Match headings where text is duplicated: ## Text Text {#id} or ## Text? Text? {#id} + // Captures: (hashes) (text including punctuation) (same text) (custom id) + const duplicatedHeadingRe = + /^(#{1,6})\s+(.+?[?!.]?)\s+\2\s*(\{#[^}]+\})\s*$/gm + + result = result.replace(duplicatedHeadingRe, (match, hashes, text, id) => { + fixCount++ + return `${hashes} ${text} ${id}` + }) + + return { content: result, fixCount } +} + +/** + * Fix broken markdown links where there's a space between ] and (. + * Pattern: ] (https://... → ](https://... + * This is a common translation artifact from Crowdin. + */ +function fixBrokenMarkdownLinks(content: string): { + content: string + fixCount: number +} { + let result = content + let fixCount = 0 + + // Match ] followed by space(s) then ( - this breaks markdown links + const brokenLinkRe = /\]\s+\(/g + const matches = result.match(brokenLinkRe) + if (matches) { + fixCount = matches.length + result = result.replace(brokenLinkRe, "](") + } + + return { content: result, fixCount } +} + function lineAt(file: string, index: number): string { const fileSubstring = file.substring(0, index) const lines = fileSubstring.split("\n") @@ -360,6 +457,20 @@ function processMarkdownFile( const before = content + // Fix duplicated headings (e.g., ## Text? Text? {#id} → ## Text? {#id}) + const duplicatedResult = fixDuplicatedHeadings(content) + content = duplicatedResult.content + if (duplicatedResult.fixCount > 0) { + issues.push(`Fixed ${duplicatedResult.fixCount} duplicated headings`) + } + + // Fix broken markdown links (] (https:// → ](https://) + const brokenLinksResult = fixBrokenMarkdownLinks(content) + content = brokenLinksResult.content + if (brokenLinksResult.fixCount > 0) { + issues.push(`Fixed ${brokenLinksResult.fixCount} broken markdown links`) + } + // Fix block component line breaks (critical for MDX parser) const blockResult = fixBlockComponentLineBreaks(content) content = blockResult.content @@ -390,6 +501,10 @@ function processMarkdownFile( `Restored ${blankLineResult.fixCount} blank lines from English` ) } + + // Check for mistranslated brand names (report-only) + const brandWarnings = checkProtectedBrandNames(content, englishMd) + issues.push(...brandWarnings) } const fixed = before !== content From 811a43fd3b81325f7199ff1d69bd181c443a714e Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 8 Jan 2026 14:51:41 -0800 Subject: [PATCH 82/99] update: excluded-paths to include terms-and-conditions --- src/scripts/i18n/config/excluded-paths.json | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/scripts/i18n/config/excluded-paths.json b/src/scripts/i18n/config/excluded-paths.json index e179216a205..84b5eb49bec 100644 --- a/src/scripts/i18n/config/excluded-paths.json +++ b/src/scripts/i18n/config/excluded-paths.json @@ -1 +1,7 @@ -["/cookie-policy/", "/privacy-policy/", "/terms-of-use/", "/style-guide/"] +[ + "/cookie-policy/", + "/privacy-policy/", + "/terms-of-use/", + "/style-guide/", + "/terms-and-conditions/" +] \ No newline at end of file From beb8885f042347c49999d24679b6dd972678a9aa Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 8 Jan 2026 21:35:33 -0800 Subject: [PATCH 83/99] feat(i18n): add href and component line break fixes Add fixTranslatedHrefs() to detect and auto-fix incorrectly translated internal hrefs using set comparison. Only fixes unambiguous cases (1 wrong + 1 missing); warns otherwise. Add fixCollapsedComponentLineBreaks() to restore line breaks between consecutive MDX components when translators collapse them onto single lines. Extract BLOCK_MDX_COMPONENTS constant to DRY up component lists. Add escapeRegex() and isInternalHref() helpers. --- src/scripts/i18n/post_import_sanitize.ts | 215 ++++++++++++++++++++--- 1 file changed, 189 insertions(+), 26 deletions(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index c933dfc5459..b7db8c33040 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -18,7 +18,6 @@ import * as path from "path" const ROOT = process.cwd() const CONTENT_ROOT = path.join(ROOT, "public", "content") -// const INTL_ROOT = path.join(ROOT, "src", "intl") // Not currently used const BLOCK_HTML_TAGS = [ "section", @@ -29,6 +28,24 @@ const BLOCK_HTML_TAGS = [ "footer", ] +/** + * MDX block components that need opening/closing tags on separate lines. + * ButtonLink is intentionally excluded - it's an inline component. + */ +const BLOCK_MDX_COMPONENTS = [ + "Card", + "ExpandableCard", + "Alert", + "AlertEmoji", + "AlertContent", + "AlertDescription", + "CardGrid", + "InfoGrid", + "InfoBanner", + "Tabs", + "TabItem", +] + function listFiles( dir: string, predicate: (file: string) => boolean @@ -171,6 +188,154 @@ function fixBrokenMarkdownLinks(content: string): { return { content: result, fixCount } } +/** + * Fix collapsed line breaks between consecutive MDX components. + * Pattern:
\n + * This happens when translators collapse multiple components onto one line. + */ +function fixCollapsedComponentLineBreaks( + translatedContent: string, + englishContent: string +): { content: string; fixCount: number } { + let result = translatedContent + let fixCount = 0 + + // Find components that appear consecutively in English (on separate lines) + // and restore line breaks in translation if they were collapsed + const consecutiveComponentRe = + /<\/([A-Z][A-Za-z]*)[^>]*>\s*<([A-Z][A-Za-z]*)/g + + // Check English for line break patterns between components + const englishMatches = [...englishContent.matchAll(consecutiveComponentRe)] + for (const match of englishMatches) { + const fullMatch = match[0] + // If English has a newline between these components + if (fullMatch.includes("\n")) { + // Find same pattern in translation (possibly without newline) + const closingTag = match[1] + const openingTag = match[2] + const collapsedRe = new RegExp( + `[ \\t]+<${openingTag}`, + "g" + ) + const collapsedMatches = result.match(collapsedRe) + if (collapsedMatches) { + fixCount += collapsedMatches.length + result = result.replace(collapsedRe, `\n<${openingTag}`) + } + } + } + + return { content: result, fixCount } +} + +/** + * Extract all href values from content (both markdown links and JSX/HTML attributes). + */ +function extractHrefs(content: string): Set { + const hrefs = new Set() + + // Markdown links: [text](href) + const markdownLinkRe = /\[[^\]]*\]\(([^)]+)\)/g + let match + while ((match = markdownLinkRe.exec(content))) { + hrefs.add(match[1]) + } + + // JSX/HTML href attributes: href="..." or href='...' + const hrefAttrRe = /href=["']([^"']+)["']/g + while ((match = hrefAttrRe.exec(content))) { + hrefs.add(match[1]) + } + + return hrefs +} + +/** + * Fix translated hrefs by comparing against English source. + * Uses set comparison to find hrefs that were incorrectly translated. + * Only auto-fixes when there's exactly 1 wrong and 1 missing (unambiguous). + * Warns for multiple mismatches without attempting to guess. + */ +function fixTranslatedHrefs( + translatedContent: string, + englishContent: string +): { content: string; fixCount: number; fixes: string[]; warnings: string[] } { + const englishHrefs = extractHrefs(englishContent) + const translatedHrefs = extractHrefs(translatedContent) + + // Find internal hrefs that differ between English and translation + const wrongHrefs: string[] = [] // In translation but not English + const missingHrefs: string[] = [] // In English but not translation + + for (const href of translatedHrefs) { + if (isInternalHref(href) && !englishHrefs.has(href)) { + wrongHrefs.push(href) + } + } + + for (const href of englishHrefs) { + if (isInternalHref(href) && !translatedHrefs.has(href)) { + missingHrefs.push(href) + } + } + + // No issues found + if (wrongHrefs.length === 0 && missingHrefs.length === 0) { + return { content: translatedContent, fixCount: 0, fixes: [], warnings: [] } + } + + // Multiple mismatches - warn but don't try to guess + if (wrongHrefs.length !== 1 || missingHrefs.length !== 1) { + const warnings: string[] = [] + for (const href of wrongHrefs) { + warnings.push(`Possibly translated href "${href}" - not found in English`) + } + for (const href of missingHrefs) { + warnings.push(`Missing href "${href}" - present in English but not translation`) + } + return { content: translatedContent, fixCount: 0, fixes: [], warnings } + } + + // Exactly 1 wrong and 1 missing - safe to fix + const wrong = wrongHrefs[0] + const correct = missingHrefs[0] + + let result = translatedContent + + // Replace in markdown links: [text](wrong) → [text](correct) + const markdownRe = new RegExp( + `(\\[[^\\]]*\\]\\()${escapeRegex(wrong)}(\\))`, + "g" + ) + result = result.replace(markdownRe, `$1${correct}$2`) + + // Replace in href attributes: href="wrong" → href="correct" + const hrefRe = new RegExp(`(href=["'])${escapeRegex(wrong)}(["'])`, "g") + result = result.replace(hrefRe, `$1${correct}$2`) + + return { + content: result, + fixCount: 1, + fixes: [`${wrong} → ${correct}`], + warnings: [], + } +} + +/** + * Escape special regex characters in a string. + */ +function escapeRegex(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") +} + +/** + * Check if href is an internal link (starts with / but not //). + */ +function isInternalHref(href: string): boolean { + return href.startsWith("/") && !href.startsWith("//") +} + function lineAt(file: string, index: number): string { const fileSubstring = file.substring(0, index) const lines = fileSubstring.split("\n") @@ -270,9 +435,9 @@ function restoreBlankLinesFromEnglish( // Patterns that should have blank lines after them const headerPattern = /^#{1,6}\s+/ - // NOTE: ButtonLink is excluded - children should remain inline - const blockComponentClosePattern = - /<\/(Alert|AlertContent|AlertDescription|Card|ExpandableCard|CardGrid|InfoGrid|Tabs|TabItem|InfoBanner)>/ + const blockComponentClosePattern = new RegExp( + `` + ) for (let i = 0; i < translatedLines.length; i++) { const line = translatedLines[i] @@ -318,11 +483,6 @@ function restoreBlankLinesFromEnglish( return { content: result.join("\n"), fixCount } } -/** - * Fix block-level React components that have opening/closing tags inline with content. - * MDX parser requires these tags to be on separate lines. - * Returns number of fixes applied. - */ /** * Normalize inline component formatting to match English source. * If English has the component on one line, collapse translated version too. @@ -385,26 +545,10 @@ function fixBlockComponentLineBreaks(md: string): { content: string fixCount: number } { - // Block components that need opening/closing tags on separate lines - // NOTE: ButtonLink is intentionally excluded - it's an inline component - const blockComponents = [ - "Card", - "ExpandableCard", - "Alert", - "AlertEmoji", - "AlertContent", - "AlertDescription", - "CardGrid", - "InfoGrid", - "InfoBanner", - "Tabs", - "TabItem", - ] - let content = md let fixCount = 0 - for (const component of blockComponents) { + for (const component of BLOCK_MDX_COMPONENTS) { // Fix inline closing tags: content → content\n
const inlineCloseRe = new RegExp(`([^\\n])\\s*`, "g") content = content.replace(inlineCloseRe, (_, before) => { @@ -502,9 +646,28 @@ function processMarkdownFile( ) } + // Fix collapsed line breaks between consecutive components + const collapsedResult = fixCollapsedComponentLineBreaks(content, englishMd) + content = collapsedResult.content + if (collapsedResult.fixCount > 0) { + issues.push( + `Fixed ${collapsedResult.fixCount} collapsed component line breaks` + ) + } + // Check for mistranslated brand names (report-only) const brandWarnings = checkProtectedBrandNames(content, englishMd) issues.push(...brandWarnings) + + // Fix translated hrefs using set comparison + const hrefResult = fixTranslatedHrefs(content, englishMd) + content = hrefResult.content + if (hrefResult.fixCount > 0) { + issues.push( + `Fixed ${hrefResult.fixCount} translated hrefs: ${hrefResult.fixes.join(", ")}` + ) + } + issues.push(...hrefResult.warnings) } const fixed = before !== content From 5be8450a6200ecd2b4ea585fca9f76d96b24e63d Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 8 Jan 2026 21:35:50 -0800 Subject: [PATCH 84/99] refactor(i18n): improve regex safety and code style - Use escapeRegex() in checkProtectedBrandNames to handle special characters in brand names - Simplify fixBrokenMarkdownLinks using replace callback - Change type HeaderInfo to interface per TS conventions --- src/scripts/i18n/post_import_sanitize.ts | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index b7db8c33040..061f53994de 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -119,8 +119,8 @@ function checkProtectedBrandNames( const warnings: string[] = [] for (const brand of PROTECTED_BRAND_NAMES) { - // Check if brand exists in English source (case-insensitive search, case-sensitive match) - const brandRegex = new RegExp(`\\b${brand}\\b`, "g") + // Check if brand exists in English source (case-sensitive match with word boundaries) + const brandRegex = new RegExp(`\\b${escapeRegex(brand)}\\b`, "g") const inEnglish = englishContent.match(brandRegex) if (inEnglish && inEnglish.length > 0) { @@ -174,16 +174,13 @@ function fixBrokenMarkdownLinks(content: string): { content: string fixCount: number } { - let result = content let fixCount = 0 // Match ] followed by space(s) then ( - this breaks markdown links - const brokenLinkRe = /\]\s+\(/g - const matches = result.match(brokenLinkRe) - if (matches) { - fixCount = matches.length - result = result.replace(brokenLinkRe, "](") - } + const result = content.replace(/\]\s+\(/g, () => { + fixCount++ + return "](" + }) return { content: result, fixCount } } @@ -344,7 +341,7 @@ function lineAt(file: string, index: number): string { const lineNumber = `${linePosition}:${charPosition}` return lineNumber } -type HeaderInfo = { +interface HeaderInfo { level: number // Number of # symbols text: string // Header text (translated or English) id: string // Custom ID from {#id} From 5ce97dd2bc6c3b7275a3453697e0f31c57ff9716 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 12 Jan 2026 14:20:51 -0800 Subject: [PATCH 85/99] feat(i18n): use paragraph-scoped href matching for safer fixes Rewrite fixTranslatedHrefs() to compare hrefs within paragraph blocks instead of globally. Handles grammatical reordering in non-English languages. Only auto-fixes 1:1 mismatches within blocks; warns otherwise. --- src/scripts/i18n/post_import_sanitize.ts | 171 +++++++++++++++++------ 1 file changed, 126 insertions(+), 45 deletions(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 061f53994de..d1b61fad511 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -248,74 +248,155 @@ function extractHrefs(content: string): Set { return hrefs } +/** + * Extract hrefs from a single text block (paragraph/section). + * Returns array to preserve duplicates within the block. + */ +function extractHrefsFromBlock(block: string): string[] { + const hrefs: string[] = [] + + // Markdown links: [text](href) + const markdownLinkRe = /\[[^\]]*\]\(([^)]+)\)/g + let match + while ((match = markdownLinkRe.exec(block))) { + hrefs.push(match[1]) + } + + // JSX/HTML href attributes: href="..." or href='...' + const hrefAttrRe = /href=["']([^"']+)["']/g + while ((match = hrefAttrRe.exec(block))) { + hrefs.push(match[1]) + } + + return hrefs +} + +/** + * Split markdown content into logical blocks (paragraphs/sections). + * Blocks are separated by blank lines. + */ +function splitIntoBlocks(content: string): string[] { + // Split on one or more blank lines + return content.split(/\n\s*\n/).filter((block) => block.trim().length > 0) +} + /** * Fix translated hrefs by comparing against English source. - * Uses set comparison to find hrefs that were incorrectly translated. - * Only auto-fixes when there's exactly 1 wrong and 1 missing (unambiguous). - * Warns for multiple mismatches without attempting to guess. + * Uses paragraph-scoped set comparison for robust matching across languages. + * + * Strategy: + * 1. Split both documents into blocks (paragraphs separated by blank lines) + * 2. For each block pair, compare internal href sets + * 3. Within a block: if invalid href count equals missing href count, we can match + * 4. This handles grammatical reordering within sentences (common in non-English) + * + * Only auto-fixes unambiguous cases; warns for complex mismatches. */ function fixTranslatedHrefs( translatedContent: string, englishContent: string ): { content: string; fixCount: number; fixes: string[]; warnings: string[] } { - const englishHrefs = extractHrefs(englishContent) - const translatedHrefs = extractHrefs(translatedContent) + const englishBlocks = splitIntoBlocks(englishContent) + const translatedBlocks = splitIntoBlocks(translatedContent) - // Find internal hrefs that differ between English and translation - const wrongHrefs: string[] = [] // In translation but not English - const missingHrefs: string[] = [] // In English but not translation + // Collect all English internal hrefs as the "valid" set + const allEnglishHrefs = extractHrefs(englishContent) - for (const href of translatedHrefs) { - if (isInternalHref(href) && !englishHrefs.has(href)) { - wrongHrefs.push(href) - } - } + const allFixes: Array<[string, string]> = [] // [wrong, correct] + const allWarnings: string[] = [] - for (const href of englishHrefs) { - if (isInternalHref(href) && !translatedHrefs.has(href)) { - missingHrefs.push(href) - } - } + // Process block by block + const blockCount = Math.min(englishBlocks.length, translatedBlocks.length) - // No issues found - if (wrongHrefs.length === 0 && missingHrefs.length === 0) { - return { content: translatedContent, fixCount: 0, fixes: [], warnings: [] } - } + for (let i = 0; i < blockCount; i++) { + const engBlock = englishBlocks[i] + const transBlock = translatedBlocks[i] + + const engHrefs = extractHrefsFromBlock(engBlock).filter(isInternalHref) + const transHrefs = extractHrefsFromBlock(transBlock).filter(isInternalHref) + + // Skip blocks with no internal hrefs + if (engHrefs.length === 0 && transHrefs.length === 0) continue + + // Find hrefs in translation that don't exist in English (invalid) + const transHrefSet = new Set(transHrefs) + + const invalidInTrans: string[] = [] // In translation but not in any English href + const missingFromTrans: string[] = [] // In English block but not in translation - // Multiple mismatches - warn but don't try to guess - if (wrongHrefs.length !== 1 || missingHrefs.length !== 1) { - const warnings: string[] = [] - for (const href of wrongHrefs) { - warnings.push(`Possibly translated href "${href}" - not found in English`) + for (const href of transHrefs) { + if (!allEnglishHrefs.has(href)) { + invalidInTrans.push(href) + } } - for (const href of missingHrefs) { - warnings.push(`Missing href "${href}" - present in English but not translation`) + + for (const href of engHrefs) { + if (!transHrefSet.has(href)) { + missingFromTrans.push(href) + } + } + + // No issues in this block + if (invalidInTrans.length === 0 && missingFromTrans.length === 0) continue + + // Deduplicate for set comparison + const uniqueInvalid = [...new Set(invalidInTrans)] + const uniqueMissing = [...new Set(missingFromTrans)] + + // Only auto-fix when there's exactly 1 invalid and 1 missing in block + // Multiple mismatches within same block could be reordered - don't guess + if (uniqueInvalid.length === 1 && uniqueMissing.length === 1) { + allFixes.push([uniqueInvalid[0], uniqueMissing[0]]) + } else if (uniqueInvalid.length > 0 || uniqueMissing.length > 0) { + // Count mismatch - can't safely fix, warn instead + for (const href of uniqueInvalid) { + allWarnings.push( + `Block ${i + 1}: Invalid href "${href}" - not a valid English path` + ) + } + for (const href of uniqueMissing) { + allWarnings.push( + `Block ${i + 1}: Missing href "${href}" - present in English but not translation` + ) + } } - return { content: translatedContent, fixCount: 0, fixes: [], warnings } } - // Exactly 1 wrong and 1 missing - safe to fix - const wrong = wrongHrefs[0] - const correct = missingHrefs[0] + // Warn about block count mismatch + if (englishBlocks.length !== translatedBlocks.length) { + allWarnings.push( + `Block count mismatch: English has ${englishBlocks.length}, translation has ${translatedBlocks.length}` + ) + } + // Apply all fixes let result = translatedContent + const appliedFixes: string[] = [] - // Replace in markdown links: [text](wrong) → [text](correct) - const markdownRe = new RegExp( - `(\\[[^\\]]*\\]\\()${escapeRegex(wrong)}(\\))`, - "g" - ) - result = result.replace(markdownRe, `$1${correct}$2`) + for (const [wrong, correct] of allFixes) { + // Replace in markdown links: [text](wrong) → [text](correct) + const markdownRe = new RegExp( + `(\\[[^\\]]*\\]\\()${escapeRegex(wrong)}(\\))`, + "g" + ) + const beforeMd = result + result = result.replace(markdownRe, `$1${correct}$2`) + + // Replace in href attributes: href="wrong" → href="correct" + const hrefRe = new RegExp(`(href=["'])${escapeRegex(wrong)}(["'])`, "g") + const beforeAttr = result + result = result.replace(hrefRe, `$1${correct}$2`) - // Replace in href attributes: href="wrong" → href="correct" - const hrefRe = new RegExp(`(href=["'])${escapeRegex(wrong)}(["'])`, "g") - result = result.replace(hrefRe, `$1${correct}$2`) + if (result !== beforeMd || result !== beforeAttr) { + appliedFixes.push(`${wrong} → ${correct}`) + } + } return { content: result, - fixCount: 1, - fixes: [`${wrong} → ${correct}`], - warnings: [], + fixCount: appliedFixes.length, + fixes: appliedFixes, + warnings: allWarnings, } } From c8363253c48e1e386a2ff6bc53a87ebba29a6d4b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 12 Jan 2026 16:28:02 -0800 Subject: [PATCH 86/99] fix: unused arg --- src/scripts/i18n/post_import_sanitize.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index d1b61fad511..abc9291fe0d 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -157,7 +157,7 @@ function fixDuplicatedHeadings(content: string): { const duplicatedHeadingRe = /^(#{1,6})\s+(.+?[?!.]?)\s+\2\s*(\{#[^}]+\})\s*$/gm - result = result.replace(duplicatedHeadingRe, (match, hashes, text, id) => { + result = result.replace(duplicatedHeadingRe, (_, hashes, text, id) => { fixCount++ return `${hashes} ${text} ${id}` }) From dd91f7150f9e4e62ce8c7741450119414415e2c0 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 12 Jan 2026 17:22:31 -0800 Subject: [PATCH 87/99] refactor(i18n): remove dead code and production-harden workflow - Remove Spanish default from target_languages input - Remove exposed timeout/poll workflow inputs (use code defaults) - Delete unused scripts: check-translation-status, unhide-strings - Delete dead code: prompt-model.ts, pr-review-comments.ts - Delete standalone translate-jsx-attributes.yml workflow - Clean up verbose logging and stale workflow references Co-Authored-By: Claude Opus 4.5 --- .github/workflows/crowdin-ai-import.yml | 13 -- .../workflows/translate-jsx-attributes.yml | 74 ----------- src/scripts/i18n/check-translation-status.ts | 118 ------------------ src/scripts/i18n/config.ts | 2 - src/scripts/i18n/lib/crowdin/prompt-model.ts | 22 ---- .../i18n/lib/github/pr-review-comments.ts | 97 -------------- src/scripts/i18n/lib/workflows/pr-creation.ts | 3 +- src/scripts/i18n/unhide-strings.ts | 72 ----------- 8 files changed, 1 insertion(+), 400 deletions(-) delete mode 100644 .github/workflows/translate-jsx-attributes.yml delete mode 100644 src/scripts/i18n/check-translation-status.ts delete mode 100644 src/scripts/i18n/lib/crowdin/prompt-model.ts delete mode 100644 src/scripts/i18n/lib/github/pr-review-comments.ts delete mode 100644 src/scripts/i18n/unhide-strings.ts diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index 3734f54c3be..3e6d6c99754 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -14,7 +14,6 @@ on: target_languages: description: "Comma-separated internal language codes (blank for all locales)" required: false - default: "es" type: string use_legacy_languages: description: "Use legacy locales i18n.config.json (else uses canonical-llm-language-list.json)" @@ -30,16 +29,6 @@ on: description: "Pre-translation ID to resume from (leave empty to start new)" required: false type: string - pretranslate_timeout_ms: - description: "Max ms to wait for pre-translate (default: 21600000 ~6h)" - required: false - default: "21600000" - type: string - pretranslate_poll_base_ms: - description: "Base poll interval ms (default: 30000)" - required: false - default: "30000" - type: string pre_translate_prompt_id: description: "AI prompt ID for pre_translate (default: 326942)" required: false @@ -94,8 +83,6 @@ jobs: TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} USE_LEGACY_LANGUAGES: ${{ github.event.inputs.use_legacy_languages }} BASE_BRANCH: ${{ github.event.inputs.base_branch }} - PRETRANSLATE_TIMEOUT_MS: ${{ github.event.inputs.pretranslate_timeout_ms }} - PRETRANSLATE_POLL_BASE_MS: ${{ github.event.inputs.pretranslate_poll_base_ms }} PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} VERBOSE: ${{ github.event.inputs.verbose }} SKIP_PR_CREATION: ${{ github.event.inputs.skip_pr }} diff --git a/.github/workflows/translate-jsx-attributes.yml b/.github/workflows/translate-jsx-attributes.yml deleted file mode 100644 index 45927c84a04..00000000000 --- a/.github/workflows/translate-jsx-attributes.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: Translate JSX Attributes - -on: - workflow_dispatch: - inputs: - branch: - description: "Branch name to process (e.g., translations/es-2024-12-17)" - required: true - type: string - target_language: - description: "Target language code (e.g., es, fr, de)" - required: true - type: string - file_pattern: - description: "Glob pattern for files to process (default: all markdown in translations folder)" - required: false - default: "public/content/translations/**/*.md" - type: string - verbose: - description: "Enable verbose logging?" - required: false - default: false - type: boolean - -jobs: - translate_attributes: - runs-on: ubuntu-latest - steps: - - name: Check out branch - uses: actions/checkout@v4 - with: - ref: ${{ github.event.inputs.branch }} - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: 20 - cache: "pnpm" - - - name: Install dependencies - run: pnpm install - - - name: Find markdown files - id: find-files - run: | - FILES=$(find ${{ github.event.inputs.file_pattern }} -type f 2>/dev/null | head -500 | tr '\n' ',') - echo "files=${FILES%,}" >> $GITHUB_OUTPUT - echo "Found files: ${FILES%,}" - - - name: Translate JSX attributes - if: steps.find-files.outputs.files != '' - run: | - npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/translate-jsx-attributes.ts \ - --language ${{ github.event.inputs.target_language }} \ - --files "${{ steps.find-files.outputs.files }}" - env: - GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} - - - name: Commit changes - if: steps.find-files.outputs.files != '' - run: | - git config --local user.email "github-actions[bot]@users.noreply.github.com" - git config --local user.name "github-actions[bot]" - git add -A - if git diff --staged --quiet; then - echo "No changes to commit" - else - git commit -m "chore: translate JSX attributes (${{ github.event.inputs.target_language }})" - git push - echo "✓ Committed and pushed JSX attribute translations" - fi diff --git a/src/scripts/i18n/check-translation-status.ts b/src/scripts/i18n/check-translation-status.ts deleted file mode 100644 index 5913e5457f1..00000000000 --- a/src/scripts/i18n/check-translation-status.ts +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Quick script to check translation status of a specific file in Crowdin - */ - -const CROWDIN_API_KEY = process.env.CROWDIN_TOKEN! -const PROJECT_ID = 834930 -const FILE_ID = 17434 // organizing/index.md -const LANGUAGE_ID = "es-EM" - -const headers = { - Authorization: `Bearer ${CROWDIN_API_KEY}`, - "Content-Type": "application/json", -} - -async function checkTranslationProgress() { - console.log("\n=== Checking Translation Progress ===") - console.log(`File ID: ${FILE_ID}`) - console.log(`Language: ${LANGUAGE_ID}`) - - // Get translation progress for the file - const url = `https://api.crowdin.com/api/v2/projects/${PROJECT_ID}/languages/${LANGUAGE_ID}/progress?fileIds=${FILE_ID}` - - try { - const res = await fetch(url, { headers }) - if (!res.ok) { - const text = await res.text() - throw new Error(`Failed to get progress (${res.status}): ${text}`) - } - - const json = await res.json() - console.log("\nTranslation Progress:") - console.log(JSON.stringify(json, null, 2)) - } catch (error) { - console.error("Error:", error) - } -} - -async function listStrings() { - console.log("\n=== Listing Strings in File ===") - - // Get strings from the file - const url = `https://api.crowdin.com/api/v2/projects/${PROJECT_ID}/strings?fileId=${FILE_ID}&limit=10` - - try { - const res = await fetch(url, { headers }) - if (!res.ok) { - const text = await res.text() - throw new Error(`Failed to list strings (${res.status}): ${text}`) - } - - const json = await res.json() - console.log(`\nFound ${json.data.length} strings (showing first 10):`) - for (const item of json.data) { - console.log(`\nString ID: ${item.data.id}`) - console.log(`Text: "${item.data.text.substring(0, 100)}..."`) - console.log(`Context: ${item.data.context || "none"}`) - } - } catch (error) { - console.error("Error:", error) - } -} - -async function checkStringTranslations() { - console.log("\n=== Checking String Translations ===") - - // First get a string ID - const stringsUrl = `https://api.crowdin.com/api/v2/projects/${PROJECT_ID}/strings?fileId=${FILE_ID}&limit=1` - - try { - const stringsRes = await fetch(stringsUrl, { headers }) - if (!stringsRes.ok) { - throw new Error(`Failed to get strings: ${stringsRes.status}`) - } - - const stringsJson = await stringsRes.json() - if (stringsJson.data.length === 0) { - console.log("❌ No strings found in file!") - return - } - - const stringId = stringsJson.data[0].data.id - console.log(`\nChecking translations for string ID: ${stringId}`) - console.log( - `String text: "${stringsJson.data[0].data.text.substring(0, 100)}..."` - ) - - // Get translations for this string - const translationsUrl = `https://api.crowdin.com/api/v2/projects/${PROJECT_ID}/translations?stringId=${stringId}&languageId=${LANGUAGE_ID}` - const transRes = await fetch(translationsUrl, { headers }) - - if (!transRes.ok) { - const text = await transRes.text() - console.log( - `\n⚠️ No translations found or error (${transRes.status}): ${text}` - ) - return - } - - const transJson = await transRes.json() - console.log(`\nTranslations found: ${transJson.data.length}`) - if (transJson.data.length > 0) { - console.log("First translation:") - console.log(JSON.stringify(transJson.data[0].data, null, 2)) - } else { - console.log("❌ String has NO translations in Spanish!") - } - } catch (error) { - console.error("Error:", error) - } -} - -async function main() { - await checkTranslationProgress() - await listStrings() - await checkStringTranslations() -} - -main() diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index ab3cd5b1fd5..70ff8c125eb 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -121,8 +121,6 @@ if (verbose) { console.log(`[DEBUG] - Exclude path: ${excludePath || "none"}`) console.log(`[DEBUG] - Skip await: ${skipAwait}`) console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) - console.log(`[DEBUG] - Pretranslate timeout ms: ${pretranslateTimeoutMs}`) - console.log(`[DEBUG] - Pretranslate poll base ms: ${pretranslatePollBaseMs}`) if (existingPreTranslationId) { console.log( `[DEBUG] - Resuming from pre-translation ID: ${existingPreTranslationId}` diff --git a/src/scripts/i18n/lib/crowdin/prompt-model.ts b/src/scripts/i18n/lib/crowdin/prompt-model.ts deleted file mode 100644 index 0b3bb581ce6..00000000000 --- a/src/scripts/i18n/lib/crowdin/prompt-model.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { crowdinBearerHeaders } from "../../config" - -import type { PromptResource } from "./prompt" - -export async function getPromptModelKey( - userId: number, - promptId: number -): Promise { - const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` - const resp = await fetch(url, { headers: crowdinBearerHeaders }) - if (!resp.ok) { - throw new Error( - `Failed to fetch prompt metadata: ${resp.status} ${await resp.text()}` - ) - } - const json: { data?: PromptResource } = await resp.json() - const data: PromptResource = json.data ?? ({} as PromptResource) - const provider = data.aiProviderId ?? "provider" - const model = data.model ?? "model" - const version = data.version ?? "version" - return `${provider}:${model}:${version}` -} diff --git a/src/scripts/i18n/lib/github/pr-review-comments.ts b/src/scripts/i18n/lib/github/pr-review-comments.ts deleted file mode 100644 index 168724183e9..00000000000 --- a/src/scripts/i18n/lib/github/pr-review-comments.ts +++ /dev/null @@ -1,97 +0,0 @@ -// GitHub PR review comment helper with scoped @mentions -import { config, gitHubBearerHeaders } from "../../config" -import { fetchWithRetry } from "../utils/fetch" - -// QA level for AI review routing (planned for v0.2.0) -export type QaLevel = "copilot" | "copilot+claude" - -/** - * Post a follow-up comment on a PR with AI reviewer mentions and clear scope - * @param prNumber The PR number - * @param qaPlan The QA plan mapping languages to review levels - */ -export async function postPrReviewComment( - prNumber: number, - qaPlan: Record -): Promise { - const copilotLangs: string[] = [] - const claudeLangs: string[] = [] - - for (const [lang, level] of Object.entries(qaPlan)) { - if (level === "copilot") { - copilotLangs.push(lang) - } else if (level === "copilot+claude") { - copilotLangs.push(lang) - claudeLangs.push(lang) - } - } - - if (copilotLangs.length === 0 && claudeLangs.length === 0) { - console.log("[PR-COMMENT] No AI review needed, skipping comment") - return - } - - let comment = "## AI Translation Review Request\n\n" - comment += - "This PR contains automated translations that need quality review.\n\n" - - // Compact snapshot of canonical prompt rules and glossary/TM awareness - comment += "### Prompt Rules Snapshot\n\n" - comment += "Key non-negotiables for review:\n" - comment += "- Protected names include `ethereum.org`; do not change casing.\n" - comment += "- Header IDs `{#...}` must remain identical to English.\n" - comment += - "- URL/path destinations must be preserved character-for-character (case, hyphens, slashes, fragments, query params). This also applies to links inside JSON strings.\n" - comment += - '- JSON escaping: inside JSON values, escape quotes ("), backslashes (\\), newlines (\\n), tabs (\\t).\n\n' - comment += - "Canonical prompt source: `src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt` (synced to Crowdin before pre-translation).\n\n" - comment += - "Glossary/TM note: Community glossary/TM is synced from Supabase into Crowdin at the start of the run to guide terminology consistency.\n\n" - - if (copilotLangs.length > 0) { - comment += "### @copilot\n\n" - comment += - "@copilot Please review the translations for the following languages and check for:\n" - comment += "- Accuracy and natural phrasing\n" - comment += "- Consistent use of technical terminology\n" - comment += "- Proper handling of Markdown/code syntax\n" - comment += "- Appropriate tone and formality\n\n" - comment += `**Languages:** ${copilotLangs.join(", ")}\n\n` - } - - if (claudeLangs.length > 0) { - comment += "### @claude\n\n" - comment += - "@claude Please provide a thorough review of translations for the following languages, focusing on:\n" - comment += "- Semantic accuracy and cultural appropriateness\n" - comment += "- Technical term consistency\n" - comment += "- Grammar and idiomatic expressions\n" - comment += "- Any potential ambiguities or mistranslations\n\n" - comment += `**Languages:** ${claudeLangs.join(", ")}\n\n` - } - - comment += - "---\n*This review request was automatically generated based on language quality trust scores.*" - - const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/issues/${prNumber}/comments` - - const response = await fetchWithRetry(url, { - method: "POST", - headers: { - ...gitHubBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ body: comment }), - }) - - if (!response.ok) { - const errorText = await response.text() - console.warn( - `[PR-COMMENT] Failed to post review comment (${response.status}): ${errorText}` - ) - return - } - - console.log(`[PR-COMMENT] Posted AI review comment on PR #${prNumber}`) -} diff --git a/src/scripts/i18n/lib/workflows/pr-creation.ts b/src/scripts/i18n/lib/workflows/pr-creation.ts index 147fd1794b2..07d2c7462ff 100644 --- a/src/scripts/i18n/lib/workflows/pr-creation.ts +++ b/src/scripts/i18n/lib/workflows/pr-creation.ts @@ -103,8 +103,7 @@ export function generatePRBody( prBody += `---\n\n` prBody += `> ⚠️ **Note:** GEMINI_API_KEY was not available during this run. ` prBody += `JSX component attributes (e.g., \`title="..."\`, \`description="..."\`) ` - prBody += `may remain untranslated. You can run the \`translate-jsx-attributes\` ` - prBody += `workflow on this branch to translate them separately.\n\n` + prBody += `may remain untranslated.\n\n` } return prBody diff --git a/src/scripts/i18n/unhide-strings.ts b/src/scripts/i18n/unhide-strings.ts deleted file mode 100644 index 40aed1b9298..00000000000 --- a/src/scripts/i18n/unhide-strings.ts +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Unhide all hidden/duplicate strings in a Crowdin file - */ - -import dotenv from "dotenv" - -dotenv.config({ path: ".env.local" }) - -const API_KEY = process.env.I18N_CROWDIN_API_KEY! -const PROJ_ID = 834930 -const TARGET_FILE_ID = 17434 // organizing/index.md - -const requestHeaders = { - Authorization: `Bearer ${API_KEY}`, - "Content-Type": "application/json", -} - -async function unhideAllStrings() { - console.log(`\n=== Unhiding strings in file ${TARGET_FILE_ID} ===`) - - // Get all strings from the file - const listUrl = `https://api.crowdin.com/api/v2/projects/${PROJ_ID}/strings?fileId=${TARGET_FILE_ID}&limit=500` - - const listRes = await fetch(listUrl, { headers: requestHeaders }) - if (!listRes.ok) { - throw new Error(`Failed to list strings: ${listRes.status}`) - } - - const listJson = await listRes.json() - console.log(`Found ${listJson.data.length} strings`) - - let unhiddenCount = 0 - - for (const item of listJson.data) { - const stringId = item.data.id - const isHidden = item.data.isHidden - - if (!isHidden) { - continue - } - - // Unhide the string using PATCH - const patchUrl = `https://api.crowdin.com/api/v2/projects/${PROJ_ID}/strings/${stringId}` - - const patchRes = await fetch(patchUrl, { - method: "PATCH", - headers: requestHeaders, - body: JSON.stringify([ - { - op: "replace", - path: "/isHidden", - value: false, - }, - ]), - }) - - if (!patchRes.ok) { - const text = await patchRes.text() - console.error(`Failed to unhide string ${stringId}: ${text}`) - continue - } - - unhiddenCount++ - if (unhiddenCount % 10 === 0) { - console.log(`Unhidden ${unhiddenCount} strings...`) - } - } - - console.log(`\n✅ Successfully unhidden ${unhiddenCount} strings!`) -} - -unhideAllStrings().catch(console.error) From e6d4518ca2c4dd789b9ec77f473cdfdbfcb50b28 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 13 Jan 2026 18:45:16 -0800 Subject: [PATCH 88/99] patch: logs and pr body details --- src/scripts/i18n/lib/crowdin/files.ts | 3 -- .../i18n/lib/workflows/file-preparation.ts | 8 ++--- src/scripts/i18n/lib/workflows/pr-creation.ts | 35 +++++++++++-------- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts index bcd615e64bc..e0096d6c350 100644 --- a/src/scripts/i18n/lib/crowdin/files.ts +++ b/src/scripts/i18n/lib/crowdin/files.ts @@ -151,8 +151,6 @@ export const unhideStringsInFile = async (fileId: number): Promise => { console.log( `[UNHIDE] ✓ Unhidden ${unhiddenCount} strings in fileId=${fileId}` ) - } else { - console.log(`[UNHIDE] No hidden strings found in fileId=${fileId}`) } return unhiddenCount @@ -346,7 +344,6 @@ export const postFileToStorage = async ( } } const json: JsonResponse = await res.json() - console.log("Uploaded storage:", json.data) return json.data } catch (error) { console.error("postFileToStorage error:", error) diff --git a/src/scripts/i18n/lib/workflows/file-preparation.ts b/src/scripts/i18n/lib/workflows/file-preparation.ts index 1191f5c18b6..44111a7c1a5 100644 --- a/src/scripts/i18n/lib/workflows/file-preparation.ts +++ b/src/scripts/i18n/lib/workflows/file-preparation.ts @@ -35,10 +35,6 @@ async function updateCrowdinFile( }, foundFile: CrowdinFileData ): Promise<{ fileId: number; path: string; buffer: Buffer }> { - console.log( - `Updating existing file in Crowdin: ${file.filePath} (ID: ${foundFile.id})` - ) - const fileBuffer = await downloadGitHubFile(file.download_url) const storageInfo = await postFileToStorage( fileBuffer, @@ -65,7 +61,9 @@ async function updateCrowdinFile( ) } - console.log(`✓ Updated Crowdin file (ID: ${foundFile.id})`) + console.log( + `✓ Updated Crowdin file: ${file.filePath} (fileId: ${foundFile.id}, storageId: ${storageInfo.id})` + ) // Wait for file parsing after update const delayMs = 10000 diff --git a/src/scripts/i18n/lib/workflows/pr-creation.ts b/src/scripts/i18n/lib/workflows/pr-creation.ts index 07d2c7462ff..5ed6db04db0 100644 --- a/src/scripts/i18n/lib/workflows/pr-creation.ts +++ b/src/scripts/i18n/lib/workflows/pr-creation.ts @@ -24,7 +24,7 @@ export function generatePRTitle( } else if (isAllLanguages) { prTitle += ` (all languages)` } else { - prTitle += ` (many languages)` + prTitle += ` (multiple languages)` } return prTitle @@ -61,6 +61,20 @@ export function generatePRBody( path.toLowerCase().endsWith(".md") ) + // Dedupe paths after stripping locale prefix (same content path across languages) + const uniqueJsonPaths = [ + ...new Set( + jsonFiles.map((path) => path.replace(/^src\/intl\/[^/]+\//, "")) + ), + ].sort() + const uniqueMarkdownPaths = [ + ...new Set( + markdownFiles.map((path) => + path.replace(/^public\/content\/translations\/[^/]+\//, "") + ) + ), + ].sort() + // Build PR body let prBody = `## Description\n\n` prBody += `This PR contains automated ${aiModelName} translations from Crowdin.\n\n` @@ -74,26 +88,19 @@ export function generatePRBody( prBody += `${langCodes.join(", ")}\n\n` // Files section - JSON - if (jsonFiles.length > 0) { + if (uniqueJsonPaths.length > 0) { prBody += `### JSON changes (\`src/intl/{locale}/\`)\n\n` - for (const path of jsonFiles) { - // Remove src/intl/{locale}/ prefix - const simplifiedPath = path.replace(/^src\/intl\/[^/]+\//, "") - prBody += `- ${simplifiedPath}\n` + for (const path of uniqueJsonPaths) { + prBody += `- ${path}\n` } prBody += `\n` } // Files section - Markdown - if (markdownFiles.length > 0) { + if (uniqueMarkdownPaths.length > 0) { prBody += `### Markdown changes (\`public/content/translations/{locale}/\`)\n\n` - for (const path of markdownFiles) { - // Remove public/content/translations/{locale}/ prefix - const simplifiedPath = path.replace( - /^public\/content\/translations\/[^/]+\//, - "" - ) - prBody += `- ${simplifiedPath}\n` + for (const path of uniqueMarkdownPaths) { + prBody += `- ${path}\n` } prBody += `\n` } From c004ee1d21d1d338741f20624134d39548f1ed5b Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 14 Jan 2026 21:14:09 -0800 Subject: [PATCH 89/99] fix(i18n): handle "created" status for queued jobs Crowdin returns "created" when a job is queued behind other jobs. Previously this would cause polling to fail. Now we continue polling for both "created" and "in_progress" states. Co-Authored-By: Claude Opus 4.5 --- src/scripts/i18n/lib/crowdin/pre-translate.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/scripts/i18n/lib/crowdin/pre-translate.ts b/src/scripts/i18n/lib/crowdin/pre-translate.ts index 149d2a42891..69380ab9ca0 100644 --- a/src/scripts/i18n/lib/crowdin/pre-translate.ts +++ b/src/scripts/i18n/lib/crowdin/pre-translate.ts @@ -138,7 +138,10 @@ export const awaitPreTranslationCompleted = async ( await delay(nextWait) continue } - if (res.status !== "in_progress") { + // "created" means job is queued (e.g., another large job is running) + // "in_progress" means job is actively translating + // Both are valid states to keep polling + if (res.status !== "in_progress" && res.status !== "created") { if (res.status === "finished") { console.log( `[PRE-TRANSLATE][POLL] Completed after ${attempt} attempts; elapsed ${Math.round( @@ -153,8 +156,9 @@ export const awaitPreTranslationCompleted = async ( } const nextWait = computeInterval(elapsed) const progressPct = res.progress ?? 0 + const statusNote = res.status === "created" ? " (queued)" : "" console.log( - `[PRE-TRANSLATE][POLL] attempt=${attempt} progress=${progressPct}% elapsed=${Math.round( + `[PRE-TRANSLATE][POLL] attempt=${attempt} status=${res.status}${statusNote} progress=${progressPct}% elapsed=${Math.round( elapsed / 60000 )}m nextWait=${nextWait}ms` ) From e0a8d768217585e0ce86328127d6e8068d071054 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 14 Jan 2026 21:19:36 -0800 Subject: [PATCH 90/99] refactor(i18n): per-language pre-translation jobs - Add LanguageJobInfo type for tracking per-language jobs - Move prompt creation from file-preparation to pre-translation phase - Create one ephemeral prompt per language with language-specific glossary - Poll all jobs in parallel with continue-on-error - Log comma-separated job IDs for easy resume copy-paste Co-Authored-By: Claude Opus 4.5 --- .../i18n/lib/workflows/file-preparation.ts | 56 +--- src/scripts/i18n/lib/workflows/initialize.ts | 1 + .../i18n/lib/workflows/pre-translation.ts | 276 +++++++++++++++--- src/scripts/i18n/lib/workflows/types.ts | 26 +- 4 files changed, 256 insertions(+), 103 deletions(-) diff --git a/src/scripts/i18n/lib/workflows/file-preparation.ts b/src/scripts/i18n/lib/workflows/file-preparation.ts index 44111a7c1a5..e8a3810fd61 100644 --- a/src/scripts/i18n/lib/workflows/file-preparation.ts +++ b/src/scripts/i18n/lib/workflows/file-preparation.ts @@ -1,24 +1,18 @@ // File preparation workflow phase -import * as fs from "fs" -import * as path from "path" - import { config, crowdinBearerHeaders } from "../../config" -import { createEphemeralPrompt } from "../crowdin/ephemeral-prompts" import { findCrowdinFile, postCrowdinFile, postFileToStorage, unhideStringsInFile, } from "../crowdin/files" -import { getPromptInfo } from "../crowdin/prompt" import { getCurrentUser } from "../crowdin/user" import { downloadGitHubFile, getAllEnglishFiles, getFileMetadata, } from "../github/files" -import { formatGlossaryForPrompt, getGlossaryForLanguage } from "../supabase" import type { CrowdinFileData } from "../types" import type { FilePreparationResult, WorkflowContext } from "./types" @@ -124,64 +118,18 @@ async function createCrowdinFile(file: { export async function prepareEnglishFiles( context: WorkflowContext ): Promise { - const { allInternalCodes } = config const { crowdinProjectFiles, fileIdsSet, processedFileIdToPath, englishBuffers, - glossary, } = context - logSection("Starting New Pre-Translation") + logSection("Preparing English Files") - // Create ephemeral prompt with glossary terms baked in + // Get current user ID for ephemeral prompt cleanup later const currentUser = await getCurrentUser() - - // Get AI provider/model settings from the static prompt - const staticPromptInfo = await getPromptInfo( - currentUser.id, - config.preTranslatePromptId - ) - debugLog( - `Static prompt AI settings: provider=${staticPromptInfo.aiProviderId}, model=${staticPromptInfo.aiModelId}` - ) - - const promptPath = path.join( - process.cwd(), - "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" - ) - const basePrompt = fs.readFileSync(promptPath, "utf8") - - // Get glossary for target language and append to prompt - const targetLang = allInternalCodes[0] - const glossaryTerms = getGlossaryForLanguage(glossary, targetLang) - const glossarySection = formatGlossaryForPrompt(glossaryTerms, "informal") - - const fullPrompt = glossarySection - ? `${basePrompt}\n\n---\n\n${glossarySection}` - : basePrompt - - if (glossaryTerms.size > 0) { - console.log( - `[GLOSSARY] Injecting ${glossaryTerms.size} terms for ${targetLang} into prompt` - ) - } - - // Create ephemeral prompt for this job (copy AI provider from static prompt) - const { promptId: ephemeralPromptId } = await createEphemeralPrompt({ - userId: currentUser.id, - languageCode: targetLang, - promptKey: "glossary", - promptText: fullPrompt, - aiProviderId: staticPromptInfo.aiProviderId ?? undefined, - aiModelId: staticPromptInfo.aiModelId ?? undefined, - }) - - // Store ephemeral prompt ID and user ID in context for pre-translation and cleanup - context.ephemeralPromptId = ephemeralPromptId context.crowdinUserId = currentUser.id - console.log(`✓ Created ephemeral prompt (ID: ${ephemeralPromptId})`) // Fetch English files const allEnglishFiles = await getAllEnglishFiles() diff --git a/src/scripts/i18n/lib/workflows/initialize.ts b/src/scripts/i18n/lib/workflows/initialize.ts index a4cd4fb9ffc..fb04c9574a3 100644 --- a/src/scripts/i18n/lib/workflows/initialize.ts +++ b/src/scripts/i18n/lib/workflows/initialize.ts @@ -48,5 +48,6 @@ export async function initializeWorkflow(): Promise { processedFileIdToPath: {}, englishBuffers: {}, glossary, + languageJobs: [], } } diff --git a/src/scripts/i18n/lib/workflows/pre-translation.ts b/src/scripts/i18n/lib/workflows/pre-translation.ts index 5407b46fdcb..40971708cb2 100644 --- a/src/scripts/i18n/lib/workflows/pre-translation.ts +++ b/src/scripts/i18n/lib/workflows/pre-translation.ts @@ -1,15 +1,21 @@ // Pre-translation workflow phase +import * as fs from "fs" +import * as path from "path" + import { config } from "../../config" +import { createEphemeralPrompt } from "../crowdin/ephemeral-prompts" import { awaitPreTranslationCompleted, getPreTranslationStatus, postApplyPreTranslation, } from "../crowdin/pre-translate" +import { getPromptInfo } from "../crowdin/prompt" +import { formatGlossaryForPrompt, getGlossaryForLanguage } from "../supabase" import type { CrowdinPreTranslateResponse } from "../types" import type { PreTranslationResult, WorkflowContext } from "./types" -import { logSection } from "./utils" +import { debugLog, logSection } from "./utils" /** * Resume existing pre-translation job @@ -21,10 +27,12 @@ async function resumePreTranslation( const statusResp = await getPreTranslationStatus(preTranslationId) - if (statusResp.status === "in_progress") { - console.log( - `Pre-translation in progress (${statusResp.progress}%), waiting for completion...` - ) + if (statusResp.status === "in_progress" || statusResp.status === "created") { + const statusMsg = + statusResp.status === "created" + ? "Pre-translation queued (waiting for other jobs)" + : `Pre-translation in progress (${statusResp.progress}%)` + console.log(`${statusMsg}, waiting for completion...`) return await awaitPreTranslationCompleted(preTranslationId) } else if (statusResp.status === "finished") { console.log(`Pre-translation already finished, proceeding to download...`) @@ -37,79 +45,256 @@ async function resumePreTranslation( } /** - * Start new pre-translation job + * Create ephemeral prompt with language-specific glossary */ -async function startNewPreTranslation( - fileIdsSet: Set, - ephemeralPromptId?: number -): Promise { - logSection("Requesting AI Pre-Translation") +async function createLanguagePrompt( + userId: number, + internalCode: string, + glossary: WorkflowContext["glossary"], + basePrompt: string, + aiProviderId?: number, + aiModelId?: string +): Promise { + const glossaryTerms = getGlossaryForLanguage(glossary, internalCode) + const glossarySection = formatGlossaryForPrompt(glossaryTerms, "informal") + + const fullPrompt = glossarySection + ? `${basePrompt}\n\n---\n\n${glossarySection}` + : basePrompt + + if (glossaryTerms.size > 0) { + console.log( + `[GLOSSARY] Injecting ${glossaryTerms.size} terms for ${internalCode} into prompt` + ) + } + + const { promptId } = await createEphemeralPrompt({ + userId, + languageCode: internalCode, + promptKey: "glossary", + promptText: fullPrompt, + aiProviderId, + aiModelId, + }) + + return promptId +} + +/** + * Start pre-translation jobs for all target languages + * Creates one ephemeral prompt and one job per language + */ +async function startPerLanguagePreTranslation( + context: WorkflowContext +): Promise { + const { allCrowdinCodes, allInternalCodes } = config + const { fileIdsSet, crowdinUserId, glossary, languageJobs } = context + + if (!crowdinUserId) { + throw new Error("Missing crowdinUserId in context") + } + + logSection("Requesting AI Pre-Translation (Per-Language)") console.log(`Files to translate: ${fileIdsSet.size}`) - console.log(`Target languages: ${config.allCrowdinCodes.join(", ")}`) + console.log(`Target languages: ${allCrowdinCodes.join(", ")}`) - // Use ephemeral prompt if available, otherwise fall back to static prompt - const promptId = ephemeralPromptId ?? config.preTranslatePromptId - console.log( - `AI Prompt ID: ${promptId}${ephemeralPromptId ? " (ephemeral)" : ""}` + // Load base prompt template + const promptPath = path.join( + process.cwd(), + "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" ) + const basePrompt = fs.readFileSync(promptPath, "utf8") - const applyPreTranslationResponse = await postApplyPreTranslation( - Array.from(fileIdsSet), - config.allCrowdinCodes, - promptId + // Get AI provider/model settings from the static prompt + const staticPromptInfo = await getPromptInfo( + crowdinUserId, + config.preTranslatePromptId ) - - console.log( - `✓ Pre-translation job created (ID: ${applyPreTranslationResponse.identifier})` + debugLog( + `Static prompt AI settings: provider=${staticPromptInfo.aiProviderId}, model=${staticPromptInfo.aiModelId}` ) + const fileIds = Array.from(fileIdsSet) + + // Process each language: create prompt, start job + for (let i = 0; i < allInternalCodes.length; i++) { + const internalCode = allInternalCodes[i] + const crowdinCode = allCrowdinCodes[i] + + console.log(`\n[${internalCode}] Creating ephemeral prompt...`) + + // Create language-specific prompt with glossary + const ephemeralPromptId = await createLanguagePrompt( + crowdinUserId, + internalCode, + glossary, + basePrompt, + staticPromptInfo.aiProviderId ?? undefined, + staticPromptInfo.aiModelId ?? undefined + ) + + console.log(`[${internalCode}] ✓ Created prompt (ID: ${ephemeralPromptId})`) + console.log(`[${internalCode}] Submitting pre-translation job...`) + + // Submit pre-translation for this single language + const response = await postApplyPreTranslation( + fileIds, + [crowdinCode], + ephemeralPromptId + ) + + console.log(`[${internalCode}] ✓ Job created (ID: ${response.identifier})`) + + // Track job info for polling and cleanup + languageJobs.push({ + internalCode, + crowdinCode, + ephemeralPromptId, + preTranslationId: response.identifier, + }) + } + + // Log all job IDs for potential manual resume (comma-separated for easy copy-paste) + const allJobIds = languageJobs.map((j) => j.preTranslationId).join(",") + logSection("Pre-Translation Jobs Summary") + console.log(`Created ${languageJobs.length} pre-translation jobs:`) + for (const job of languageJobs) { + console.log(` ${job.internalCode}: ${job.preTranslationId}`) + } + console.log(`\n📋 Copy for resume: ${allJobIds}`) + // Exit early if skipAwait is set or if full translation mode (no targetPath) if (config.skipAwait || !config.targetPath) { const reason = config.skipAwait ? "skip_await option enabled" : "full translation job" logSection(`Exiting for Manual Resume (${reason})`) - console.log(`Pre-translation ID: ${applyPreTranslationResponse.identifier}`) - console.log(`\nTo resume later, dispatch workflow with:`) - console.log( - ` pretranslation_id: ${applyPreTranslationResponse.identifier}` - ) + console.log(`\nTo resume, use PRETRANSLATION_ID:`) + console.log(` ${allJobIds}`) console.log(`\nCheck progress: https://crowdin.com/project/ethereum-org`) process.exit(0) } - // For file/directory mode without skipAwait, wait for completion - console.log(`\nWaiting for pre-translation to complete...`) - const completedResponse = await awaitPreTranslationCompleted( - applyPreTranslationResponse.identifier + // Wait for all jobs to complete in parallel with continue-on-error + logSection("Waiting for Pre-Translation Completion") + + const results = await Promise.all( + languageJobs.map(async (job) => { + console.log(`[${job.internalCode}] Waiting for completion...`) + try { + const completed = await awaitPreTranslationCompleted( + job.preTranslationId + ) + if (completed.status !== "finished") { + throw new Error(`Unexpected status: ${completed.status}`) + } + console.log(`[${job.internalCode}] ✓ Completed!`) + return { success: true as const, job, response: completed } + } catch (err) { + console.error( + `[${job.internalCode}] ✗ Failed:`, + err instanceof Error ? err.message : err + ) + return { success: false as const, job, error: err } + } + }) ) - if (completedResponse.status !== "finished") { - throw new Error( - `Pre-translation ended with unexpected status: ${completedResponse.status}` + const successes = results.filter((r) => r.success) + const failures = results.filter((r) => !r.success) + + if (failures.length > 0) { + console.warn( + `\n[WARN] ${failures.length}/${languageJobs.length} jobs failed:` + ) + for (const f of failures) { + console.warn(` - ${f.job.internalCode}: ${f.job.preTranslationId}`) + } + } + + if (successes.length === 0) { + throw new Error("All pre-translation jobs failed") + } + + console.log( + `\n✓ ${successes.length}/${languageJobs.length} pre-translation jobs completed!` + ) + return successes.map((s) => s.response) +} + +/** + * Resume multiple pre-translation jobs in parallel with continue-on-error + */ +async function resumeMultiplePreTranslations( + preTranslationIds: string[] +): Promise { + logSection(`Resuming ${preTranslationIds.length} Pre-Translation Jobs`) + console.log(`IDs: ${preTranslationIds.join(", ")}`) + + const results = await Promise.all( + preTranslationIds.map(async (id) => { + try { + const response = await resumePreTranslation(id) + return { success: true as const, id, response } + } catch (err) { + console.error( + `[ERROR] Job ${id} failed:`, + err instanceof Error ? err.message : err + ) + return { success: false as const, id, error: err } + } + }) + ) + + // Separate successes and failures + const successes = results.filter((r) => r.success) + const failures = results.filter((r) => !r.success) + + if (failures.length > 0) { + console.warn( + `\n[WARN] ${failures.length}/${preTranslationIds.length} jobs failed:` ) + for (const f of failures) { + console.warn(` - ${f.id}`) + } + } + + if (successes.length === 0) { + throw new Error("All pre-translation jobs failed") } - console.log(`✓ Pre-translation completed successfully!`) - return completedResponse + console.log( + `\n✓ ${successes.length}/${preTranslationIds.length} jobs completed successfully` + ) + return successes.map((s) => s.response) } /** - * Handle pre-translation: resume existing or start new + * Handle pre-translation: resume existing or start new per-language jobs */ export async function handlePreTranslation( context: WorkflowContext ): Promise { - const { existingPreTranslationId, verbose } = config + const { existingPreTranslationIds, verbose } = config const { fileIdsSet, processedFileIdToPath, crowdinProjectFiles } = context - // Resume existing or start new - const preTranslateResponse = existingPreTranslationId - ? await resumePreTranslation(existingPreTranslationId) - : await startNewPreTranslation(fileIdsSet, context.ephemeralPromptId) + // Resume existing jobs or start new per-language jobs + let responses: CrowdinPreTranslateResponse[] + let fileIds: number[] + + if (existingPreTranslationIds.length > 0) { + // Resume mode: one or more existing jobs + responses = await resumeMultiplePreTranslations(existingPreTranslationIds) + // Collect all fileIds from all responses + fileIds = [...new Set(responses.flatMap((r) => r.attributes.fileIds))] + } else { + // New mode: per-language jobs + responses = await startPerLanguagePreTranslation(context) + // All jobs translate the same files, so just use the first response's fileIds + fileIds = responses[0]?.attributes.fileIds ?? Array.from(fileIdsSet) + } // Build mapping for commit phase - const { fileIds } = preTranslateResponse.attributes const fileIdToPathMapping: Record = {} for (const fid of fileIds) { @@ -125,7 +310,8 @@ export async function handlePreTranslation( } return { - response: preTranslateResponse, + responses, fileIdToPathMapping, + fileIds, } } diff --git a/src/scripts/i18n/lib/workflows/types.ts b/src/scripts/i18n/lib/workflows/types.ts index e14f2f8a273..68c23030198 100644 --- a/src/scripts/i18n/lib/workflows/types.ts +++ b/src/scripts/i18n/lib/workflows/types.ts @@ -3,6 +3,20 @@ import type { GlossaryByLanguage } from "../supabase" import type { CrowdinFileData, CrowdinPreTranslateResponse } from "../types" +/** + * Per-language job tracking data + */ +export interface LanguageJobInfo { + /** Internal language code (e.g., "es", "zh") */ + internalCode: string + /** Crowdin language code (e.g., "es-EM", "zh-CN") */ + crowdinCode: string + /** Ephemeral prompt ID created for this language */ + ephemeralPromptId: number + /** Pre-translation job ID */ + preTranslationId: string +} + /** * Shared context passed between workflow phases */ @@ -12,8 +26,8 @@ export interface WorkflowContext { processedFileIdToPath: Record englishBuffers: Record glossary: GlossaryByLanguage - /** Ephemeral prompt ID created for this job (to be cleaned up after) */ - ephemeralPromptId?: number + /** Per-language job info (populated during pre-translation phase) */ + languageJobs: LanguageJobInfo[] /** Crowdin user ID (needed for ephemeral prompt cleanup) */ crowdinUserId?: number } @@ -62,9 +76,13 @@ export interface PullRequest { } /** - * Pre-translation job result + * Pre-translation job result (supports multiple per-language jobs) */ export interface PreTranslationResult { - response: CrowdinPreTranslateResponse + /** All pre-translation responses (one per language) */ + responses: CrowdinPreTranslateResponse[] + /** File ID to path mapping */ fileIdToPathMapping: Record + /** File IDs that were translated */ + fileIds: number[] } From c498f8a9e690a676a652931a7a3ebffd1f4cacda Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 14 Jan 2026 21:19:56 -0800 Subject: [PATCH 91/99] feat(i18n): support comma-separated resume IDs PRETRANSLATION_ID now accepts comma-separated values (e.g., "abc123,def456") for resuming multiple per-language jobs. Resume polls all jobs in parallel with continue-on-error. Co-Authored-By: Claude Opus 4.5 --- src/scripts/i18n/config.ts | 12 ++++++++---- src/scripts/i18n/main.ts | 38 ++++++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index 70ff8c125eb..b68ce45ec7a 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -96,7 +96,11 @@ const pretranslatePollBaseMs = process.env.PRETRANSLATE_POLL_BASE_MS ? Math.max(5000, parseInt(process.env.PRETRANSLATE_POLL_BASE_MS, 10)) : 30_000 // default 30s base (min clamped to 5s) -const existingPreTranslationId = process.env.PRETRANSLATION_ID || "" +// Parse comma-separated pre-translation IDs (for resuming multiple per-language jobs) +const existingPreTranslationIds = (process.env.PRETRANSLATION_ID || "") + .split(",") + .map((id) => id.trim()) + .filter(Boolean) const verbose = process.env.VERBOSE === "true" @@ -121,9 +125,9 @@ if (verbose) { console.log(`[DEBUG] - Exclude path: ${excludePath || "none"}`) console.log(`[DEBUG] - Skip await: ${skipAwait}`) console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) - if (existingPreTranslationId) { + if (existingPreTranslationIds.length > 0) { console.log( - `[DEBUG] - Resuming from pre-translation ID: ${existingPreTranslationId}` + `[DEBUG] - Resuming from pre-translation IDs: ${existingPreTranslationIds.join(", ")}` ) } } @@ -151,7 +155,7 @@ export const config = { skipAwait, pretranslateTimeoutMs, pretranslatePollBaseMs, - existingPreTranslationId, + existingPreTranslationIds, verbose, } diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 9faade3c640..17b2edc5403 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -14,13 +14,13 @@ import { config } from "./config" * Main orchestration function */ async function main() { - const { existingPreTranslationId } = config + const { existingPreTranslationIds } = config // Phase 1: Initialize workflow const context = await initializeWorkflow() - // Phase 2: Prepare English files (skip if resuming existing job) - if (!existingPreTranslationId) { + // Phase 2: Prepare English files (skip if resuming existing jobs) + if (existingPreTranslationIds.length === 0) { await prepareEnglishFiles(context) } @@ -85,20 +85,26 @@ async function main() { console.log( `Languages: ${translationResult.languagePairs.map((p) => p.internalLanguageCode).join(", ")}` ) - console.log(`Files: ${preTranslateResult.response.attributes.fileIds.length}`) + console.log(`Files: ${preTranslateResult.fileIds.length}`) - // Cleanup ephemeral prompt (best effort - don't fail the workflow if cleanup fails) - if (context.ephemeralPromptId && context.crowdinUserId) { - try { - await deleteEphemeralPrompt( - context.crowdinUserId, - context.ephemeralPromptId - ) - } catch (err) { - console.warn( - `[WARN] Failed to cleanup ephemeral prompt ${context.ephemeralPromptId}:`, - err instanceof Error ? err.message : err - ) + // Cleanup all ephemeral prompts (best effort - don't fail the workflow if cleanup fails) + if (context.languageJobs.length > 0 && context.crowdinUserId) { + logSection("Cleaning Up Ephemeral Prompts") + for (const job of context.languageJobs) { + try { + await deleteEphemeralPrompt( + context.crowdinUserId, + job.ephemeralPromptId + ) + console.log( + `✓ Deleted prompt for ${job.internalCode} (ID: ${job.ephemeralPromptId})` + ) + } catch (err) { + console.warn( + `[WARN] Failed to cleanup ephemeral prompt ${job.ephemeralPromptId} (${job.internalCode}):`, + err instanceof Error ? err.message : err + ) + } } } } From 938c750403255d7cc42e05db24645fce9a0a9f79 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 14 Jan 2026 21:20:09 -0800 Subject: [PATCH 92/99] refactor(i18n): adapt download for multi-response results Collect languageIds from all responses instead of single response. Co-Authored-By: Claude Opus 4.5 --- src/scripts/i18n/lib/workflows/translation-download.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/scripts/i18n/lib/workflows/translation-download.ts b/src/scripts/i18n/lib/workflows/translation-download.ts index 16dd51e8c23..5a62509439f 100644 --- a/src/scripts/i18n/lib/workflows/translation-download.ts +++ b/src/scripts/i18n/lib/workflows/translation-download.ts @@ -33,9 +33,10 @@ export async function downloadAndCommitTranslations( context: WorkflowContext ): Promise { const { englishBuffers } = context - const { response, fileIdToPathMapping } = preTranslateResult + const { responses, fileIdToPathMapping, fileIds } = preTranslateResult - const { languageIds, fileIds } = response.attributes + // Collect all language IDs from all responses (each response has one language) + const languageIds = responses.flatMap((r) => r.attributes.languageIds) // Build language pair mappings const languagePairs = buildLanguageMappings(languageIds) From a52be9ddd9bba8f1fdaee46f003d42675d0698d8 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 15 Jan 2026 08:16:14 -0800 Subject: [PATCH 93/99] feat(i18n): add SPLIT_PRS option for per-language PRs When SPLIT_PRS=true, creates one PR per language instead of a single combined PR. Useful for large translation batches where individual PRs are easier to review. - Each language gets its own branch: i18n/import/{ts}-{langCode} - Continue-on-error: failed languages don't block others - Summary printed at end with PR URLs and failures Co-Authored-By: Claude Opus 4.5 --- src/scripts/i18n/config.ts | 2 + .../lib/workflows/translation-download.ts | 11 +- src/scripts/i18n/lib/workflows/types.ts | 10 + src/scripts/i18n/main.ts | 208 +++++++++++++----- 4 files changed, 175 insertions(+), 56 deletions(-) diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index b68ce45ec7a..243e461b0c3 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -103,6 +103,7 @@ const existingPreTranslationIds = (process.env.PRETRANSLATION_ID || "") .filter(Boolean) const verbose = process.env.VERBOSE === "true" +const splitPrs = process.env.SPLIT_PRS === "true" // Parse GitHub repository from env (format: "owner/repo") const githubRepo = @@ -157,6 +158,7 @@ export const config = { pretranslatePollBaseMs, existingPreTranslationIds, verbose, + splitPrs, } // Do not translate list - Declare paths that should never be translated diff --git a/src/scripts/i18n/lib/workflows/translation-download.ts b/src/scripts/i18n/lib/workflows/translation-download.ts index 5a62509439f..58b254c13c2 100644 --- a/src/scripts/i18n/lib/workflows/translation-download.ts +++ b/src/scripts/i18n/lib/workflows/translation-download.ts @@ -43,11 +43,12 @@ export async function downloadAndCommitTranslations( logSection("Creating Translation PR") - // Create GitHub branch - const { branch } = await postCreateBranchFrom( - config.baseBranch, - "crowdin-translations" - ) + // Create GitHub branch (use language code as suffix for single-language PRs) + const branchSuffix = + languagePairs.length === 1 + ? languagePairs[0].internalLanguageCode + : "crowdin-translations" + const { branch } = await postCreateBranchFrom(config.baseBranch, branchSuffix) console.log(`✓ Created branch: ${branch}`) // Track all committed files with their content for sanitizer/validation diff --git a/src/scripts/i18n/lib/workflows/types.ts b/src/scripts/i18n/lib/workflows/types.ts index 68c23030198..8620c268696 100644 --- a/src/scripts/i18n/lib/workflows/types.ts +++ b/src/scripts/i18n/lib/workflows/types.ts @@ -86,3 +86,13 @@ export interface PreTranslationResult { /** File IDs that were translated */ fileIds: number[] } + +/** + * Result of processing a single language in split-PR mode + */ +export interface SplitPRResult { + language: string + status: "success" | "failed" + prUrl?: string + error?: string +} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 17b2edc5403..08e9eb3daac 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -5,7 +5,11 @@ import { runJsxTranslation } from "./lib/workflows/jsx-translation" import { createTranslationPR } from "./lib/workflows/pr-creation" import { handlePreTranslation } from "./lib/workflows/pre-translation" import { runPostImportSanitization } from "./lib/workflows/sanitization" -import { downloadAndCommitTranslations } from "./lib/workflows/translation-download" +import { + buildLanguageMappings, + downloadAndCommitTranslations, +} from "./lib/workflows/translation-download" +import type { PreTranslationResult, SplitPRResult } from "./lib/workflows/types" import { logSection } from "./lib/workflows/utils" import { runSyntaxValidation } from "./lib/workflows/validation" import { config } from "./config" @@ -27,65 +31,167 @@ async function main() { // Phase 3: Handle pre-translation (resume or start new) const preTranslateResult = await handlePreTranslation(context) - // Phase 4: Download and commit translations - const translationResult = await downloadAndCommitTranslations( - preTranslateResult, - context - ) - - // Phase 5: Translate JSX attributes via Gemini (before sanitizer) - const jsxTranslationResult = await runJsxTranslation( - translationResult.committedFiles, - translationResult.languagePairs, - translationResult.branch, - context.glossary - ) - - // Phase 6: Run post-import sanitizer - const sanitizeResult = await runPostImportSanitization( - translationResult.committedFiles, - translationResult.branch - ) - // Check if PR creation should be skipped const skipPrCreation = ["1", "true", "yes", "on"].includes( (process.env.SKIP_PR_CREATION || "").toLowerCase() ) - if (skipPrCreation) { - logSection("Skipping PR Creation") - console.log( - `Files have been committed to branch: ${translationResult.branch}. No PR will be opened.` + + // Split PR mode: create one PR per language + if (config.splitPrs) { + const results: SplitPRResult[] = [] + + for (const response of preTranslateResult.responses) { + const langId = response.attributes.languageIds[0] + const langCode = buildLanguageMappings([langId])[0].internalLanguageCode + + logSection(`Processing Language: ${langCode}`) + + // Create single-response PreTranslationResult for this language + const singleLangResult: PreTranslationResult = { + responses: [response], + fileIdToPathMapping: preTranslateResult.fileIdToPathMapping, + fileIds: preTranslateResult.fileIds, + } + + try { + // Phase 4: Download and commit translations + const translationResult = await downloadAndCommitTranslations( + singleLangResult, + context + ) + + // Phase 5: Translate JSX attributes via Gemini + const jsxTranslationResult = await runJsxTranslation( + translationResult.committedFiles, + translationResult.languagePairs, + translationResult.branch, + context.glossary + ) + + // Phase 6: Run post-import sanitizer + const sanitizeResult = await runPostImportSanitization( + translationResult.committedFiles, + translationResult.branch + ) + + if (skipPrCreation) { + console.log( + `[${langCode}] Branch created: ${translationResult.branch}` + ) + results.push({ language: langCode, status: "success" }) + continue + } + + // Phase 7: Create PR + const pr = await createTranslationPR( + translationResult.branch, + translationResult.committedFiles, + sanitizeResult.changedFiles, + translationResult.languagePairs, + { geminiSkipped: jsxTranslationResult.geminiSkipped } + ) + + // Phase 8: Run syntax tree validation + await runSyntaxValidation( + pr, + translationResult.committedFiles, + context.englishBuffers, + translationResult.fileIdToPathMapping + ) + + console.log(`[${langCode}] ✓ PR created: ${pr.html_url}`) + results.push({ + language: langCode, + status: "success", + prUrl: pr.html_url, + }) + } catch (err) { + const errorMsg = err instanceof Error ? err.message : String(err) + console.error(`[${langCode}] ✗ Failed: ${errorMsg}`) + results.push({ language: langCode, status: "failed", error: errorMsg }) + } + } + + // Print summary + logSection("SPLIT PR SUMMARY") + const successes = results.filter((r) => r.status === "success") + const failures = results.filter((r) => r.status === "failed") + + console.log(`Created: ${successes.length}/${results.length}`) + if (successes.length > 0) { + console.log(`\nSuccessful:`) + for (const r of successes) { + console.log(` ${r.language}: ${r.prUrl ?? "(branch only)"}`) + } + } + if (failures.length > 0) { + console.log(`\nFailed:`) + for (const r of failures) { + console.log(` ${r.language}: ${r.error}`) + } + } + + if (successes.length === 0) { + throw new Error("All language PRs failed") + } + } else { + // Single PR mode (default): all languages in one PR + // Phase 4: Download and commit translations + const translationResult = await downloadAndCommitTranslations( + preTranslateResult, + context ) - console.log( - `Set SKIP_PR_CREATION=false to enable automatic PR creation in the workflow.` + + // Phase 5: Translate JSX attributes via Gemini (before sanitizer) + const jsxTranslationResult = await runJsxTranslation( + translationResult.committedFiles, + translationResult.languagePairs, + translationResult.branch, + context.glossary ) - return - } - // Phase 7: Create PR - const pr = await createTranslationPR( - translationResult.branch, - translationResult.committedFiles, - sanitizeResult.changedFiles, - translationResult.languagePairs, - { geminiSkipped: jsxTranslationResult.geminiSkipped } - ) + // Phase 6: Run post-import sanitizer + const sanitizeResult = await runPostImportSanitization( + translationResult.committedFiles, + translationResult.branch + ) - // Phase 8: Run syntax tree validation - await runSyntaxValidation( - pr, - translationResult.committedFiles, - context.englishBuffers, - translationResult.fileIdToPathMapping - ) + if (skipPrCreation) { + logSection("Skipping PR Creation") + console.log( + `Files have been committed to branch: ${translationResult.branch}. No PR will be opened.` + ) + console.log( + `Set SKIP_PR_CREATION=false to enable automatic PR creation in the workflow.` + ) + return + } - // Success! - logSection("SUCCESS") - console.log(`Pull Request: ${pr.html_url}`) - console.log( - `Languages: ${translationResult.languagePairs.map((p) => p.internalLanguageCode).join(", ")}` - ) - console.log(`Files: ${preTranslateResult.fileIds.length}`) + // Phase 7: Create PR + const pr = await createTranslationPR( + translationResult.branch, + translationResult.committedFiles, + sanitizeResult.changedFiles, + translationResult.languagePairs, + { geminiSkipped: jsxTranslationResult.geminiSkipped } + ) + + // Phase 8: Run syntax tree validation + await runSyntaxValidation( + pr, + translationResult.committedFiles, + context.englishBuffers, + translationResult.fileIdToPathMapping + ) + + // Success! + logSection("SUCCESS") + console.log(`Pull Request: ${pr.html_url}`) + console.log( + `Languages: ${translationResult.languagePairs.map((p) => p.internalLanguageCode).join(", ")}` + ) + console.log(`Files: ${preTranslateResult.fileIds.length}`) + } // Cleanup all ephemeral prompts (best effort - don't fail the workflow if cleanup fails) if (context.languageJobs.length > 0 && context.crowdinUserId) { From c374cbebe1b6d6230cf3f9cf42024bccb88e9b4e Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 15 Jan 2026 08:26:41 -0800 Subject: [PATCH 94/99] fix: add SKIP_PRS boolean as action input --- .github/workflows/crowdin-ai-import.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index 3e6d6c99754..c644f66c372 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -26,9 +26,14 @@ on: default: "dev" type: string pretranslation_id: - description: "Pre-translation ID to resume from (leave empty to start new)" + description: "Pre-translation ID(s) to resume from, comma-separated for multiple (leave empty to start new)" required: false type: string + split_prs: + description: "Create one PR per language instead of one combined PR?" + required: false + default: false + type: boolean pre_translate_prompt_id: description: "AI prompt ID for pre_translate (default: 326942)" required: false @@ -87,4 +92,5 @@ jobs: VERBOSE: ${{ github.event.inputs.verbose }} SKIP_PR_CREATION: ${{ github.event.inputs.skip_pr }} SKIP_AWAIT: ${{ github.event.inputs.skip_await }} + SPLIT_PRS: ${{ github.event.inputs.split_prs }} GITHUB_REPOSITORY: ${{ github.repository }} From 865f5c0139e790cd1ea64988e323eca0b9b562a0 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 15 Jan 2026 14:39:07 -0800 Subject: [PATCH 95/99] refactor(i18n): batch commits per phase via Git Data API Replace per-file commits with batch commits using GitHub's Git Data API. Each workflow phase now creates a single commit instead of one per file, reducing commit noise from hundreds to ~3 commits per language. Co-Authored-By: Claude Opus 4.5 --- src/scripts/i18n/lib/github/commits.ts | 127 ++++++++++++++++++ .../i18n/lib/workflows/jsx-translation.ts | 56 ++++---- .../i18n/lib/workflows/sanitization.ts | 36 +++-- .../lib/workflows/translation-download.ts | 31 ++++- 4 files changed, 206 insertions(+), 44 deletions(-) diff --git a/src/scripts/i18n/lib/github/commits.ts b/src/scripts/i18n/lib/github/commits.ts index 4603afa1ec3..6fe8778a204 100644 --- a/src/scripts/i18n/lib/github/commits.ts +++ b/src/scripts/i18n/lib/github/commits.ts @@ -4,6 +4,133 @@ import { config, gitHubBearerHeaders } from "../../config" import { fetchWithRetry } from "../utils/fetch" import { debugLog, delay } from "../workflows/utils" +/** File to be committed in a batch */ +export interface BatchFile { + path: string + content: Buffer +} + +/** + * Commit multiple files in a single commit using GitHub's Git Data API. + * This avoids creating one commit per file. + * + * @param files - Array of files to commit + * @param branch - Target branch name + * @param message - Commit message + */ +export async function batchCommitFiles( + files: BatchFile[], + branch: string, + message: string +): Promise { + if (files.length === 0) { + debugLog("batchCommitFiles: No files to commit, skipping") + return + } + + const baseUrl = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}` + + // 1. Get current branch ref + const refRes = await fetchWithRetry(`${baseUrl}/git/ref/heads/${branch}`, { + headers: gitHubBearerHeaders, + }) + if (!refRes.ok) { + const body = await refRes.text().catch(() => "") + throw new Error(`Failed to get branch ref (${refRes.status}): ${body}`) + } + const refData: { object: { sha: string } } = await refRes.json() + const latestCommitSha = refData.object.sha + + // 2. Get the commit to find base tree + const commitRes = await fetchWithRetry( + `${baseUrl}/git/commits/${latestCommitSha}`, + { headers: gitHubBearerHeaders } + ) + if (!commitRes.ok) { + const body = await commitRes.text().catch(() => "") + throw new Error(`Failed to get commit (${commitRes.status}): ${body}`) + } + const commitData: { tree: { sha: string } } = await commitRes.json() + const baseTreeSha = commitData.tree.sha + + // 3. Create blobs for each file + const treeItems: { path: string; mode: string; type: string; sha: string }[] = + [] + + for (const file of files) { + const blobRes = await fetchWithRetry(`${baseUrl}/git/blobs`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + content: file.content.toString("base64"), + encoding: "base64", + }), + }) + if (!blobRes.ok) { + const body = await blobRes.text().catch(() => "") + throw new Error( + `Failed to create blob for ${file.path} (${blobRes.status}): ${body}` + ) + } + const blobData: { sha: string } = await blobRes.json() + treeItems.push({ + path: file.path, + mode: "100644", + type: "blob", + sha: blobData.sha, + }) + } + + // 4. Create new tree + const treeRes = await fetchWithRetry(`${baseUrl}/git/trees`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + base_tree: baseTreeSha, + tree: treeItems, + }), + }) + if (!treeRes.ok) { + const body = await treeRes.text().catch(() => "") + throw new Error(`Failed to create tree (${treeRes.status}): ${body}`) + } + const treeData: { sha: string } = await treeRes.json() + + // 5. Create commit + const newCommitRes = await fetchWithRetry(`${baseUrl}/git/commits`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + message, + tree: treeData.sha, + parents: [latestCommitSha], + }), + }) + if (!newCommitRes.ok) { + const body = await newCommitRes.text().catch(() => "") + throw new Error(`Failed to create commit (${newCommitRes.status}): ${body}`) + } + const newCommitData: { sha: string } = await newCommitRes.json() + + // 6. Update branch ref + const updateRefRes = await fetchWithRetry( + `${baseUrl}/git/refs/heads/${branch}`, + { + method: "PATCH", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ sha: newCommitData.sha }), + } + ) + if (!updateRefRes.ok) { + const body = await updateRefRes.text().catch(() => "") + throw new Error(`Failed to update ref (${updateRefRes.status}): ${body}`) + } + + debugLog( + `batchCommitFiles: Committed ${files.length} files in single commit ${newCommitData.sha}` + ) +} + /** * Get the destination path for a translated file * diff --git a/src/scripts/i18n/lib/workflows/jsx-translation.ts b/src/scripts/i18n/lib/workflows/jsx-translation.ts index 8f3821d2581..c0bad47561a 100644 --- a/src/scripts/i18n/lib/workflows/jsx-translation.ts +++ b/src/scripts/i18n/lib/workflows/jsx-translation.ts @@ -3,7 +3,7 @@ import { config } from "../../config" import { translateJsxAttributes } from "../../translate-jsx-attributes" import { isGeminiAvailable } from "../ai" -import { putCommitFile } from "../github/commits" +import { batchCommitFiles, BatchFile } from "../github/commits" import type { GlossaryByLanguage } from "../supabase" import { getGlossaryForLanguage } from "../supabase" @@ -72,33 +72,41 @@ export async function runJsxTranslation( verbose: config.verbose, }) - // Commit updated files + // Batch commit updated files if (jsxResult.updatedFiles.length > 0) { + const filesToCommit: BatchFile[] = [] + for (const updated of jsxResult.updatedFiles) { - try { - const buf = Buffer.from(updated.updatedContent, "utf8") - await putCommitFile(buf, updated.filePath, branch) - debugLog(`JSX-TRANSLATE: Committed ${updated.filePath}`) - - // Update the committedFiles array with new content for sanitizer - const existingFile = committedFiles.find( - (f) => f.path === updated.filePath - ) - if (existingFile) { - existingFile.content = updated.updatedContent - } - } catch (e) { - console.warn( - `[JSX-TRANSLATE] Failed to commit ${updated.filePath}:`, - e - ) + const buf = Buffer.from(updated.updatedContent, "utf8") + filesToCommit.push({ path: updated.filePath, content: buf }) + debugLog(`JSX-TRANSLATE: Will commit ${updated.filePath}`) + + // Update the committedFiles array with new content for sanitizer + const existingFile = committedFiles.find( + (f) => f.path === updated.filePath + ) + if (existingFile) { + existingFile.content = updated.updatedContent } } - console.log( - `[JSX-TRANSLATE] ✓ Committed ${jsxResult.updatedFiles.length} files for ${langCode}` - ) - totalFilesUpdated += jsxResult.updatedFiles.length - totalAttributesTranslated += jsxResult.attributesTranslated + + try { + await batchCommitFiles( + filesToCommit, + branch, + `i18n(${langCode}): JSX attribute translations` + ) + console.log( + `[JSX-TRANSLATE] ✓ Committed ${jsxResult.updatedFiles.length} files for ${langCode}` + ) + totalFilesUpdated += jsxResult.updatedFiles.length + totalAttributesTranslated += jsxResult.attributesTranslated + } catch (e) { + console.warn( + `[JSX-TRANSLATE] Failed to commit files for ${langCode}:`, + e + ) + } } } diff --git a/src/scripts/i18n/lib/workflows/sanitization.ts b/src/scripts/i18n/lib/workflows/sanitization.ts index c5285b24a7f..17f59c90ff9 100644 --- a/src/scripts/i18n/lib/workflows/sanitization.ts +++ b/src/scripts/i18n/lib/workflows/sanitization.ts @@ -1,7 +1,7 @@ // Post-import sanitization workflow phase import { runSanitizer } from "../../post_import_sanitize" -import { putCommitFile } from "../github/commits" +import { batchCommitFiles, BatchFile } from "../github/commits" import type { CommittedFile } from "./types" import { debugLog, logSection } from "./utils" @@ -31,23 +31,31 @@ export async function runPostImportSanitization( if (changedFiles.length) { console.log(`Sanitizer modified ${changedFiles.length} files`) + const filesToCommit: BatchFile[] = [] + for (const file of changedFiles) { const relPath = file.path - try { - const buf = Buffer.from(file.content, "utf8") - await putCommitFile(buf, relPath, branch) - debugLog(`Committed sanitized file: ${relPath}`) - - // Update committedFiles with sanitized content for validation - const existingFile = committedFiles.find((f) => f.path === relPath) - if (existingFile) { - existingFile.content = file.content - } - } catch (e) { - console.warn(`Failed to commit sanitized file ${relPath}:`, e) + const buf = Buffer.from(file.content, "utf8") + filesToCommit.push({ path: relPath, content: buf }) + debugLog(`Will commit sanitized file: ${relPath}`) + + // Update committedFiles with sanitized content for validation + const existingFile = committedFiles.find((f) => f.path === relPath) + if (existingFile) { + existingFile.content = file.content } } - console.log(`✓ Committed ${changedFiles.length} sanitized files`) + + try { + await batchCommitFiles( + filesToCommit, + branch, + `i18n: post-import sanitization` + ) + console.log(`✓ Committed ${changedFiles.length} sanitized files`) + } catch (e) { + console.warn(`Failed to commit sanitized files:`, e) + } } else { console.log("No sanitization changes needed") } diff --git a/src/scripts/i18n/lib/workflows/translation-download.ts b/src/scripts/i18n/lib/workflows/translation-download.ts index 58b254c13c2..db9f8047b7c 100644 --- a/src/scripts/i18n/lib/workflows/translation-download.ts +++ b/src/scripts/i18n/lib/workflows/translation-download.ts @@ -3,7 +3,11 @@ import { config } from "../../config" import { getBuiltFile, postBuildProjectFileTranslation } from "../crowdin/build" import { postCreateBranchFrom } from "../github/branches" -import { getDestinationFromPath, putCommitFile } from "../github/commits" +import { + batchCommitFiles, + BatchFile, + getDestinationFromPath, +} from "../github/commits" import { mapCrowdinCodeToInternal } from "../utils/mapping" import type { @@ -60,7 +64,10 @@ export async function downloadAndCommitTranslations( `Building translations for ${crowdinId} (${internalLanguageCode})` ) - // Build, download and commit each file + // Collect files for batch commit + const filesToCommit: BatchFile[] = [] + + // Build and download each file for (const fileId of fileIds) { const crowdinPath = fileIdToPathMapping[fileId] @@ -86,14 +93,14 @@ export async function downloadAndCommitTranslations( continue } - // 3- Get destination path and commit + // 3- Get destination path and collect for batch commit const destinationPath = getDestinationFromPath( crowdinPath, internalLanguageCode ) - debugLog(`Committing to: ${destinationPath}`) + debugLog(`Will commit to: ${destinationPath}`) - await putCommitFile(buffer, destinationPath, branch) + filesToCommit.push({ path: destinationPath, content: buffer }) // Track this file's path and content for sanitizer/validation committedFiles.push({ @@ -102,7 +109,19 @@ export async function downloadAndCommitTranslations( }) } - console.log(`✓ Committed translations for ${internalLanguageCode}`) + // Batch commit all files for this language + if (filesToCommit.length > 0) { + await batchCommitFiles( + filesToCommit, + branch, + `i18n(${internalLanguageCode}): Crowdin translations` + ) + console.log( + `✓ Committed ${filesToCommit.length} translations for ${internalLanguageCode}` + ) + } else { + console.log(`No new translations for ${internalLanguageCode}`) + } } return { From 23c325f0ece19cc2de8443f41ab03e4391848c72 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 15 Jan 2026 14:45:06 -0800 Subject: [PATCH 96/99] update(i18n): canonical llm language list --- .../config/canonical-llm-language-list.json | 49 +++++++++---------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/src/scripts/i18n/config/canonical-llm-language-list.json b/src/scripts/i18n/config/canonical-llm-language-list.json index fca8cb14d29..53be456973f 100644 --- a/src/scripts/i18n/config/canonical-llm-language-list.json +++ b/src/scripts/i18n/config/canonical-llm-language-list.json @@ -1,27 +1,26 @@ [ - { "code": "zh", "language": "Chinese (Simplified)", "coverageRank": 1 }, - { "code": "es", "language": "Spanish", "coverageRank": 2 }, - { "code": "hi", "language": "Hindi", "coverageRank": 3 }, - { "code": "ar", "language": "Arabic", "coverageRank": 4 }, - { "code": "pt-br", "language": "Portuguese (Brazil)", "coverageRank": 5 }, - { "code": "bn", "language": "Bengali", "coverageRank": 6 }, - { "code": "ru", "language": "Russian", "coverageRank": 7 }, - { "code": "id", "language": "Indonesian", "coverageRank": 8 }, - { "code": "fr", "language": "French", "coverageRank": 9 }, - { "code": "ja", "language": "Japanese", "coverageRank": 10 }, - { "code": "de", "language": "German", "coverageRank": 11 }, - { "code": "ur", "language": "Urdu", "coverageRank": 12 }, - { "code": "zh-tw", "language": "Chinese (Traditional)", "coverageRank": 13 }, - { "code": "tr", "language": "Turkish", "coverageRank": 14 }, - { "code": "vi", "language": "Vietnamese", "coverageRank": 15 }, - { "code": "ko", "language": "Korean", "coverageRank": 16 }, - { "code": "te", "language": "Telugu", "coverageRank": 17 }, - { "code": "mr", "language": "Marathi", "coverageRank": 18 }, - { "code": "ta", "language": "Tamil", "coverageRank": 19 }, - { "code": "it", "language": "Italian", "coverageRank": 20 }, - { "code": "pt", "language": "Portuguese (Euro)", "coverageRank": 21 }, - { "code": "pl", "language": "Polish", "coverageRank": 22 }, - { "code": "uk", "language": "Ukrainian", "coverageRank": 23 }, - { "code": "sw", "language": "Swahili", "coverageRank": 24 }, - { "code": "cs", "language": "Czech", "coverageRank": 25 } + { "code": "zh", "language": "Chinese (Simplified)" }, + { "code": "es", "language": "Spanish" }, + { "code": "hi", "language": "Hindi" }, + { "code": "ar", "language": "Arabic" }, + { "code": "pt-br", "language": "Portuguese (Brazil)" }, + { "code": "fr", "language": "French" }, + { "code": "id", "language": "Indonesian" }, + { "code": "ru", "language": "Russian" }, + { "code": "ja", "language": "Japanese" }, + { "code": "de", "language": "German" }, + { "code": "ko", "language": "Korean" }, + { "code": "vi", "language": "Vietnamese" }, + { "code": "tr", "language": "Turkish" }, + { "code": "bn", "language": "Bengali" }, + { "code": "it", "language": "Italian" }, + { "code": "pl", "language": "Polish" }, + { "code": "ur", "language": "Urdu" }, + { "code": "uk", "language": "Ukrainian" }, + { "code": "ta", "language": "Tamil" }, + { "code": "te", "language": "Telugu" }, + { "code": "mr", "language": "Marathi" }, + { "code": "sw", "language": "Swahili" }, + { "code": "zh-tw", "language": "Chinese (Traditional)" }, + { "code": "cs", "language": "Czech" } ] \ No newline at end of file From fdd3c96f16aa56c65182f919ab805cd3a2648509 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Mon, 19 Jan 2026 19:53:24 -0800 Subject: [PATCH 97/99] fix(i18n): add rate limiting to prevent GitHub API abuse --- src/scripts/i18n/lib/github/commits.ts | 16 +++++++++++++++- src/scripts/i18n/lib/utils/fetch.ts | 26 ++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/scripts/i18n/lib/github/commits.ts b/src/scripts/i18n/lib/github/commits.ts index 6fe8778a204..3a351185ae9 100644 --- a/src/scripts/i18n/lib/github/commits.ts +++ b/src/scripts/i18n/lib/github/commits.ts @@ -54,10 +54,19 @@ export async function batchCommitFiles( const baseTreeSha = commitData.tree.sha // 3. Create blobs for each file + // Add delay between requests to avoid hitting GitHub's secondary rate limits + const BLOB_CREATION_DELAY_MS = 200 // 200ms between blob creations const treeItems: { path: string; mode: string; type: string; sha: string }[] = [] - for (const file of files) { + for (let i = 0; i < files.length; i++) { + const file = files[i] + + // Add delay before each request (except the first one) + if (i > 0) { + await delay(BLOB_CREATION_DELAY_MS) + } + const blobRes = await fetchWithRetry(`${baseUrl}/git/blobs`, { method: "POST", headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, @@ -79,6 +88,11 @@ export async function batchCommitFiles( type: "blob", sha: blobData.sha, }) + + // Log progress for large batches + if (files.length > 10 && (i + 1) % 10 === 0) { + debugLog(`Created ${i + 1}/${files.length} blobs...`) + } } // 4. Create new tree diff --git a/src/scripts/i18n/lib/utils/fetch.ts b/src/scripts/i18n/lib/utils/fetch.ts index 2295311b2e9..41af8c601e7 100644 --- a/src/scripts/i18n/lib/utils/fetch.ts +++ b/src/scripts/i18n/lib/utils/fetch.ts @@ -18,6 +18,7 @@ export const fetchWithRetry = async ( const timeoutMs = options?.timeoutMs ?? 30000 const backoffMs = options?.backoffMs ?? 1000 const retryOnStatuses = options?.retryOnStatuses ?? [ + 403, // GitHub secondary rate limits 408, 429, 500, 502, 503, 504, ] @@ -35,9 +36,30 @@ export const fetchWithRetry = async ( retryOnStatuses.includes(res.status) && attempt < retries ) { - const wait = backoffMs * Math.pow(2, attempt) + // Check if this is a rate limit error and use longer backoff + let wait = backoffMs * Math.pow(2, attempt) + let isRateLimit = false + + if (res.status === 403 || res.status === 429) { + try { + const bodyText = await res.clone().text() + if ( + bodyText.includes("rate limit") || + bodyText.includes("Rate limit") + ) { + isRateLimit = true + // Use much longer backoff for rate limits (60s, 120s, 240s) + wait = 60000 * Math.pow(2, attempt) + } + } catch { + // If we can't read the body, treat 403/429 as rate limits + isRateLimit = true + wait = 60000 * Math.pow(2, attempt) + } + } + console.warn( - `[RETRY] ${url} -> ${res.status}. Attempt ${attempt + 1}/${retries}. Waiting ${wait}ms.` + `[${isRateLimit ? "RATE LIMIT" : "RETRY"}] ${url} -> ${res.status}. Attempt ${attempt + 1}/${retries}. Waiting ${wait}ms.` ) await delay(wait) continue From 03dce0c00a6ebfbdae0b048935176d249dad68cc Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Thu, 22 Jan 2026 08:21:29 -0800 Subject: [PATCH 98/99] refactor: use i18n.config.json as canonical language list - Deprecates canonical-llm-language-list.json --- .github/workflows/crowdin-ai-import.yml | 6 --- src/scripts/i18n/config.ts | 39 +++++-------------- .../config/canonical-llm-language-list.json | 26 ------------- 3 files changed, 9 insertions(+), 62 deletions(-) delete mode 100644 src/scripts/i18n/config/canonical-llm-language-list.json diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index c644f66c372..a6f617a826a 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -15,11 +15,6 @@ on: description: "Comma-separated internal language codes (blank for all locales)" required: false type: string - use_legacy_languages: - description: "Use legacy locales i18n.config.json (else uses canonical-llm-language-list.json)" - required: false - default: false - type: boolean base_branch: description: "Base branch to create PR against" required: false @@ -86,7 +81,6 @@ jobs: TARGET_PATH: ${{ github.event.inputs.target_path }} EXCLUDE_PATH: ${{ github.event.inputs.exclude_path }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} - USE_LEGACY_LANGUAGES: ${{ github.event.inputs.use_legacy_languages }} BASE_BRANCH: ${{ github.event.inputs.base_branch }} PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} VERBOSE: ${{ github.event.inputs.verbose }} diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts index 243e461b0c3..baf35f6519b 100644 --- a/src/scripts/i18n/config.ts +++ b/src/scripts/i18n/config.ts @@ -2,7 +2,6 @@ import * as dotenv from "dotenv" import i18nConfig from "../../../i18n.config.json" -import canonicalLanguageList from "./config/canonical-llm-language-list.json" import { mapInternalCodeToCrowdin } from "./lib/utils/mapping" dotenv.config({ path: ".env.local" }) @@ -46,36 +45,20 @@ export const crowdinBearerHeaders = { Authorization: `Bearer ${crowdinApiKey}` } // Parse environment variables with defaults // Accept internal codes (e.g., "es") and convert to Crowdin codes (e.g., "es-EM") -const useLegacyLanguages = ["1", "true", "yes", "on"].includes( - (process.env.USE_LEGACY_LANGUAGES || "").toLowerCase() -) - const targetLanguagesInput = process.env.TARGET_LANGUAGES ? process.env.TARGET_LANGUAGES.split(",") .map((lang) => lang.trim()) .filter(Boolean) : [] -// If no target languages specified, use all languages from appropriate config -let targetLanguages: string[] -if (targetLanguagesInput.length === 0) { - if (useLegacyLanguages) { - // Use i18n.config.json, excluding 'en' - targetLanguages = i18nConfig - .map(({ code }) => code) - .filter((code) => code !== "en") - .map((code) => mapInternalCodeToCrowdin(code)) - } else { - // Use canonical-llm-language-list.json - targetLanguages = canonicalLanguageList - .map(({ code }) => code) - .map((code) => mapInternalCodeToCrowdin(code)) - } -} else { - targetLanguages = targetLanguagesInput.map((code) => - mapInternalCodeToCrowdin(code) - ) -} +// If no target languages specified, use all languages from i18n.config.json, excluding 'en' +const targetLanguages: string[] = + targetLanguagesInput.length === 0 + ? i18nConfig + .map(({ code }) => code) + .filter((code) => code !== "en") + .map((code) => mapInternalCodeToCrowdin(code)) + : targetLanguagesInput.map((code) => mapInternalCodeToCrowdin(code)) const baseBranch = process.env.BASE_BRANCH || "dev" @@ -118,7 +101,6 @@ if (verbose) { console.log( `[DEBUG] - Target languages (Crowdin): ${targetLanguages.join(", ")}` ) - console.log(`[DEBUG] - Use legacy languages: ${useLegacyLanguages}`) console.log(`[DEBUG] - Base branch: ${baseBranch}`) console.log( `[DEBUG] - Target path: ${targetPath || "none (full translation)"}` @@ -146,10 +128,7 @@ export const config = { allCrowdinCodes: targetLanguages, allInternalCodes: targetLanguagesInput.length ? targetLanguagesInput - : useLegacyLanguages - ? i18nConfig.map(({ code }) => code).filter((code) => code !== "en") - : canonicalLanguageList.map(({ code }) => code), - useLegacyLanguages, + : i18nConfig.map(({ code }) => code).filter((code) => code !== "en"), baseBranch, targetPath, excludePath, diff --git a/src/scripts/i18n/config/canonical-llm-language-list.json b/src/scripts/i18n/config/canonical-llm-language-list.json deleted file mode 100644 index 53be456973f..00000000000 --- a/src/scripts/i18n/config/canonical-llm-language-list.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { "code": "zh", "language": "Chinese (Simplified)" }, - { "code": "es", "language": "Spanish" }, - { "code": "hi", "language": "Hindi" }, - { "code": "ar", "language": "Arabic" }, - { "code": "pt-br", "language": "Portuguese (Brazil)" }, - { "code": "fr", "language": "French" }, - { "code": "id", "language": "Indonesian" }, - { "code": "ru", "language": "Russian" }, - { "code": "ja", "language": "Japanese" }, - { "code": "de", "language": "German" }, - { "code": "ko", "language": "Korean" }, - { "code": "vi", "language": "Vietnamese" }, - { "code": "tr", "language": "Turkish" }, - { "code": "bn", "language": "Bengali" }, - { "code": "it", "language": "Italian" }, - { "code": "pl", "language": "Polish" }, - { "code": "ur", "language": "Urdu" }, - { "code": "uk", "language": "Ukrainian" }, - { "code": "ta", "language": "Tamil" }, - { "code": "te", "language": "Telugu" }, - { "code": "mr", "language": "Marathi" }, - { "code": "sw", "language": "Swahili" }, - { "code": "zh-tw", "language": "Chinese (Traditional)" }, - { "code": "cs", "language": "Czech" } -] \ No newline at end of file From f982de6fe46a19c90ab6d6cb40096083a8147654 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Wed, 4 Feb 2026 17:31:49 -0500 Subject: [PATCH 99/99] revert: .gitignore artifacts addition --- .gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index 81eb6e4e5dd..bd0197d2f42 100644 --- a/.gitignore +++ b/.gitignore @@ -68,9 +68,6 @@ build-storybook.log build-archive.log storybook-static -# I18n translation artifacts -artifacts/ - # Trigger .trigger