diff --git a/.github/workflows/crowdin-ai-import.yml b/.github/workflows/crowdin-ai-import.yml index f2a8a31979b..a6f617a826a 100644 --- a/.github/workflows/crowdin-ai-import.yml +++ b/.github/workflows/crowdin-ai-import.yml @@ -3,21 +3,52 @@ name: Import Crowdin AI Translations on: workflow_dispatch: inputs: - file_limit: - description: "Number of files to process (default: 100, use 1-10 for testing)" + target_path: + description: "File or directory path to translate (e.g., public/content/developers/index.md or public/content/developers or blank for all files)" + required: false + type: string + exclude_path: + description: "Path to exclude from this job (e.g., public/content/developers/tutorials)" required: false - default: "100" type: string target_languages: - description: "Comma-separated Crowdin language codes (default: es-EM)" + description: "Comma-separated internal language codes (blank for all locales)" required: false - default: "es-EM" type: string base_branch: - description: "Base branch to create PR against (default: dev)" + description: "Base branch to create PR against" required: false default: "dev" type: string + pretranslation_id: + description: "Pre-translation ID(s) to resume from, comma-separated for multiple (leave empty to start new)" + required: false + type: string + split_prs: + description: "Create one PR per language instead of one combined PR?" + required: false + default: false + type: boolean + pre_translate_prompt_id: + description: "AI prompt ID for pre_translate (default: 326942)" + required: false + default: "326942" + type: string + skip_pr: + description: "Skip PR creation?" + required: false + default: false + type: boolean + skip_await: + description: "Exit after dispatching pre-translation (resume later with ID)" + required: false + default: false + type: boolean + verbose: + description: "Enable verbose logging?" + required: false + default: "false" + type: boolean jobs: import_translations: @@ -41,9 +72,19 @@ jobs: - name: Run Crowdin AI translation import run: npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/main.ts env: - I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_API_KEY }} + I18N_CROWDIN_API_KEY: ${{ secrets.CROWDIN_WORKFLOW_API_KEY }} I18N_GITHUB_API_KEY: ${{ secrets.I18N_GITHUB_TOKEN }} - FILE_LIMIT: ${{ github.event.inputs.file_limit }} + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} + PRETRANSLATION_ID: ${{ github.event.inputs.pretranslation_id }} + TARGET_PATH: ${{ github.event.inputs.target_path }} + EXCLUDE_PATH: ${{ github.event.inputs.exclude_path }} TARGET_LANGUAGES: ${{ github.event.inputs.target_languages }} BASE_BRANCH: ${{ github.event.inputs.base_branch }} + PRE_TRANSLATE_PROMPT_ID: ${{ github.event.inputs.pre_translate_prompt_id }} + VERBOSE: ${{ github.event.inputs.verbose }} + SKIP_PR_CREATION: ${{ github.event.inputs.skip_pr }} + SKIP_AWAIT: ${{ github.event.inputs.skip_await }} + SPLIT_PRS: ${{ github.event.inputs.split_prs }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/package.json b/package.json index f6f0de4162b..842ea56228a 100644 --- a/package.json +++ b/package.json @@ -109,6 +109,7 @@ "devDependencies": { "@chromatic-com/playwright": "^0.12.4", "@chromatic-com/storybook": "1.5.0", + "@google/generative-ai": "^0.24.1", "@netlify/plugin-nextjs": "^5.15.5", "@playwright/test": "^1.52.0", "@storybook/addon-essentials": "8.6.14", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 6d89a30cb08..64c9de3bf0c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -239,6 +239,9 @@ importers: '@chromatic-com/storybook': specifier: 1.5.0 version: 1.5.0(@chromatic-com/playwright@0.12.5(@playwright/test@1.53.1)(@types/react@18.2.57)(bufferutil@4.0.9)(esbuild@0.25.12)(prettier@3.5.3)(typescript@5.8.3)(utf-8-validate@5.0.10))(react@18.3.1) + '@google/generative-ai': + specifier: ^0.24.1 + version: 0.24.1 '@netlify/plugin-nextjs': specifier: ^5.15.5 version: 5.15.5 @@ -1666,6 +1669,10 @@ packages: resolution: {integrity: sha512-1TUx3KdaU3cN7nfCdNf+UVqA/PSX29Cjcox3fZZBtINlRrXVTmUkQnCKv2MbBUbCopbK4olAT1IHl76uZyCiVA==} engines: {node: '>=14.0.0'} + '@google/generative-ai@0.24.1': + resolution: {integrity: sha512-MqO+MLfM6kjxcKoy0p1wRzG3b4ZZXtPI+z2IE26UogS2Cm/XHO+7gGRBh6gcJsOiIVoH93UwKvW4HdgiOZCy9Q==} + engines: {node: '>=18.0.0'} + '@hookform/resolvers@3.10.0': resolution: {integrity: sha512-79Dv+3mDF7i+2ajj7SkypSKHhl1cbln1OGavqrsF7p6mbUv11xpqpacPsGDCTRvCSjEEIez2ef1NveSVL3b0Ag==} peerDependencies: @@ -11412,6 +11419,8 @@ snapshots: '@google-cloud/precise-date@4.0.0': {} + '@google/generative-ai@0.24.1': {} + '@hookform/resolvers@3.10.0(react-hook-form@7.57.0(react@18.3.1))': dependencies: react-hook-form: 7.57.0(react@18.3.1) diff --git a/src/scripts/i18n/check-translation-status.ts b/src/scripts/i18n/check-translation-status.ts deleted file mode 100644 index 5913e5457f1..00000000000 --- a/src/scripts/i18n/check-translation-status.ts +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Quick script to check translation status of a specific file in Crowdin - */ - -const CROWDIN_API_KEY = process.env.CROWDIN_TOKEN! -const PROJECT_ID = 834930 -const FILE_ID = 17434 // organizing/index.md -const LANGUAGE_ID = "es-EM" - -const headers = { - Authorization: `Bearer ${CROWDIN_API_KEY}`, - "Content-Type": "application/json", -} - -async function checkTranslationProgress() { - console.log("\n=== Checking Translation Progress ===") - console.log(`File ID: ${FILE_ID}`) - console.log(`Language: ${LANGUAGE_ID}`) - - // Get translation progress for the file - const url = `https://api.crowdin.com/api/v2/projects/${PROJECT_ID}/languages/${LANGUAGE_ID}/progress?fileIds=${FILE_ID}` - - try { - const res = await fetch(url, { headers }) - if (!res.ok) { - const text = await res.text() - throw new Error(`Failed to get progress (${res.status}): ${text}`) - } - - const json = await res.json() - console.log("\nTranslation Progress:") - console.log(JSON.stringify(json, null, 2)) - } catch (error) { - console.error("Error:", error) - } -} - -async function listStrings() { - console.log("\n=== Listing Strings in File ===") - - // Get strings from the file - const url = `https://api.crowdin.com/api/v2/projects/${PROJECT_ID}/strings?fileId=${FILE_ID}&limit=10` - - try { - const res = await fetch(url, { headers }) - if (!res.ok) { - const text = await res.text() - throw new Error(`Failed to list strings (${res.status}): ${text}`) - } - - const json = await res.json() - console.log(`\nFound ${json.data.length} strings (showing first 10):`) - for (const item of json.data) { - console.log(`\nString ID: ${item.data.id}`) - console.log(`Text: "${item.data.text.substring(0, 100)}..."`) - console.log(`Context: ${item.data.context || "none"}`) - } - } catch (error) { - console.error("Error:", error) - } -} - -async function checkStringTranslations() { - console.log("\n=== Checking String Translations ===") - - // First get a string ID - const stringsUrl = `https://api.crowdin.com/api/v2/projects/${PROJECT_ID}/strings?fileId=${FILE_ID}&limit=1` - - try { - const stringsRes = await fetch(stringsUrl, { headers }) - if (!stringsRes.ok) { - throw new Error(`Failed to get strings: ${stringsRes.status}`) - } - - const stringsJson = await stringsRes.json() - if (stringsJson.data.length === 0) { - console.log("❌ No strings found in file!") - return - } - - const stringId = stringsJson.data[0].data.id - console.log(`\nChecking translations for string ID: ${stringId}`) - console.log( - `String text: "${stringsJson.data[0].data.text.substring(0, 100)}..."` - ) - - // Get translations for this string - const translationsUrl = `https://api.crowdin.com/api/v2/projects/${PROJECT_ID}/translations?stringId=${stringId}&languageId=${LANGUAGE_ID}` - const transRes = await fetch(translationsUrl, { headers }) - - if (!transRes.ok) { - const text = await transRes.text() - console.log( - `\n⚠️ No translations found or error (${transRes.status}): ${text}` - ) - return - } - - const transJson = await transRes.json() - console.log(`\nTranslations found: ${transJson.data.length}`) - if (transJson.data.length > 0) { - console.log("First translation:") - console.log(JSON.stringify(transJson.data[0].data, null, 2)) - } else { - console.log("❌ String has NO translations in Spanish!") - } - } catch (error) { - console.error("Error:", error) - } -} - -async function main() { - await checkTranslationProgress() - await listStrings() - await checkStringTranslations() -} - -main() diff --git a/src/scripts/i18n/config.ts b/src/scripts/i18n/config.ts new file mode 100644 index 00000000000..baf35f6519b --- /dev/null +++ b/src/scripts/i18n/config.ts @@ -0,0 +1,193 @@ +import * as dotenv from "dotenv" + +import i18nConfig from "../../../i18n.config.json" + +import { mapInternalCodeToCrowdin } from "./lib/utils/mapping" + +dotenv.config({ path: ".env.local" }) + +// Language code mapping +export const crowdinToInternalCodeMapping: Record = + i18nConfig.reduce( + (acc, { crowdinCode, code }) => { + acc[crowdinCode] = code + return acc + }, + {} as Record + ) + +// GitHub API configuration +const gitHubApiKey = process.env.I18N_GITHUB_API_KEY || "" +if (!gitHubApiKey) { + console.error("[ERROR] Missing I18N_GITHUB_API_KEY environment variable") + console.error( + "[ERROR] Please set I18N_GITHUB_API_KEY in your .env.local file" + ) + throw new Error("No GitHub API Key found (I18N_GITHUB_API_KEY)") +} + +export const gitHubBearerHeaders = { + Authorization: `Bearer ${gitHubApiKey}`, + Accept: "application/vnd.github.v3+json", +} + +// Crowdin API configuration +const crowdinApiKey = process.env.I18N_CROWDIN_API_KEY || "" +if (!crowdinApiKey) { + console.error("[ERROR] Missing I18N_CROWDIN_API_KEY environment variable") + console.error( + "[ERROR] Please set I18N_CROWDIN_API_KEY in your .env.local file" + ) + throw new Error("No Crowdin API Key found (I18N_CROWDIN_API_KEY)") +} + +export const crowdinBearerHeaders = { Authorization: `Bearer ${crowdinApiKey}` } + +// Parse environment variables with defaults +// Accept internal codes (e.g., "es") and convert to Crowdin codes (e.g., "es-EM") +const targetLanguagesInput = process.env.TARGET_LANGUAGES + ? process.env.TARGET_LANGUAGES.split(",") + .map((lang) => lang.trim()) + .filter(Boolean) + : [] + +// If no target languages specified, use all languages from i18n.config.json, excluding 'en' +const targetLanguages: string[] = + targetLanguagesInput.length === 0 + ? i18nConfig + .map(({ code }) => code) + .filter((code) => code !== "en") + .map((code) => mapInternalCodeToCrowdin(code)) + : targetLanguagesInput.map((code) => mapInternalCodeToCrowdin(code)) + +const baseBranch = process.env.BASE_BRANCH || "dev" + +const targetPath = process.env.TARGET_PATH || "" +const excludePath = process.env.EXCLUDE_PATH?.trim() || "" + +// Skip awaiting pre-translation completion (exit early with ID for manual resume) +const skipAwait = ["1", "true", "yes", "on"].includes( + (process.env.SKIP_AWAIT || "").toLowerCase() +) + +// Adaptive polling / timeout configuration (milliseconds) +const pretranslateTimeoutMs = process.env.PRETRANSLATE_TIMEOUT_MS + ? parseInt(process.env.PRETRANSLATE_TIMEOUT_MS, 10) + : 6 * 60 * 60 * 1000 // default 6h + +const pretranslatePollBaseMs = process.env.PRETRANSLATE_POLL_BASE_MS + ? Math.max(5000, parseInt(process.env.PRETRANSLATE_POLL_BASE_MS, 10)) + : 30_000 // default 30s base (min clamped to 5s) + +// Parse comma-separated pre-translation IDs (for resuming multiple per-language jobs) +const existingPreTranslationIds = (process.env.PRETRANSLATION_ID || "") + .split(",") + .map((id) => id.trim()) + .filter(Boolean) + +const verbose = process.env.VERBOSE === "true" +const splitPrs = process.env.SPLIT_PRS === "true" + +// Parse GitHub repository from env (format: "owner/repo") +const githubRepo = + process.env.GITHUB_REPOSITORY || "ethereum/ethereum-org-website" +const [ghOrganization, ghRepo] = githubRepo.split("/") + +if (verbose) { + console.log("[DEBUG] Configuration:") + console.log( + `[DEBUG] - Target languages (internal): ${targetLanguagesInput.length ? targetLanguagesInput.join(", ") : "ALL"}` + ) + console.log( + `[DEBUG] - Target languages (Crowdin): ${targetLanguages.join(", ")}` + ) + console.log(`[DEBUG] - Base branch: ${baseBranch}`) + console.log( + `[DEBUG] - Target path: ${targetPath || "none (full translation)"}` + ) + console.log(`[DEBUG] - Exclude path: ${excludePath || "none"}`) + console.log(`[DEBUG] - Skip await: ${skipAwait}`) + console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) + if (existingPreTranslationIds.length > 0) { + console.log( + `[DEBUG] - Resuming from pre-translation IDs: ${existingPreTranslationIds.join(", ")}` + ) + } +} + +// Main configuration object +export const config = { + projectId: 834930, + ghOrganization, + ghRepo, + jsonRoot: "src/intl/en", + mdRoot: "public/content", + preTranslatePromptId: Number.parseInt( + process.env.PRE_TRANSLATE_PROMPT_ID || "326942" + ), + allCrowdinCodes: targetLanguages, + allInternalCodes: targetLanguagesInput.length + ? targetLanguagesInput + : i18nConfig.map(({ code }) => code).filter((code) => code !== "en"), + baseBranch, + targetPath, + excludePath, + skipAwait, + pretranslateTimeoutMs, + pretranslatePollBaseMs, + existingPreTranslationIds, + verbose, + splitPrs, +} + +// Do not translate list - Declare paths that should never be translated +export const doNotTranslatePaths = [ + "/cookie-policy/", + "/privacy-policy/", + "/terms-of-use/", + "/terms-and-conditions/", + "/style-guide/", +] + +// Validation for target path +export function validateTargetPath(targetPath: string): void { + if (!targetPath) { + // Full translation mode is allowed + return + } + + // Disallowed: paths under public/content/translations (translated content) + if (targetPath.includes("public/content/translations")) { + throw new Error( + `[ERROR] Invalid target path: "${targetPath}"\n` + + `Target path cannot be under "public/content/translations" (this is translated content)\n` + + `Did you mean to target a file under "public/content" instead?` + ) + } + + // Disallowed: paths under src/intl other than src/intl/en + if ( + targetPath.startsWith("src/intl/") && + !targetPath.startsWith("src/intl/en") + ) { + throw new Error( + `[ERROR] Invalid target path: "${targetPath}"\n` + + `Target path under "src/intl/" can only be "src/intl/en" (English source)\n` + + `Other src/intl directories contain translated content` + ) + } + + // Disallowed: explicitly excluded paths from config file + for (const excluded of doNotTranslatePaths) { + if (targetPath.includes(excluded)) { + throw new Error( + `[ERROR] Invalid target path: "${targetPath}"\n` + + `This path is in the excluded paths list (${excluded})` + ) + } + } +} + +// Constants +export const CROWDIN_API_BASE_URL = "https://api.crowdin.com/api/v2" +export const MAX_STRINGS_PER_REQUEST = 500 diff --git a/src/scripts/i18n/docs/v0.2.0-roadmap.md b/src/scripts/i18n/docs/v0.2.0-roadmap.md new file mode 100644 index 00000000000..c86e4f57447 --- /dev/null +++ b/src/scripts/i18n/docs/v0.2.0-roadmap.md @@ -0,0 +1,146 @@ +# v0.2.0 Roadmap: Glossary & Consistency Validation + +This document outlines planned features for the next major iteration of the i18n automation system. + +## Overview + +v0.1.0 focused on: +- JSX attribute translation via Gemini API (fallback for Crowdin) +- Build-breaking syntax validation +- Modular architecture for standalone workflow execution + +v0.2.0 will focus on **translation quality and consistency** through glossary enforcement and term validation. + +--- + +## Planned Features + +### 1. Glossary Supabase Sync (Separate Cron) + +**Goal:** Keep Crowdin glossary synchronized with community-curated terms in Supabase. + +**Implementation:** +- Dedicated GitHub Action running on cron schedule (e.g., daily at midnight UTC) +- Fetches glossary terms from Supabase `glossary` table +- Uploads/updates terms in Crowdin project glossary via API +- Logs sync status and any conflicts + +**Files to create:** +- `src/scripts/i18n/sync-glossary.ts` - Main sync orchestrator +- `src/scripts/i18n/lib/supabase/glossary.ts` - Supabase client for glossary queries +- `.github/workflows/sync-glossary.yml` - Cron workflow + +**Environment variables needed:** +- `SUPABASE_URL` - Supabase project URL +- `SUPABASE_KEY` - Supabase anon/service key +- `CROWDIN_PROJECT_ID`, `CROWDIN_API_KEY` (existing) + +--- + +### 2. Term/Phrase Consistency Validation + +**Goal:** Validate that translated files use glossary terms consistently. + +**Implementation:** +- Post-translation validation step in main workflow +- Extract glossary terms from Crowdin (or local cache from sync) +- Scan translated files for source terms that should have been translated +- Flag inconsistencies in PR validation comment + +**Validation rules:** +- Source term appears in translation → likely missed (should be target term) +- Target term varies within same file → inconsistent usage +- Protected terms (ethereum.org, Ethereum, etc.) → should remain unchanged + +**Files to create:** +- `src/scripts/i18n/lib/validation/glossary.ts` - Glossary term validation +- Updates to `lib/workflows/validation.ts` - Integrate glossary checks + +--- + +### 3. Confidence Scoring + +**Goal:** Provide per-file and per-language confidence scores based on validation results. + +**Scoring factors:** +- JSX attribute untranslated percentage (from v0.1.0) +- Glossary term consistency rate +- Syntax validation pass/fail +- Source file complexity (length, technical density) + +**Output:** +- Confidence score (0-100) per file in PR comment +- Aggregate confidence per language +- Suggested review priority based on low-confidence files + +**Files to create:** +- `src/scripts/i18n/lib/validation/confidence.ts` - Scoring algorithm +- Updates to PR comment formatting + +--- + +## Architecture Considerations + +### Supabase Schema (Proposed) + +```sql +-- Glossary terms table +CREATE TABLE glossary ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + source_term TEXT NOT NULL, + language_code TEXT NOT NULL, + target_term TEXT NOT NULL, + context TEXT, -- e.g., "technical", "UI", "marketing" + notes TEXT, + created_at TIMESTAMPTZ DEFAULT now(), + updated_at TIMESTAMPTZ DEFAULT now(), + UNIQUE(source_term, language_code) +); + +-- Translation memory (future) +CREATE TABLE translation_memory ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + source_text TEXT NOT NULL, + language_code TEXT NOT NULL, + target_text TEXT NOT NULL, + source_file TEXT, + created_at TIMESTAMPTZ DEFAULT now() +); +``` + +### Crowdin API Endpoints + +- `POST /projects/{projectId}/glossaries/{glossaryId}/terms` - Add/update terms +- `GET /projects/{projectId}/glossaries/{glossaryId}/terms` - List terms for validation + +--- + +## Timeline (Tentative) + +| Feature | Estimated Effort | Priority | +|---------|------------------|----------| +| Glossary Supabase sync | 2-3 days | High | +| Term consistency validation | 2-3 days | High | +| Confidence scoring | 1-2 days | Medium | +| Documentation & testing | 1-2 days | High | + +--- + +## Dependencies + +- Supabase project setup with glossary table +- Crowdin glossary ID configuration +- Community glossary data migration (if existing) + +--- + +## Open Questions + +1. Should glossary sync be bidirectional (Supabase ↔ Crowdin)? +2. What threshold for glossary inconsistency should trigger a warning vs error? +3. Should confidence scores block PR merge below a certain threshold? +4. How to handle language-specific glossary exceptions? + +--- + +*This roadmap was created as part of the v0.1.0 development cycle. Updates will be made as requirements evolve.* diff --git a/src/scripts/i18n/lib/ai/gemini.ts b/src/scripts/i18n/lib/ai/gemini.ts new file mode 100644 index 00000000000..1e978ea192c --- /dev/null +++ b/src/scripts/i18n/lib/ai/gemini.ts @@ -0,0 +1,242 @@ +/** + * Gemini AI translation wrapper for JSX attribute translation + */ + +import { GoogleGenerativeAI } from "@google/generative-ai" + +import i18nConfig from "../../../../../i18n.config.json" +import type { ExtractedAttribute, TranslatedAttribute } from "../jsx-attributes" +import { delay } from "../workflows/utils" + +/** Gemini API configuration */ +const GEMINI_MODEL = "gemini-2.5-pro" + +/** Language names parsed from i18n.config.json */ +const LANGUAGE_NAMES: Record = Object.fromEntries( + i18nConfig.map(({ code, name }) => [code, name]) +) + +/** + * Check if Gemini API is available (API key present) + */ +export function isGeminiAvailable(): boolean { + return Boolean(process.env.GEMINI_API_KEY) +} + +/** + * Get the Gemini API client + */ +function getGeminiClient(): GoogleGenerativeAI { + const apiKey = process.env.GEMINI_API_KEY + if (!apiKey) { + throw new Error("GEMINI_API_KEY environment variable is not set") + } + return new GoogleGenerativeAI(apiKey) +} + +/** + * Get human-readable language name from code + */ +function getLanguageName(code: string): string { + return LANGUAGE_NAMES[code] || code.toUpperCase() +} + +/** + * Build translation prompt for a batch of attributes + */ +function buildTranslationPrompt( + attributes: ExtractedAttribute[], + targetLanguage: string, + glossaryTerms?: Map +): string { + const langName = getLanguageName(targetLanguage) + + const attributeList = attributes + .map( + (attr, i) => + `${i + 1}. [${attr.componentName}.${attr.attributeName}] "${attr.originalValue}" + Context: ${attr.context}` + ) + .join("\n\n") + + // Build glossary section if terms provided + let glossarySection = "" + if (glossaryTerms && glossaryTerms.size > 0) { + const termsList = Array.from(glossaryTerms.entries()) + .map(([term, translation]) => `- "${term}" → "${translation}"`) + .join("\n") + glossarySection = ` + +REQUIRED TERMINOLOGY (use these exact translations): +${termsList} +` + } + + return `You are translating UI component attributes for the Ethereum.org website into ${langName}. + +These are JSX component attributes that contain human-readable text. Translate each value naturally and accurately while: +- Preserving technical Ethereum terminology appropriately for ${langName} +- Keeping the translation concise (similar length to original) +- Maintaining any placeholders like {variable} or {{variable}} unchanged +- Using region-neutral ${langName} that most speakers would understand +- Using informal, friendly register${glossarySection} + +Attributes to translate: + +${attributeList} + +Respond with ONLY a JSON array of translated strings in the same order, like: +["translated text 1", "translated text 2", ...] + +Do not include any explanation, just the JSON array.` +} + +/** + * Parse Gemini response to extract translated strings + */ +function parseTranslationResponse(response: string): string[] { + // Clean up response - remove markdown code blocks if present + let cleaned = response.trim() + if (cleaned.startsWith("```json")) { + cleaned = cleaned.slice(7) + } else if (cleaned.startsWith("```")) { + cleaned = cleaned.slice(3) + } + if (cleaned.endsWith("```")) { + cleaned = cleaned.slice(0, -3) + } + cleaned = cleaned.trim() + + try { + const parsed = JSON.parse(cleaned) + if (!Array.isArray(parsed)) { + throw new Error("Response is not an array") + } + return parsed.map((item) => String(item)) + } catch (error) { + console.error("[GEMINI] Failed to parse response:", cleaned) + throw new Error(`Failed to parse Gemini response: ${error}`) + } +} + +/** + * Translate a batch of attributes for a single language. + * Returns translated attributes with their values filled in. + */ +export async function translateAttributes( + attributes: ExtractedAttribute[], + targetLanguage: string, + glossaryTerms?: Map +): Promise { + if (attributes.length === 0) { + return [] + } + + if (!isGeminiAvailable()) { + console.warn( + "[GEMINI] API key not available, skipping attribute translation" + ) + return [] + } + + const client = getGeminiClient() + const model = client.getGenerativeModel({ model: GEMINI_MODEL }) + + const prompt = buildTranslationPrompt( + attributes, + targetLanguage, + glossaryTerms + ) + + console.log( + `[GEMINI] Translating ${attributes.length} attributes to ${getLanguageName(targetLanguage)}` + ) + + try { + const result = await model.generateContent(prompt) + const response = result.response.text() + const translations = parseTranslationResponse(response) + + if (translations.length !== attributes.length) { + console.warn( + `[GEMINI] Translation count mismatch: expected ${attributes.length}, got ${translations.length}` + ) + } + + // Map translations back to attributes + return attributes.map((attr, i) => ({ + ...attr, + translatedValue: translations[i] || attr.originalValue, + })) + } catch (error) { + console.error("[GEMINI] Translation failed:", error) + throw error + } +} + +/** + * Translate attributes with retry logic + */ +export async function translateAttributesWithRetry( + attributes: ExtractedAttribute[], + targetLanguage: string, + glossaryTerms?: Map, + maxRetries = 3 +): Promise { + let lastError: Error | null = null + + for (let attempt = 1; attempt <= maxRetries; attempt++) { + try { + return await translateAttributes( + attributes, + targetLanguage, + glossaryTerms + ) + } catch (error) { + lastError = error instanceof Error ? error : new Error(String(error)) + console.warn( + `[GEMINI] Attempt ${attempt}/${maxRetries} failed: ${lastError.message}` + ) + + if (attempt < maxRetries) { + // Exponential backoff + const backoff = Math.min(1000 * Math.pow(2, attempt - 1), 10000) + await delay(backoff) + } + } + } + + throw lastError || new Error("Translation failed after retries") +} + +/** + * Translate attributes grouped by file, processing each file's batch sequentially + * to avoid rate limits while maximizing context per request. + */ +export async function translateAttributesByFile( + attributesByFile: Map, + targetLanguage: string, + glossaryTerms?: Map +): Promise> { + const results = new Map() + + for (const [filePath, attributes] of attributesByFile) { + try { + const translated = await translateAttributesWithRetry( + attributes, + targetLanguage, + glossaryTerms + ) + results.set(filePath, translated) + console.log( + `[GEMINI] ✓ Translated ${translated.length} attributes in ${filePath}` + ) + } catch (error) { + console.error(`[GEMINI] ✗ Failed to translate ${filePath}:`, error) + // Continue with other files even if one fails + results.set(filePath, []) + } + } + + return results +} diff --git a/src/scripts/i18n/lib/ai/index.ts b/src/scripts/i18n/lib/ai/index.ts new file mode 100644 index 00000000000..e2f75fd59d1 --- /dev/null +++ b/src/scripts/i18n/lib/ai/index.ts @@ -0,0 +1,10 @@ +/** + * AI translation module + */ + +export { + isGeminiAvailable, + translateAttributes, + translateAttributesByFile, + translateAttributesWithRetry, +} from "./gemini" diff --git a/src/scripts/i18n/lib/crowdin/build.ts b/src/scripts/i18n/lib/crowdin/build.ts new file mode 100644 index 00000000000..cdd49d35fff --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/build.ts @@ -0,0 +1,76 @@ +// Crowdin build and download operations + +import { + config, + CROWDIN_API_BASE_URL, + crowdinBearerHeaders, +} from "../../config" +import type { BuildProjectFileTranslationResponse } from "../types" + +/** + * Build a project file translation for a specific language + * + * @param fileId - The Crowdin file ID + * @param targetLanguageId - The target language ID + * @param projectId - The Crowdin project ID (defaults to config) + * @returns Build response with download URL + */ +export const postBuildProjectFileTranslation = async ( + fileId: number, + targetLanguageId: string, + projectId = config.projectId +): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${projectId}/translations/builds/files/${fileId}` + ) + + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + Accept: "application/json", + }, + body: JSON.stringify({ targetLanguageId }), + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error( + `Crowdin postBuildProjectFileTranslation failed (${res.status}): ${body}` + ) + } + + type JsonResponse = { data: BuildProjectFileTranslationResponse } + const json: JsonResponse = await res.json() + console.log("Built file:", json.data) + return json.data +} + +/** + * Download a built file from Crowdin + * + * @param downloadUrl - The download URL from the build response + * @returns Buffer containing the file contents + */ +export const getBuiltFile = async ( + downloadUrl: string +): Promise<{ buffer: Buffer }> => { + try { + const res = await fetch(downloadUrl) + + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error(`Failed to download built file (${res.status}): ${body}`) + } + + const arrayBuffer = await res.arrayBuffer() + const buffer = Buffer.from(arrayBuffer) + + return { buffer } + } catch (error) { + console.error("getBuiltFile error:", error) + throw error + } +} diff --git a/src/scripts/i18n/lib/crowdin/ephemeral-prompts.ts b/src/scripts/i18n/lib/crowdin/ephemeral-prompts.ts new file mode 100644 index 00000000000..985979bca9c --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/ephemeral-prompts.ts @@ -0,0 +1,151 @@ +/** + * Ephemeral Prompts + * + * Manages Crowdin AI prompts that are created per-job and cleaned up after use. + * Each prompt is uniquely named with language, key, and timestamp to avoid conflicts. + * + * Naming convention: eth-org-{lang}-{key}-{timestamp} + * Example: eth-org-es-glossary-1702987200 + */ + +import { crowdinBearerHeaders } from "../../config" + +import type { PromptResource } from "./prompt" + +/** Parameters for creating an ephemeral prompt */ +export interface CreateEphemeralPromptParams { + /** Crowdin user ID (owner of the prompt) */ + userId: number + /** Language code (e.g., "es", "fr", "de") */ + languageCode: string + /** Prompt key (e.g., "glossary", "formal") */ + promptKey: string + /** The full prompt text */ + promptText: string + /** AI provider ID (optional, uses default if not specified) */ + aiProviderId?: number + /** AI model ID (optional, uses default if not specified) */ + aiModelId?: string +} + +/** Result of creating an ephemeral prompt */ +export interface EphemeralPromptResult { + /** The created prompt's ID */ + promptId: number + /** The prompt's unique name */ + promptName: string +} + +/** Crowdin API response for prompt creation */ +interface CrowdinCreatePromptResponse { + data: PromptResource +} + +/** Prefix for all ephemeral prompt names */ +const EPHEMERAL_PREFIX = "eth-org" + +/** Crowdin action type for pre-translation prompts */ +const PRE_TRANSLATE_ACTION = "pre_translate" + +/** + * Generate a unique name for an ephemeral prompt + */ +export function generateEphemeralPromptName( + languageCode: string, + promptKey: string +): string { + const timestamp = Math.floor(Date.now() / 1000) + return `${EPHEMERAL_PREFIX}-${languageCode}-${promptKey}-${timestamp}` +} + +/** + * Create an ephemeral AI prompt in Crowdin + * + * Uses Crowdin API v2: POST /users/{userId}/ai/prompts + */ +export async function createEphemeralPrompt( + params: CreateEphemeralPromptParams +): Promise { + const { + userId, + languageCode, + promptKey, + promptText, + aiProviderId, + aiModelId, + } = params + + const promptName = generateEphemeralPromptName(languageCode, promptKey) + console.log(`[EPHEMERAL-PROMPT] Creating prompt: ${promptName}`) + + const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts` + + const body: Record = { + name: promptName, + action: PRE_TRANSLATE_ACTION, + config: { + mode: "advanced", + prompt: promptText, + glossaryTerms: true, + tmSuggestions: true, + }, + } + + if (aiProviderId !== undefined) { + body.aiProviderId = aiProviderId + } + if (aiModelId !== undefined) { + body.aiModelId = aiModelId + } + + const response = await fetch(url, { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }) + + if (!response.ok) { + const text = await response.text().catch(() => "") + throw new Error( + `Failed to create ephemeral prompt "${promptName}" (${response.status}): ${text}` + ) + } + + const json = (await response.json()) as CrowdinCreatePromptResponse + const promptId = json.data.id + + console.log( + `[EPHEMERAL-PROMPT] Created prompt: ${promptName} (ID: ${promptId})` + ) + return { promptId, promptName } +} + +/** + * Delete an ephemeral AI prompt from Crowdin + */ +export async function deleteEphemeralPrompt( + userId: number, + promptId: number +): Promise { + console.log(`[EPHEMERAL-PROMPT] Deleting prompt ID: ${promptId}`) + + const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` + + const response = await fetch(url, { + method: "DELETE", + headers: crowdinBearerHeaders, + }) + + // 204 No Content is success, 404 is also acceptable (already deleted) + if (!response.ok && response.status !== 404) { + const text = await response.text().catch(() => "") + throw new Error( + `Failed to delete ephemeral prompt ${promptId} (${response.status}): ${text}` + ) + } + + console.log(`[EPHEMERAL-PROMPT] Deleted prompt ID: ${promptId}`) +} diff --git a/src/scripts/i18n/lib/crowdin/files.ts b/src/scripts/i18n/lib/crowdin/files.ts new file mode 100644 index 00000000000..e0096d6c350 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/files.ts @@ -0,0 +1,396 @@ +// Crowdin file operations + +import { + config, + CROWDIN_API_BASE_URL, + crowdinBearerHeaders, +} from "../../config" +import type { + CrowdinAddFileResponse, + CrowdinFileData, + GitHubCrowdinFileMetadata, +} from "../types" +import { debugLog } from "../workflows/utils" + +/** + * JSX component attributes that should be translated in markdown files. + * These contain human-readable strings, as opposed to technical attributes + * like emoji, eventCategory, href, etc. + * + * Note: Crowdin's PATCH API only accepts a boolean flag (translateAttributes: true) + * to enable attribute translation. The actual whitelist may need to be configured + * separately via the Crowdin UI or a different API endpoint. + * + * See TRANSLATABLE_ATTRIBUTES in jsx-attributes/types.ts for the canonical list. + */ + +/** + * Get all files in the Crowdin project + */ +export const getCrowdinProjectFiles = async (): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/files` + ) + url.searchParams.set("limit", "500") + + debugLog(`Fetching Crowdin project files from: ${url.toString()}`) + + try { + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error( + `Crowdin getCrowdinProjectFiles failed (${res.status}): ${body}` + ) + } + + type JsonResponse = { data: { data: CrowdinFileData }[] } + const json: JsonResponse = await res.json() + + const mappedData = json.data.map(({ data }) => data) + debugLog(`Successfully fetched ${mappedData.length} Crowdin files`) + return mappedData + } catch (error) { + console.error(`[ERROR] Failed to fetch Crowdin project files:`, error) + process.exit(1) + } +} + +/** + * Find a Crowdin file matching a GitHub file. + * Returns null if file not found (indicating it's new and needs to be uploaded). + */ +export const findCrowdinFile = ( + targetFile: GitHubCrowdinFileMetadata, + crowdinFiles: CrowdinFileData[] +): CrowdinFileData | null => { + debugLog(`Looking for Crowdin file matching: ${targetFile.filePath}`) + + const found = crowdinFiles.find(({ path }) => + path.endsWith(targetFile.filePath) + ) + + if (!found) { + // Not an error - file is new and will be uploaded + console.log( + `[INFO] File not in Crowdin (will upload): ${targetFile.filePath}` + ) + return null + } + + debugLog( + `Successfully matched with Crowdin file: ${found.path} (ID: ${found.id})` + ) + return found +} + +/** + * Unhides all hidden strings in a Crowdin file. + * Hidden strings (often marked as duplicates) cannot be translated. + * This function makes them visible so they can be processed by pre-translation. + */ +export const unhideStringsInFile = async (fileId: number): Promise => { + debugLog(`Checking for hidden strings in fileId=${fileId}`) + + // Get all strings from the file + const listUrl = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/strings?fileId=${fileId}&limit=500` + + try { + const listRes = await fetch(listUrl, { headers: crowdinBearerHeaders }) + if (!listRes.ok) { + const text = await listRes.text().catch(() => "") + console.warn( + `[UNHIDE] Failed to list strings for fileId=${fileId}: ${text}` + ) + return 0 + } + + const listJson = await listRes.json() + const strings = listJson.data || [] + + let unhiddenCount = 0 + + for (const item of strings) { + const stringId = item.data.id + const isHidden = item.data.isHidden + + if (!isHidden) continue + + // Unhide the string using PATCH + const patchUrl = `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/strings/${stringId}` + + try { + const patchRes = await fetch(patchUrl, { + method: "PATCH", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify([ + { + op: "replace", + path: "/isHidden", + value: false, + }, + ]), + }) + + if (patchRes.ok) { + unhiddenCount++ + } else { + const text = await patchRes.text().catch(() => "") + console.warn(`[UNHIDE] Failed to unhide string ${stringId}: ${text}`) + } + } catch (err) { + console.warn(`[UNHIDE] Error unhiding string ${stringId}:`, err) + } + } + + if (unhiddenCount > 0) { + console.log( + `[UNHIDE] ✓ Unhidden ${unhiddenCount} strings in fileId=${fileId}` + ) + } + + return unhiddenCount + } catch (error) { + console.error(`[UNHIDE] Error processing fileId=${fileId}:`, error) + return 0 + } +} + +/** + * Lists all Crowdin directories in the project. + */ +export const getCrowdinProjectDirectories = async (): Promise< + { id: number; name: string; directoryId?: number }[] +> => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/directories` + ) + url.searchParams.set("limit", "500") + + debugLog(`Fetching Crowdin directories: ${url.toString()}`) + + try { + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error( + `Crowdin getCrowdinProjectDirectories failed (${res.status}): ${body}` + ) + } + type DirJson = { + data: { data: { id: number; name: string; directoryId?: number } }[] + } + const json: DirJson = await res.json() + const dirs = json.data.map(({ data }) => data) + debugLog(`Loaded ${dirs.length} directories`) + return dirs + } catch (error) { + console.error("[ERROR] getCrowdinProjectDirectories:", error) + throw error + } +} + +/** + * Creates a single Crowdin directory (one segment). Parent may be undefined for root. + */ +export const postCrowdinDirectory = async ( + name: string, + parentDirectoryId?: number +): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/directories` + ) + + const body: Record = { name } + if (parentDirectoryId) body.directoryId = parentDirectoryId + + debugLog( + `Creating directory segment "${name}" parent=${parentDirectoryId ?? "ROOT"}` + ) + + try { + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + Accept: "application/json", + }, + body: JSON.stringify(body), + }) + + if (!res.ok) { + const text = await res.text().catch(() => "") + // 409 = already exists race condition + throw new Error( + `Crowdin postCrowdinDirectory failed (${res.status}): ${text}` + ) + } + + type JsonResponse = { data: { id: number } } + const json: JsonResponse = await res.json() + debugLog(`Created directory id=${json.data.id} name="${name}"`) + return json.data.id + } catch (error) { + console.error("[ERROR] postCrowdinDirectory:", error) + throw error + } +} + +/** + * Ensures a nested path of directories exists. + * Example path: "public/content/community/events/organizing" + * Returns the final (deepest) directory id. + * + * - Splits path on "/" ignoring empty segments. + * - Reuses existing segments (matched by name + parent). + * - Creates missing segments sequentially. + */ +export const createCrowdinDirectory = async ( + fullPath: string +): Promise => { + if (!fullPath || typeof fullPath !== "string") { + throw new Error("createCrowdinDirectory: path must be a non-empty string") + } + debugLog(`Ensuring Crowdin directory path: "${fullPath}"`) + + const segments = fullPath + .split("/") + .map((s) => s.trim()) + .filter(Boolean) + if (!segments.length) throw new Error("No valid path segments") + + const invalidChars = /[\\:*?"<>|]/ // Disallowed per Crowdin docs for directory name (exclude forward slash which is path separator) + for (const segment of segments) { + if (invalidChars.test(segment)) { + throw new Error( + `createCrowdinDirectory: segment "${segment}" contains invalid characters in path "${fullPath}"` + ) + } + } + + // Load existing directories once + const existing = await getCrowdinProjectDirectories() + + // Build quick lookup: parentId|name -> id (root parentId = 0 sentinel) + const key = (parentId: number | undefined, name: string) => + `${parentId || 0}|${name}` + + const directoryIndex = new Map() + for (const dir of existing) { + directoryIndex.set(key(dir.directoryId, dir.name), dir.id) + } + + let currentParentId: number | undefined + for (const segment of segments) { + const k = key(currentParentId, segment) + let dirId = directoryIndex.get(k) + if (dirId) { + debugLog( + `Reusing existing directory "${segment}" id=${dirId} parent=${currentParentId ?? "ROOT"}` + ) + currentParentId = dirId + continue + } + // Create + dirId = await postCrowdinDirectory(segment, currentParentId) + directoryIndex.set(k, dirId) + currentParentId = dirId + } + + if (!currentParentId) + throw new Error("Failed to resolve final directory id (unexpected)") + + debugLog(`Final directory id for path "${fullPath}" = ${currentParentId}`) + return currentParentId +} + +/** + * Upload a file to Crowdin storage + */ +export const postFileToStorage = async ( + fileBuffer: Buffer, + fileName: string +) => { + const url = new URL(`${CROWDIN_API_BASE_URL}/storages`) + + try { + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + // Crowdin expects raw bytes for storages endpoint; use octet-stream. + "Content-Type": "application/octet-stream", + "Crowdin-API-FileName": fileName, + }, + body: fileBuffer, + }) + + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error( + `Crowdin postFileToStorage failed (${res.status}): ${text}` + ) + } + + type JsonResponse = { + data: { + id: number + fileName: string + } + } + const json: JsonResponse = await res.json() + return json.data + } catch (error) { + console.error("postFileToStorage error:", error) + throw error + } +} + +/** + * Add a file to Crowdin project + */ +export const postCrowdinFile = async ( + storageId: number, + name: string, + dir: string +): Promise => { + const directoryId = await createCrowdinDirectory(dir) + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/files` + ) + + const requestBody = { + storageId, + name, + directoryId, + } + + // Create the file (errors propagate to caller for graceful handling) + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + Accept: "application/json", + }, + body: JSON.stringify(requestBody), + }) + + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error(`Crowdin postCrowdinFile failed (${res.status}): ${body}`) + } + + type JsonResponse = { data: CrowdinAddFileResponse } + const json: JsonResponse = await res.json() + console.log("Created file:", json.data) + + // Note: parser options are managed in Crowdin UI. No PATCH here. + + return json.data +} diff --git a/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt b/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt new file mode 100644 index 00000000000..d35cf850695 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt @@ -0,0 +1,71 @@ +You are a professional translator with native-level fluency in both English and all the target languages %targetLanguages% in the project, and expertise in Ethereum, blockchain, cryptocurrency, and decentralized technologies. +You have deep familiarity with open-source communities and technical documentation, enabling you to handle domain-specific terminology accurately. Your task is to produce high-quality translations of ethereum.org content from English into the target language, following the guidelines below. +The source content is content from the ethereum.org website, segmented in the source files as %strings%. +Translate content from English into the target language %targetLanguage% specified for each individual pre-translation project run. +The target language is automatically defined in the project configuration and the pre-translation process — never guess or switch languages. +Always output translations in the target language only. + +CRITICAL DO-NOT-BREAK RULES (must follow exactly): +- JSON escaping: When translating JSON files, ALL double quotes (") inside string values MUST be escaped as \" to maintain valid JSON. Similarly, escape backslashes (\) as \\, newlines as \n, tabs as \t. The output MUST be parseable JSON—invalid JSON will break the build. Example: translate "Learn about "Ethereum"" as "Aprenda sobre \"Ethereum\"", NOT "Aprenda sobre "Ethereum"". +- Custom header IDs: If a Markdown heading includes a custom anchor like `{#custom-id}`, the ID MUST remain identical to the English source, ASCII-only (no accents or special characters). Do NOT alter, translate, add, or remove braces. Keep the exact ID string. +- HTML/MDX tag line placement: If an opening HTML tag appears on its own line, the matching closing tag MUST also be on its own line. Preserve line breaks around paired block-level tags. +- JSX/MDX attributes: Translate human-readable text found inside attribute values (e.g., `title="..."`, `aria-label="..."`, `alt="..."`) while preserving placeholders, variables, and code. Do NOT translate attribute names or change quoting/escaping. +- Protected names: Do NOT translate obvious proper names, brands, or team names. This includes programming languages (e.g., "Solidity", "Vyper"), company/product names (e.g., "Alchemy", "Infura", "MetaMask", "Consensys", "Chainlink", "Uniswap", "OpenSea", "OpenZeppelin"), protocol/network names (e.g., "Ethereum", "ETH"), and tools/platforms (e.g., "GitHub", "Crowdin", "ethereum.org"). Leave these as in the source unless a community-approved localized form exists. IMPORTANT: Even when a word has a common translation in the target language (e.g., "Alchemy" meaning the historical practice, or "Solidity" meaning firmness), keep the English term when it refers to a brand, product, or technology name. + - URL/path destinations MUST be preserved character-for-character: keep exact case, hyphens, slashes, fragments (`#...`), and query parameters (`?...`). Do NOT change, normalize, or localize any part of a link destination. This rule also applies to any links contained within JSON string values used in React/MDX pages. + +Maintain Clarity and Professionalism: Ensure the translated text is clear, accurate, and professional in tone, just like the source. Match the tone and register of the English content – if the source is explanatory and formal, the translation should mirror that style. Remember that Ethereum’s content serves both experts and complete beginners, so the translation should be accessible to technical and non-technical readers alike. +Consistency with Source Tone: Use a tone that is neither too casual nor overly stiff, unless the source text itself has a specific tone. For example, if the English text uses a friendly and encouraging tone, reflect that in the translation while maintaining professionalism. +Formal Address: In languages that have formal and informal address forms, use the formal form to address the reader. This ensures the content remains respectful and appropriate for all users, and often helps maintain gender-neutrality. Only use an informal tone if the English source explicitly does so. +Idioms and Cultural Nuances: If the source uses idiomatic expressions or culturally specific references, preserve their intent. Replace an idiom only with an equivalent well-understood expression; otherwise keep a direct translation that preserves meaning. + +Certain elements of the source text must be handled with special care during translation: + +Technical Terms: Do not translate highly specific blockchain terms such as "smart contract", "gas", "dapp", or other Ethereum jargon unless there is a widely accepted equivalent. When in doubt, leave the term in English. +Code, Commands, and Output: Retain code snippets, configuration commands, outputs, function names, and anything in backticks or code blocks exactly. Do not translate placeholders (e.g. {value}, %s, <0>...), variables, or braces. Translate English comments inside code (e.g., lines or blocks starting with //, #, or /* ... */) while leaving all code tokens unchanged. +URLs, File Paths, and Domain Names: Never translate or alter these. Preserve exactly, including case and slashes. +Markdown, HTML, and JSX/MDX Syntax: Preserve all formatting symbols, tags, and structure. Do not add/remove markers. Keep tag order identical. Translate only human-readable text outside tags. +Punctuation in Code/Text: Do not alter punctuation that is part of code/syntax (e.g., {}, <>, (), []). + +Match Source Capitalization: Preserve capitalization of terms, acronyms, proper nouns (e.g., "Ethereum", "Solidity", "NFT"). Maintain ALL CAPS where used. +Follow Target Language Conventions: Apply normal punctuation/grammar rules of the target language except where code syntax would break. +Sentence Structure: Reorder or split/join sentences only to achieve natural grammar; avoid ambiguity changes. +End Punctuation: Mirror source intent; headings without periods usually remain without periods. + +Use Consistent Terminology: Reuse prior translations for repeated terms unless context demands a change. +Ethereum Glossary and Termbase: If provided, follow those preferred translations strictly. +External Translation Memory (TM): Use exact matches from TM if context fits. +No Glossary or TM? Pick a clear translation and keep it consistent thereafter. + +(If translation memory/termbase resources are available to Crowdin AI, they should be applied to maintain consistency.) + +Preserve Tags and Placeholders: Keep tags/placeholders exactly ordered. Do not duplicate, omit, or reorder them. +Do Not Break Variables: Leave placeholders such as {userName} unchanged; adapt surrounding punctuation only if required. +Avoid Tag Duplication or Omission: Every opening tag must have its closing counterpart. Never remove tags. +Maintain Markdown Structure: Lists, tables, headings remain structurally identical. Custom IDs stay identical to English. +Line Breaks and Whitespace: Avoid introducing/removing line breaks. Keep opening/closing block HTML tags on their own lines when the source does. + +Consistency Reminders (non-strict, but preferred): +- Headings: Keep section and subsection heading choices consistent with the English source across the document. If the source uses a particular heading term (e.g., "Overview", "Examples", "Resources"), choose a single clear localized equivalent and reuse it throughout the page. +- Example Arrays and Lists: When the source contains example items (lists of technologies, wallets, tools, etc.), translate common nouns/adjectives to the target language. Retain English only for proper names and brands. Do not revert entire lists to English unless items are proper nouns. +- Stable Canonical Terms: Prefer previously used localized headings/labels for recurring sections when known (e.g., consistent translation for "Learn", "Developers"). If unsure, pick the most natural single term and stick to it within the page. + +Inclusive Language: Use gender-neutral constructions where possible. +Localize Examples and Units Where Appropriate: Localize date formats, basic punctuation as customary without altering meaning. Do not convert currencies. +Cultural References: Prefer clarity over forced local analogies. Keep original if unsure. +Avoid Slang and Colloquialisms: Maintain professional, accessible tone. + +Untranslatable Strings: Keep product names, trademarks, protocol names, abbreviations ("ETH", "NFT", "HTML", "PoW", "PoS", "EIP-1559") unless widely accepted localized form exists. +Placeholders and Dummy Text: Do not translate placeholder tokens or dummy values (e.g., "Lorem ipsum", "user@example.com"). +Flagging Issues: If a string is ambiguous and unsafe to translate confidently, produce a literal translation or leave it for review (do not guess). When consistency conflicts arise (e.g., competing heading variants), prefer the most widely used term in the target language or the termbase entry if available. +No Guessing for Missing Context: Choose neutral wording when context is unclear; retain English term where ambiguity could mislead. +´ +Preserve Intended Meaning: Prioritize accurate meaning over literal wording. +Literal vs. Free Translation: Avoid overly literal output if unnatural; adjust for clarity. +Clarity Over Literalness: Prefer clear, idiomatic phrasing that conveys the source meaning. +Avoid Adding Extra Information: Do not introduce new content or explanations. + +High-Quality Output: Output should need minimal post-editing: correct spelling, grammar, style. +Avoiding Errors: Do not omit content or invert meaning; retain numeric values precisely. +Consistency in Style: Maintain uniform formal, explanatory tone throughout. +Minimal Creativity: Rephrase only when necessary for clarity/grammar. +Post-Editing Ready: Deliver translation suitable for quick human approval. \ No newline at end of file diff --git a/src/scripts/i18n/lib/crowdin/pre-translate.ts b/src/scripts/i18n/lib/crowdin/pre-translate.ts new file mode 100644 index 00000000000..69380ab9ca0 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/pre-translate.ts @@ -0,0 +1,173 @@ +// Crowdin pre-translation operations + +import { + config, + CROWDIN_API_BASE_URL, + crowdinBearerHeaders, +} from "../../config" +import type { CrowdinPreTranslateResponse } from "../types" +import { delay } from "../workflows/utils" + +/** + * Apply pre-translation to files + */ +export const postApplyPreTranslation = async ( + fileIds: number[], + languageIds?: string[], + aiPromptIdOverride?: number +): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/pre-translations` + ) + try { + const res = await fetch(url.toString(), { + method: "POST", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + languageIds: languageIds || config.allCrowdinCodes, + fileIds, + method: "ai", + aiPromptId: + typeof aiPromptIdOverride === "number" + ? aiPromptIdOverride + : config.preTranslatePromptId, + }), + }) + + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error( + `Crowdin postApplyPreTranslation failed (${res.status}): ${text}` + ) + } + + type JsonResponse = { + data: CrowdinPreTranslateResponse + } + const json: JsonResponse = await res.json() + + return json.data + } catch (error) { + console.error("postApplyPreTranslation error:", error) + throw error + } +} + +/** + * Get pre-translation status + */ +export const getPreTranslationStatus = async ( + preTranslationId: string +): Promise => { + const url = new URL( + `${CROWDIN_API_BASE_URL}/projects/${config.projectId}/pre-translations/${preTranslationId}` + ) + try { + const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) + + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error( + `Crowdin getPreTranslationStatus failed (${res.status}): ${text}` + ) + } + + type JsonResponse = { + data: CrowdinPreTranslateResponse + } + const json: JsonResponse = await res.json() + + return json.data + } catch (error) { + console.error("getPreTranslationStatus error:", error) + throw error + } +} + +/** + * Polls Crowdin for the status of a pre-translation job and resolves when it finishes. + * + * This function repeatedly calls `getPreTranslationStatus` for the given + * pre-translation ID until the job is no longer in progress. It uses adaptive + * polling intervals based on elapsed time and will abort with an error if the operation + * does not complete within the configured timeout. + * + * @param preTranslationId - The identifier of the Crowdin pre-translation job to monitor. + * @param opts - Optional configuration for timeout and base polling interval + * + * @returns A promise that resolves with the final CrowdinPreTranslateResponse when the + * job status becomes "finished". + * + * @throws {Error} If the wait times out + * @throws {Error} If the pre-translation completes with an unexpected status + * @throws {Error} If an error is thrown while fetching the pre-translation status + */ +export const awaitPreTranslationCompleted = async ( + preTranslationId: string, + opts?: { timeoutMs?: number; baseIntervalMs?: number } +): Promise => { + const timeoutMs = opts?.timeoutMs ?? config.pretranslateTimeoutMs + const baseInterval = opts?.baseIntervalMs ?? config.pretranslatePollBaseMs + const start = Date.now() + let attempt = 0 + + const computeInterval = (elapsedMs: number): number => { + const minutes = elapsedMs / 60000 + if (minutes < 10) return baseInterval + if (minutes < 30) return Math.max(baseInterval * 2, 60_000) + if (minutes < 60) return Math.max(baseInterval * 4, 180_000) + return Math.max(baseInterval * 10, 300_000) // cap at 5 min + } + + // Bounded loop: terminates once elapsed exceeds timeoutMs + while (Date.now() - start <= timeoutMs) { + const elapsed = Date.now() - start + attempt++ + let res: CrowdinPreTranslateResponse + try { + res = await getPreTranslationStatus(preTranslationId) + } catch (e) { + // transient fetch errors: log + continue within timeout window + const nextWait = computeInterval(elapsed) + console.warn( + `[PRE-TRANSLATE][POLL] Error on attempt ${attempt}: ${(e as Error).message}. Retrying in ${nextWait}ms.` + ) + await delay(nextWait) + continue + } + // "created" means job is queued (e.g., another large job is running) + // "in_progress" means job is actively translating + // Both are valid states to keep polling + if (res.status !== "in_progress" && res.status !== "created") { + if (res.status === "finished") { + console.log( + `[PRE-TRANSLATE][POLL] Completed after ${attempt} attempts; elapsed ${Math.round( + (Date.now() - start) / 60000 + )}m.` + ) + return res + } + throw new Error( + `Pre-translation ended with unexpected status: ${res.status}` + ) + } + const nextWait = computeInterval(elapsed) + const progressPct = res.progress ?? 0 + const statusNote = res.status === "created" ? " (queued)" : "" + console.log( + `[PRE-TRANSLATE][POLL] attempt=${attempt} status=${res.status}${statusNote} progress=${progressPct}% elapsed=${Math.round( + elapsed / 60000 + )}m nextWait=${nextWait}ms` + ) + await delay(nextWait) + } + const finalElapsed = Date.now() - start + throw new Error( + `Timed out waiting for pre-translation (elapsed ${Math.round( + finalElapsed / 60000 + )}m)` + ) +} diff --git a/src/scripts/i18n/lib/crowdin/prompt.ts b/src/scripts/i18n/lib/crowdin/prompt.ts new file mode 100644 index 00000000000..7f0c43a923e --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/prompt.ts @@ -0,0 +1,72 @@ +import * as fs from "fs" + +import { crowdinBearerHeaders } from "../../config" + +/** Crowdin AI prompt resource type */ +export type PromptResource = { + id: number + name: string + action: string + aiProviderId?: number | null + aiModelId?: string | null + model?: string | null + version?: string | null +} + +/** + * Get information about a Crowdin AI prompt including the model being used. + * Uses Crowdin API v2: GET /users/{userId}/ai/prompts/{promptId} + */ +export async function getPromptInfo( + userId: number, + promptId: number +): Promise { + const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` + const resp = await fetch(url, { + method: "GET", + headers: crowdinBearerHeaders, + }) + if (!resp.ok) { + const text = await resp.text().catch(() => "") + throw new Error(`Failed to get prompt info (${resp.status}): ${text}`) + } + const json = await resp.json() + return json.data as PromptResource +} + +/** + * Update a Crowdin AI prompt's content from a local file. + * Uses Crowdin API v2: PATCH /users/{userId}/ai/prompts/{promptId} + */ +export async function updatePromptFromFile( + userId: number, + promptId: number, + filePath: string +): Promise { + const content = await fs.promises.readFile(filePath, "utf8") + await updatePromptContent(userId, promptId, content) +} + +/** + * Update a Crowdin AI prompt with provided content. + * Uses Crowdin API v2: PATCH /users/{userId}/ai/prompts/{promptId} + */ +export async function updatePromptContent( + userId: number, + promptId: number, + content: string +): Promise { + const url = `https://api.crowdin.com/api/v2/users/${userId}/ai/prompts/${promptId}` + const resp = await fetch(url, { + method: "PATCH", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ content }), + }) + if (!resp.ok) { + const text = await resp.text().catch(() => "") + throw new Error(`Failed to update prompt (${resp.status}): ${text}`) + } +} diff --git a/src/scripts/i18n/lib/crowdin/user.ts b/src/scripts/i18n/lib/crowdin/user.ts new file mode 100644 index 00000000000..954630b6e40 --- /dev/null +++ b/src/scripts/i18n/lib/crowdin/user.ts @@ -0,0 +1,41 @@ +import { crowdinBearerHeaders } from "../../config" + +interface CrowdinUser { + id: number + username: string + email: string + emailVerified: boolean + fullName: string + avatarUrl: string + createdAt: string + lastSeen: string + twoFactor: string + timezone: string +} + +interface CrowdinUserResponse { + data: CrowdinUser +} + +/** + * Get the authenticated Crowdin user's information + * @returns The authenticated user's data + */ +export async function getCurrentUser(): Promise { + const url = "https://api.crowdin.com/api/v2/user" + + const response = await fetch(url, { + method: "GET", + headers: crowdinBearerHeaders, + }) + + if (!response.ok) { + const text = await response.text().catch(() => "") + throw new Error( + `Failed to fetch current user (${response.status}): ${text}` + ) + } + + const json = (await response.json()) as CrowdinUserResponse + return json.data +} diff --git a/src/scripts/i18n/lib/github/branches.ts b/src/scripts/i18n/lib/github/branches.ts new file mode 100644 index 00000000000..e523a14f846 --- /dev/null +++ b/src/scripts/i18n/lib/github/branches.ts @@ -0,0 +1,88 @@ +// GitHub branch operations + +import { config, gitHubBearerHeaders } from "../../config" +import type { BranchDetailsResponse, BranchObject } from "../types" +import { fetchWithRetry } from "../utils/fetch" +import { debugLog } from "../workflows/utils" + +/** + * Retrieves the Git object for a branch from the GitHub API + * + * @param branch - The branch name to look up (e.g., "main" or "dev") + * @returns A promise that resolves to the BranchObject containing sha, type, and url + */ +export const getBranchObject = async ( + branch: string +): Promise => { + const url = new URL( + `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/git/ref/heads/${branch}` + ) + + const res = await fetchWithRetry(url.toString(), { + headers: gitHubBearerHeaders, + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error(`GitHub getBranchObject (${res.status}): ${body}`) + } + + type JsonResponse = BranchDetailsResponse + const json: JsonResponse = await res.json() + return json.object +} + +/** + * Generate a branch name based on current timestamp + */ +export const createBranchName = (suffix?: string) => { + const ts = new Date().toISOString().replace(/\..*$/, "").replace(/[:]/g, "-") + return "i18n/import/" + ts + (suffix ? `-${suffix}` : "") +} + +/** + * Create a new branch from a base branch + * + * @param ref - The base branch reference (defaults to config.baseBranch) + * @returns Object containing the new branch name and SHA + */ +export const postCreateBranchFrom = async ( + ref = config.baseBranch, + suffix?: string +) => { + const { sha } = await getBranchObject(ref) + const branch = createBranchName(suffix) + + const url = new URL( + `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/git/refs` + ) + + try { + debugLog( + `Creating branch from base="${ref}" sha=${sha} -> new branch="${branch}"` + ) + const res = await fetchWithRetry(url.toString(), { + method: "POST", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ ref: `refs/heads/${branch}`, sha }), + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + console.error( + `[ERROR] Failed to create branch. URL=${url.toString()} status=${res.status}` + ) + throw new Error(`GitHub createBranchFrom (${res.status}): ${body}`) + } + + return { branch, sha } + } catch (error) { + console.error(error) + process.exit(1) + } +} diff --git a/src/scripts/i18n/lib/github/commits.ts b/src/scripts/i18n/lib/github/commits.ts new file mode 100644 index 00000000000..3a351185ae9 --- /dev/null +++ b/src/scripts/i18n/lib/github/commits.ts @@ -0,0 +1,313 @@ +// GitHub commit operations + +import { config, gitHubBearerHeaders } from "../../config" +import { fetchWithRetry } from "../utils/fetch" +import { debugLog, delay } from "../workflows/utils" + +/** File to be committed in a batch */ +export interface BatchFile { + path: string + content: Buffer +} + +/** + * Commit multiple files in a single commit using GitHub's Git Data API. + * This avoids creating one commit per file. + * + * @param files - Array of files to commit + * @param branch - Target branch name + * @param message - Commit message + */ +export async function batchCommitFiles( + files: BatchFile[], + branch: string, + message: string +): Promise { + if (files.length === 0) { + debugLog("batchCommitFiles: No files to commit, skipping") + return + } + + const baseUrl = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}` + + // 1. Get current branch ref + const refRes = await fetchWithRetry(`${baseUrl}/git/ref/heads/${branch}`, { + headers: gitHubBearerHeaders, + }) + if (!refRes.ok) { + const body = await refRes.text().catch(() => "") + throw new Error(`Failed to get branch ref (${refRes.status}): ${body}`) + } + const refData: { object: { sha: string } } = await refRes.json() + const latestCommitSha = refData.object.sha + + // 2. Get the commit to find base tree + const commitRes = await fetchWithRetry( + `${baseUrl}/git/commits/${latestCommitSha}`, + { headers: gitHubBearerHeaders } + ) + if (!commitRes.ok) { + const body = await commitRes.text().catch(() => "") + throw new Error(`Failed to get commit (${commitRes.status}): ${body}`) + } + const commitData: { tree: { sha: string } } = await commitRes.json() + const baseTreeSha = commitData.tree.sha + + // 3. Create blobs for each file + // Add delay between requests to avoid hitting GitHub's secondary rate limits + const BLOB_CREATION_DELAY_MS = 200 // 200ms between blob creations + const treeItems: { path: string; mode: string; type: string; sha: string }[] = + [] + + for (let i = 0; i < files.length; i++) { + const file = files[i] + + // Add delay before each request (except the first one) + if (i > 0) { + await delay(BLOB_CREATION_DELAY_MS) + } + + const blobRes = await fetchWithRetry(`${baseUrl}/git/blobs`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + content: file.content.toString("base64"), + encoding: "base64", + }), + }) + if (!blobRes.ok) { + const body = await blobRes.text().catch(() => "") + throw new Error( + `Failed to create blob for ${file.path} (${blobRes.status}): ${body}` + ) + } + const blobData: { sha: string } = await blobRes.json() + treeItems.push({ + path: file.path, + mode: "100644", + type: "blob", + sha: blobData.sha, + }) + + // Log progress for large batches + if (files.length > 10 && (i + 1) % 10 === 0) { + debugLog(`Created ${i + 1}/${files.length} blobs...`) + } + } + + // 4. Create new tree + const treeRes = await fetchWithRetry(`${baseUrl}/git/trees`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + base_tree: baseTreeSha, + tree: treeItems, + }), + }) + if (!treeRes.ok) { + const body = await treeRes.text().catch(() => "") + throw new Error(`Failed to create tree (${treeRes.status}): ${body}`) + } + const treeData: { sha: string } = await treeRes.json() + + // 5. Create commit + const newCommitRes = await fetchWithRetry(`${baseUrl}/git/commits`, { + method: "POST", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ + message, + tree: treeData.sha, + parents: [latestCommitSha], + }), + }) + if (!newCommitRes.ok) { + const body = await newCommitRes.text().catch(() => "") + throw new Error(`Failed to create commit (${newCommitRes.status}): ${body}`) + } + const newCommitData: { sha: string } = await newCommitRes.json() + + // 6. Update branch ref + const updateRefRes = await fetchWithRetry( + `${baseUrl}/git/refs/heads/${branch}`, + { + method: "PATCH", + headers: { ...gitHubBearerHeaders, "Content-Type": "application/json" }, + body: JSON.stringify({ sha: newCommitData.sha }), + } + ) + if (!updateRefRes.ok) { + const body = await updateRefRes.text().catch(() => "") + throw new Error(`Failed to update ref (${updateRefRes.status}): ${body}`) + } + + debugLog( + `batchCommitFiles: Committed ${files.length} files in single commit ${newCommitData.sha}` + ) +} + +/** + * Get the destination path for a translated file + * + * @param crowdinFilePath - The Crowdin file path (e.g., src/intl/en/page-foo.json) + * @param internalLanguageCode - The internal language code + * @returns The destination path in the repository + */ +export const getDestinationFromPath = ( + crowdinFilePath: string, + internalLanguageCode: string +) => { + const normalized = crowdinFilePath.replace(/^\//, "") + const isJson = normalized.toLowerCase().endsWith(".json") + const isMarkdown = normalized.toLowerCase().endsWith(".md") + + let destinationPath = normalized + + if (isJson) { + // JSON: src/intl/en/*.json -> src/intl//*.json + if (normalized.startsWith("src/intl/en/")) { + destinationPath = normalized.replace( + /^src\/intl\/en\//, + `src/intl/${internalLanguageCode}/` + ) + } else if (normalized.startsWith("src/intl/")) { + // Fallback: if for some reason "en" segment is missing, inject lang after src/intl/ + const parts = normalized.split("/") + // parts: [src, intl, ...] + parts.splice(2, 0, internalLanguageCode) + destinationPath = parts.join("/") + } + } else if (isMarkdown) { + // Markdown: public/content//index.md -> public/content/translations///index.md + if (normalized.startsWith("public/content/")) { + const rel = normalized.replace(/^public\/content\//, "") + // If already inside translations/, avoid duplicating; rewrite to current lang + const relParts = rel.split("/").filter(Boolean) + if (relParts[0] === "translations") { + // Drop existing translations// + const rest = relParts.slice(2).join("/") + destinationPath = `public/content/translations/${internalLanguageCode}/${rest}` + } else { + destinationPath = `public/content/translations/${internalLanguageCode}/${rel}` + } + } + } + + debugLog( + `Destination mapping: ${crowdinFilePath} -> ${destinationPath} (lang=${internalLanguageCode})` + ) + return destinationPath +} + +/** + * Get the SHA of a file at a specific path + * + * @param path - The file path in the repository + * @param branch - The branch name + * @returns Object containing the file SHA + */ +export const getPathSha = async (path: string, branch: string) => { + const url = new URL( + `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/contents/${path}?ref=${branch}` + ) + + const res = await fetchWithRetry(url.toString(), { + headers: gitHubBearerHeaders, + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error(`GitHub getPathSha (${res.status}): ${body}`) + } + + type JsonResponse = { sha: string } + const { sha }: JsonResponse = await res.json() + + return { sha } +} + +/** + * Commit a file to a GitHub branch with retry logic for conflicts + * + * @param buffer - The file contents as a Buffer + * @param destinationPath - The path in the repository + * @param branch - The branch name + * @param sha - Optional SHA for updating existing files + * @param attempt - Current retry attempt number + */ +export const putCommitFile = async ( + buffer: Buffer, + destinationPath: string, + branch: string, + sha?: string, + attempt = 0 +): Promise => { + const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/contents/${destinationPath}` + + try { + // Use the buffer contents as base64-encoded content for the commit + const contentBase64 = buffer.toString("base64") + + const body = { + message: `update(i18n): ${destinationPath}`, + content: contentBase64, + branch, + } + + if (sha) body["sha"] = sha + + const res = await fetchWithRetry(url.toString(), { + method: "PUT", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }) + + if (res.status === 422) { + const { sha: fileSha } = await getPathSha(destinationPath, branch) + console.warn( + `[RETRY] 422 Unprocessable for ${destinationPath}. Retrying with existing SHA ${fileSha}` + ) + return await putCommitFile( + buffer, + destinationPath, + branch, + fileSha, + attempt + ) + } + + if (res.status === 409) { + if (attempt >= 5) { + const bodyText = await res.text().catch(() => "") + throw new Error( + `GitHub putCommitFile conflict persists after ${attempt} retries (${res.status}): ${bodyText}` + ) + } + const backoff = 500 * Math.pow(2, attempt) // 500ms, 1s, 2s, 4s, 8s + console.warn( + `[RETRY] 409 Conflict for ${destinationPath}. Attempt ${attempt + 1}. Waiting ${backoff}ms before retry.` + ) + await delay(backoff) + const { sha: latestSha } = await getPathSha(destinationPath, branch) + return await putCommitFile( + buffer, + destinationPath, + branch, + latestSha, + attempt + 1 + ) + } + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error(`GitHub putCommitFile (${res.status}): ${body}`) + } + } catch (error) { + console.error(error) + process.exit(1) + } +} diff --git a/src/scripts/i18n/lib/github/files.ts b/src/scripts/i18n/lib/github/files.ts new file mode 100644 index 00000000000..64ee90c933b --- /dev/null +++ b/src/scripts/i18n/lib/github/files.ts @@ -0,0 +1,257 @@ +// GitHub file operations + +import { config, doNotTranslatePaths, gitHubBearerHeaders } from "../../config" +import type { + ContentType, + GitHubCrowdinFileMetadata, + GitHubQueryResponseItem, +} from "../types" +import { fetchWithRetry } from "../utils/fetch" +import { debugLog } from "../workflows/utils" + +/** + * Check if a path should be excluded + */ +function isPathExcluded(filePath: string, excludedPaths: string[]): boolean { + return excludedPaths.some((excluded) => filePath.includes(excluded)) +} + +/** + * Check if a path is a file (has .md or .json extension) or directory + */ +function isFilePath(targetPath: string): boolean { + return targetPath.endsWith(".md") || targetPath.endsWith(".json") +} + +/** + * Get English files with optional file/directory filtering and excluded paths. + * If targetPath is a file (ends with .md or .json), returns only that file. + * If targetPath is a directory, returns all files recursively within that directory. + * Otherwise, returns all English content files. + */ +export const getAllEnglishFiles = async (): Promise< + GitHubQueryResponseItem[] +> => { + const { targetPath, excludePath } = config + + // Add runtime exclusion if specified + const allExcludedPaths = excludePath + ? [...doNotTranslatePaths, excludePath] + : doNotTranslatePaths + + debugLog( + `Do-not-translate paths loaded: ${doNotTranslatePaths.length} entries` + ) + if (excludePath) { + debugLog(`Runtime path exclusions: ${excludePath}`) + } + + // Determine if targetPath is a file or directory + if (targetPath) { + if (isPathExcluded(targetPath, allExcludedPaths)) { + console.log(`[INFO] Path ${targetPath} is in excluded paths, skipping`) + return [] + } + + if (isFilePath(targetPath)) { + // Single file mode + console.log(`[INFO] Fetching single file: ${targetPath}`) + return await fetchSingleFile(targetPath) + } else { + // Directory mode + console.log(`[INFO] Fetching files from directory: ${targetPath}`) + } + } + + // Directory mode or full translation + const ghSearchEndpointBase = "https://api.github.com/search/code" + let query: string + + if (targetPath && !isFilePath(targetPath)) { + // Search within specific directory + query = `repo:${config.ghOrganization}/${config.ghRepo} extension:md path:"${targetPath}" -path:"${config.mdRoot}/translations" OR repo:${config.ghOrganization}/${config.ghRepo} extension:json path:"${targetPath}"` + } else { + // Search all content files + query = `repo:${config.ghOrganization}/${config.ghRepo} extension:md path:"${config.mdRoot}" -path:"${config.mdRoot}/translations" OR repo:${config.ghOrganization}/${config.ghRepo} extension:json path:"${config.jsonRoot}"` + if (!targetPath) { + console.log(`[INFO] Fetching all English content files`) + } + } + + debugLog(`GitHub search query: ${query}`) + + const perPage = 100 + const collected: GitHubQueryResponseItem[] = [] + + let page = 1 + let hasMorePages = true + while (hasMorePages) { + const url = new URL(ghSearchEndpointBase) + url.searchParams.set("q", query) + url.searchParams.set("per_page", perPage.toString()) + url.searchParams.set("page", page.toString()) + + debugLog(`Fetching search page ${page}...`) + + try { + const res = await fetchWithRetry(url.toString(), { + headers: gitHubBearerHeaders, + }) + + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error(`GitHub getAllEnglishFiles (${res.status}): ${body}`) + } + + type JsonResponse = { items: GitHubQueryResponseItem[] } + const json: JsonResponse = await res.json() + + if (!json.items.length) { + debugLog(`No more results at page ${page}`) + hasMorePages = false + break + } + + collected.push(...json.items) + debugLog(`Collected ${collected.length} items so far`) + + page += 1 + if (page > 10) { + console.warn(`[WARN] Reached pagination safety cap at page ${page - 1}`) + hasMorePages = false + break + } + } catch (error) { + console.error(`[ERROR] Failed to get English files from GitHub:`, error) + process.exit(1) + } + } + + // Filter out excluded paths (static + runtime) + const filtered = collected.filter( + (item) => !isPathExcluded(item.path, allExcludedPaths) + ) + + const excludedCount = collected.length - filtered.length + if (excludedCount > 0) { + console.log(`[INFO] Filtered out ${excludedCount} excluded files`) + } + + console.log(`[INFO] Total files to translate: ${filtered.length}`) + + return filtered +} + +/** + * Fetch a single file by path from GitHub + */ +async function fetchSingleFile( + filePath: string +): Promise { + const url = `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/contents/${filePath}?ref=${config.baseBranch}` + + try { + const res = await fetchWithRetry(url, { + headers: gitHubBearerHeaders, + }) + + if (!res.ok) { + throw new Error(`Failed to fetch file ${filePath}: ${res.status}`) + } + + const data = await res.json() + + // Convert to GitHubQueryResponseItem format + return [ + { + name: data.name, + path: data.path, + sha: data.sha, + url: data.url, + git_url: data.git_url, + html_url: data.html_url, + repository: { + id: 0, + name: config.ghRepo, + full_name: `${config.ghOrganization}/${config.ghRepo}`, + owner: { + login: config.ghOrganization, + id: 0, + node_id: "", + avatar_url: "", + gravatar_id: "", + url: "", + html_url: "", + followers_url: "", + following_url: "", + gists_url: "", + starred_url: "", + subscriptions_url: "", + organizations_url: "", + repos_url: "", + events_url: "", + received_events_url: "", + type: "Organization", + user_view_type: "", + site_admin: false, + }, + } as GitHubQueryResponseItem["repository"], + score: 1, + }, + ] + } catch (error) { + console.error(`[ERROR] Failed to fetch single file ${filePath}:`, error) + throw error + } +} + +/** + * Convert GitHub items to Crowdin file metadata + */ +export const getFileMetadata = async ( + items: GitHubQueryResponseItem[] +): Promise => { + if (!items.length) return [] + + const owner = items[0].repository.owner.login + const repo = items[0].repository.name + + const englishFileMetadata = items.map((item) => { + // https://raw.githubusercontent.com/:owner/:repo/:ref/:path + const download_url = `https://raw.githubusercontent.com/${owner}/${repo}/${config.baseBranch}/${item.path}` + const filePath = item.path + const filePathSplit = filePath.split("/") + const fileName = filePathSplit[filePathSplit.length - 1] + const contentType: ContentType = fileName?.endsWith(".json") + ? "application/json" + : "text/markdown" + + return { + "Crowdin-API-FileName": fileName, + filePath: filePath, + download_url: download_url, + "Content-Type": contentType, + } + }) + return englishFileMetadata +} + +/** + * Download a file from GitHub + */ +export const downloadGitHubFile = async ( + download_url: string +): Promise => { + try { + const res = await fetch(download_url) + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error(`Failed to download from GitHub (${res.status}): ${body}`) + } + const arrayBuffer = await res.arrayBuffer() + return Buffer.from(arrayBuffer) + } catch (error) { + console.error("downloadGitHubFile error:", error) + throw error + } +} diff --git a/src/scripts/i18n/lib/github/pull-requests.ts b/src/scripts/i18n/lib/github/pull-requests.ts new file mode 100644 index 00000000000..18be0aa47d1 --- /dev/null +++ b/src/scripts/i18n/lib/github/pull-requests.ts @@ -0,0 +1,82 @@ +// GitHub pull request operations + +import { config, gitHubBearerHeaders } from "../../config" +import { fetchWithRetry } from "../utils/fetch" + +/** + * Create a pull request + * + * @param head - The head branch (source of changes) + * @param base - The base branch (target for merge, defaults to config.baseBranch) + * @param title - PR title + * @param bodyText - Optional PR description text + * @returns The created pull request object + */ +export const postPullRequest = async ( + head: string, + base = config.baseBranch, + title: string, + bodyText?: string +) => { + const url = new URL( + `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/pulls` + ) + + const body = { + title, + head, + base, + body: bodyText || "Automated Crowdin translation import", + } + + const res = await fetchWithRetry(url.toString(), { + method: "POST", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }) + + if (!res.ok) { + console.warn("Res not OK") + const body = await res.text().catch(() => "") + throw new Error(`Crowdin postPullRequest failed (${res.status}): ${body}`) + } + + const json = await res.json() + return json +} + +/** + * Post a comment on a pull request + * + * @param prNumber - The PR number + * @param commentBody - The comment body text + * @returns The created comment object + */ +export const postPullRequestComment = async ( + prNumber: number, + commentBody: string +) => { + const url = new URL( + `https://api.github.com/repos/${config.ghOrganization}/${config.ghRepo}/issues/${prNumber}/comments` + ) + + const res = await fetchWithRetry(url.toString(), { + method: "POST", + headers: { + ...gitHubBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify({ body: commentBody }), + }) + + if (!res.ok) { + const body = await res.text().catch(() => "") + throw new Error(`Failed to post PR comment (${res.status}): ${body}`) + } + + const json = await res.json() + return json +} diff --git a/src/scripts/i18n/lib/jsx-attributes/extract.ts b/src/scripts/i18n/lib/jsx-attributes/extract.ts new file mode 100644 index 00000000000..f74c1b7a1b3 --- /dev/null +++ b/src/scripts/i18n/lib/jsx-attributes/extract.ts @@ -0,0 +1,169 @@ +/** + * Extract translatable JSX attributes from markdown files + */ + +import type { + ExtractedAttribute, + FileExtractionResult, + TranslatableAttribute, +} from "./types" +import { + JSX_ATTRIBUTE_REGEX, + JSX_COMPONENT_REGEX, + TRANSLATABLE_ATTRIBUTES, +} from "./types" + +/** + * Check if a string appears to be English text (not a variable, URL, or code). + * Uses heuristics: contains spaces, common English words, or sentence structure. + */ +function isLikelyEnglishText(value: string): boolean { + // Skip empty or very short values + if (!value || value.length < 3) return false + + // Skip URLs + if (/^https?:\/\//.test(value)) return false + + // Skip paths + if (/^[/.]/.test(value) || /\.(png|jpg|svg|gif|json|md)$/i.test(value)) + return false + + // Skip variables/placeholders like {variable} or {{variable}} + if (/^\{.*\}$/.test(value)) return false + + // Skip CSS classes or technical identifiers (camelCase/kebab-case only) + if (/^[a-z][a-zA-Z0-9-]*$/.test(value) && !value.includes(" ")) return false + + // Skip emoji-only values + if (/^[\p{Emoji}\s]+$/u.test(value)) return false + + // Skip numbers-only + if (/^[\d.,\s%$€£]+$/.test(value)) return false + + // Likely English if it contains spaces (multi-word) or common English patterns + if (value.includes(" ")) return true + + // Single words that look like natural language (capitalized, common endings) + if (/^[A-Z][a-z]+(?:ing|ed|er|est|ly|tion|ness)?$/.test(value)) return true + + return false +} + +/** + * Extract surrounding context (lines before/after) for translation accuracy. + */ +function extractContext( + content: string, + lineNumber: number, + contextLines = 2 +): string { + const lines = content.split("\n") + const startLine = Math.max(0, lineNumber - 1 - contextLines) + const endLine = Math.min(lines.length, lineNumber + contextLines) + + return lines + .slice(startLine, endLine) + .map((line) => line.trim()) + .filter((line) => line.length > 0) + .join(" ") + .slice(0, 500) // Limit context length +} + +/** + * Extract translatable attributes from a single file's content. + */ +export function extractAttributesFromContent( + content: string, + filePath: string +): ExtractedAttribute[] { + const attributes: ExtractedAttribute[] = [] + const lines = content.split("\n") + + // Track line numbers for each match + let currentLine = 0 + let currentPos = 0 + + // Process each JSX component + let componentMatch: RegExpExecArray | null + JSX_COMPONENT_REGEX.lastIndex = 0 + + while ((componentMatch = JSX_COMPONENT_REGEX.exec(content)) !== null) { + const componentName = componentMatch[1] + const attributeString = componentMatch[2] + const componentStartPos = componentMatch.index + + // Calculate line number for this component + while (currentPos < componentStartPos && currentLine < lines.length) { + currentPos += lines[currentLine].length + 1 // +1 for newline + currentLine++ + } + const componentLine = currentLine + 1 // 1-indexed + + // Extract attributes from this component + let attrMatch: RegExpExecArray | null + JSX_ATTRIBUTE_REGEX.lastIndex = 0 + + while ((attrMatch = JSX_ATTRIBUTE_REGEX.exec(attributeString)) !== null) { + const attrName = attrMatch[1] + const attrValue = attrMatch[2] || attrMatch[3] // double or single quotes + + // Check if this is a translatable attribute + if ( + !TRANSLATABLE_ATTRIBUTES.includes(attrName as TranslatableAttribute) + ) { + continue + } + + // Check if the value looks like English text needing translation + if (!isLikelyEnglishText(attrValue)) { + continue + } + + attributes.push({ + filePath, + line: componentLine, + column: attrMatch.index, + attributeName: attrName as TranslatableAttribute, + componentName, + originalValue: attrValue, + context: extractContext(content, componentLine), + }) + } + } + + return attributes +} + +/** + * Extract translatable attributes from a file, returning the extraction result. + */ +export function extractAttributesFromFile( + content: string, + filePath: string +): FileExtractionResult { + const attributes = extractAttributesFromContent(content, filePath) + + return { + filePath, + attributes, + content, + } +} + +/** + * Extract attributes from multiple files. + */ +export function extractAttributesFromFiles( + files: { path: string; content: string }[] +): FileExtractionResult[] { + return files.map((file) => extractAttributesFromFile(file.content, file.path)) +} + +/** + * Count total attributes across multiple extraction results. + */ +export function countExtractedAttributes( + results: FileExtractionResult[] +): number { + return results.reduce((sum, result) => sum + result.attributes.length, 0) +} diff --git a/src/scripts/i18n/lib/jsx-attributes/index.ts b/src/scripts/i18n/lib/jsx-attributes/index.ts new file mode 100644 index 00000000000..517481472c3 --- /dev/null +++ b/src/scripts/i18n/lib/jsx-attributes/index.ts @@ -0,0 +1,23 @@ +/** + * JSX attribute extraction and translation module + */ + +export { + countExtractedAttributes, + extractAttributesFromContent, + extractAttributesFromFile, + extractAttributesFromFiles, +} from "./extract" +export { + reinsertTranslatedAttributes, + reinsertTranslationsForFiles, +} from "./reinsert" +export { + type ExtractedAttribute, + type FileExtractionResult, + type FileTranslationResult, + type JsxTranslationSummary, + TRANSLATABLE_ATTRIBUTES, + type TranslatableAttribute, + type TranslatedAttribute, +} from "./types" diff --git a/src/scripts/i18n/lib/jsx-attributes/reinsert.ts b/src/scripts/i18n/lib/jsx-attributes/reinsert.ts new file mode 100644 index 00000000000..d4a3db75290 --- /dev/null +++ b/src/scripts/i18n/lib/jsx-attributes/reinsert.ts @@ -0,0 +1,81 @@ +/** + * Re-insert translated attribute values into file content + */ + +import type { + FileExtractionResult, + FileTranslationResult, + TranslatedAttribute, +} from "./types" + +/** + * Escape special regex characters in a string + */ +function escapeRegex(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") +} + +/** + * Replace a single attribute value in content. + * Handles both double and single quoted attributes. + */ +function replaceAttributeValue( + content: string, + attr: TranslatedAttribute +): string { + // Build regex to find this specific attribute with its original value + // Match: attributeName="originalValue" or attributeName='originalValue' + const escapedOriginal = escapeRegex(attr.originalValue) + const pattern = new RegExp( + `(\\b${attr.attributeName}\\s*=\\s*)(?:"${escapedOriginal}"|'${escapedOriginal}')`, + "g" + ) + + // Replace with translated value, preserving quote style (default to double quotes) + return content.replace(pattern, `$1"${attr.translatedValue}"`) +} + +/** + * Re-insert all translated attributes into a file's content. + */ +export function reinsertTranslatedAttributes( + extraction: FileExtractionResult, + translatedAttributes: TranslatedAttribute[] +): FileTranslationResult { + let updatedContent = extraction.content + let successCount = 0 + + // Sort by position (reverse order) to avoid offset issues when replacing + const sortedAttrs = [...translatedAttributes].sort( + (a, b) => b.line - a.line || b.column - a.column + ) + + for (const attr of sortedAttrs) { + const beforeReplace = updatedContent + updatedContent = replaceAttributeValue(updatedContent, attr) + + if (updatedContent !== beforeReplace) { + successCount++ + } + } + + return { + filePath: extraction.filePath, + translatedAttributes, + updatedContent, + hasChanges: successCount > 0, + } +} + +/** + * Process multiple files with their translated attributes. + */ +export function reinsertTranslationsForFiles( + extractions: FileExtractionResult[], + translationsByFile: Map +): FileTranslationResult[] { + return extractions.map((extraction) => { + const translations = translationsByFile.get(extraction.filePath) || [] + return reinsertTranslatedAttributes(extraction, translations) + }) +} diff --git a/src/scripts/i18n/lib/jsx-attributes/types.ts b/src/scripts/i18n/lib/jsx-attributes/types.ts new file mode 100644 index 00000000000..4d4823de6fc --- /dev/null +++ b/src/scripts/i18n/lib/jsx-attributes/types.ts @@ -0,0 +1,84 @@ +/** + * Types for JSX attribute extraction and translation + */ + +/** Regex to match JSX/HTML-style attributes with quoted values */ +export const JSX_ATTRIBUTE_REGEX = + /\b([a-zA-Z][\w-]*)\s*=\s*(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)')/g + +/** Regex to identify JSX component opening tags */ +export const JSX_COMPONENT_REGEX = /<([A-Z][a-zA-Z0-9]*)\s+([^>]*?)(?:\/>|>)/g + +/** Attributes that contain human-readable text requiring translation */ +export const TRANSLATABLE_ATTRIBUTES = [ + "title", + "description", + "alt", + "label", + "aria-label", + "placeholder", + "buttonLabel", + "name", + "caption", + "contentPreview", + "location", +] as const + +export type TranslatableAttribute = (typeof TRANSLATABLE_ATTRIBUTES)[number] + +/** A single extracted attribute from a JSX component */ +export interface ExtractedAttribute { + /** File path the attribute was found in */ + filePath: string + /** Line number (1-indexed) where the attribute appears */ + line: number + /** Column position where the attribute value starts */ + column: number + /** The attribute name (e.g., "title", "description") */ + attributeName: TranslatableAttribute + /** The component name (e.g., "Card", "ExpandableCard") */ + componentName: string + /** The original English attribute value */ + originalValue: string + /** Surrounding context (1-2 sentences before/after) for translation accuracy */ + context: string +} + +/** Result of extracting attributes from a single file */ +export interface FileExtractionResult { + filePath: string + attributes: ExtractedAttribute[] + /** Original file content for re-insertion */ + content: string +} + +/** A translated attribute ready for re-insertion */ +export interface TranslatedAttribute extends ExtractedAttribute { + translatedValue: string +} + +/** Result of translating attributes for a file */ +export interface FileTranslationResult { + filePath: string + translatedAttributes: TranslatedAttribute[] + /** Updated file content with translations inserted */ + updatedContent: string + /** Whether any attributes were translated */ + hasChanges: boolean +} + +/** Summary of JSX attribute translation for a batch of files */ +export interface JsxTranslationSummary { + /** Total files processed */ + filesProcessed: number + /** Files that had attributes translated */ + filesWithChanges: number + /** Total attributes translated */ + attributesTranslated: number + /** Attributes that failed translation */ + attributesFailed: number + /** Whether Gemini API was available */ + geminiAvailable: boolean + /** Files with updated content */ + updatedFiles: FileTranslationResult[] +} diff --git a/src/scripts/i18n/lib/supabase/glossary.ts b/src/scripts/i18n/lib/supabase/glossary.ts new file mode 100644 index 00000000000..2eee0a0ef74 --- /dev/null +++ b/src/scripts/i18n/lib/supabase/glossary.ts @@ -0,0 +1,117 @@ +/** + * Supabase glossary client for fetching community-approved translations + * + * Fetches from the `top_translations` view which contains the highest-voted + * translation for each term/language pair. + */ + +/** Glossary entry from Supabase top_translations view */ +export interface GlossaryEntry { + string_term: string + translation_text: string + language_code: string + total_votes: number +} + +/** Glossary grouped by language code */ +export type GlossaryByLanguage = Map> + +/** Tone for translation register */ +export type Tone = "informal" | "formal" + +/** + * Fetch all glossary entries from Supabase + */ +export async function fetchGlossaryEntries(): Promise { + const supabaseUrl = process.env.SUPABASE_URL + const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY + + if (!supabaseUrl || !supabaseKey) { + console.warn( + "[GLOSSARY] Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY, skipping glossary fetch" + ) + return [] + } + + const url = `${supabaseUrl}/rest/v1/top_translations?select=string_term,translation_text,language_code,total_votes` + + try { + const response = await fetch(url, { + headers: { + apikey: supabaseKey, + Authorization: `Bearer ${supabaseKey}`, + "Content-Type": "application/json", + }, + }) + + if (!response.ok) { + const text = await response.text().catch(() => "") + throw new Error(`Supabase API error (${response.status}): ${text}`) + } + + const entries: GlossaryEntry[] = await response.json() + console.log(`[GLOSSARY] Fetched ${entries.length} glossary entries`) + return entries + } catch (error) { + console.warn("[GLOSSARY] Failed to fetch glossary:", error) + return [] + } +} + +/** + * Group glossary entries by language code for efficient lookup + * Returns Map> + */ +export function groupGlossaryByLanguage( + entries: GlossaryEntry[] +): GlossaryByLanguage { + const byLanguage: GlossaryByLanguage = new Map() + + for (const entry of entries) { + if (!byLanguage.has(entry.language_code)) { + byLanguage.set(entry.language_code, new Map()) + } + byLanguage + .get(entry.language_code)! + .set(entry.string_term, entry.translation_text) + } + + return byLanguage +} + +/** + * Get glossary terms for a specific language code + * Returns Map or empty map if not found + */ +export function getGlossaryForLanguage( + glossary: GlossaryByLanguage, + languageCode: string +): Map { + return glossary.get(languageCode) ?? new Map() +} + +/** + * Format glossary as string for inclusion in AI prompts + */ +export function formatGlossaryForPrompt( + glossaryTerms: Map, + tone: Tone = "informal" +): string { + if (glossaryTerms.size === 0) return "" + + const toneInstruction = + tone === "formal" + ? "Use formal register." + : "Use informal, friendly register." + + const terms = Array.from(glossaryTerms.entries()) + .map(([term, translation]) => `- "${term}" → "${translation}"`) + .join("\n") + + return `## REQUIRED TERMINOLOGY + +Use these exact translations. Do not substitute synonyms. +${toneInstruction} + +${terms}` +} diff --git a/src/scripts/i18n/lib/supabase/index.ts b/src/scripts/i18n/lib/supabase/index.ts new file mode 100644 index 00000000000..1689b520c25 --- /dev/null +++ b/src/scripts/i18n/lib/supabase/index.ts @@ -0,0 +1,9 @@ +// Supabase integration exports + +export type { GlossaryByLanguage, GlossaryEntry, Tone } from "./glossary" +export { + fetchGlossaryEntries, + formatGlossaryForPrompt, + getGlossaryForLanguage, + groupGlossaryByLanguage, +} from "./glossary" diff --git a/src/scripts/i18n/types.ts b/src/scripts/i18n/lib/types.ts similarity index 98% rename from src/scripts/i18n/types.ts rename to src/scripts/i18n/lib/types.ts index 8c81a295668..995f70dec36 100644 --- a/src/scripts/i18n/types.ts +++ b/src/scripts/i18n/lib/types.ts @@ -234,3 +234,9 @@ export type CrowdinAddFileResponse = { createdAt: string | null updatedAt: string | null } + +export type I18nConfigItem = { + code: string + crowdinCode: string + name: string +} diff --git a/src/scripts/i18n/lib/utils/fetch.ts b/src/scripts/i18n/lib/utils/fetch.ts new file mode 100644 index 00000000000..41af8c601e7 --- /dev/null +++ b/src/scripts/i18n/lib/utils/fetch.ts @@ -0,0 +1,88 @@ +// Utilities: resilient fetch with retry logic + +import { delay } from "../workflows/utils" + +export type RetryOptions = { + retries?: number + timeoutMs?: number + backoffMs?: number + retryOnStatuses?: number[] +} + +export const fetchWithRetry = async ( + url: string, + init?: RequestInit, + options?: RetryOptions +) => { + const retries = options?.retries ?? 3 + const timeoutMs = options?.timeoutMs ?? 30000 + const backoffMs = options?.backoffMs ?? 1000 + const retryOnStatuses = options?.retryOnStatuses ?? [ + 403, // GitHub secondary rate limits + 408, 429, 500, 502, 503, 504, + ] + + for (let attempt = 0; attempt <= retries; attempt++) { + const controller = new AbortController() + const id = setTimeout(() => controller.abort(), timeoutMs) + try { + const res = await fetch(url, { + ...(init || {}), + signal: controller.signal, + }) + clearTimeout(id) + if ( + !res.ok && + retryOnStatuses.includes(res.status) && + attempt < retries + ) { + // Check if this is a rate limit error and use longer backoff + let wait = backoffMs * Math.pow(2, attempt) + let isRateLimit = false + + if (res.status === 403 || res.status === 429) { + try { + const bodyText = await res.clone().text() + if ( + bodyText.includes("rate limit") || + bodyText.includes("Rate limit") + ) { + isRateLimit = true + // Use much longer backoff for rate limits (60s, 120s, 240s) + wait = 60000 * Math.pow(2, attempt) + } + } catch { + // If we can't read the body, treat 403/429 as rate limits + isRateLimit = true + wait = 60000 * Math.pow(2, attempt) + } + } + + console.warn( + `[${isRateLimit ? "RATE LIMIT" : "RETRY"}] ${url} -> ${res.status}. Attempt ${attempt + 1}/${retries}. Waiting ${wait}ms.` + ) + await delay(wait) + continue + } + return res + } catch (err: unknown) { + clearTimeout(id) + const errObj = err as { name?: string; code?: string } + const isAbort = errObj?.name === "AbortError" + const isConnectTimeout = errObj?.code === "UND_ERR_CONNECT_TIMEOUT" + if ((isAbort || isConnectTimeout) && attempt < retries) { + const wait = backoffMs * Math.pow(2, attempt) + console.warn( + `[RETRY] ${url} -> ${isAbort ? "AbortError" : errObj?.code}. Attempt ${ + attempt + 1 + }/${retries}. Waiting ${wait}ms.` + ) + await delay(wait) + continue + } + throw err + } + } + // Unreachable, but TS wants a return + throw new Error("fetchWithRetry: exhausted retries") +} diff --git a/src/scripts/i18n/lib/utils/mapping.ts b/src/scripts/i18n/lib/utils/mapping.ts new file mode 100644 index 00000000000..e4f543e942d --- /dev/null +++ b/src/scripts/i18n/lib/utils/mapping.ts @@ -0,0 +1,18 @@ +import { crowdinToInternalCodeMapping } from "../../config" + +/** + * Convert Crowdin language code to internal language code + */ +export function mapCrowdinCodeToInternal(crowdinCode: string): string { + return crowdinToInternalCodeMapping[crowdinCode] || crowdinCode +} + +/** + * Convert internal language code to Crowdin language code + */ +export function mapInternalCodeToCrowdin(internalCode: string): string { + const entry = Object.entries(crowdinToInternalCodeMapping).find( + ([, internal]) => internal === internalCode + ) + return entry ? entry[0] : internalCode +} diff --git a/src/scripts/i18n/lib/validation/syntax-tree.ts b/src/scripts/i18n/lib/validation/syntax-tree.ts new file mode 100644 index 00000000000..c1b7762ba5d --- /dev/null +++ b/src/scripts/i18n/lib/validation/syntax-tree.ts @@ -0,0 +1,377 @@ +// Syntax tree validation for JSON and Markdown files + +import type { TranslatableAttribute } from "../jsx-attributes/types" +import { + JSX_ATTRIBUTE_REGEX, + JSX_COMPONENT_REGEX, + TRANSLATABLE_ATTRIBUTES, +} from "../jsx-attributes/types" + +export interface JsonValidationResult { + isValid: boolean + expectedKeyCount: number + actualKeyCount: number + missingKeys: string[] + extraKeys: string[] + orderMatches: boolean +} + +export interface MarkdownValidationResult { + isValid: boolean + expectedHeadingCount: number + actualHeadingCount: number + mismatchedHeadings: Array<{ + level: number + expectedId: string + actualId: string | null + line: number + }> +} + +export interface JsxAttributeValidationResult { + isValid: boolean + untranslatedCount: number + totalCount: number + untranslatedPercentage: number + untranslatedAttributes: Array<{ + attributeName: string + componentName: string + englishValue: string + translatedValue: string + line: number + }> +} + +/** + * Extract JSON keys in order from a JSON string + */ +function extractJsonKeys(jsonContent: string): string[] { + try { + const obj = JSON.parse(jsonContent) + if (typeof obj !== "object" || obj === null || Array.isArray(obj)) { + return [] + } + return Object.keys(obj) + } catch { + return [] + } +} + +/** + * Validate JSON file structure against English source + */ +export function validateJsonStructure( + englishContent: string, + translatedContent: string +): JsonValidationResult { + const englishKeys = extractJsonKeys(englishContent) + const translatedKeys = extractJsonKeys(translatedContent) + + const englishKeySet = new Set(englishKeys) + const translatedKeySet = new Set(translatedKeys) + + const missingKeys = englishKeys.filter((key) => !translatedKeySet.has(key)) + const extraKeys = translatedKeys.filter((key) => !englishKeySet.has(key)) + + const orderMatches = + JSON.stringify(englishKeys) === JSON.stringify(translatedKeys) + + return { + isValid: missingKeys.length === 0 && extraKeys.length === 0, + expectedKeyCount: englishKeys.length, + actualKeyCount: translatedKeys.length, + missingKeys, + extraKeys, + orderMatches, + } +} + +/** + * Extract markdown headings with their custom IDs + */ +function extractMarkdownHeadings( + content: string +): Array<{ level: number; id: string | null; line: number }> { + const lines = content.split("\n") + const headings: Array<{ level: number; id: string | null; line: number }> = [] + + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + const headingMatch = line.match(/^(#{1,6})\s+(.+)$/) + + if (headingMatch) { + const level = headingMatch[1].length + const headingText = headingMatch[2] + + // Extract custom ID if present (e.g., "Heading text {#custom-id}") + const idMatch = headingText.match(/\{#([^}]+)\}\s*$/) + const customId = idMatch ? idMatch[1] : null + + headings.push({ + level, + id: customId, + line: i + 1, + }) + } + } + + return headings +} + +/** + * Validate markdown heading structure against English source + */ +export function validateMarkdownStructure( + englishContent: string, + translatedContent: string +): MarkdownValidationResult { + const englishHeadings = extractMarkdownHeadings(englishContent) + const translatedHeadings = extractMarkdownHeadings(translatedContent) + + const mismatchedHeadings: Array<{ + level: number + expectedId: string + actualId: string | null + line: number + }> = [] + + // Check if heading counts match + if (englishHeadings.length !== translatedHeadings.length) { + return { + isValid: false, + expectedHeadingCount: englishHeadings.length, + actualHeadingCount: translatedHeadings.length, + mismatchedHeadings: [], + } + } + + // Compare each heading + for (let i = 0; i < englishHeadings.length; i++) { + const englishHeading = englishHeadings[i] + const translatedHeading = translatedHeadings[i] + + // Check if level matches + if (englishHeading.level !== translatedHeading.level) { + mismatchedHeadings.push({ + level: translatedHeading.level, + expectedId: englishHeading.id || "(no id)", + actualId: translatedHeading.id, + line: translatedHeading.line, + }) + continue + } + + // Check if custom IDs match (if present in English) + if (englishHeading.id && englishHeading.id !== translatedHeading.id) { + mismatchedHeadings.push({ + level: translatedHeading.level, + expectedId: englishHeading.id, + actualId: translatedHeading.id, + line: translatedHeading.line, + }) + } + } + + return { + isValid: mismatchedHeadings.length === 0, + expectedHeadingCount: englishHeadings.length, + actualHeadingCount: translatedHeadings.length, + mismatchedHeadings, + } +} + +// JSX_COMPONENT_REGEX and JSX_ATTRIBUTE_REGEX imported from jsx-attributes/types + +/** + * Extract JSX component attributes from content + * Returns a map of componentName.attrName -> value for matching + */ +function extractJsxAttributes( + content: string +): Map { + const attributes = new Map< + string, + { value: string; line: number; componentName: string } + >() + + const lines = content.split("\n") + let currentLine = 0 + let currentPos = 0 + + let componentMatch: RegExpExecArray | null + JSX_COMPONENT_REGEX.lastIndex = 0 + + while ((componentMatch = JSX_COMPONENT_REGEX.exec(content)) !== null) { + const componentName = componentMatch[1] + const attributeString = componentMatch[2] + const componentStartPos = componentMatch.index + + // Calculate line number + while (currentPos < componentStartPos && currentLine < lines.length) { + currentPos += lines[currentLine].length + 1 + currentLine++ + } + const componentLine = currentLine + 1 + + let attrMatch: RegExpExecArray | null + JSX_ATTRIBUTE_REGEX.lastIndex = 0 + + while ((attrMatch = JSX_ATTRIBUTE_REGEX.exec(attributeString)) !== null) { + const attrName = attrMatch[1] + const attrValue = attrMatch[2] || attrMatch[3] + + if (!TRANSLATABLE_ATTRIBUTES.includes(attrName as TranslatableAttribute)) + continue + + // Use component position + attribute name as key for matching + // This allows us to match attributes even if component names differ slightly + const key = `${componentLine}:${attrName}` + attributes.set(key, { + value: attrValue, + line: componentLine, + componentName, + }) + } + } + + return attributes +} + +/** + * Validate JSX attributes by comparing translated content against English source. + * An attribute is considered untranslated if its value is IDENTICAL to the English source. + */ +export function validateJsxAttributes( + englishContent: string, + translatedContent: string, + threshold = 5 +): JsxAttributeValidationResult { + const englishAttrs = extractJsxAttributes(englishContent) + const translatedAttrs = extractJsxAttributes(translatedContent) + + const untranslatedAttributes: JsxAttributeValidationResult["untranslatedAttributes"] = + [] + let totalCount = 0 + + // Compare each English attribute with its translated counterpart + for (const [key, englishAttr] of englishAttrs) { + const translatedAttr = translatedAttrs.get(key) + + // Skip if attribute doesn't exist in translation (structural difference) + if (!translatedAttr) continue + + totalCount++ + + // Check if the translated value is IDENTICAL to English (i.e., not translated) + if (translatedAttr.value === englishAttr.value) { + untranslatedAttributes.push({ + attributeName: key.split(":")[1], + componentName: translatedAttr.componentName, + englishValue: englishAttr.value, + translatedValue: translatedAttr.value, + line: translatedAttr.line, + }) + } + } + + const untranslatedPercentage = + totalCount > 0 ? (untranslatedAttributes.length / totalCount) * 100 : 0 + + return { + isValid: untranslatedPercentage <= threshold, + untranslatedCount: untranslatedAttributes.length, + totalCount, + untranslatedPercentage, + untranslatedAttributes, + } +} + +/** + * Format validation results into a markdown comment + */ +export function formatValidationComment( + validationResults: Array<{ + path: string + type: "json" | "markdown" | "jsx-attributes" + result: + | JsonValidationResult + | MarkdownValidationResult + | JsxAttributeValidationResult + }> +): string | null { + const issues = validationResults.filter((v) => !v.result.isValid) + + if (issues.length === 0) { + return null + } + + let comment = "## ⚠️ Syntax Tree Validation Issues\n\n" + comment += + "The following files have structural differences from their English source:\n\n" + + for (const issue of issues) { + comment += `### \`${issue.path}\`\n\n` + + if (issue.type === "json") { + const result = issue.result as JsonValidationResult + comment += `**JSON Structure Issues:**\n` + comment += `- Expected keys: ${result.expectedKeyCount}\n` + comment += `- Actual keys: ${result.actualKeyCount}\n` + + if (result.missingKeys.length > 0) { + comment += `- Missing keys: ${result.missingKeys.map((k) => `\`${k}\``).join(", ")}\n` + } + + if (result.extraKeys.length > 0) { + comment += `- Extra keys: ${result.extraKeys.map((k) => `\`${k}\``).join(", ")}\n` + } + + if ( + !result.orderMatches && + result.missingKeys.length === 0 && + result.extraKeys.length === 0 + ) { + comment += `- ⚠️ Key order differs from English version\n` + } + } else if (issue.type === "markdown") { + const result = issue.result as MarkdownValidationResult + comment += `**Markdown Structure Issues:**\n` + comment += `- Expected headings: ${result.expectedHeadingCount}\n` + comment += `- Actual headings: ${result.actualHeadingCount}\n` + + if (result.mismatchedHeadings.length > 0) { + comment += `\n**Mismatched Headings:**\n` + for (const mismatch of result.mismatchedHeadings) { + comment += `- Line ${mismatch.line}: Expected ID \`${mismatch.expectedId}\`, found \`${mismatch.actualId || "(none)"}\`\n` + } + } + } else if (issue.type === "jsx-attributes") { + const result = issue.result as JsxAttributeValidationResult + comment += `**Untranslated JSX Attributes (identical to English):**\n` + comment += `- Untranslated: ${result.untranslatedCount} / ${result.totalCount} (${result.untranslatedPercentage.toFixed(1)}%)\n` + + if (result.untranslatedAttributes.length > 0) { + comment += `\n**Attributes that need translation:**\n` + // Show up to 10 examples + const examples = result.untranslatedAttributes.slice(0, 10) + for (const attr of examples) { + const truncatedValue = + attr.englishValue.length > 50 + ? attr.englishValue.slice(0, 47) + "..." + : attr.englishValue + comment += `- Line ${attr.line}: \`<${attr.componentName} ${attr.attributeName}="${truncatedValue}">\`\n` + } + if (result.untranslatedAttributes.length > 10) { + comment += `- ... and ${result.untranslatedAttributes.length - 10} more\n` + } + } + } + + comment += `\n` + } + + comment += `\n---\n` + comment += `*This validation check ensures translated files maintain the same structure as the English source.*` + + return comment +} diff --git a/src/scripts/i18n/lib/workflows/file-preparation.ts b/src/scripts/i18n/lib/workflows/file-preparation.ts new file mode 100644 index 00000000000..e8a3810fd61 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/file-preparation.ts @@ -0,0 +1,204 @@ +// File preparation workflow phase + +import { config, crowdinBearerHeaders } from "../../config" +import { + findCrowdinFile, + postCrowdinFile, + postFileToStorage, + unhideStringsInFile, +} from "../crowdin/files" +import { getCurrentUser } from "../crowdin/user" +import { + downloadGitHubFile, + getAllEnglishFiles, + getFileMetadata, +} from "../github/files" +import type { CrowdinFileData } from "../types" + +import type { FilePreparationResult, WorkflowContext } from "./types" +import { debugLog, delay, logSection } from "./utils" + +/** + * Update existing file in Crowdin with latest English content + */ +async function updateCrowdinFile( + file: { + filePath: string + download_url: string + "Crowdin-API-FileName": string + }, + foundFile: CrowdinFileData +): Promise<{ fileId: number; path: string; buffer: Buffer }> { + const fileBuffer = await downloadGitHubFile(file.download_url) + const storageInfo = await postFileToStorage( + fileBuffer, + file["Crowdin-API-FileName"] + ) + + // Update the file content using PUT + const updateUrl = `https://api.crowdin.com/api/v2/projects/${config.projectId}/files/${foundFile.id}` + const updateBody = { storageId: storageInfo.id } + + const updateResp = await fetch(updateUrl, { + method: "PUT", + headers: { + ...crowdinBearerHeaders, + "Content-Type": "application/json", + }, + body: JSON.stringify(updateBody), + }) + + if (!updateResp.ok) { + const text = await updateResp.text().catch(() => "") + throw new Error( + `Failed to update Crowdin file ${foundFile.id} (${updateResp.status}): ${text}` + ) + } + + console.log( + `✓ Updated Crowdin file: ${file.filePath} (fileId: ${foundFile.id}, storageId: ${storageInfo.id})` + ) + + // Wait for file parsing after update + const delayMs = 10000 + debugLog(`Waiting ${delayMs / 1000}s for Crowdin to re-parse updated file...`) + await delay(delayMs) + + return { + fileId: foundFile.id, + path: foundFile.path, + buffer: fileBuffer, + } +} + +/** + * Create new file in Crowdin + */ +async function createCrowdinFile(file: { + filePath: string + download_url: string + "Crowdin-API-FileName": string +}): Promise<{ fileId: number; path: string; buffer: Buffer }> { + console.log(`Creating new file in Crowdin: ${file.filePath}`) + + const fileBuffer = await downloadGitHubFile(file.download_url) + const storageInfo = await postFileToStorage( + fileBuffer, + file["Crowdin-API-FileName"] + ) + + // Derive full parent directory path (exclude filename) + const parts = file.filePath.split("/").filter(Boolean) + parts.pop() // remove filename + const parentDirPath = parts.join("/") || "/" + + const crowdinFileResponse = await postCrowdinFile( + storageInfo.id, + file["Crowdin-API-FileName"], + parentDirPath + ) + + console.log(`✓ Created new Crowdin file (ID: ${crowdinFileResponse.id})`) + + // Wait for new file parsing + const delayMs = 10000 + debugLog(`Waiting ${delayMs / 1000}s for Crowdin to parse new file...`) + await delay(delayMs) + + return { + fileId: crowdinFileResponse.id, + path: crowdinFileResponse.path, + buffer: fileBuffer, + } +} + +/** + * Upload/update English files to Crowdin and prepare for translation + */ +export async function prepareEnglishFiles( + context: WorkflowContext +): Promise { + const { + crowdinProjectFiles, + fileIdsSet, + processedFileIdToPath, + englishBuffers, + } = context + + logSection("Preparing English Files") + + // Get current user ID for ephemeral prompt cleanup later + const currentUser = await getCurrentUser() + context.crowdinUserId = currentUser.id + + // Fetch English files + const allEnglishFiles = await getAllEnglishFiles() + + if (!allEnglishFiles.length) { + console.log("No files to translate, exiting") + process.exit(0) + } + + debugLog(`Found ${allEnglishFiles.length} English files`) + debugLog(`Found ${crowdinProjectFiles.length} files in Crowdin project`) + + const fileMetadata = await getFileMetadata(allEnglishFiles) + + // Track failed files for summary + const failedFiles: Array<{ path: string; error: string }> = [] + let successCount = 0 + + // Iterate through each file and upload/update + for (const file of fileMetadata) { + debugLog(`Processing file: ${file.filePath}`) + + try { + // findCrowdinFile returns null if file doesn't exist (will be created) + const foundFile = findCrowdinFile(file, crowdinProjectFiles) + + const result = foundFile + ? await updateCrowdinFile(file, foundFile) + : await createCrowdinFile(file) + + fileIdsSet.add(result.fileId) + if (result.path) { + processedFileIdToPath[result.fileId] = result.path + } + englishBuffers[result.fileId] = result.buffer + successCount++ + } catch (error) { + // Log and continue - don't let one file failure kill the entire job + const message = error instanceof Error ? error.message : String(error) + failedFiles.push({ path: file.filePath, error: message }) + console.warn(`[WARN] Skipping ${file.filePath}: ${message}`) + } + } + + // Log summary of failed files + if (failedFiles.length > 0) { + console.log(`\n[SUMMARY] ${failedFiles.length} files skipped:`) + failedFiles.forEach((f) => console.log(` - ${f.path}`)) + } + + // Exit 1 only if ALL files failed + if (successCount === 0 && failedFiles.length > 0) { + console.error("[ERROR] All files failed to process") + process.exit(1) + } + + console.log( + `\n[INFO] Processed ${successCount} files successfully${failedFiles.length > 0 ? `, ${failedFiles.length} skipped` : ""}` + ) + + // Unhide any hidden/duplicate strings before pre-translation + logSection(`Unhiding Strings in ${fileIdsSet.size} Files`) + for (const fileId of Array.from(fileIdsSet)) { + await unhideStringsInFile(fileId) + } + + return { + fileIdsSet, + processedFileIdToPath, + englishBuffers, + } +} diff --git a/src/scripts/i18n/lib/workflows/initialize.ts b/src/scripts/i18n/lib/workflows/initialize.ts new file mode 100644 index 00000000000..fb04c9574a3 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/initialize.ts @@ -0,0 +1,53 @@ +// Workflow initialization phase + +import { config, validateTargetPath } from "../../config" +import { getCrowdinProjectFiles } from "../crowdin/files" +import { fetchGlossaryEntries, groupGlossaryByLanguage } from "../supabase" + +import type { WorkflowContext } from "./types" +import { logSection } from "./utils" + +/** + * Initialize workflow: validate config, log settings, fetch Crowdin state + */ +export async function initializeWorkflow(): Promise { + const { targetPath } = config + + logSection("Crowdin AI Translation Import") + console.log(`Target languages: ${config.allCrowdinCodes.join(", ")}`) + + if (targetPath) { + const isFile = targetPath.endsWith(".md") || targetPath.endsWith(".json") + console.log(`Mode: ${isFile ? "Single file" : "Directory"} (${targetPath})`) + + // Validate target path is in allowed location + try { + validateTargetPath(targetPath) + } catch (e) { + console.error(e instanceof Error ? e.message : String(e)) + process.exit(1) + } + } else { + console.log(`Mode: Full translation (all files)`) + } + + // Fetch Crowdin project state + const crowdinProjectFiles = await getCrowdinProjectFiles() + + // Fetch glossary from Supabase (graceful degradation if unavailable) + const glossaryEntries = await fetchGlossaryEntries() + const glossary = groupGlossaryByLanguage(glossaryEntries) + console.log( + `[INIT] Loaded glossary: ${glossaryEntries.length} terms across ${glossary.size} languages` + ) + + // Initialize shared state + return { + crowdinProjectFiles, + fileIdsSet: new Set(), + processedFileIdToPath: {}, + englishBuffers: {}, + glossary, + languageJobs: [], + } +} diff --git a/src/scripts/i18n/lib/workflows/jsx-translation.ts b/src/scripts/i18n/lib/workflows/jsx-translation.ts new file mode 100644 index 00000000000..c0bad47561a --- /dev/null +++ b/src/scripts/i18n/lib/workflows/jsx-translation.ts @@ -0,0 +1,118 @@ +// JSX attribute translation workflow phase + +import { config } from "../../config" +import { translateJsxAttributes } from "../../translate-jsx-attributes" +import { isGeminiAvailable } from "../ai" +import { batchCommitFiles, BatchFile } from "../github/commits" +import type { GlossaryByLanguage } from "../supabase" +import { getGlossaryForLanguage } from "../supabase" + +import type { CommittedFile, LanguagePair } from "./types" +import { debugLog, logSection } from "./utils" + +export interface JsxTranslationResult { + /** Whether Gemini was skipped due to missing API key */ + geminiSkipped: boolean + /** Total attributes translated across all files */ + totalAttributesTranslated: number + /** Total files updated */ + totalFilesUpdated: number +} + +/** + * Translate JSX attributes in markdown files via Gemini. + * Updates committedFiles in-place with translated content. + */ +export async function runJsxTranslation( + committedFiles: CommittedFile[], + languagePairs: LanguagePair[], + branch: string, + glossary: GlossaryByLanguage +): Promise { + logSection("JSX Attribute Translation") + + if (!isGeminiAvailable()) { + console.warn( + `[JSX-TRANSLATE] ⚠️ GEMINI_API_KEY not set - JSX attributes may remain untranslated` + ) + return { + geminiSkipped: true, + totalAttributesTranslated: 0, + totalFilesUpdated: 0, + } + } + + let totalAttributesTranslated = 0 + let totalFilesUpdated = 0 + + // Process each language separately + for (const langPair of languagePairs) { + const langCode = langPair.internalLanguageCode + + // Filter files for this language (markdown only) + const langFiles = committedFiles + .filter((f) => f.path.includes(`/translations/${langCode}/`)) + .filter((f) => f.path.endsWith(".md") || f.path.endsWith(".mdx")) + .map((f) => ({ path: f.path, content: f.content })) + + if (langFiles.length === 0) { + console.log(`[JSX-TRANSLATE] No markdown files for ${langCode}`) + continue + } + + console.log( + `[JSX-TRANSLATE] Processing ${langFiles.length} files for ${langCode}` + ) + + const glossaryTerms = getGlossaryForLanguage(glossary, langCode) + const jsxResult = await translateJsxAttributes({ + targetLanguage: langCode, + files: langFiles, + glossaryTerms, + verbose: config.verbose, + }) + + // Batch commit updated files + if (jsxResult.updatedFiles.length > 0) { + const filesToCommit: BatchFile[] = [] + + for (const updated of jsxResult.updatedFiles) { + const buf = Buffer.from(updated.updatedContent, "utf8") + filesToCommit.push({ path: updated.filePath, content: buf }) + debugLog(`JSX-TRANSLATE: Will commit ${updated.filePath}`) + + // Update the committedFiles array with new content for sanitizer + const existingFile = committedFiles.find( + (f) => f.path === updated.filePath + ) + if (existingFile) { + existingFile.content = updated.updatedContent + } + } + + try { + await batchCommitFiles( + filesToCommit, + branch, + `i18n(${langCode}): JSX attribute translations` + ) + console.log( + `[JSX-TRANSLATE] ✓ Committed ${jsxResult.updatedFiles.length} files for ${langCode}` + ) + totalFilesUpdated += jsxResult.updatedFiles.length + totalAttributesTranslated += jsxResult.attributesTranslated + } catch (e) { + console.warn( + `[JSX-TRANSLATE] Failed to commit files for ${langCode}:`, + e + ) + } + } + } + + return { + geminiSkipped: false, + totalAttributesTranslated, + totalFilesUpdated, + } +} diff --git a/src/scripts/i18n/lib/workflows/pr-creation.ts b/src/scripts/i18n/lib/workflows/pr-creation.ts new file mode 100644 index 00000000000..5ed6db04db0 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/pr-creation.ts @@ -0,0 +1,198 @@ +// PR creation workflow phase + +import { config } from "../../config" +import { getPromptInfo } from "../crowdin/prompt" +import { getCurrentUser } from "../crowdin/user" +import { postPullRequest } from "../github/pull-requests" + +import type { CommittedFile, LanguagePair, PullRequest } from "./types" +import { logSection } from "./utils" + +/** + * Generate dynamic PR title based on language count + */ +export function generatePRTitle( + langCodes: string[], + allPossibleLanguages: string[] +): string { + const isAllLanguages = langCodes.length === allPossibleLanguages.length + + let prTitle = "i18n: automated Crowdin translation import" + + if (langCodes.length <= 3) { + prTitle += ` (${langCodes.join(", ")})` + } else if (isAllLanguages) { + prTitle += ` (all languages)` + } else { + prTitle += ` (multiple languages)` + } + + return prTitle +} + +/** Options for PR body generation */ +export interface PRBodyOptions { + geminiSkipped?: boolean + workflowRunUrl?: string +} + +/** + * Generate PR body with organized file listings + */ +export function generatePRBody( + aiModelName: string, + langCodes: string[], + committedFiles: CommittedFile[], + sanitizedFiles: CommittedFile[], + options: PRBodyOptions = {} +): string { + // Include both sanitized files and original committed files + const allChangedPathsSet = new Set([ + ...sanitizedFiles.map(({ path }) => path), + ...committedFiles.map(({ path }) => path), + ]) + const allChangedPaths = Array.from(allChangedPathsSet) + + // Separate JSON and Markdown files + const jsonFiles = allChangedPaths.filter((path) => + path.toLowerCase().endsWith(".json") + ) + const markdownFiles = allChangedPaths.filter((path) => + path.toLowerCase().endsWith(".md") + ) + + // Dedupe paths after stripping locale prefix (same content path across languages) + const uniqueJsonPaths = [ + ...new Set( + jsonFiles.map((path) => path.replace(/^src\/intl\/[^/]+\//, "")) + ), + ].sort() + const uniqueMarkdownPaths = [ + ...new Set( + markdownFiles.map((path) => + path.replace(/^public\/content\/translations\/[^/]+\//, "") + ) + ), + ].sort() + + // Build PR body + let prBody = `## Description\n\n` + prBody += `This PR contains automated ${aiModelName} translations from Crowdin.\n\n` + + if (options.workflowRunUrl) { + prBody += `[🔗 View workflow run](${options.workflowRunUrl})\n\n` + } + + // Language section + prBody += `### Languages translated\n\n` + prBody += `${langCodes.join(", ")}\n\n` + + // Files section - JSON + if (uniqueJsonPaths.length > 0) { + prBody += `### JSON changes (\`src/intl/{locale}/\`)\n\n` + for (const path of uniqueJsonPaths) { + prBody += `- ${path}\n` + } + prBody += `\n` + } + + // Files section - Markdown + if (uniqueMarkdownPaths.length > 0) { + prBody += `### Markdown changes (\`public/content/translations/{locale}/\`)\n\n` + for (const path of uniqueMarkdownPaths) { + prBody += `- ${path}\n` + } + prBody += `\n` + } + + // Add warning if Gemini was skipped + if (options.geminiSkipped) { + prBody += `---\n\n` + prBody += `> ⚠️ **Note:** GEMINI_API_KEY was not available during this run. ` + prBody += `JSX component attributes (e.g., \`title="..."\`, \`description="..."\`) ` + prBody += `may remain untranslated.\n\n` + } + + return prBody +} + +/** + * Fetch AI model name from Crowdin + */ +async function fetchAIModelName(): Promise { + try { + const currentUser = await getCurrentUser() + const promptInfo = await getPromptInfo( + currentUser.id, + config.preTranslatePromptId + ) + + if (promptInfo?.aiModelId) { + console.log(`✓ Fetched AI model: ${promptInfo.aiModelId}`) + return promptInfo.aiModelId + } else { + console.warn("Prompt info missing aiModelId, using default") + return "LLM" + } + } catch (e) { + console.warn("Could not fetch AI model name from Crowdin:", e) + return "LLM" + } +} + +/** + * Build workflow run URL from GitHub environment variables + */ +function getWorkflowRunUrl(): string | undefined { + const serverUrl = process.env.GITHUB_SERVER_URL + const repository = process.env.GITHUB_REPOSITORY + const runId = process.env.GITHUB_RUN_ID + + if (serverUrl && repository && runId) { + return `${serverUrl}/${repository}/actions/runs/${runId}` + } + return undefined +} + +/** + * Create pull request with formatted title and body + */ +export async function createTranslationPR( + branch: string, + committedFiles: CommittedFile[], + sanitizedFiles: CommittedFile[], + languagePairs: LanguagePair[], + options: PRBodyOptions = {} +): Promise { + logSection("Creating Pull Request") + + // Fetch AI model name dynamically + const aiModelName = await fetchAIModelName() + + // Extract language codes + const langCodes = languagePairs.map((p) => p.internalLanguageCode) + + // Add workflow metadata to options + const fullOptions: PRBodyOptions = { + ...options, + workflowRunUrl: getWorkflowRunUrl(), + } + + // Generate PR title and body + const prTitle = generatePRTitle(langCodes, config.allInternalCodes) + const prBody = generatePRBody( + aiModelName, + langCodes, + committedFiles, + sanitizedFiles, + fullOptions + ) + + // Create PR + const pr = await postPullRequest(branch, config.baseBranch, prTitle, prBody) + + console.log(`\n✓ Pull Request created: ${pr.html_url}`) + console.log(`PR Number: #${pr.number}`) + + return pr +} diff --git a/src/scripts/i18n/lib/workflows/pre-translation.ts b/src/scripts/i18n/lib/workflows/pre-translation.ts new file mode 100644 index 00000000000..40971708cb2 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/pre-translation.ts @@ -0,0 +1,317 @@ +// Pre-translation workflow phase + +import * as fs from "fs" +import * as path from "path" + +import { config } from "../../config" +import { createEphemeralPrompt } from "../crowdin/ephemeral-prompts" +import { + awaitPreTranslationCompleted, + getPreTranslationStatus, + postApplyPreTranslation, +} from "../crowdin/pre-translate" +import { getPromptInfo } from "../crowdin/prompt" +import { formatGlossaryForPrompt, getGlossaryForLanguage } from "../supabase" +import type { CrowdinPreTranslateResponse } from "../types" + +import type { PreTranslationResult, WorkflowContext } from "./types" +import { debugLog, logSection } from "./utils" + +/** + * Resume existing pre-translation job + */ +async function resumePreTranslation( + preTranslationId: string +): Promise { + logSection(`Resuming Pre-Translation ${preTranslationId}`) + + const statusResp = await getPreTranslationStatus(preTranslationId) + + if (statusResp.status === "in_progress" || statusResp.status === "created") { + const statusMsg = + statusResp.status === "created" + ? "Pre-translation queued (waiting for other jobs)" + : `Pre-translation in progress (${statusResp.progress}%)` + console.log(`${statusMsg}, waiting for completion...`) + return await awaitPreTranslationCompleted(preTranslationId) + } else if (statusResp.status === "finished") { + console.log(`Pre-translation already finished, proceeding to download...`) + return statusResp + } else { + throw new Error( + `Pre-translation ${preTranslationId} has unexpected status: ${statusResp.status}` + ) + } +} + +/** + * Create ephemeral prompt with language-specific glossary + */ +async function createLanguagePrompt( + userId: number, + internalCode: string, + glossary: WorkflowContext["glossary"], + basePrompt: string, + aiProviderId?: number, + aiModelId?: string +): Promise { + const glossaryTerms = getGlossaryForLanguage(glossary, internalCode) + const glossarySection = formatGlossaryForPrompt(glossaryTerms, "informal") + + const fullPrompt = glossarySection + ? `${basePrompt}\n\n---\n\n${glossarySection}` + : basePrompt + + if (glossaryTerms.size > 0) { + console.log( + `[GLOSSARY] Injecting ${glossaryTerms.size} terms for ${internalCode} into prompt` + ) + } + + const { promptId } = await createEphemeralPrompt({ + userId, + languageCode: internalCode, + promptKey: "glossary", + promptText: fullPrompt, + aiProviderId, + aiModelId, + }) + + return promptId +} + +/** + * Start pre-translation jobs for all target languages + * Creates one ephemeral prompt and one job per language + */ +async function startPerLanguagePreTranslation( + context: WorkflowContext +): Promise { + const { allCrowdinCodes, allInternalCodes } = config + const { fileIdsSet, crowdinUserId, glossary, languageJobs } = context + + if (!crowdinUserId) { + throw new Error("Missing crowdinUserId in context") + } + + logSection("Requesting AI Pre-Translation (Per-Language)") + console.log(`Files to translate: ${fileIdsSet.size}`) + console.log(`Target languages: ${allCrowdinCodes.join(", ")}`) + + // Load base prompt template + const promptPath = path.join( + process.cwd(), + "src/scripts/i18n/lib/crowdin/pre-translate-prompt.txt" + ) + const basePrompt = fs.readFileSync(promptPath, "utf8") + + // Get AI provider/model settings from the static prompt + const staticPromptInfo = await getPromptInfo( + crowdinUserId, + config.preTranslatePromptId + ) + debugLog( + `Static prompt AI settings: provider=${staticPromptInfo.aiProviderId}, model=${staticPromptInfo.aiModelId}` + ) + + const fileIds = Array.from(fileIdsSet) + + // Process each language: create prompt, start job + for (let i = 0; i < allInternalCodes.length; i++) { + const internalCode = allInternalCodes[i] + const crowdinCode = allCrowdinCodes[i] + + console.log(`\n[${internalCode}] Creating ephemeral prompt...`) + + // Create language-specific prompt with glossary + const ephemeralPromptId = await createLanguagePrompt( + crowdinUserId, + internalCode, + glossary, + basePrompt, + staticPromptInfo.aiProviderId ?? undefined, + staticPromptInfo.aiModelId ?? undefined + ) + + console.log(`[${internalCode}] ✓ Created prompt (ID: ${ephemeralPromptId})`) + console.log(`[${internalCode}] Submitting pre-translation job...`) + + // Submit pre-translation for this single language + const response = await postApplyPreTranslation( + fileIds, + [crowdinCode], + ephemeralPromptId + ) + + console.log(`[${internalCode}] ✓ Job created (ID: ${response.identifier})`) + + // Track job info for polling and cleanup + languageJobs.push({ + internalCode, + crowdinCode, + ephemeralPromptId, + preTranslationId: response.identifier, + }) + } + + // Log all job IDs for potential manual resume (comma-separated for easy copy-paste) + const allJobIds = languageJobs.map((j) => j.preTranslationId).join(",") + logSection("Pre-Translation Jobs Summary") + console.log(`Created ${languageJobs.length} pre-translation jobs:`) + for (const job of languageJobs) { + console.log(` ${job.internalCode}: ${job.preTranslationId}`) + } + console.log(`\n📋 Copy for resume: ${allJobIds}`) + + // Exit early if skipAwait is set or if full translation mode (no targetPath) + if (config.skipAwait || !config.targetPath) { + const reason = config.skipAwait + ? "skip_await option enabled" + : "full translation job" + logSection(`Exiting for Manual Resume (${reason})`) + console.log(`\nTo resume, use PRETRANSLATION_ID:`) + console.log(` ${allJobIds}`) + console.log(`\nCheck progress: https://crowdin.com/project/ethereum-org`) + process.exit(0) + } + + // Wait for all jobs to complete in parallel with continue-on-error + logSection("Waiting for Pre-Translation Completion") + + const results = await Promise.all( + languageJobs.map(async (job) => { + console.log(`[${job.internalCode}] Waiting for completion...`) + try { + const completed = await awaitPreTranslationCompleted( + job.preTranslationId + ) + if (completed.status !== "finished") { + throw new Error(`Unexpected status: ${completed.status}`) + } + console.log(`[${job.internalCode}] ✓ Completed!`) + return { success: true as const, job, response: completed } + } catch (err) { + console.error( + `[${job.internalCode}] ✗ Failed:`, + err instanceof Error ? err.message : err + ) + return { success: false as const, job, error: err } + } + }) + ) + + const successes = results.filter((r) => r.success) + const failures = results.filter((r) => !r.success) + + if (failures.length > 0) { + console.warn( + `\n[WARN] ${failures.length}/${languageJobs.length} jobs failed:` + ) + for (const f of failures) { + console.warn(` - ${f.job.internalCode}: ${f.job.preTranslationId}`) + } + } + + if (successes.length === 0) { + throw new Error("All pre-translation jobs failed") + } + + console.log( + `\n✓ ${successes.length}/${languageJobs.length} pre-translation jobs completed!` + ) + return successes.map((s) => s.response) +} + +/** + * Resume multiple pre-translation jobs in parallel with continue-on-error + */ +async function resumeMultiplePreTranslations( + preTranslationIds: string[] +): Promise { + logSection(`Resuming ${preTranslationIds.length} Pre-Translation Jobs`) + console.log(`IDs: ${preTranslationIds.join(", ")}`) + + const results = await Promise.all( + preTranslationIds.map(async (id) => { + try { + const response = await resumePreTranslation(id) + return { success: true as const, id, response } + } catch (err) { + console.error( + `[ERROR] Job ${id} failed:`, + err instanceof Error ? err.message : err + ) + return { success: false as const, id, error: err } + } + }) + ) + + // Separate successes and failures + const successes = results.filter((r) => r.success) + const failures = results.filter((r) => !r.success) + + if (failures.length > 0) { + console.warn( + `\n[WARN] ${failures.length}/${preTranslationIds.length} jobs failed:` + ) + for (const f of failures) { + console.warn(` - ${f.id}`) + } + } + + if (successes.length === 0) { + throw new Error("All pre-translation jobs failed") + } + + console.log( + `\n✓ ${successes.length}/${preTranslationIds.length} jobs completed successfully` + ) + return successes.map((s) => s.response) +} + +/** + * Handle pre-translation: resume existing or start new per-language jobs + */ +export async function handlePreTranslation( + context: WorkflowContext +): Promise { + const { existingPreTranslationIds, verbose } = config + const { fileIdsSet, processedFileIdToPath, crowdinProjectFiles } = context + + // Resume existing jobs or start new per-language jobs + let responses: CrowdinPreTranslateResponse[] + let fileIds: number[] + + if (existingPreTranslationIds.length > 0) { + // Resume mode: one or more existing jobs + responses = await resumeMultiplePreTranslations(existingPreTranslationIds) + // Collect all fileIds from all responses + fileIds = [...new Set(responses.flatMap((r) => r.attributes.fileIds))] + } else { + // New mode: per-language jobs + responses = await startPerLanguagePreTranslation(context) + // All jobs translate the same files, so just use the first response's fileIds + fileIds = responses[0]?.attributes.fileIds ?? Array.from(fileIdsSet) + } + + // Build mapping for commit phase + const fileIdToPathMapping: Record = {} + + for (const fid of fileIds) { + if (processedFileIdToPath[fid]) { + fileIdToPathMapping[fid] = processedFileIdToPath[fid] + } else { + const existing = crowdinProjectFiles.find((f) => f.id === fid) + if (existing) fileIdToPathMapping[fid] = existing.path + } + if (!fileIdToPathMapping[fid] && verbose) { + console.warn(`[WARN] Missing path mapping for fileId=${fid}`) + } + } + + return { + responses, + fileIdToPathMapping, + fileIds, + } +} diff --git a/src/scripts/i18n/lib/workflows/sanitization.ts b/src/scripts/i18n/lib/workflows/sanitization.ts new file mode 100644 index 00000000000..17f59c90ff9 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/sanitization.ts @@ -0,0 +1,67 @@ +// Post-import sanitization workflow phase + +import { runSanitizer } from "../../post_import_sanitize" +import { batchCommitFiles, BatchFile } from "../github/commits" + +import type { CommittedFile } from "./types" +import { debugLog, logSection } from "./utils" + +export interface SanitizationResult { + /** Files that were modified by the sanitizer */ + changedFiles: CommittedFile[] + /** Total files processed */ + totalProcessed: number +} + +/** + * Run post-import sanitizer on committed files. + * Updates committedFiles in-place with sanitized content. + */ +export async function runPostImportSanitization( + committedFiles: CommittedFile[], + branch: string +): Promise { + logSection("Running Post-Import Sanitizer") + + console.log(`[SANITIZE] Processing ${committedFiles.length} committed files`) + + const sanitizeResult = runSanitizer(committedFiles) + const changedFiles = sanitizeResult.changedFiles || [] + + if (changedFiles.length) { + console.log(`Sanitizer modified ${changedFiles.length} files`) + + const filesToCommit: BatchFile[] = [] + + for (const file of changedFiles) { + const relPath = file.path + const buf = Buffer.from(file.content, "utf8") + filesToCommit.push({ path: relPath, content: buf }) + debugLog(`Will commit sanitized file: ${relPath}`) + + // Update committedFiles with sanitized content for validation + const existingFile = committedFiles.find((f) => f.path === relPath) + if (existingFile) { + existingFile.content = file.content + } + } + + try { + await batchCommitFiles( + filesToCommit, + branch, + `i18n: post-import sanitization` + ) + console.log(`✓ Committed ${changedFiles.length} sanitized files`) + } catch (e) { + console.warn(`Failed to commit sanitized files:`, e) + } + } else { + console.log("No sanitization changes needed") + } + + return { + changedFiles, + totalProcessed: committedFiles.length, + } +} diff --git a/src/scripts/i18n/lib/workflows/translation-download.ts b/src/scripts/i18n/lib/workflows/translation-download.ts new file mode 100644 index 00000000000..db9f8047b7c --- /dev/null +++ b/src/scripts/i18n/lib/workflows/translation-download.ts @@ -0,0 +1,133 @@ +// Translation download workflow phase + +import { config } from "../../config" +import { getBuiltFile, postBuildProjectFileTranslation } from "../crowdin/build" +import { postCreateBranchFrom } from "../github/branches" +import { + batchCommitFiles, + BatchFile, + getDestinationFromPath, +} from "../github/commits" +import { mapCrowdinCodeToInternal } from "../utils/mapping" + +import type { + CommittedFile, + LanguagePair, + PreTranslationResult, + TranslationDownloadResult, + WorkflowContext, +} from "./types" +import { debugLog, logSection, logSubsection } from "./utils" + +/** + * Build language pair mappings from Crowdin IDs to internal codes + */ +export function buildLanguageMappings(languageIds: string[]): LanguagePair[] { + return languageIds.map((crowdinId) => ({ + crowdinId, + internalLanguageCode: mapCrowdinCodeToInternal(crowdinId), + })) +} + +/** + * Download translations from Crowdin and commit to GitHub branch + */ +export async function downloadAndCommitTranslations( + preTranslateResult: PreTranslationResult, + context: WorkflowContext +): Promise { + const { englishBuffers } = context + const { responses, fileIdToPathMapping, fileIds } = preTranslateResult + + // Collect all language IDs from all responses (each response has one language) + const languageIds = responses.flatMap((r) => r.attributes.languageIds) + + // Build language pair mappings + const languagePairs = buildLanguageMappings(languageIds) + + logSection("Creating Translation PR") + + // Create GitHub branch (use language code as suffix for single-language PRs) + const branchSuffix = + languagePairs.length === 1 + ? languagePairs[0].internalLanguageCode + : "crowdin-translations" + const { branch } = await postCreateBranchFrom(config.baseBranch, branchSuffix) + console.log(`✓ Created branch: ${branch}`) + + // Track all committed files with their content for sanitizer/validation + const committedFiles: CommittedFile[] = [] + + // For each language, download and commit translations + for (const { crowdinId, internalLanguageCode } of languagePairs) { + logSubsection( + `Building translations for ${crowdinId} (${internalLanguageCode})` + ) + + // Collect files for batch commit + const filesToCommit: BatchFile[] = [] + + // Build and download each file + for (const fileId of fileIds) { + const crowdinPath = fileIdToPathMapping[fileId] + + debugLog(`Processing fileId: ${fileId} (${crowdinPath})`) + + // 1- Build translation + const { url: downloadUrl } = await postBuildProjectFileTranslation( + fileId, + crowdinId, + config.projectId + ) + + // 2- Download + const { buffer } = await getBuiltFile(downloadUrl) + debugLog(`Downloaded ${buffer.length} bytes`) + + // Check if translation differs from English + const originalEnglish = englishBuffers[fileId] + if (originalEnglish && originalEnglish.compare(buffer) === 0) { + debugLog( + `Skipping commit - content identical to English (no translation)` + ) + continue + } + + // 3- Get destination path and collect for batch commit + const destinationPath = getDestinationFromPath( + crowdinPath, + internalLanguageCode + ) + debugLog(`Will commit to: ${destinationPath}`) + + filesToCommit.push({ path: destinationPath, content: buffer }) + + // Track this file's path and content for sanitizer/validation + committedFiles.push({ + path: destinationPath, + content: buffer.toString("utf8"), + }) + } + + // Batch commit all files for this language + if (filesToCommit.length > 0) { + await batchCommitFiles( + filesToCommit, + branch, + `i18n(${internalLanguageCode}): Crowdin translations` + ) + console.log( + `✓ Committed ${filesToCommit.length} translations for ${internalLanguageCode}` + ) + } else { + console.log(`No new translations for ${internalLanguageCode}`) + } + } + + return { + branch, + committedFiles, + languagePairs, + fileIdToPathMapping, + } +} diff --git a/src/scripts/i18n/lib/workflows/types.ts b/src/scripts/i18n/lib/workflows/types.ts new file mode 100644 index 00000000000..8620c268696 --- /dev/null +++ b/src/scripts/i18n/lib/workflows/types.ts @@ -0,0 +1,98 @@ +// Types for i18n workflow phases + +import type { GlossaryByLanguage } from "../supabase" +import type { CrowdinFileData, CrowdinPreTranslateResponse } from "../types" + +/** + * Per-language job tracking data + */ +export interface LanguageJobInfo { + /** Internal language code (e.g., "es", "zh") */ + internalCode: string + /** Crowdin language code (e.g., "es-EM", "zh-CN") */ + crowdinCode: string + /** Ephemeral prompt ID created for this language */ + ephemeralPromptId: number + /** Pre-translation job ID */ + preTranslationId: string +} + +/** + * Shared context passed between workflow phases + */ +export interface WorkflowContext { + crowdinProjectFiles: CrowdinFileData[] + fileIdsSet: Set + processedFileIdToPath: Record + englishBuffers: Record + glossary: GlossaryByLanguage + /** Per-language job info (populated during pre-translation phase) */ + languageJobs: LanguageJobInfo[] + /** Crowdin user ID (needed for ephemeral prompt cleanup) */ + crowdinUserId?: number +} + +/** + * Result of file preparation phase + */ +export interface FilePreparationResult { + fileIdsSet: Set + processedFileIdToPath: Record + englishBuffers: Record +} + +/** + * File committed to GitHub branch + */ +export interface CommittedFile { + path: string + content: string +} + +/** + * Language pair mapping + */ +export interface LanguagePair { + crowdinId: string + internalLanguageCode: string +} + +/** + * Result of translation download phase + */ +export interface TranslationDownloadResult { + branch: string + committedFiles: CommittedFile[] + languagePairs: LanguagePair[] + fileIdToPathMapping: Record +} + +/** + * Pull request data + */ +export interface PullRequest { + html_url: string + number: number +} + +/** + * Pre-translation job result (supports multiple per-language jobs) + */ +export interface PreTranslationResult { + /** All pre-translation responses (one per language) */ + responses: CrowdinPreTranslateResponse[] + /** File ID to path mapping */ + fileIdToPathMapping: Record + /** File IDs that were translated */ + fileIds: number[] +} + +/** + * Result of processing a single language in split-PR mode + */ +export interface SplitPRResult { + language: string + status: "success" | "failed" + prUrl?: string + error?: string +} diff --git a/src/scripts/i18n/lib/workflows/utils.ts b/src/scripts/i18n/lib/workflows/utils.ts new file mode 100644 index 00000000000..bd81c73819a --- /dev/null +++ b/src/scripts/i18n/lib/workflows/utils.ts @@ -0,0 +1,31 @@ +// Common utilities for i18n workflows + +import { config } from "../../config" + +/** + * Delay execution for specified milliseconds + */ +export const delay = (ms: number): Promise => + new Promise((resolve) => setTimeout(resolve, ms)) + +/** + * Log debug message (only when verbose mode is enabled) + */ +export function debugLog(message: string): void { + if (!config.verbose) return + console.log(`[DEBUG] ${message}`) +} + +/** + * Log a section header with consistent formatting + */ +export function logSection(title: string): void { + console.log(`\n========== ${title} ==========`) +} + +/** + * Log a subsection with lighter formatting + */ +export function logSubsection(title: string): void { + console.log(`\n--- ${title} ---`) +} diff --git a/src/scripts/i18n/lib/workflows/validation.ts b/src/scripts/i18n/lib/workflows/validation.ts new file mode 100644 index 00000000000..17241baf6ab --- /dev/null +++ b/src/scripts/i18n/lib/workflows/validation.ts @@ -0,0 +1,133 @@ +// Syntax tree validation workflow phase + +import { postPullRequestComment } from "../github/pull-requests" +import { + formatValidationComment, + validateJsonStructure, + validateJsxAttributes, + validateMarkdownStructure, +} from "../validation/syntax-tree" + +import type { CommittedFile, PullRequest } from "./types" +import { debugLog, logSection } from "./utils" + +/** Default threshold for JSX attribute untranslated percentage */ +const DEFAULT_JSX_THRESHOLD = 5 + +/** + * Run syntax tree validation and post comment if issues found + */ +export async function runSyntaxValidation( + pr: PullRequest, + committedFiles: CommittedFile[], + englishBuffers: Record, + fileIdToPathMapping: Record +): Promise { + logSection("Running Syntax Tree Validation") + + const validationResults: Parameters[0] = [] + + for (const file of committedFiles) { + const isJson = file.path.toLowerCase().endsWith(".json") + const isMarkdown = file.path.toLowerCase().endsWith(".md") + + if (!isJson && !isMarkdown) continue + + // Find the corresponding English file + let englishContent: string | null = null + + // Determine the English source path + if (isJson) { + // Extract the file name from the destination path + const match = file.path.match(/src\/intl\/[^/]+\/(.+)$/) + if (match) { + const fileName = match[1] + // Find the English buffer from our tracked files + for (const [fileId, buffer] of Object.entries(englishBuffers)) { + const crowdinPath = fileIdToPathMapping[Number(fileId)] + if (crowdinPath && crowdinPath.includes(fileName)) { + englishContent = buffer.toString("utf8") + break + } + } + } + } else if (isMarkdown) { + // Extract the relative path from translations + const match = file.path.match( + /public\/content\/translations\/[^/]+\/(.+)$/ + ) + if (match) { + const relPath = match[1] + // Find the English buffer + for (const [fileId, buffer] of Object.entries(englishBuffers)) { + const crowdinPath = fileIdToPathMapping[Number(fileId)] + if (crowdinPath && crowdinPath.includes(relPath)) { + englishContent = buffer.toString("utf8") + break + } + } + } + } + + if (!englishContent) { + debugLog(`Could not find English source for ${file.path}`) + continue + } + + // Validate structure + if (isJson) { + const result = validateJsonStructure(englishContent, file.content) + validationResults.push({ + path: file.path, + type: "json", + result, + }) + if (!result.isValid) { + debugLog(`JSON validation failed for ${file.path}`) + } + } else if (isMarkdown) { + const result = validateMarkdownStructure(englishContent, file.content) + validationResults.push({ + path: file.path, + type: "markdown", + result, + }) + if (!result.isValid) { + debugLog(`Markdown validation failed for ${file.path}`) + } + + // Also validate JSX attributes for markdown files (compare against English) + const jsxThreshold = + Number(process.env.JSX_UNTRANSLATED_THRESHOLD) || DEFAULT_JSX_THRESHOLD + const jsxResult = validateJsxAttributes( + englishContent, + file.content, + jsxThreshold + ) + if (!jsxResult.isValid) { + validationResults.push({ + path: file.path, + type: "jsx-attributes", + result: jsxResult, + }) + debugLog( + `JSX attribute validation flagged ${file.path}: ${jsxResult.untranslatedPercentage.toFixed(1)}% untranslated` + ) + } + } + } + + // Post validation comment if there are issues + const validationComment = formatValidationComment(validationResults) + if (validationComment) { + console.log(`\n⚠️ Syntax validation issues found, posting comment...`) + try { + await postPullRequestComment(pr.number, validationComment) + console.log(`✓ Posted validation comment to PR`) + } catch (e) { + console.warn(`Failed to post validation comment:`, e) + } + } else { + console.log(`✓ All files passed syntax tree validation`) + } +} diff --git a/src/scripts/i18n/main.ts b/src/scripts/i18n/main.ts index 27cfab83e25..08e9eb3daac 100644 --- a/src/scripts/i18n/main.ts +++ b/src/scripts/i18n/main.ts @@ -1,1349 +1,222 @@ -import dotenv from "dotenv" - -import i18nConfig from "../../../i18n.config.json" - -import type { - BranchDetailsResponse, - BranchObject, - BuildProjectFileTranslationResponse, - ContentType, - CrowdinAddFileResponse, - CrowdinFileData, - CrowdinPreTranslateResponse, - GitHubCrowdinFileMetadata, - GitHubQueryResponseItem, -} from "./types" - -dotenv.config({ path: ".env.local" }) - -const crowdinToInternalCodeMapping: Record = i18nConfig.reduce( - (acc, { crowdinCode, code }) => { - acc[crowdinCode] = code - return acc - }, - {} as Record -) - -const gitHubApiKey = process.env.I18N_GITHUB_API_KEY || "" -if (!gitHubApiKey) { - console.error("[ERROR] Missing I18N_GITHUB_API_KEY environment variable") - console.error( - "[ERROR] Please set I18N_GITHUB_API_KEY in your .env.local file" - ) - throw new Error("No GitHub API Key found (I18N_GITHUB_API_KEY)") -} -console.log("[DEBUG] GitHub API key found ✓") -const gitHubBearerHeaders = { - Authorization: `Bearer ${gitHubApiKey}`, - Accept: "application/vnd.github.v3+json", -} - -const crowdinApiKey = process.env.I18N_CROWDIN_API_KEY || "" -if (!crowdinApiKey) { - console.error("[ERROR] Missing I18N_CROWDIN_API_KEY environment variable") - console.error( - "[ERROR] Please set I18N_CROWDIN_API_KEY in your .env.local file" - ) - throw new Error("No Crowdin API Key found (I18N_CROWDIN_API_KEY)") -} -console.log("[DEBUG] Crowdin API key found ✓") -const crowdinBearerHeaders = { Authorization: `Bearer ${crowdinApiKey}` } - -// Parse environment variables with defaults -const targetLanguages = process.env.TARGET_LANGUAGES - ? process.env.TARGET_LANGUAGES.split(",").map((lang) => lang.trim()) - : ["es-EM"] - -const baseBranch = process.env.BASE_BRANCH || "dev" - -const fileLimit = process.env.FILE_LIMIT - ? parseInt(process.env.FILE_LIMIT, 10) - : 100 - -// Parse GitHub repository from env (format: "owner/repo") -const githubRepo = - process.env.GITHUB_REPOSITORY || "ethereum/ethereum-org-website" -const [ghOrganization, ghRepo] = githubRepo.split("/") - -console.log("[DEBUG] Configuration:") -console.log(`[DEBUG] - Target languages: ${targetLanguages.join(", ")}`) -console.log(`[DEBUG] - Base branch: ${baseBranch}`) -console.log(`[DEBUG] - File limit: ${fileLimit}`) -console.log(`[DEBUG] - GitHub repo: ${ghOrganization}/${ghRepo}`) - -const env = { - projectId: 834930, - ghOrganization, - ghRepo, - jsonRoot: "src/intl/en", - mdRoot: "public/content", - preTranslatePromptId: 168584, - allCrowdinCodes: targetLanguages, - baseBranch, -} - -// --- Utilities: resilient fetch for GitHub calls --- -const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)) - -type RetryOptions = { - retries?: number - timeoutMs?: number - backoffMs?: number - retryOnStatuses?: number[] -} - -const fetchWithRetry = async ( - url: string, - init?: RequestInit, - options?: RetryOptions -) => { - const retries = options?.retries ?? 3 - const timeoutMs = options?.timeoutMs ?? 30000 - const backoffMs = options?.backoffMs ?? 1000 - const retryOnStatuses = options?.retryOnStatuses ?? [ - 408, 429, 500, 502, 503, 504, - ] - - for (let attempt = 0; attempt <= retries; attempt++) { - const controller = new AbortController() - const id = setTimeout(() => controller.abort(), timeoutMs) - try { - const res = await fetch(url, { - ...(init || {}), - signal: controller.signal, - }) - clearTimeout(id) - if ( - !res.ok && - retryOnStatuses.includes(res.status) && - attempt < retries - ) { - const wait = backoffMs * Math.pow(2, attempt) - console.warn( - `[RETRY] ${url} -> ${res.status}. Attempt ${attempt + 1}/${retries}. Waiting ${wait}ms.` - ) - await delay(wait) - continue - } - return res - } catch (err: unknown) { - clearTimeout(id) - const errObj = err as { name?: string; code?: string } - const isAbort = errObj?.name === "AbortError" - const isConnectTimeout = errObj?.code === "UND_ERR_CONNECT_TIMEOUT" - if ((isAbort || isConnectTimeout) && attempt < retries) { - const wait = backoffMs * Math.pow(2, attempt) - console.warn( - `[RETRY] ${url} -> ${isAbort ? "AbortError" : errObj?.code}. Attempt ${ - attempt + 1 - }/${retries}. Waiting ${wait}ms.` - ) - await delay(wait) - continue - } - throw err - } - } - // Unreachable, but TS wants a return - throw new Error("fetchWithRetry: exhausted retries") -} +import { deleteEphemeralPrompt } from "./lib/crowdin/ephemeral-prompts" +import { prepareEnglishFiles } from "./lib/workflows/file-preparation" +import { initializeWorkflow } from "./lib/workflows/initialize" +import { runJsxTranslation } from "./lib/workflows/jsx-translation" +import { createTranslationPR } from "./lib/workflows/pr-creation" +import { handlePreTranslation } from "./lib/workflows/pre-translation" +import { runPostImportSanitization } from "./lib/workflows/sanitization" +import { + buildLanguageMappings, + downloadAndCommitTranslations, +} from "./lib/workflows/translation-download" +import type { PreTranslationResult, SplitPRResult } from "./lib/workflows/types" +import { logSection } from "./lib/workflows/utils" +import { runSyntaxValidation } from "./lib/workflows/validation" +import { config } from "./config" /** - * Get all files, using perPage to limit amount fetched + * Main orchestration function */ -const getAllEnglishFiles = async ( - perPage = 100 -): Promise => { - const ghSearchEndpointBase = "https://api.github.com/search/code" - const query = `repo:${env.ghOrganization}/${env.ghRepo} extension:md path:"${env.mdRoot}" -path:"${env.mdRoot}/translations" OR repo:${env.ghOrganization}/${env.ghRepo} extension:json path:"${env.jsonRoot}"` - - const url = new URL(ghSearchEndpointBase) - url.searchParams.set("q", query) - url.searchParams.set("per_page", perPage.toString()) - url.searchParams.set("page", "1") - - console.log(`[DEBUG] GitHub search query: ${query}`) - console.log(`[DEBUG] GitHub search URL: ${url.toString()}`) - - try { - const res = await fetchWithRetry(url.toString(), { - headers: gitHubBearerHeaders, - }) - - if (!res.ok) { - console.warn(`[ERROR] GitHub API response not OK: ${res.status}`) - const body = await res.text().catch(() => "") - console.error(`[ERROR] Response body:`, body) - throw new Error(`GitHub getAllEnglishFiles (${res.status}): ${body}`) - } - - type JsonResponse = { items: GitHubQueryResponseItem[] } - const json: JsonResponse = await res.json() - - console.log(`[DEBUG] Found ${json.items.length} files from GitHub`) - console.log(`[DEBUG] First GitHub file:`, json.items[0]) - return json.items - } catch (error) { - console.error(`[ERROR] Failed to get English files from GitHub:`, error) - process.exit(1) - } -} - -const getFileMetadata = async ( - items: GitHubQueryResponseItem[] -): Promise => { - if (!items.length) return [] - - const owner = items[0].repository.owner.login - const repo = items[0].repository.name - - const englishFileMetadata = items.map((item) => { - // https://raw.githubusercontent.com/:owner/:repo/:ref/:path - const download_url = `https://raw.githubusercontent.com/${owner}/${repo}/${env.baseBranch}/${item.path}` - const filePath = item.path - const filePathSplit = filePath.split("/") - const fileName = filePathSplit[filePathSplit.length - 1] - const contentType: ContentType = fileName?.endsWith(".json") - ? "application/json" - : "text/markdown" +async function main() { + const { existingPreTranslationIds } = config - return { - "Crowdin-API-FileName": fileName, - filePath: filePath, - download_url: download_url, - "Content-Type": contentType, - } - }) - return englishFileMetadata -} - -const getCrowdinProjectFiles = async (): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/files` - ) - url.searchParams.set("limit", "500") - - console.log(`[DEBUG] Fetching Crowdin project files from: ${url.toString()}`) - - try { - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - - if (!res.ok) { - console.warn(`[ERROR] Crowdin API response not OK: ${res.status}`) - const body = await res.text().catch(() => "") - console.error(`[ERROR] Response body:`, body) - throw new Error( - `Crowdin getCrowdinProjectFiles failed (${res.status}): ${body}` - ) - } - - type JsonResponse = { data: { data: CrowdinFileData }[] } - const json: JsonResponse = await res.json() - - const mappedData = json.data.map(({ data }) => data) + // Phase 1: Initialize workflow + const context = await initializeWorkflow() - console.log( - `[DEBUG] Successfully fetched ${mappedData.length} Crowdin files` - ) - console.log(`[DEBUG] First Crowdin file:`, mappedData[0]) - return mappedData - } catch (error) { - console.error(`[ERROR] Failed to fetch Crowdin project files:`, error) - process.exit(1) + // Phase 2: Prepare English files (skip if resuming existing jobs) + if (existingPreTranslationIds.length === 0) { + await prepareEnglishFiles(context) } -} -const findCrowdinFile = ( - targetFile: GitHubCrowdinFileMetadata, - crowdinFiles: CrowdinFileData[] -): CrowdinFileData => { - console.log( - `[DEBUG] Looking for Crowdin file matching: ${targetFile.filePath}` - ) - console.log(`[DEBUG] Target file name: ${targetFile["Crowdin-API-FileName"]}`) + // Phase 3: Handle pre-translation (resume or start new) + const preTranslateResult = await handlePreTranslation(context) - // Log first few Crowdin files for comparison - console.log(`[DEBUG] Total Crowdin files found: ${crowdinFiles.length}`) - console.log( - `[DEBUG] First 3 Crowdin file paths:`, - crowdinFiles.slice(0, 3).map((f) => f.path) + // Check if PR creation should be skipped + const skipPrCreation = ["1", "true", "yes", "on"].includes( + (process.env.SKIP_PR_CREATION || "").toLowerCase() ) - const found = crowdinFiles.find(({ path }) => - path.endsWith(targetFile.filePath) - ) + // Split PR mode: create one PR per language + if (config.splitPrs) { + const results: SplitPRResult[] = [] - if (!found) { - console.error( - `[ERROR] No matching Crowdin project file found for: ${targetFile.filePath}` - ) - console.error( - `[ERROR] Available Crowdin file paths:`, - crowdinFiles.map((f) => f.path) - ) - throw new Error( - `No matching Crowdin project file found for: ${targetFile.filePath}` - ) - } - - console.log( - `[DEBUG] Successfully matched with Crowdin file: ${found.path} (ID: ${found.id})` - ) - return found -} + for (const response of preTranslateResult.responses) { + const langId = response.attributes.languageIds[0] + const langCode = buildLanguageMappings([langId])[0].internalLanguageCode -/** - * Unhides all hidden strings in a Crowdin file. - * Hidden strings (often marked as duplicates) cannot be translated. - * This function makes them visible so they can be processed by pre-translation. - */ -const unhideStringsInFile = async (fileId: number): Promise => { - console.log(`[UNHIDE] Checking for hidden strings in fileId=${fileId}`) + logSection(`Processing Language: ${langCode}`) - // Get all strings from the file - const listUrl = `https://api.crowdin.com/api/v2/projects/${env.projectId}/strings?fileId=${fileId}&limit=500` + // Create single-response PreTranslationResult for this language + const singleLangResult: PreTranslationResult = { + responses: [response], + fileIdToPathMapping: preTranslateResult.fileIdToPathMapping, + fileIds: preTranslateResult.fileIds, + } - try { - const listRes = await fetch(listUrl, { headers: crowdinBearerHeaders }) - if (!listRes.ok) { - const text = await listRes.text().catch(() => "") - console.warn( - `[UNHIDE] Failed to list strings for fileId=${fileId}: ${text}` - ) - return 0 - } + try { + // Phase 4: Download and commit translations + const translationResult = await downloadAndCommitTranslations( + singleLangResult, + context + ) - const listJson = await listRes.json() - const strings = listJson.data || [] + // Phase 5: Translate JSX attributes via Gemini + const jsxTranslationResult = await runJsxTranslation( + translationResult.committedFiles, + translationResult.languagePairs, + translationResult.branch, + context.glossary + ) - let unhiddenCount = 0 + // Phase 6: Run post-import sanitizer + const sanitizeResult = await runPostImportSanitization( + translationResult.committedFiles, + translationResult.branch + ) - for (const item of strings) { - const stringId = item.data.id - const isHidden = item.data.isHidden + if (skipPrCreation) { + console.log( + `[${langCode}] Branch created: ${translationResult.branch}` + ) + results.push({ language: langCode, status: "success" }) + continue + } - if (!isHidden) continue + // Phase 7: Create PR + const pr = await createTranslationPR( + translationResult.branch, + translationResult.committedFiles, + sanitizeResult.changedFiles, + translationResult.languagePairs, + { geminiSkipped: jsxTranslationResult.geminiSkipped } + ) - // Unhide the string using PATCH - const patchUrl = `https://api.crowdin.com/api/v2/projects/${env.projectId}/strings/${stringId}` + // Phase 8: Run syntax tree validation + await runSyntaxValidation( + pr, + translationResult.committedFiles, + context.englishBuffers, + translationResult.fileIdToPathMapping + ) - try { - const patchRes = await fetch(patchUrl, { - method: "PATCH", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify([ - { - op: "replace", - path: "/isHidden", - value: false, - }, - ]), + console.log(`[${langCode}] ✓ PR created: ${pr.html_url}`) + results.push({ + language: langCode, + status: "success", + prUrl: pr.html_url, }) - - if (patchRes.ok) { - unhiddenCount++ - } else { - const text = await patchRes.text().catch(() => "") - console.warn(`[UNHIDE] Failed to unhide string ${stringId}: ${text}`) - } } catch (err) { - console.warn(`[UNHIDE] Error unhiding string ${stringId}:`, err) + const errorMsg = err instanceof Error ? err.message : String(err) + console.error(`[${langCode}] ✗ Failed: ${errorMsg}`) + results.push({ language: langCode, status: "failed", error: errorMsg }) } } - if (unhiddenCount > 0) { - console.log( - `[UNHIDE] ✓ Unhidden ${unhiddenCount} strings in fileId=${fileId}` - ) - } else { - console.log(`[UNHIDE] No hidden strings found in fileId=${fileId}`) - } + // Print summary + logSection("SPLIT PR SUMMARY") + const successes = results.filter((r) => r.status === "success") + const failures = results.filter((r) => r.status === "failed") - return unhiddenCount - } catch (error) { - console.error(`[UNHIDE] Error processing fileId=${fileId}:`, error) - return 0 - } -} - -/** - * Lists all Crowdin directories in the project. - */ -const getCrowdinProjectDirectories = async (): Promise< - { id: number; name: string; directoryId?: number }[] -> => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/directories` - ) - url.searchParams.set("limit", "500") - - console.log(`[DEBUG] Fetching Crowdin directories: ${url.toString()}`) - - try { - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error( - `Crowdin getCrowdinProjectDirectories failed (${res.status}): ${body}` - ) - } - type DirJson = { - data: { data: { id: number; name: string; directoryId?: number } }[] + console.log(`Created: ${successes.length}/${results.length}`) + if (successes.length > 0) { + console.log(`\nSuccessful:`) + for (const r of successes) { + console.log(` ${r.language}: ${r.prUrl ?? "(branch only)"}`) + } } - const json: DirJson = await res.json() - const dirs = json.data.map(({ data }) => data) - console.log(`[DEBUG] Loaded ${dirs.length} directories`) - return dirs - } catch (error) { - console.error("[ERROR] getCrowdinProjectDirectories:", error) - throw error - } -} - -/** - * Creates a single Crowdin directory (one segment). Parent may be undefined for root. - */ -const postCrowdinDirectory = async ( - name: string, - parentDirectoryId?: number -): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/directories` - ) - - const body: Record = { name } - if (parentDirectoryId) body.directoryId = parentDirectoryId - - console.log( - `[DEBUG] Creating directory segment "${name}" parent=${parentDirectoryId ?? "ROOT"}` - ) - - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - Accept: "application/json", - }, - body: JSON.stringify(body), - }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - // 409 = already exists race condition - throw new Error( - `Crowdin postCrowdinDirectory failed (${res.status}): ${text}` - ) + if (failures.length > 0) { + console.log(`\nFailed:`) + for (const r of failures) { + console.log(` ${r.language}: ${r.error}`) + } } - type JsonResponse = { data: { id: number } } - const json: JsonResponse = await res.json() - console.log(`[DEBUG] Created directory id=${json.data.id} name="${name}"`) - return json.data.id - } catch (error) { - console.error("[ERROR] postCrowdinDirectory:", error) - throw error - } -} - -/** - * Ensures a nested path of directories exists. - * Example path: "public/content/community/events/organizing" - * Returns the final (deepest) directory id. - * - * - Splits path on "/" ignoring empty segments. - * - Reuses existing segments (matched by name + parent). - * - Creates missing segments sequentially. - */ -const createCrowdinDirectory = async (fullPath: string): Promise => { - if (!fullPath || typeof fullPath !== "string") { - throw new Error("createCrowdinDirectory: path must be a non-empty string") - } - console.log(`[DEBUG] Ensuring Crowdin directory path: "${fullPath}"`) - - const segments = fullPath - .split("/") - .map((s) => s.trim()) - .filter(Boolean) - if (!segments.length) throw new Error("No valid path segments") - - const invalidChars = /[\\:*?"<>|]/ // Disallowed per Crowdin docs for directory name (exclude forward slash which is path separator) - for (const segment of segments) { - if (invalidChars.test(segment)) { - throw new Error( - `createCrowdinDirectory: segment "${segment}" contains invalid characters in path "${fullPath}"` - ) + if (successes.length === 0) { + throw new Error("All language PRs failed") } - } - - // Load existing directories once - const existing = await getCrowdinProjectDirectories() + } else { + // Single PR mode (default): all languages in one PR + // Phase 4: Download and commit translations + const translationResult = await downloadAndCommitTranslations( + preTranslateResult, + context + ) - // Build quick lookup: parentId|name -> id (root parentId = 0 sentinel) - const key = (parentId: number | undefined, name: string) => - `${parentId || 0}|${name}` + // Phase 5: Translate JSX attributes via Gemini (before sanitizer) + const jsxTranslationResult = await runJsxTranslation( + translationResult.committedFiles, + translationResult.languagePairs, + translationResult.branch, + context.glossary + ) - const directoryIndex = new Map() - for (const dir of existing) { - directoryIndex.set(key(dir.directoryId, dir.name), dir.id) - } + // Phase 6: Run post-import sanitizer + const sanitizeResult = await runPostImportSanitization( + translationResult.committedFiles, + translationResult.branch + ) - let currentParentId: number | undefined - for (const segment of segments) { - const k = key(currentParentId, segment) - let dirId = directoryIndex.get(k) - if (dirId) { + if (skipPrCreation) { + logSection("Skipping PR Creation") console.log( - `[DEBUG] Reusing existing directory "${segment}" id=${dirId} parent=${currentParentId ?? "ROOT"}` + `Files have been committed to branch: ${translationResult.branch}. No PR will be opened.` ) - currentParentId = dirId - continue - } - // Create - dirId = await postCrowdinDirectory(segment, currentParentId) - directoryIndex.set(k, dirId) - currentParentId = dirId - } - - if (!currentParentId) - throw new Error("Failed to resolve final directory id (unexpected)") - - console.log( - `[DEBUG] Final directory id for path "${fullPath}" = ${currentParentId}` - ) - return currentParentId -} - -const postCrowdinFile = async ( - storageId: number, - name: string, - dir: string -): Promise => { - const directoryId = await createCrowdinDirectory(dir) - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/files` - ) - - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - Accept: "application/json", - }, - body: JSON.stringify({ storageId, name, directoryId }), - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`Crowdin postCrowdinFile failed (${res.status}): ${body}`) - } - - type JsonResponse = { data: CrowdinAddFileResponse } - const json: JsonResponse = await res.json() - console.log("Updated file:", json.data) - return json.data - } catch (error) { - console.error(error) - process.exit(1) - } -} - -const downloadGitHubFile = async (download_url: string): Promise => { - try { - // const res = await fetch(download_url, { headers: gitHubBearerHeaders }) - const res = await fetch(download_url) - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error(`Failed to download from GitHub (${res.status}): ${body}`) - } - const arrayBuffer = await res.arrayBuffer() - return Buffer.from(arrayBuffer) - } catch (error) { - console.error("downloadGitHubFile error:", error) - throw error - } -} - -const postFileToStorage = async (fileBuffer: Buffer, fileName: string) => { - const url = new URL("https://api.crowdin.com/api/v2/storages") - - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - // Crowdin expects raw bytes for storages endpoint; use octet-stream. - "Content-Type": "application/octet-stream", - "Crowdin-API-FileName": fileName, - }, - body: fileBuffer, - }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error( - `Crowdin postFileToStorage failed (${res.status}): ${text}` - ) - } - - type JsonResponse = { - data: { - id: number - fileName: string - } - } - const json: JsonResponse = await res.json() - console.log("Uploaded storage:", json.data) - return json.data - } catch (error) { - console.error("postFileToStorage error:", error) - throw error - } -} - -const postApplyPreTranslation = async ( - fileIds: number[], - languageIds?: string[] -): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/pre-translations` - ) - try { - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - languageIds: languageIds || env.allCrowdinCodes, // ["es-EM"], // TODO: All languages - fileIds, - method: "ai", - aiPromptId: env.preTranslatePromptId, - }), - }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error( - `Crowdin postApplyPreTranslation failed (${res.status}): ${text}` - ) - } - - type JsonResponse = { - data: CrowdinPreTranslateResponse - } - const json: JsonResponse = await res.json() - - return json.data - } catch (error) { - console.error("postApplyPreTranslation error:", error) - throw error - } -} - -const getPreTranslationStatus = async ( - preTranslationId: string -): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${env.projectId}/pre-translations/${preTranslationId}` - ) - try { - const res = await fetch(url.toString(), { headers: crowdinBearerHeaders }) - - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error( - `Crowdin getPreTranslationStatus failed (${res.status}): ${text}` + console.log( + `Set SKIP_PR_CREATION=false to enable automatic PR creation in the workflow.` ) + return } - type JsonResponse = { - data: CrowdinPreTranslateResponse - } - const json: JsonResponse = await res.json() - - return json.data - } catch (error) { - console.error("postApplyPreTranslation error:", error) - throw error - } -} - -/** - * Polls Crowdin for the status of a pre-translation job and resolves when it finishes. - * - * This function repeatedly calls `getPreTranslationStatus` for the given - * pre-translation ID until the job is no longer in progress. It polls at a - * fixed interval (10 seconds) and will abort with an error if the operation - * does not complete within the configured timeout (30 minutes). - * - * @param preTranslationId - The identifier of the Crowdin pre-translation job to monitor. - * - * @returns A promise that resolves with the final CrowdinPreTranslateResponse when the - * job status becomes "finished". - * - * @throws {Error} If the wait times out (after 30 minutes). - * @throws {Error} If the pre-translation completes with an unexpected status - * (i.e., any status other than "finished"). - * @throws {Error} If an error is thrown while fetching the pre-translation status - * (errors from `getPreTranslationStatus` are propagated). - * - * @remarks - * - Polling interval: 10,000 ms (10 seconds). - * - Timeout: 30 minutes. - * - * @example - * // Wait for a pre-translation to complete - * const result = await awaitPreTranslationCompleted("abc123") - */ -const awaitPreTranslationCompleted = async ( - preTranslationId: string, - options?: { intervalMs?: number; timeoutMs?: number } -): Promise => { - const intervalMs = options?.intervalMs ?? 10_000 - const timeoutMs = options?.timeoutMs ?? 30 /* min */ * 60 * 1000 - - return await new Promise((resolve, reject) => { - const timeout = setTimeout(() => { - reject(new Error("Timed out waiting for pre-translation to finish")) - }, timeoutMs) - - const poll = async () => { - try { - const res = await getPreTranslationStatus(preTranslationId) - if (res.status !== "in_progress") { - clearTimeout(timeout) - if (res.status === "finished") { - resolve(res) - } else { - reject( - new Error( - `Pre-translation ended with unexpected status: ${res.status}` - ) - ) - } - } else { - setTimeout(poll, intervalMs) - } - } catch (err) { - clearTimeout(timeout) - reject(err) - } - } - - void poll() - }) -} - -/** - * Method: POST - * https://support.crowdin.com/developer/api/v2/#tag/Translations/operation/api.projects.translations.builds.directories.post - * @param fileId - * @param targetLanguageId - * @param projectId - * @returns { url: string; expireIn: string; etag: string; } - */ -const postBuildProjectFileTranslation = async ( - fileId: number, - targetLanguageId: string, - projectId = env.projectId -): Promise => { - const url = new URL( - `https://api.crowdin.com/api/v2/projects/${projectId}/translations/builds/files/${fileId}` - ) - - const res = await fetch(url.toString(), { - method: "POST", - headers: { - ...crowdinBearerHeaders, - "Content-Type": "application/json", - Accept: "application/json", - }, - body: JSON.stringify({ targetLanguageId }), - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error( - `Crowdin postBuildProjectFileTranslation failed (${res.status}): ${body}` + // Phase 7: Create PR + const pr = await createTranslationPR( + translationResult.branch, + translationResult.committedFiles, + sanitizeResult.changedFiles, + translationResult.languagePairs, + { geminiSkipped: jsxTranslationResult.geminiSkipped } ) - } - - type JsonResponse = { data: BuildProjectFileTranslationResponse } - const json: JsonResponse = await res.json() - console.log("Built file:", json.data) - return json.data -} - -/** - * method: GET - * @param downloadUrl - * @returns { buffer: Buffer } - */ -const getBuiltFile = async ( - downloadUrl: string - // ): Promise<{ buffer: Buffer; fileName: string; contentType: string }> => { -): Promise<{ buffer: Buffer }> => { - try { - const res = await fetch(downloadUrl) - - if (!res.ok) { - const body = await res.text().catch(() => "") - throw new Error(`Failed to download built file (${res.status}): ${body}`) - } - - const arrayBuffer = await res.arrayBuffer() - const buffer = Buffer.from(arrayBuffer) - - return { buffer } - } catch (error) { - console.error("getBuiltFile error:", error) - throw error - } -} - -/** - * Retrieves the Git object for a branch from the GitHub API and returns its underlying BranchObject. - * - * Fetches the ref for the given branch name from: - * https://api.github.com/repos/{env.ghOrganization}/{env.ghRepo}/git/ref/heads/{branch} - * using the preconfigured `gitHubBearerHeaders`. - * - * @param branch - The branch name to look up (for example "main" or "dev"). - * @returns A promise that resolves to the BranchObject extracted from the GitHub API response. - * - * @throws {Error} If the HTTP response is not OK (non-2xx). The thrown error includes the HTTP status - * and the response body text (when available). - * @throws {SyntaxError} If the response body cannot be parsed as JSON. - * - * @remarks - * - This function expects `env.ghOrganization`, `env.ghRepo`, and `gitHubBearerHeaders` to be available - * in the enclosing scope and correctly configured. - * - The function returns the `.object` property of the BranchDetailsResponse returned by GitHub. - * - Network errors (e.g. connectivity issues) will propagate as rejected promises from `fetch`. - * - * @example - * ```ts - * // resolves to the branch's object (sha, type, url) - * const obj = await getBranchObject("dev"); - * ``` - */ -const getBranchObject = async (branch: string): Promise => { - // https://api.github.com/repos/{{ $('env').item.json.ghOrganization }}/{{ $('env').item.json.ghRepo }}/git/ref/heads/dev - const url = new URL( - `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/git/ref/heads/${branch}` - ) - - const res = await fetchWithRetry(url.toString(), { - headers: gitHubBearerHeaders, - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`GitHub getBranchObject (${res.status}): ${body}`) - } - - type JsonResponse = BranchDetailsResponse - const json: JsonResponse = await res.json() - // console.log("getBranchDetails results", json) - return json.object -} - -const createBranchName = () => { - const ts = new Date().toISOString().replace(/\..*$/, "").replace(/[:]/g, "-") // e.g., 2025-11-10T04-20-13 - return "i18n/import/" + ts -} - -const getDestinationFromPath = ( - crowdinFilePath: string, // e.g. src/intl/en/page-foo.json OR public/content/.../index.md - internalLanguageCode: string -) => { - const normalized = crowdinFilePath.replace(/^\//, "") - const isJson = normalized.toLowerCase().endsWith(".json") - const isMarkdown = normalized.toLowerCase().endsWith(".md") - - let destinationPath = normalized - - if (isJson) { - // JSON: src/intl/en/*.json -> src/intl//*.json - if (normalized.startsWith("src/intl/en/")) { - destinationPath = normalized.replace( - /^src\/intl\/en\//, - `src/intl/${internalLanguageCode}/` - ) - } else if (normalized.startsWith("src/intl/")) { - // Fallback: if for some reason "en" segment is missing, inject lang after src/intl/ - const parts = normalized.split("/") - // parts: [src, intl, ...] - parts.splice(2, 0, internalLanguageCode) - destinationPath = parts.join("/") - } - } else if (isMarkdown) { - // Markdown: public/content//index.md -> public/content/translations///index.md - if (normalized.startsWith("public/content/")) { - const rel = normalized.replace(/^public\/content\//, "") - // If already inside translations/, avoid duplicating; rewrite to current lang - const relParts = rel.split("/").filter(Boolean) - if (relParts[0] === "translations") { - // Drop existing translations// - const rest = relParts.slice(2).join("/") - destinationPath = `public/content/translations/${internalLanguageCode}/${rest}` - } else { - destinationPath = `public/content/translations/${internalLanguageCode}/${rel}` - } - } - } - - console.log( - `[DEBUG] Destination mapping: ${crowdinFilePath} -> ${destinationPath} (lang=${internalLanguageCode})` - ) - return destinationPath -} -/** - * method: PUT - */ -const postCreateBranchFrom = async (ref = env.baseBranch) => { - const { sha } = await getBranchObject(ref) - - const branch = createBranchName() - - const url = new URL( - `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/git/refs` - ) + // Phase 8: Run syntax tree validation + await runSyntaxValidation( + pr, + translationResult.committedFiles, + context.englishBuffers, + translationResult.fileIdToPathMapping + ) - try { + // Success! + logSection("SUCCESS") + console.log(`Pull Request: ${pr.html_url}`) console.log( - `[DEBUG] Creating branch from base="${ref}" sha=${sha} -> new branch="${branch}"` + `Languages: ${translationResult.languagePairs.map((p) => p.internalLanguageCode).join(", ")}` ) - const res = await fetchWithRetry(url.toString(), { - method: "POST", - headers: { - ...gitHubBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify({ ref: `refs/heads/${branch}`, sha }), - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - console.error( - `[ERROR] Failed to create branch. URL=${url.toString()} status=${res.status}` - ) - throw new Error(`GitHub createBranchFrom (${res.status}): ${body}`) - } - - return { branch, sha } - } catch (error) { - console.error(error) - process.exit(1) - } -} - -const getPathSha = async (path: string, branch: string) => { - const url = new URL( - `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/contents/${path}?ref=${branch}` - ) - - const res = await fetchWithRetry(url.toString(), { - headers: gitHubBearerHeaders, - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`GitHub getPathSha (${res.status}): ${body}`) - } - - type JsonResponse = { sha: string } - const { sha }: JsonResponse = await res.json() - - return { sha } -} -const putCommitFile = async ( - buffer: Buffer, - destinationPath: string, - branch: string, - sha?: string, - attempt = 0 -): Promise => { - const url = `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/contents/${destinationPath}` - - try { - // Use the buffer contents as base64-encoded content for the commit - const contentBase64 = buffer.toString("base64") - - const body = { - message: `update(i18n): ${destinationPath}`, - content: contentBase64, - branch, - } - - if (sha) body["sha"] = sha - - const res = await fetchWithRetry(url.toString(), { - method: "PUT", - headers: { - ...gitHubBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify(body), - }) - - if (res.status === 422) { - const { sha: fileSha } = await getPathSha(destinationPath, branch) - console.warn( - `[RETRY] 422 Unprocessable for ${destinationPath}. Retrying with existing SHA ${fileSha}` - ) - return await putCommitFile( - buffer, - destinationPath, - branch, - fileSha, - attempt - ) - } - - if (res.status === 409) { - if (attempt >= 5) { - const bodyText = await res.text().catch(() => "") - throw new Error( - `GitHub putCommitFile conflict persists after ${attempt} retries (${res.status}): ${bodyText}` - ) - } - const backoff = 500 * Math.pow(2, attempt) // 500ms, 1s, 2s, 4s, 8s - console.warn( - `[RETRY] 409 Conflict for ${destinationPath}. Attempt ${attempt + 1}. Waiting ${backoff}ms before retry.` - ) - await delay(backoff) - const { sha: latestSha } = await getPathSha(destinationPath, branch) - return await putCommitFile( - buffer, - destinationPath, - branch, - latestSha, - attempt + 1 - ) - } - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`GitHub putCommitFile (${res.status}): ${body}`) - } - } catch (error) { - console.error(error) - process.exit(1) + console.log(`Files: ${preTranslateResult.fileIds.length}`) } -} -const postPullRequest = async (head: string, base = env.baseBranch) => { - const url = new URL( - `https://api.github.com/repos/${env.ghOrganization}/${env.ghRepo}/pulls` - ) - - const body = { - title: "i18n: automated Crowdin translation import", - head, - base, - body: "Automated Crowdin translation import", - } - - const res = await fetchWithRetry(url.toString(), { - method: "POST", - headers: { - ...gitHubBearerHeaders, - "Content-Type": "application/json", - }, - body: JSON.stringify(body), - }) - - if (!res.ok) { - console.warn("Res not OK") - const body = await res.text().catch(() => "") - throw new Error(`Crowdin postPullRequest failed (${res.status}): ${body}`) - } - - const json = await res.json() - return json -} - -async function main(options?: { allLangs: boolean }) { - console.log(`[DEBUG] Starting main function with options:`, options) - console.log(`[DEBUG] Environment config:`, { - projectId: env.projectId, - baseBranch: env.baseBranch, - jsonRoot: env.jsonRoot, - mdRoot: env.mdRoot, - allCrowdinCodes: env.allCrowdinCodes, - }) - - // Fetch English files with the configured file limit - const allEnglishFiles = await getAllEnglishFiles(fileLimit) - console.log( - `[DEBUG] Found ${allEnglishFiles.length} English files from GitHub` - ) - - // TODO: Add filter here to select specific files - const fileMetadata = await getFileMetadata(allEnglishFiles) - console.log(`[DEBUG] Generated metadata for ${fileMetadata.length} files`) - console.log(`[DEBUG] First file metadata:`, fileMetadata[0]) - - const crowdinProjectFiles = await getCrowdinProjectFiles() // *** - console.log( - `[DEBUG] Found ${crowdinProjectFiles.length} files in Crowdin project` - ) - - /** - * Iterate through each file and upload - */ - const fileIdsSet = new Set() - // Maintain authoritative mapping of processed Crowdin fileId -> path (including newly added files this run) - const processedFileIdToPath: Record = {} - // Keep original English buffers to detect untranslated outputs - const englishBuffers: Record = {} - for (const file of fileMetadata) { - console.log(`[DEBUG] Processing file: ${file.filePath}`) - await (async () => { - let foundFile: CrowdinFileData | undefined + // Cleanup all ephemeral prompts (best effort - don't fail the workflow if cleanup fails) + if (context.languageJobs.length > 0 && context.crowdinUserId) { + logSection("Cleaning Up Ephemeral Prompts") + for (const job of context.languageJobs) { try { - foundFile = findCrowdinFile(file, crowdinProjectFiles) - } catch { - console.log("File not found in Crowdin, attempting to add new file") - } - - let crowdinFileResponse: CrowdinAddFileResponse | undefined - let effectiveFileId: number - let effectivePath: string - - if (foundFile) { - // File exists - DO NOT update to preserve parsed string structure - console.log( - `[SKIP-UPDATE] File already exists in Crowdin with ID: ${foundFile.id}, using existing structure` - ) - console.log( - `[SKIP-UPDATE] Skipping upload/update to preserve existing parsed strings` - ) - effectiveFileId = foundFile.id - effectivePath = foundFile.path - - // Still download English for buffer comparison later - console.log( - `[DOWNLOAD] Downloading English source for buffer comparison: ${file.download_url}` - ) - const fileBuffer = await downloadGitHubFile(file.download_url) - englishBuffers[effectiveFileId] = fileBuffer - } else { - // File doesn't exist - create it - console.log(`[UPLOAD] File NOT found in Crowdin, creating new file`) - console.log( - `[UPLOAD] Downloading English source from: ${file.download_url}` - ) - const fileBuffer = await downloadGitHubFile(file.download_url) - console.log(`[UPLOAD] Downloaded ${fileBuffer.length} bytes`) - - const storageInfo = await postFileToStorage( - fileBuffer, - file["Crowdin-API-FileName"] - ) - console.log( - `[UPLOAD] Uploaded to Crowdin storage with ID: ${storageInfo.id}` - ) - - // Derive full parent directory path (exclude filename) - const parts = file.filePath.split("/").filter(Boolean) - parts.pop() // remove filename - const parentDirPath = parts.join("/") || "/" - console.log( - `[UPLOAD] Creating new Crowdin file in directory path: ${parentDirPath}` - ) - crowdinFileResponse = await postCrowdinFile( - storageInfo.id, - file["Crowdin-API-FileName"], - parentDirPath + await deleteEphemeralPrompt( + context.crowdinUserId, + job.ephemeralPromptId ) console.log( - `[UPLOAD] ✓ Created new Crowdin file with ID: ${crowdinFileResponse.id}` + `✓ Deleted prompt for ${job.internalCode} (ID: ${job.ephemeralPromptId})` ) - - effectiveFileId = crowdinFileResponse.id - effectivePath = crowdinFileResponse.path - englishBuffers[effectiveFileId] = fileBuffer - - // Wait for new file parsing - const delayMs = 10000 - console.log( - `[UPLOAD] ⏱️ Waiting ${delayMs / 1000}s for Crowdin to parse new file...` - ) - await delay(delayMs) - console.log(`[UPLOAD] ✓ Parsing delay complete`) - } - - fileIdsSet.add(effectiveFileId) - // Record path for destination mapping later (Crowdin returns leading slash paths) - if (effectivePath) processedFileIdToPath[effectiveFileId] = effectivePath - })() - } - - // Unhide any hidden/duplicate strings before pre-translation - console.log( - `\n[UNHIDE] ========== Unhiding strings in ${fileIdsSet.size} files ==========` - ) - for (const fileId of fileIdsSet) { - await unhideStringsInFile(fileId) - } - - console.log( - `\n[PRE-TRANSLATE] ========== Requesting AI Pre-Translation ==========` - ) - console.log(`[PRE-TRANSLATE] FileIds to translate:`, Array.from(fileIdsSet)) - console.log(`[PRE-TRANSLATE] Target languages:`, env.allCrowdinCodes) - console.log(`[PRE-TRANSLATE] AI Prompt ID:`, env.preTranslatePromptId) - - const applyPreTranslationResponse = await postApplyPreTranslation( - Array.from(fileIdsSet), - options?.allLangs ? env.allCrowdinCodes : env.allCrowdinCodes - ) - console.log( - `[PRE-TRANSLATE] ✓ Pre-translation job created with ID: ${applyPreTranslationResponse.identifier}` - ) - console.log( - `[PRE-TRANSLATE] Initial status:`, - applyPreTranslationResponse.status - ) - - console.log(`\n[PRE-TRANSLATE] Waiting for job to complete...`) - const preTranslateJobCompletedResponse = await awaitPreTranslationCompleted( - applyPreTranslationResponse.identifier - ) - - if (preTranslateJobCompletedResponse.status !== "finished") { - console.error( - "[PRE-TRANSLATE] ❌ Pre-translation did not finish successfully. Full response:", - preTranslateJobCompletedResponse - ) - throw new Error( - `Pre-translation ended with unexpected status: ${preTranslateJobCompletedResponse.status}` - ) - } - - console.log(`[PRE-TRANSLATE] ✓ Job completed successfully!`) - console.log( - `[PRE-TRANSLATE] Progress: ${preTranslateJobCompletedResponse.progress}%` - ) - console.log( - `[PRE-TRANSLATE] Full response:`, - JSON.stringify(preTranslateJobCompletedResponse, null, 2) - ) - - const { languageIds, fileIds } = preTranslateJobCompletedResponse.attributes - - // Build mapping for commit phase. Prefer processed mapping (includes newly added files); fall back to existing Crowdin snapshot for any missed IDs. - const fileIdToPathMapping: Record = {} - for (const fid of fileIds) { - if (processedFileIdToPath[fid]) { - fileIdToPathMapping[fid] = processedFileIdToPath[fid] - } else { - const existing = crowdinProjectFiles.find((f) => f.id === fid) - if (existing) fileIdToPathMapping[fid] = existing.path - } - if (!fileIdToPathMapping[fid]) { - console.warn( - `[WARN] Missing path mapping for fileId=${fid} (may impact destination path calculation)` - ) - } - } - // Build mapping between Crowdin IDs (e.g. "es-EM") and internal codes (e.g. "es") - const languagePairs = languageIds.map((crowdinId) => ({ - crowdinId, - internalLanguageCode: crowdinToInternalCodeMapping[crowdinId], - })) - - const { branch } = await postCreateBranchFrom(env.baseBranch) - console.log(`\n[BRANCH] ✓ Created branch: ${branch}`) - - // For each language - for (const { crowdinId, internalLanguageCode } of languagePairs) { - console.log( - `\n[BUILD] ========== Building translations for language: ${crowdinId} (internal: ${internalLanguageCode}) ==========` - ) - - // Build, download and commit each file updated - for (const fileId of fileIds) { - console.log(`\n[BUILD] --- Processing fileId: ${fileId} ---`) - const crowdinPath = fileIdToPathMapping[fileId] - console.log(`[BUILD] Crowdin path: ${crowdinPath}`) - - // 1- Build - console.log( - `[BUILD] Requesting build for fileId=${fileId}, language=${crowdinId}` - ) - const { url: downloadUrl } = await postBuildProjectFileTranslation( - fileId, - crowdinId, // Crowdin expects the Crowdin language ID here (e.g., "es-EM") - env.projectId - ) - console.log(`[BUILD] ✓ Build complete, download URL: ${downloadUrl}`) - - // 2- Download - console.log(`[BUILD] Downloading translated file...`) - const { buffer } = await getBuiltFile(downloadUrl) - console.log(`[BUILD] Downloaded ${buffer.length} bytes`) - - // Check if translation differs from English - const originalEnglish = englishBuffers[fileId] - if (originalEnglish) { - console.log( - `[BUILD] Original English size: ${originalEnglish.length} bytes` + } catch (err) { + console.warn( + `[WARN] Failed to cleanup ephemeral prompt ${job.ephemeralPromptId} (${job.internalCode}):`, + err instanceof Error ? err.message : err ) - if (originalEnglish.compare(buffer) === 0) { - console.warn( - `[BUILD] ⚠️ Skipping commit - content identical to English (no translation occurred)` - ) - continue - } else { - console.log(`[BUILD] ✓ Translation differs from English, will commit`) - } } - - // 3a- Get destination path - const destinationPath = getDestinationFromPath( - crowdinPath, - internalLanguageCode // Use internal code (e.g., "es") for repo path replacement - ) - console.log(`[BUILD] Destination path: ${destinationPath}`) - - // 3b- Commit - console.log(`[BUILD] Committing to branch: ${branch}`) - await putCommitFile(buffer, destinationPath, branch) - console.log(`[BUILD] ✓ Committed successfully`) } } - - console.log(`\n[PR] ========== Creating Pull Request ==========`) - console.log(`[PR] Head branch: ${branch}`) - console.log(`[PR] Base branch: ${env.baseBranch}`) - - const pr = await postPullRequest(branch, env.baseBranch) - - console.log(`\n[SUCCESS] ========== Translation import complete! ==========`) - console.log(`[SUCCESS] Pull Request URL: ${pr.html_url}`) - console.log(`[SUCCESS] PR Number: #${pr.number}`) - console.log(pr) } main().catch((err) => { - console.error("Fatal error:", err) + console.error("\n========== ERROR ==========") + console.error(err) process.exit(1) }) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts new file mode 100644 index 00000000000..75456a69d94 --- /dev/null +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -0,0 +1,1437 @@ +import * as fs from "fs" +import * as path from "path" + +/** + * Post-import sanitizer for Crowdin translations. + * + * - Synchronize custom Markdown header IDs `{#...}` with English source (ASCII-only) + * - Normalize block HTML tag line breaks (opening and closing tags on their own lines) + * - Protect known brand/team names from inadvertent translation + * - Validate JSON files; report issues + * + * Usage: + * npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/post_import_sanitize.ts + * + * Env: + * TARGET_LANGUAGES (comma-separated, e.g. "es-EM") optional; defaults to scanning all `translations/*` folders + */ + +const ROOT = process.cwd() +const CONTENT_ROOT = path.join(ROOT, "public", "content") + +const BLOCK_HTML_TAGS = [ + "section", + "div", + "article", + "aside", + "header", + "footer", +] + +/** + * MDX block components that need opening/closing tags on separate lines. + * ButtonLink is intentionally excluded - it's an inline component. + */ +const BLOCK_MDX_COMPONENTS = [ + "Card", + "ExpandableCard", + "Alert", + "AlertEmoji", + "AlertContent", + "AlertDescription", + "CardGrid", + "InfoGrid", + "InfoBanner", + "Tabs", + "TabItem", +] + +function listFiles( + dir: string, + predicate: (file: string) => boolean +): string[] { + const out: string[] = [] + const stack: string[] = [dir] + while (stack.length) { + const d = stack.pop()! + const entries = fs.readdirSync(d, { withFileTypes: true }) + for (const e of entries) { + const full = path.join(d, e.name) + if (e.isDirectory()) stack.push(full) + else if (predicate(full)) out.push(full) + } + } + return out +} + +function toAsciiId(id: string): string { + // keep only ASCII letters, numbers, hyphens and underscores; strip accents + const normalized = id.normalize("NFD").replace(/[\u0300-\u036f]/g, "") + return normalized.replace(/[^A-Za-z0-9_-]/g, "-") +} + +// Critical regex checks adapted from legacy markdownChecker +const BROKEN_LINK_REGEX = /\[[^\]]+\]\([^)\s]+\s[^)]+\)/g +const INVALID_LINK_REGEX = + /(? 0) { + // Brand is in English, check if it's preserved in translation + const inTranslation = translatedContent.match(brandRegex) + const englishCount = inEnglish.length + const translationCount = inTranslation?.length ?? 0 + + if (translationCount < englishCount) { + warnings.push( + `Protected brand "${brand}" appears ${englishCount}x in English but ${translationCount}x in translation - may have been mistranslated` + ) + } + } + } + + return warnings +} + +/** + * Fix duplicated headings where the text is repeated. + * Pattern: ## Text? Text? {#id} → ## Text? {#id} + * This happens when translators accidentally duplicate question headings. + */ +function fixDuplicatedHeadings(content: string): { + content: string + fixCount: number +} { + let result = content + let fixCount = 0 + + // Match headings where text is duplicated: ## Text Text {#id} or ## Text? Text? {#id} + // Captures: (hashes) (text including punctuation) (same text) (custom id) + const duplicatedHeadingRe = + /^(#{1,6})\s+(.+?[?!.]?)\s+\2\s*(\{#[^}]+\})\s*$/gm + + result = result.replace(duplicatedHeadingRe, (_, hashes, text, id) => { + fixCount++ + return `${hashes} ${text} ${id}` + }) + + return { content: result, fixCount } +} + +/** + * Fix broken markdown links where there's a space between ] and (. + * Pattern: ] (https://... → ](https://... + * This is a common translation artifact from Crowdin. + */ +function fixBrokenMarkdownLinks(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + + // Match ] followed by space(s) then ( - this breaks markdown links + const result = content.replace(/\]\s+\(/g, () => { + fixCount++ + return "](" + }) + + return { content: result, fixCount } +} + +/** + * Fix collapsed line breaks between consecutive MDX components. + * Pattern: \n + * This happens when translators collapse multiple components onto one line. + */ +function fixCollapsedComponentLineBreaks( + translatedContent: string, + englishContent: string +): { content: string; fixCount: number } { + let result = translatedContent + let fixCount = 0 + + // Find components that appear consecutively in English (on separate lines) + // and restore line breaks in translation if they were collapsed + const consecutiveComponentRe = + /<\/([A-Z][A-Za-z]*)[^>]*>\s*<([A-Z][A-Za-z]*)/g + + // Check English for line break patterns between components + const englishMatches = [...englishContent.matchAll(consecutiveComponentRe)] + for (const match of englishMatches) { + const fullMatch = match[0] + // If English has a newline between these components + if (fullMatch.includes("\n")) { + // Find same pattern in translation (possibly without newline) + const closingTag = match[1] + const openingTag = match[2] + const collapsedRe = new RegExp( + `[ \\t]+<${openingTag}`, + "g" + ) + const collapsedMatches = result.match(collapsedRe) + if (collapsedMatches) { + fixCount += collapsedMatches.length + result = result.replace(collapsedRe, `\n<${openingTag}`) + } + } + } + + return { content: result, fixCount } +} + +/** + * Extract all href values from content (both markdown links and JSX/HTML attributes). + */ +function extractHrefs(content: string): Set { + const hrefs = new Set() + + // Markdown links: [text](href) + const markdownLinkRe = /\[[^\]]*\]\(([^)]+)\)/g + let match + while ((match = markdownLinkRe.exec(content))) { + hrefs.add(match[1]) + } + + // JSX/HTML href attributes: href="..." or href='...' + const hrefAttrRe = /href=["']([^"']+)["']/g + while ((match = hrefAttrRe.exec(content))) { + hrefs.add(match[1]) + } + + return hrefs +} + +/** + * Extract hrefs from a single text block (paragraph/section). + * Returns array to preserve duplicates within the block. + */ +function extractHrefsFromBlock(block: string): string[] { + const hrefs: string[] = [] + + // Markdown links: [text](href) + const markdownLinkRe = /\[[^\]]*\]\(([^)]+)\)/g + let match + while ((match = markdownLinkRe.exec(block))) { + hrefs.push(match[1]) + } + + // JSX/HTML href attributes: href="..." or href='...' + const hrefAttrRe = /href=["']([^"']+)["']/g + while ((match = hrefAttrRe.exec(block))) { + hrefs.push(match[1]) + } + + return hrefs +} + +/** + * Split markdown content into logical blocks (paragraphs/sections). + * Blocks are separated by blank lines. + */ +function splitIntoBlocks(content: string): string[] { + // Split on one or more blank lines + return content.split(/\n\s*\n/).filter((block) => block.trim().length > 0) +} + +/** + * Fix translated hrefs by comparing against English source. + * Uses paragraph-scoped set comparison for robust matching across languages. + * + * Strategy: + * 1. Split both documents into blocks (paragraphs separated by blank lines) + * 2. For each block pair, compare internal href sets + * 3. Within a block: if invalid href count equals missing href count, we can match + * 4. This handles grammatical reordering within sentences (common in non-English) + * + * Only auto-fixes unambiguous cases; warns for complex mismatches. + */ +function fixTranslatedHrefs( + translatedContent: string, + englishContent: string +): { content: string; fixCount: number; fixes: string[]; warnings: string[] } { + const englishBlocks = splitIntoBlocks(englishContent) + const translatedBlocks = splitIntoBlocks(translatedContent) + + // Collect all English internal hrefs as the "valid" set + const allEnglishHrefs = extractHrefs(englishContent) + + const allFixes: Array<[string, string]> = [] // [wrong, correct] + const allWarnings: string[] = [] + + // Process block by block + const blockCount = Math.min(englishBlocks.length, translatedBlocks.length) + + for (let i = 0; i < blockCount; i++) { + const engBlock = englishBlocks[i] + const transBlock = translatedBlocks[i] + + const engHrefs = extractHrefsFromBlock(engBlock).filter(isInternalHref) + const transHrefs = extractHrefsFromBlock(transBlock).filter(isInternalHref) + + // Skip blocks with no internal hrefs + if (engHrefs.length === 0 && transHrefs.length === 0) continue + + // Find hrefs in translation that don't exist in English (invalid) + const transHrefSet = new Set(transHrefs) + + const invalidInTrans: string[] = [] // In translation but not in any English href + const missingFromTrans: string[] = [] // In English block but not in translation + + for (const href of transHrefs) { + if (!allEnglishHrefs.has(href)) { + invalidInTrans.push(href) + } + } + + for (const href of engHrefs) { + if (!transHrefSet.has(href)) { + missingFromTrans.push(href) + } + } + + // No issues in this block + if (invalidInTrans.length === 0 && missingFromTrans.length === 0) continue + + // Deduplicate for set comparison + const uniqueInvalid = [...new Set(invalidInTrans)] + const uniqueMissing = [...new Set(missingFromTrans)] + + // Only auto-fix when there's exactly 1 invalid and 1 missing in block + // Multiple mismatches within same block could be reordered - don't guess + if (uniqueInvalid.length === 1 && uniqueMissing.length === 1) { + allFixes.push([uniqueInvalid[0], uniqueMissing[0]]) + } else if (uniqueInvalid.length > 0 || uniqueMissing.length > 0) { + // Count mismatch - can't safely fix, warn instead + for (const href of uniqueInvalid) { + allWarnings.push( + `Block ${i + 1}: Invalid href "${href}" - not a valid English path` + ) + } + for (const href of uniqueMissing) { + allWarnings.push( + `Block ${i + 1}: Missing href "${href}" - present in English but not translation` + ) + } + } + } + + // Warn about block count mismatch + if (englishBlocks.length !== translatedBlocks.length) { + allWarnings.push( + `Block count mismatch: English has ${englishBlocks.length}, translation has ${translatedBlocks.length}` + ) + } + + // Apply all fixes + let result = translatedContent + const appliedFixes: string[] = [] + + for (const [wrong, correct] of allFixes) { + // Replace in markdown links: [text](wrong) → [text](correct) + const markdownRe = new RegExp( + `(\\[[^\\]]*\\]\\()${escapeRegex(wrong)}(\\))`, + "g" + ) + const beforeMd = result + result = result.replace(markdownRe, `$1${correct}$2`) + + // Replace in href attributes: href="wrong" → href="correct" + const hrefRe = new RegExp(`(href=["'])${escapeRegex(wrong)}(["'])`, "g") + const beforeAttr = result + result = result.replace(hrefRe, `$1${correct}$2`) + + if (result !== beforeMd || result !== beforeAttr) { + appliedFixes.push(`${wrong} → ${correct}`) + } + } + + return { + content: result, + fixCount: appliedFixes.length, + fixes: appliedFixes, + warnings: allWarnings, + } +} + +/** + * Escape special regex characters in a string. + */ +function escapeRegex(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") +} + +/** + * Check if href is an internal link (starts with / but not //). + */ +function isInternalHref(href: string): boolean { + return href.startsWith("/") && !href.startsWith("//") +} + +function lineAt(file: string, index: number): string { + const fileSubstring = file.substring(0, index) + const lines = fileSubstring.split("\n") + const linePosition = lines.length + const charPosition = lines[lines.length - 1].length + 1 + const lineNumber = `${linePosition}:${charPosition}` + return lineNumber +} +interface HeaderInfo { + level: number // Number of # symbols + text: string // Header text (translated or English) + id: string // Custom ID from {#id} + fullMatch: string // Full matched string for replacement +} + +function extractHeaderStructure(md: string): HeaderInfo[] { + const headers: HeaderInfo[] = [] + const headingRe = /^(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/gm + let m: RegExpExecArray | null + while ((m = headingRe.exec(md))) { + headers.push({ + level: m[1].length, + text: m[2].trim(), + id: m[3].trim(), + fullMatch: m[0], + }) + } + return headers +} + +function syncHeaderIdsWithEnglish( + translatedMd: string, + englishMd: string +): string { + // Extract header structure from both files + const englishHeaders = extractHeaderStructure(englishMd) + const translatedHeaders = extractHeaderStructure(translatedMd) + + // Match headers by position and level in the document structure + // If structure matches, copy English IDs to translated headers + if (englishHeaders.length !== translatedHeaders.length) { + console.warn( + `[WARN] Header count mismatch: English has ${englishHeaders.length}, translated has ${translatedHeaders.length}` + ) + } + + let result = translatedMd + // Match headers by index - same position = same semantic header + for (let i = 0; i < translatedHeaders.length; i++) { + const translatedHeader = translatedHeaders[i] + const englishHeader = englishHeaders[i] + + if (!englishHeader) { + // More headers in translation than English - skip + continue + } + + if (translatedHeader.level !== englishHeader.level) { + console.warn( + `[WARN] Header level mismatch at position ${i}: English H${englishHeader.level} vs translated H${translatedHeader.level}` + ) + // Still try to sync the ID even if levels don't match + } + + // Replace the translated header's ID with the English ID (ASCII-normalized) + const asciiId = toAsciiId(englishHeader.id) + const updatedHeader = `${"#".repeat(translatedHeader.level)} ${translatedHeader.text} {#${asciiId}}` + + // Use a more specific replacement to avoid affecting other occurrences + result = result.replace(translatedHeader.fullMatch, updatedHeader) + } + + return result +} + +function normalizeBlockHtmlLines(md: string): string { + for (const tag of BLOCK_HTML_TAGS) { + const inlineCloseRe = new RegExp(`([^\\n])\\s*`, "g") + md = md.replace(inlineCloseRe, (_, before) => `${before}\n`) + } + return md +} + +/** + * Restore blank lines after headers and block components by comparing + * with English source structure. This preserves readability and formatting. + */ +function restoreBlankLinesFromEnglish( + translatedMd: string, + englishMd: string +): { content: string; fixCount: number } { + const translatedLines = translatedMd.split("\n") + const englishLines = englishMd.split("\n") + + let fixCount = 0 + const result: string[] = [] + + // Patterns that should have blank lines after them + const headerPattern = /^#{1,6}\s+/ + const blockComponentClosePattern = new RegExp( + `` + ) + + for (let i = 0; i < translatedLines.length; i++) { + const line = translatedLines[i] + result.push(line) + + // Check if this line should be followed by a blank line + const isHeader = headerPattern.test(line) + const isBlockClose = blockComponentClosePattern.test(line) + + if (isHeader || isBlockClose) { + const nextLine = translatedLines[i + 1] + const hasBlankAfter = nextLine === "" + + // Find corresponding line in English by matching pattern + let englishShouldHaveBlank = false + for (let j = 0; j < englishLines.length; j++) { + const englishLine = englishLines[j] + if (isHeader && headerPattern.test(englishLine)) { + // Headers should match by structure (level) + const transLevel = (line.match(/^#+/) || [""])[0].length + const engLevel = (englishLine.match(/^#+/) || [""])[0].length + if (transLevel === engLevel) { + englishShouldHaveBlank = englishLines[j + 1] === "" + break + } + } else if ( + isBlockClose && + blockComponentClosePattern.test(englishLine) + ) { + englishShouldHaveBlank = englishLines[j + 1] === "" + break + } + } + + // Add blank line if English has it but translation doesn't + if (englishShouldHaveBlank && !hasBlankAfter && nextLine !== undefined) { + result.push("") + fixCount++ + } + } + } + + return { content: result.join("\n"), fixCount } +} + +/** + * Normalize inline component formatting to match English source. + * If English has the component on one line, collapse translated version too. + * This prevents MDX from wrapping multi-line content in

tags. + */ +function normalizeInlineComponentsFromEnglish( + translatedMd: string, + englishMd: string +): { + content: string + fixCount: number +} { + const inlineComponents = ["ButtonLink"] + + let content = translatedMd + let fixCount = 0 + + for (const component of inlineComponents) { + // Extract English instances and check if they're single-line + // Key by href attribute since that's preserved in translation + const englishRe = new RegExp( + `<${component}[^>]*href="([^"]*)"[^>]*>([\\s\\S]*?)`, + "g" + ) + const englishFormats = new Map() // href -> isOneLine + + let match + while ((match = englishRe.exec(englishMd))) { + const href = match[1] + const innerContent = match[2] + const isOneLine = !innerContent.includes("\n") + englishFormats.set(href, isOneLine) + } + + // For each translated instance, mirror English format + const translatedRe = new RegExp( + `(<${component}[^>]*href="([^"]*)"[^>]*>)([\\s\\S]*?)()`, + "g" + ) + content = content.replace( + translatedRe, + (fullMatch, openTag, href, innerContent, closeTag) => { + const englishIsOneLine = englishFormats.get(href) + const translatedHasLineBreaks = innerContent.includes("\n") + + // If English is single-line but translated has line breaks, collapse it + if (englishIsOneLine && translatedHasLineBreaks) { + fixCount++ + return `${openTag}${innerContent.trim()}${closeTag}` + } + return fullMatch + } + ) + } + + return { content, fixCount } +} + +function fixBlockComponentLineBreaks(md: string): { + content: string + fixCount: number +} { + let content = md + let fixCount = 0 + + for (const component of BLOCK_MDX_COMPONENTS) { + // Fix inline closing tags: content → content\n + const inlineCloseRe = new RegExp(`([^\\n])\\s*`, "g") + content = content.replace(inlineCloseRe, (_, before) => { + fixCount++ + return `${before}\n` + }) + + // Fix inline opening tags: content → \ncontent + // Match any non-newline character after the tag (including other tags) + const inlineOpenRe = new RegExp(`(<${component}[^>]*>)([^\\n])`, "g") + content = content.replace(inlineOpenRe, (_, tag, after) => { + fixCount++ + return `${tag}\n${after}` + }) + } + + return { content, fixCount } +} + +/** + * Collapse inline HTML tags to single line when English source has them on one line. + * Fixes MDX paragraph wrapping issues:

content\n
content
+ */ +function collapseInlineHtmlFromEnglish( + translatedMd: string, + englishMd: string +): { content: string; fixCount: number } { + const inlineTags = ["div", "span", "p", "strong", "em"] + let content = translatedMd + let fixCount = 0 + + // Build a set of lines in English where tag opens and closes on same line + const englishLines = englishMd.split("\n") + + for (const tag of inlineTags) { + // Find English lines that have ... all on one line + // (content can include nested tags like ,
, etc.) + const singleLinePattern = new RegExp(`<${tag}[^>]*>.*`) + const englishSingleLineSet = new Set() + + for (const line of englishLines) { + if (singleLinePattern.test(line)) { + // Extract just the opening tag to use as a key + const openTagMatch = line.match(new RegExp(`<${tag}[^>]*>`)) + if (openTagMatch) { + englishSingleLineSet.add(openTagMatch[0]) + } + } + } + + // In translated content, find cases where: + // - Opening tag + content is on one line (content may include nested tags) + // - Newline follows + // - Closing tag is on the next line (possibly with leading whitespace) + // Pattern: content-with-possible-nested-tags\n + const translatedMultiLineRe = new RegExp( + `(<${tag}[^>]*>)([^\\n]+)\\n(\\s*)`, + "g" + ) + + content = content.replace( + translatedMultiLineRe, + (fullMatch, openTag, innerContent, closeTagLine) => { + // Check if this opening tag should be single-line per English + if (englishSingleLineSet.has(openTag)) { + fixCount++ + // Collapse: opening tag + trimmed content + closing tag (no newline) + return `${openTag}${innerContent.trim()}${closeTagLine.trim()}` + } + return fullMatch + } + ) + } + + return { content, fixCount } +} + +/** + * Fix JSX component closing tags that are merged with content. + * English format: + * + * Content + * + * Spanish (broken): + * + * Content + * This function splits the closing tag to its own line when English has it that way. + */ +function fixMergedClosingTags( + translatedMd: string, + englishMd: string +): { content: string; fixCount: number } { + const componentTags = ["ButtonLink", "Link"] + let content = translatedMd + let fixCount = 0 + + for (const tag of componentTags) { + // Find patterns in English where the closing tag is on its own line + // Pattern: \n content\n or \n content\n + const englishMultiLineRe = new RegExp( + `<${tag}[^>]*>\\n[\\s\\S]*?\\n\\s*`, + "g" + ) + + // Check if English uses multi-line format for this component + if (!englishMultiLineRe.test(englishMd)) continue + + // In translated content, find cases where closing tag is merged with content on same line + // Pattern: \n content (content and closing tag on same line) + const mergedPattern = new RegExp( + `(<${tag}[^>]*>\\n)(\\s*)([^\\n]+)()`, + "g" + ) + + content = content.replace( + mergedPattern, + (match, openTagLine, indent, innerContent, closeTag) => { + // Only fix if the inner content doesn't end with just whitespace + // and the closing tag is directly after content (not on its own line) + const trimmedContent = innerContent.trimEnd() + if (trimmedContent.length > 0 && !innerContent.includes("\n")) { + fixCount++ + // Split: put closing tag on its own line with same indentation + return `${openTagLine}${indent}${trimmedContent}\n${indent}${closeTag}` + } + return match + } + ) + } + + return { content, fixCount } +} + +/** + * Repair unclosed backticks by comparing with English source. + * Detects lines with odd backtick counts containing < and attempts repair. + */ +function repairUnclosedBackticks( + translatedMd: string, + englishMd: string +): { content: string; fixCount: number } { + const translatedLines = translatedMd.split("\n") + const englishLines = englishMd.split("\n") + let fixCount = 0 + + for (let i = 0; i < translatedLines.length; i++) { + const line = translatedLines[i] + const backtickCount = (line.match(/`/g) || []).length + + // Odd number of backticks and contains < means potentially unclosed code with HTML-like content + if ( + backtickCount % 2 === 1 && + line.includes("<") && + !line.includes("```") + ) { + // Try to find a matching English line with similar structure + for (const engLine of englishLines) { + // Look for English lines with balanced backticks containing similar patterns + const engBackticks = (engLine.match(/`/g) || []).length + if ( + engBackticks % 2 === 0 && + engBackticks > 0 && + engLine.includes("<") + ) { + // Extract inline code blocks from English + const codeBlockRe = /`([^`]+)`/g + let engMatch + while ((engMatch = codeBlockRe.exec(engLine))) { + const engCode = engMatch[1] + // Check if the translated line contains this code pattern without closing backtick + const unbalancedPattern = new RegExp( + "`" + + engCode + .replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + .replace(/\s+/g, "\\s*") + ) + if ( + unbalancedPattern.test(line) && + !line.includes("`" + engCode + "`") + ) { + // Found a match - add the missing closing backtick + translatedLines[i] = line.replace( + new RegExp( + "`" + + engCode + .replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + .replace(/\s+/g, "\\s*") + ), + "`" + engCode + "`" + ) + fixCount++ + break + } + } + if (fixCount > 0) break + } + } + } + } + + return { content: translatedLines.join("\n"), fixCount } +} + +/** + * Normalize frontmatter dates from localized format (DD-MM-YYYY) back to ISO (YYYY-MM-DD). + */ +function normalizeFrontmatterDates(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + + // Match frontmatter block + const frontmatterRe = /^---\n([\s\S]*?)\n---/ + const match = content.match(frontmatterRe) + if (!match) return { content, fixCount } + + let frontmatter = match[1] + const originalFrontmatter = frontmatter + + // Fix published: dates in DD-MM-YYYY or DD/MM/YYYY format + frontmatter = frontmatter.replace( + /^(published:\s*)(\d{1,2})[-/](\d{1,2})[-/](\d{4})$/gm, + (_, prefix, day, month, year) => { + fixCount++ + // Pad day and month with leading zeros if needed + const paddedDay = day.padStart(2, "0") + const paddedMonth = month.padStart(2, "0") + return `${prefix}${year}-${paddedMonth}-${paddedDay}` + } + ) + + if (frontmatter !== originalFrontmatter) { + content = content.replace(frontmatterRe, `---\n${frontmatter}\n---`) + } + + return { content, fixCount } +} + +/** + * Sync protected frontmatter fields from English source. + * These fields should never be translated (e.g., template, sidebar). + */ +function syncProtectedFrontmatterFields( + translatedMd: string, + englishMd: string +): { content: string; fixCount: number } { + // Fields that should never be translated - sync from English canonical + // Note: 'buttons' array needs special handling (content translatable, toId/isSecondary not) + // Note: 'lang' must NOT be protected - it must remain as target language code + const protectedFields = [ + "template", + "sidebar", + "sidebarDepth", + "published", + "author", + "source", + "sourceUrl", + "address", + "emoji", + "skill", + "isOutdated", + "incomplete", + "hideEditButton", + "showDropdown", + "image", + "blurDataURL", + ] + let fixCount = 0 + + // Extract frontmatter from both + const frontmatterRe = /^---\n([\s\S]*?)\n---/ + const transMatch = translatedMd.match(frontmatterRe) + const engMatch = englishMd.match(frontmatterRe) + + if (!transMatch || !engMatch) return { content: translatedMd, fixCount } + + let transFrontmatter = transMatch[1] + const engFrontmatter = engMatch[1] + + for (const field of protectedFields) { + // Get English value + const engFieldRe = new RegExp(`^${field}:\\s*(.+)$`, "m") + const engFieldMatch = engFrontmatter.match(engFieldRe) + if (!engFieldMatch) continue + + const englishValue = engFieldMatch[1].trim() + + // Check if translated value differs + const transFieldRe = new RegExp(`^${field}:\\s*(.+)$`, "m") + const transFieldMatch = transFrontmatter.match(transFieldRe) + + if (transFieldMatch) { + const translatedValue = transFieldMatch[1].trim() + // Remove quotes for comparison + const cleanTranslated = translatedValue.replace(/^["']|["']$/g, "") + const cleanEnglish = englishValue.replace(/^["']|["']$/g, "") + + if (cleanTranslated !== cleanEnglish) { + // Replace with English value + transFrontmatter = transFrontmatter.replace( + transFieldRe, + `${field}: ${englishValue}` + ) + fixCount++ + } + } + } + + if (fixCount > 0) { + return { + content: translatedMd.replace( + frontmatterRe, + `---\n${transFrontmatter}\n---` + ), + fixCount, + } + } + + return { content: translatedMd, fixCount } +} + +/** + * Fix ASCII guillemets (<< and >>) to proper Unicode guillemets (« and »). + * Prevents MDX parsing errors from malformed angle bracket sequences. + * IMPORTANT: Skips code blocks where << and >> are valid bit-shift operators. + */ +function fixAsciiGuillemets(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + + // Split content to preserve code blocks (both fenced and inline) + // Fenced: ```...``` or ~~~...~~~ + // Inline: `...` + const codeBlockPattern = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g + const parts = content.split(codeBlockPattern) + + for (let i = 0; i < parts.length; i++) { + // Skip code blocks (odd indices after split with capturing group) + if (i % 2 === 1) continue + + // Count and replace in non-code parts only + const leftMatches = parts[i].match(/<>/g) + + if (leftMatches) { + fixCount += leftMatches.length + parts[i] = parts[i].replace(/<>/g, "»") + } + } + + return { content: parts.join(""), fixCount } +} + +/** + * Wrap frontmatter string values containing non-ASCII characters in double quotes. + * Prevents YAML parsing issues with accented characters. + */ +function quoteFrontmatterNonAscii(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + + // Match frontmatter block + const frontmatterRe = /^---\n([\s\S]*?)\n---/ + const match = content.match(frontmatterRe) + if (!match) return { content, fixCount } + + let frontmatter = match[1] + const originalFrontmatter = frontmatter + + // Find lines with unquoted values containing non-ASCII + const lines = frontmatter.split("\n") + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + // Match key: value pattern + const keyValueRe = /^(\s*\w+:\s*)(.+)$/ + const kvMatch = line.match(keyValueRe) + if (kvMatch) { + const [, prefix, value] = kvMatch + const trimmedValue = value.trim() + + // Skip if already quoted (starts and ends with matching quotes) + if ( + (trimmedValue.startsWith('"') && trimmedValue.endsWith('"')) || + (trimmedValue.startsWith("'") && trimmedValue.endsWith("'")) + ) { + continue + } + + // Skip YAML arrays - they handle their own internal quoting + // Inline arrays: tags: [ "value1", "value2" ] + if (trimmedValue.startsWith("[") && trimmedValue.endsWith("]")) { + continue + } + // Multi-line array items with - prefix won't match our key:value regex, + // but check explicitly for robustness (e.g., `key: - value` edge case) + if (trimmedValue.startsWith("-")) { + continue + } + + // Check if value contains non-ASCII characters + // eslint-disable-next-line no-control-regex + if (/[^\x00-\x7F]/.test(value)) { + // Escape any existing double quotes in the value + const escapedValue = trimmedValue.replace(/"/g, '\\"') + lines[i] = `${prefix}"${escapedValue}"` + fixCount++ + } + } + } + + frontmatter = lines.join("\n") + if (frontmatter !== originalFrontmatter) { + content = content.replace(frontmatterRe, `---\n${frontmatter}\n---`) + } + + return { content, fixCount } +} + +function processMarkdownFile( + mdPath: string, + providedContent?: string +): { + fixed: boolean + issues: string[] + content: string +} { + const issues: string[] = [] + let content = providedContent || fs.readFileSync(mdPath, "utf8") + + let englishMd: string | undefined + + // Map translated path to English path: remove `/translations//` segment + const parts = mdPath.split(path.sep) + const idx = parts.lastIndexOf("translations") + if (idx === -1 || idx + 2 >= parts.length) { + issues.push("No translations segment found; skipping formatting sync") + } else { + // Use path.resolve to preserve absolute paths (path.join loses leading /) + const englishPath = path.resolve( + path.sep, + ...parts.slice(0, idx), + ...parts.slice(idx + 2) // drop translations/ + ) + if (fs.existsSync(englishPath)) { + englishMd = fs.readFileSync(englishPath, "utf8") + content = syncHeaderIdsWithEnglish(content, englishMd) + } else { + issues.push(`English source missing: ${path.relative(ROOT, englishPath)}`) + } + } + + const before = content + + // Fix duplicated headings (e.g., ## Text? Text? {#id} → ## Text? {#id}) + const duplicatedResult = fixDuplicatedHeadings(content) + content = duplicatedResult.content + if (duplicatedResult.fixCount > 0) { + issues.push(`Fixed ${duplicatedResult.fixCount} duplicated headings`) + } + + // Fix broken markdown links (] (https:// → ](https://) + const brokenLinksResult = fixBrokenMarkdownLinks(content) + content = brokenLinksResult.content + if (brokenLinksResult.fixCount > 0) { + issues.push(`Fixed ${brokenLinksResult.fixCount} broken markdown links`) + } + + // Fix frontmatter issues (don't need English source) + const dateResult = normalizeFrontmatterDates(content) + content = dateResult.content + if (dateResult.fixCount > 0) { + issues.push( + `Normalized ${dateResult.fixCount} frontmatter dates to ISO format` + ) + } + + const quoteResult = quoteFrontmatterNonAscii(content) + content = quoteResult.content + if (quoteResult.fixCount > 0) { + issues.push( + `Quoted ${quoteResult.fixCount} frontmatter values with non-ASCII chars` + ) + } + + const guillemetResult = fixAsciiGuillemets(content) + content = guillemetResult.content + if (guillemetResult.fixCount > 0) { + issues.push( + `Fixed ${guillemetResult.fixCount} ASCII guillemets (<< >>) to Unicode (« »)` + ) + } + + // Fix escaped backticks (\`) to regular backticks (`) + // Crowdin sometimes escapes backticks unnecessarily + const escapedBacktickCount = (content.match(/\\`/g) || []).length + if (escapedBacktickCount > 0) { + content = content.replace(/\\`/g, "`") + issues.push(`Unescaped ${escapedBacktickCount} backslash-escaped backticks`) + } + + // Fix block component line breaks (critical for MDX parser) + const blockResult = fixBlockComponentLineBreaks(content) + content = blockResult.content + if (blockResult.fixCount > 0) { + issues.push(`Fixed ${blockResult.fixCount} inline block component tags`) + } + + content = normalizeBlockHtmlLines(content) + + // Normalize inline components and restore blank lines from English source + if (englishMd) { + // Sync protected frontmatter fields (template, sidebar, etc.) + const protectedResult = syncProtectedFrontmatterFields(content, englishMd) + content = protectedResult.content + if (protectedResult.fixCount > 0) { + issues.push( + `Synced ${protectedResult.fixCount} protected frontmatter fields from English` + ) + } + + // Collapse inline HTML tags to match English single-line format + const inlineHtmlResult = collapseInlineHtmlFromEnglish(content, englishMd) + content = inlineHtmlResult.content + if (inlineHtmlResult.fixCount > 0) { + issues.push( + `Collapsed ${inlineHtmlResult.fixCount} inline HTML tags to match English` + ) + } + + // Fix JSX component closing tags merged with content (split to own line) + const mergedTagResult = fixMergedClosingTags(content, englishMd) + content = mergedTagResult.content + if (mergedTagResult.fixCount > 0) { + issues.push( + `Split ${mergedTagResult.fixCount} merged closing tags to own lines` + ) + } + + // Collapse inline component line breaks to match English format + const inlineResult = normalizeInlineComponentsFromEnglish( + content, + englishMd + ) + content = inlineResult.content + if (inlineResult.fixCount > 0) { + issues.push( + `Normalized ${inlineResult.fixCount} inline components to match English` + ) + } + + // Repair unclosed backticks in inline code + const backtickResult = repairUnclosedBackticks(content, englishMd) + content = backtickResult.content + if (backtickResult.fixCount > 0) { + issues.push(`Repaired ${backtickResult.fixCount} unclosed backticks`) + } + + const blankLineResult = restoreBlankLinesFromEnglish(content, englishMd) + content = blankLineResult.content + if (blankLineResult.fixCount > 0) { + issues.push( + `Restored ${blankLineResult.fixCount} blank lines from English` + ) + } + + // Fix collapsed line breaks between consecutive components + const collapsedResult = fixCollapsedComponentLineBreaks(content, englishMd) + content = collapsedResult.content + if (collapsedResult.fixCount > 0) { + issues.push( + `Fixed ${collapsedResult.fixCount} collapsed component line breaks` + ) + } + + // Check for mistranslated brand names (report-only) + const brandWarnings = checkProtectedBrandNames(content, englishMd) + issues.push(...brandWarnings) + + // Fix translated hrefs using set comparison + const hrefResult = fixTranslatedHrefs(content, englishMd) + content = hrefResult.content + if (hrefResult.fixCount > 0) { + issues.push( + `Fixed ${hrefResult.fixCount} translated hrefs: ${hrefResult.fixes.join(", ")}` + ) + } + issues.push(...hrefResult.warnings) + } + + const fixed = before !== content + // Only write to disk if no content was provided (legacy mode) + if (fixed && !providedContent) { + fs.writeFileSync(mdPath, content, "utf8") + } + // Run critical checks (report-only) + let m: RegExpExecArray | null + // Broken links containing spaces inside URL + while ((m = BROKEN_LINK_REGEX.exec(content))) { + issues.push(`Broken link format at ${mdPath}:${lineAt(content, m.index)}`) + } + // Invalid links (exclude images/internal/hash/http/mailto/pdf/<...>) + while ((m = INVALID_LINK_REGEX.exec(content))) { + issues.push(`Invalid link at ${mdPath}:${lineAt(content, m.index)}`) + } + // Empty link text + while ((m = LINK_TEXT_MISSING_REGEX.exec(content))) { + issues.push(`Link text missing at ${mdPath}:${lineAt(content, m.index)}`) + } + // Incorrect image path in translated markdown + if (mdPath.includes(`${path.sep}translations${path.sep}`)) { + while ((m = INCORRECT_PATH_IN_TRANSLATED_MARKDOWN.exec(content))) { + issues.push( + `Incorrect image path at ${mdPath}:${lineAt(content, m.index)}` + ) + } + } + // Spelling mistakes (case-insensitive) + for (const mistake of COMMON_SPELLING_MISTAKES) { + const re = new RegExp(mistake, "gi") + while ((m = re.exec(content))) { + issues.push( + `Spelling mistake "${mistake}" at ${mdPath}:${lineAt(content, m.index)}` + ) + } + } + // Case-sensitive mistakes for brands + for (const mistake of CASE_SENSITIVE_SPELLING_MISTAKES) { + const re = new RegExp(mistake, "g") + while ((m = re.exec(content))) { + issues.push( + `Brand capitalization issue "${mistake}" at ${mdPath}:${lineAt(content, m.index)}` + ) + } + } + return { fixed, issues, content } +} + +function processJsonFile( + jsonPath: string, + providedContent?: string +): { + fixed: boolean + issues: string[] + content: string +} { + const issues: string[] = [] + let content = providedContent || fs.readFileSync(jsonPath, "utf8") + const before = content + + // Normalize BOM and smart quotes + content = content + .replace(/^\uFEFF/, "") + .replace(/[""]/g, '"') + .replace(/['']/g, "'") + + // Try parsing to validate JSON + try { + JSON.parse(content) + } catch (e) { + const error = e as Error + issues.push(`JSON parse error: ${error.message}`) + } + + const fixed = before !== content + // Only write to disk if no content was provided (legacy mode) + if (fixed && !providedContent) { + fs.writeFileSync(jsonPath, content, "utf8") + } + + return { fixed, issues, content } +} + +function languagesFromEnv(): string[] | undefined { + const env = process.env.TARGET_LANGUAGES?.trim() + if (!env) return undefined + return env + .split(",") + .map((s) => s.trim()) + .filter(Boolean) +} + +export function runSanitizer( + filesWithContent?: Array<{ path: string; content: string }>, + langs?: string[] +) { + console.log("[SANITIZE] Starting post-import sanitizer") + + let mdFilesToProcess: Array<{ path: string; content: string }> = [] + let jsonFilesToProcess: Array<{ path: string; content: string }> = [] + + if (filesWithContent && filesWithContent.length > 0) { + // Process only the specific files provided with their in-memory content + console.log( + `[SANITIZE] Target: ${filesWithContent.length} specific file(s)` + ) + mdFilesToProcess = filesWithContent.filter((f) => f.path.endsWith(".md")) + jsonFilesToProcess = filesWithContent.filter((f) => + f.path.endsWith(".json") + ) + } else { + // Fallback to language-based scanning (reads from disk) + const effectiveLangs = langs || languagesFromEnv() + console.log( + "[SANITIZE] Target languages:", + effectiveLangs ?? "ALL detected in translations/" + ) + const mdFilePaths = listFiles(CONTENT_ROOT, (f) => { + if (!f.endsWith(".md")) return false + if (!f.includes(`${path.sep}translations${path.sep}`)) return false + if (effectiveLangs) + return effectiveLangs.some((l) => + f.includes(`${path.sep}translations${path.sep}${l}${path.sep}`) + ) + return true + }) + const jsonFilePaths = listFiles(CONTENT_ROOT, (f) => { + if (!f.endsWith(".json")) return false + if (!f.includes(`${path.sep}translations${path.sep}`)) return false + if (effectiveLangs) + return effectiveLangs.some((l) => + f.includes(`${path.sep}translations${path.sep}${l}${path.sep}`) + ) + return true + }) + // Convert file paths to objects without content (will be read from disk) + mdFilesToProcess = mdFilePaths.map((p) => ({ path: p, content: "" })) + jsonFilesToProcess = jsonFilePaths.map((p) => ({ path: p, content: "" })) + } + + let mdFixed = 0 + const mdIssues: Array<{ file: string; issues: string[] }> = [] + const mdChanged: Array<{ path: string; content: string }> = [] + + for (const fileInfo of mdFilesToProcess) { + const { fixed, issues, content } = processMarkdownFile( + fileInfo.path, + fileInfo.content + ) + if (fixed) { + mdFixed++ + mdChanged.push({ path: fileInfo.path, content }) + } + if (issues.length) + mdIssues.push({ file: path.relative(ROOT, fileInfo.path), issues }) + } + + let jsonFixed = 0 + const jsonIssues: Array<{ file: string; issues: string[] }> = [] + const jsonChanged: Array<{ path: string; content: string }> = [] + + for (const fileInfo of jsonFilesToProcess) { + const { fixed, issues, content } = processJsonFile( + fileInfo.path, + fileInfo.content + ) + if (fixed) { + jsonFixed++ + jsonChanged.push({ path: fileInfo.path, content }) + } + if (issues.length) + jsonIssues.push({ file: path.relative(ROOT, fileInfo.path), issues }) + } + + console.log( + `\n[SANITIZE] Markdown files scanned: ${mdFilesToProcess.length}, fixed: ${mdFixed}` + ) + console.log( + `[SANITIZE] JSON files scanned: ${jsonFilesToProcess.length}, fixed: ${jsonFixed}` + ) + + if (mdIssues.length || jsonIssues.length) { + console.log("\n[SANITIZE] Issues detected:") + for (const i of mdIssues) { + console.log(` - MD ${i.file}`) + for (const msg of i.issues) console.log(` • ${msg}`) + } + for (const i of jsonIssues) { + console.log(` - JSON ${i.file}`) + for (const msg of i.issues) console.log(` • ${msg}`) + } + } else { + console.log("\n[SANITIZE] No issues detected.") + } + + const changedFiles = [...mdChanged, ...jsonChanged].map((f) => ({ + path: f.path, + content: f.content, + })) + return { + changedFiles, + markdown: { scanned: mdFilesToProcess.length, fixed: mdFixed }, + json: { scanned: jsonFilesToProcess.length, fixed: jsonFixed }, + issues: { markdown: mdIssues, json: jsonIssues }, + } +} + +if (require.main === module) { + runSanitizer() +} diff --git a/src/scripts/i18n/translate-jsx-attributes.ts b/src/scripts/i18n/translate-jsx-attributes.ts new file mode 100644 index 00000000000..ae195b65fca --- /dev/null +++ b/src/scripts/i18n/translate-jsx-attributes.ts @@ -0,0 +1,252 @@ +/** + * Standalone JSX attribute translation module + * + * Can be called from: + * 1. Main i18n workflow (after Crowdin download, before sanitizer) + * 2. Dedicated GitHub Action (accepts branch/PR, runs in isolation) + * + * Usage: + * npx ts-node translate-jsx-attributes.ts --language es --files file1.md,file2.md + * npx ts-node translate-jsx-attributes.ts --language es --branch translations/es + */ + +import fs from "fs" +import path from "path" + +import { isGeminiAvailable, translateAttributesByFile } from "./lib/ai" +import type { + ExtractedAttribute, + FileExtractionResult, + FileTranslationResult, + JsxTranslationSummary, +} from "./lib/jsx-attributes" +import { + countExtractedAttributes, + extractAttributesFromFile, + reinsertTranslatedAttributes, +} from "./lib/jsx-attributes" + +/** + * Options for JSX attribute translation + */ +export interface TranslateJsxOptions { + /** Target language code (e.g., "es", "fr") */ + targetLanguage: string + /** Files to process (path and content) */ + files: { path: string; content: string }[] + /** Glossary terms for this language (English term -> translated term) */ + glossaryTerms?: Map + /** Whether to log verbose output */ + verbose?: boolean +} + +/** + * Translate JSX attributes in a batch of files for a single language. + * This is the main entry point for both workflow integration and standalone use. + */ +export async function translateJsxAttributes( + options: TranslateJsxOptions +): Promise { + const { targetLanguage, files, glossaryTerms, verbose = false } = options + + console.log(`\n[JSX-TRANSLATE] Starting JSX attribute translation`) + console.log(`[JSX-TRANSLATE] Target language: ${targetLanguage}`) + console.log(`[JSX-TRANSLATE] Files to process: ${files.length}`) + + // Check Gemini availability + const geminiAvailable = isGeminiAvailable() + if (!geminiAvailable) { + console.warn( + `[JSX-TRANSLATE] ⚠️ GEMINI_API_KEY not available, skipping translation` + ) + return { + filesProcessed: files.length, + filesWithChanges: 0, + attributesTranslated: 0, + attributesFailed: 0, + geminiAvailable: false, + updatedFiles: [], + } + } + + // Extract attributes from all files + const extractions: FileExtractionResult[] = [] + const attributesByFile = new Map() + + for (const file of files) { + // Only process markdown files + if (!file.path.endsWith(".md") && !file.path.endsWith(".mdx")) { + continue + } + + const extraction = extractAttributesFromFile(file.content, file.path) + extractions.push(extraction) + + if (extraction.attributes.length > 0) { + attributesByFile.set(file.path, extraction.attributes) + if (verbose) { + console.log( + `[JSX-TRANSLATE] Found ${extraction.attributes.length} attributes in ${file.path}` + ) + } + } + } + + const totalAttributes = countExtractedAttributes(extractions) + console.log( + `[JSX-TRANSLATE] Found ${totalAttributes} translatable attributes in ${attributesByFile.size} files` + ) + + if (totalAttributes === 0) { + console.log(`[JSX-TRANSLATE] No attributes to translate`) + return { + filesProcessed: files.length, + filesWithChanges: 0, + attributesTranslated: 0, + attributesFailed: 0, + geminiAvailable: true, + updatedFiles: [], + } + } + + // Translate attributes via Gemini (one API call per file batch) + const translatedByFile = await translateAttributesByFile( + attributesByFile, + targetLanguage, + glossaryTerms + ) + + // Re-insert translated attributes into files + const updatedFiles: FileTranslationResult[] = [] + let attributesTranslated = 0 + let attributesFailed = 0 + + for (const extraction of extractions) { + const translated = translatedByFile.get(extraction.filePath) || [] + const result = reinsertTranslatedAttributes(extraction, translated) + + if (result.hasChanges) { + updatedFiles.push(result) + attributesTranslated += translated.length + } + + // Count failed as those we extracted but didn't get back + const originalCount = extraction.attributes.length + const translatedCount = translated.length + if (translatedCount < originalCount) { + attributesFailed += originalCount - translatedCount + } + } + + console.log(`[JSX-TRANSLATE] ✓ Translation complete`) + console.log(`[JSX-TRANSLATE] - Files with changes: ${updatedFiles.length}`) + console.log( + `[JSX-TRANSLATE] - Attributes translated: ${attributesTranslated}` + ) + if (attributesFailed > 0) { + console.log(`[JSX-TRANSLATE] - Attributes failed: ${attributesFailed}`) + } + + return { + filesProcessed: files.length, + filesWithChanges: updatedFiles.length, + attributesTranslated, + attributesFailed, + geminiAvailable: true, + updatedFiles, + } +} + +/** + * Read files from disk for standalone execution + */ +function readFilesFromDisk( + filePaths: string[] +): { path: string; content: string }[] { + return filePaths.map((filePath) => { + const absolutePath = path.isAbsolute(filePath) + ? filePath + : path.join(process.cwd(), filePath) + const content = fs.readFileSync(absolutePath, "utf-8") + return { path: filePath, content } + }) +} + +/** + * Write updated files back to disk + */ +function writeFilesToDisk(files: FileTranslationResult[]): void { + for (const file of files) { + const absolutePath = path.isAbsolute(file.filePath) + ? file.filePath + : path.join(process.cwd(), file.filePath) + fs.writeFileSync(absolutePath, file.updatedContent, "utf-8") + console.log(`[JSX-TRANSLATE] Wrote: ${file.filePath}`) + } +} + +/** + * Parse CLI arguments + */ +function parseArgs(): { language: string; files: string[] } | null { + const args = process.argv.slice(2) + let language = "" + let files: string[] = [] + + for (let i = 0; i < args.length; i++) { + if (args[i] === "--language" || args[i] === "-l") { + language = args[++i] + } else if (args[i] === "--files" || args[i] === "-f") { + files = args[++i].split(",").map((f) => f.trim()) + } + } + + if (!language || files.length === 0) { + return null + } + + return { language, files } +} + +/** + * CLI entry point for standalone execution + */ +async function main() { + const parsed = parseArgs() + + if (!parsed) { + console.log(` +Usage: npx ts-node translate-jsx-attributes.ts --language --files + +Options: + --language, -l Target language code (e.g., "es", "fr", "de") + --files, -f Comma-separated list of file paths to process + +Example: + npx ts-node translate-jsx-attributes.ts -l es -f public/content/roadmap/pbs/index.md +`) + process.exit(1) + } + + const fileContents = readFilesFromDisk(parsed.files) + const result = await translateJsxAttributes({ + targetLanguage: parsed.language, + files: fileContents, + verbose: true, + }) + + if (result.updatedFiles.length > 0) { + writeFilesToDisk(result.updatedFiles) + console.log(`\n✓ Updated ${result.updatedFiles.length} files`) + } else { + console.log(`\nNo files were modified`) + } +} + +// Run CLI if executed directly +if (require.main === module) { + main().catch((err) => { + console.error("Error:", err) + process.exit(1) + }) +} diff --git a/src/scripts/i18n/unhide-strings.ts b/src/scripts/i18n/unhide-strings.ts deleted file mode 100644 index 40aed1b9298..00000000000 --- a/src/scripts/i18n/unhide-strings.ts +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Unhide all hidden/duplicate strings in a Crowdin file - */ - -import dotenv from "dotenv" - -dotenv.config({ path: ".env.local" }) - -const API_KEY = process.env.I18N_CROWDIN_API_KEY! -const PROJ_ID = 834930 -const TARGET_FILE_ID = 17434 // organizing/index.md - -const requestHeaders = { - Authorization: `Bearer ${API_KEY}`, - "Content-Type": "application/json", -} - -async function unhideAllStrings() { - console.log(`\n=== Unhiding strings in file ${TARGET_FILE_ID} ===`) - - // Get all strings from the file - const listUrl = `https://api.crowdin.com/api/v2/projects/${PROJ_ID}/strings?fileId=${TARGET_FILE_ID}&limit=500` - - const listRes = await fetch(listUrl, { headers: requestHeaders }) - if (!listRes.ok) { - throw new Error(`Failed to list strings: ${listRes.status}`) - } - - const listJson = await listRes.json() - console.log(`Found ${listJson.data.length} strings`) - - let unhiddenCount = 0 - - for (const item of listJson.data) { - const stringId = item.data.id - const isHidden = item.data.isHidden - - if (!isHidden) { - continue - } - - // Unhide the string using PATCH - const patchUrl = `https://api.crowdin.com/api/v2/projects/${PROJ_ID}/strings/${stringId}` - - const patchRes = await fetch(patchUrl, { - method: "PATCH", - headers: requestHeaders, - body: JSON.stringify([ - { - op: "replace", - path: "/isHidden", - value: false, - }, - ]), - }) - - if (!patchRes.ok) { - const text = await patchRes.text() - console.error(`Failed to unhide string ${stringId}: ${text}`) - continue - } - - unhiddenCount++ - if (unhiddenCount % 10 === 0) { - console.log(`Unhidden ${unhiddenCount} strings...`) - } - } - - console.log(`\n✅ Successfully unhidden ${unhiddenCount} strings!`) -} - -unhideAllStrings().catch(console.error)