diff --git a/.claude/commands/fix-sanitizer-bug.md b/.claude/commands/fix-sanitizer-bug.md new file mode 100644 index 00000000000..b41395a41d6 --- /dev/null +++ b/.claude/commands/fix-sanitizer-bug.md @@ -0,0 +1,392 @@ +--- +description: Guided workflow for fixing translation sanitizer bugs — triage, test, fix, verify +allowed-tools: Bash, Read, Glob, Grep, Task, Edit, Write, AskUserQuestion +argument-hint: [--language=CODE] [--issue="description"] [--file=PATH] [--skip-build] +--- + +# Fix Sanitizer Bug + +Iterative workflow for fixing bugs in the post-import translation sanitizer (`src/scripts/i18n/post_import_sanitize.ts`). Follows a test-first approach: triage the issue, write a failing test, implement the fix, verify across languages. + +## Context +- Current branch: !`git branch --show-current` +- Arguments: $ARGUMENTS +- Sanitizer: `src/scripts/i18n/post_import_sanitize.ts` +- Test files: `tests/unit/sanitizer/*.spec.ts` +- Research docs: `docs/solutions/integration-issues/` + +## Phase 0: Gather Context + +### Parse Flags + +Extract from $ARGUMENTS: +- `LANGUAGE`: from `--language=CODE` (e.g., `ja`, `zh-tw`, `es`) +- `ISSUE_DESC`: from `--issue="..."` (brief description of the bug) +- `FILE_PATH`: from `--file=PATH` (specific file where issue was spotted) +- `SKIP_BUILD`: from `--skip-build` (skip build verification) + +### Collect Information + +If flags are missing, use AskUserQuestion to gather: + +1. **What language?** — Which locale has the issue (e.g., `ja`, `tr`, `zh-tw`) +2. **What's the artifact?** — Exact text of the translation bug (copy-paste the broken string) +3. **Where?** — File path or general area (markdown content, JSON translations, frontmatter) +4. **English source?** — What does the correct English look like + +Read the affected file and the English source to confirm the issue: +``` +Translated: {FILE_PATH} +English: {ENGLISH_EQUIVALENT_PATH} +``` + +**IMPORTANT:** Capture the exact broken pattern NOW before any processing. Copy the raw artifact verbatim — you'll need it for the test. + +## Phase 1: Document the Problem + +Append the new pattern to the research doc: + +**File:** `docs/solutions/integration-issues/sanitizer-test-research.md` + +Add a row to the "New Patterns" table: + +```markdown +| N+1 | {PATTERN_DESCRIPTION} | {SOURCE_PR_OR_REVIEW} | `{EXACT_EXAMPLE}` | {SEVERITY} | +``` + +Severity guide: +- **Critical** — Breaks rendering, navigation, or MDX compilation +- **High** — Breaks links, images, or loses content +- **Medium** — Wrong text displayed, semantic errors +- **Low** — Cosmetic, formatting-only + +## Phase 2: Triage — Fix, Warn, or Document Only + +This is the most important decision. Use AskUserQuestion: + +**"What type of fix does this need?"** + +### Option A: Deterministic Fix (auto-correct) +Use when the pattern is: +- Regex-matchable with no false positives +- The correct output is always the same (no judgment needed) +- Safe to apply across all languages + +Examples: escaped bold `\*\*text\*\*`, ticker typos `EHT→ETH`, date format `DD/MM/YYYY→YYYY-MM-DD` + +→ Proceed to Phase 3A (write fix function + test) + +### Option B: Warning Only (detect + report) +Use when: +- The pattern is detectable but the fix requires context/judgment +- Auto-fixing could cause collateral damage (see Bug #1: href substitution) +- Different files may need different resolutions + +Examples: translated hrefs, missing brand names, code fence drift + +→ Proceed to Phase 3B (write warn function + test) + +### Option C: Document Only (not automatable) +Use when: +- The issue is semantic (wrong word choice, not a pattern) +- No reliable regex can detect it +- It needs human/AI review judgment + +Examples: "Gas" → "Sprit" (gasoline) in German, tone inconsistency + +→ Skip to Phase 6 (update docs only) + +## Phase 3A: Write Failing Test (Fix Function) + +### Determine which test file + +- Pure function (no English source needed) → `tests/unit/sanitizer/standalone-fixes.spec.ts` +- Needs English comparison → `tests/unit/sanitizer/english-comparison.spec.ts` +- End-to-end through processMarkdownFile/processJsonFile → `tests/unit/sanitizer/integration.spec.ts` + +### Write the test FIRST + +Add a `test.describe` block for the new function. Include at minimum: + +```typescript +test.describe("fixNewIssue", () => { + test("fixes the broken pattern", () => { + const input = "{EXACT_BROKEN_PATTERN_FROM_PHASE_0}" + const { content, fixCount } = fixNewIssue(input) + expect(content).toBe("{EXPECTED_CORRECT_OUTPUT}") + expect(fixCount).toBe(1) + }) + + test("leaves correct content unchanged", () => { + const input = "{ALREADY_CORRECT_CONTENT}" + const { content, fixCount } = fixNewIssue(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("skips code blocks", () => { + // If the fix operates on prose, it MUST skip code blocks + const input = "```\n{PATTERN_INSIDE_CODE}\n```" + const { content, fixCount } = fixNewIssue(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) +}) +``` + +### Add import to test file + +Add the new function name to the destructured import from `_testOnly` at the top of the test file. + +### Verify test fails + +```bash +npx playwright test --project=unit tests/unit/sanitizer/{FILE}.spec.ts +``` + +The new test MUST fail (function doesn't exist yet). Existing tests should still pass. + +## Phase 3B: Write Failing Test (Warn Function) + +Same as 3A but assert warnings instead of content changes: + +```typescript +test.describe("warnNewIssue", () => { + test("warns on broken pattern", () => { + const warnings = warnNewIssue("{BROKEN_INPUT}", "{ENGLISH_INPUT}") + expect(warnings.length).toBeGreaterThan(0) + expect(warnings[0]).toContain("{EXPECTED_WARNING_SUBSTRING}") + }) + + test("no warning on clean content", () => { + const warnings = warnNewIssue("{CLEAN_INPUT}", "{ENGLISH_INPUT}") + expect(warnings).toHaveLength(0) + }) +}) +``` + +Use `tests/unit/sanitizer/warnings.spec.ts` for warn-only functions. + +## Phase 4: Implement the Fix + +### Write the function in the sanitizer + +**File:** `src/scripts/i18n/post_import_sanitize.ts` + +**For fix functions** — follow the established pattern: + +```typescript +function fixNewIssue(content: string): { + content: string + fixCount: number +} { + let fixCount = 0 + + // MANDATORY: Split to preserve code blocks + const codeBlockPattern = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g + const parts = content.split(codeBlockPattern) + + for (let i = 0; i < parts.length; i++) { + if (i % 2 === 1) continue // Skip code blocks + + // Your fix logic here + parts[i] = parts[i].replace(/{PATTERN}/g, () => { + fixCount++ + return "{REPLACEMENT}" + }) + } + + return { content: parts.join(""), fixCount } +} +``` + +**Critical rules:** +- ALWAYS split on code blocks first (fenced + inline) +- ALWAYS return `{ content, fixCount }` for fix functions +- ALWAYS return `string[]` for warn functions +- Use word boundaries `\b` for brand names to avoid partial matches +- Use `escapeRegex()` when building regex from dynamic strings + +### Add to _testOnly export + +Add the function name to the `_testOnly` export object near the bottom of the file. + +### Wire into processMarkdownFile or processJsonFile + +Add the function call using the `applyFix` helper pattern: + +```typescript +applyFix( + () => fixNewIssue(content), + (n) => `Fixed ${n} new issues` +) +``` + +**Placement matters:** Consider whether the fix should run before or after existing fixes. Some fixes depend on others having run first. + +For warn functions, add directly: +```typescript +const newWarnings = warnNewIssue(content, englishMd) +issues.push(...newWarnings) +``` + +## Phase 5: Run Tests and Verify + +### Step 1: Unit tests + +```bash +npx playwright test --project=unit tests/unit/sanitizer/ +``` + +**All tests must pass** — both the new test and all existing 99+ tests. + +If a test fails: +- New test fails → fix the implementation, not the test +- Existing test fails → your fix has a regression, investigate the interaction + +### Step 2: Run sanitizer against real files + +```bash +TARGET_LANGUAGES={LANGUAGE} npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/post_import_sanitize.ts +``` + +Check the output for: +- Your fix being applied (look for the issue label in the log) +- No unexpected fixes in other areas +- Fix count looks reasonable (not 0, not thousands) + +### Step 3: Inspect the actual changes + +```bash +git diff public/content/translations/{LANGUAGE}/ +``` + +Verify: +- The broken pattern is corrected +- No collateral damage to surrounding content +- Changes look correct to a human reader + +### Step 4: Build verification (conditional) + +**Only run this if the fix touches MDX syntax** — angle brackets, tags, components, backticks. + +**Skip if** `--skip-build` flag is set, or fix is purely textual (ticker corrections, brand tags, date normalization, guillemets, bold/italic escaping). + +```bash +NEXT_PUBLIC_BUILD_LOCALES=en,{LANGUAGE} pnpm build +``` + +**NOTE:** This step requires `dangerouslyDisableSandbox: true` and significant RAM. Only use when the fix could affect MDX compilation. + +### Step 5: Cross-language spot check + +Run the sanitizer against 2-3 other languages to check for false positives: + +```bash +TARGET_LANGUAGES=es,tr,ja npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/post_import_sanitize.ts +``` + +Check that your fix doesn't trigger unexpectedly in other languages. + +## Phase 6: If Not Resolved + +If the fix doesn't resolve the issue after Phase 5: + +### Diagnose the root cause + +Use AskUserQuestion: + +**"What went wrong?"** + +1. **Pattern mismatch** — regex doesn't match the real-world variant + - Get more examples of the broken pattern + - Broaden the regex + - Add another test case for the variant + - Go back to Phase 4 + +2. **Interaction effect** — another fix runs first and changes content + - Identify which fix runs first and transforms the input + - Reorder the fix in `processMarkdownFile` (earlier or later) + - Add an interaction test in `integration.spec.ts` + - Go back to Phase 4 + +3. **False positives in other languages** — fix breaks something elsewhere + - Add language-specific exclusions + - Add a cross-language test case + - Consider making it warn-only instead + - Go back to Phase 3 + +4. **Not actually automatable** — needs more context than regex can provide + - Convert to warn function or document-only + - Go back to Phase 2 and re-triage + +## Phase 7: Update Documentation + +### Update research doc + +**File:** `docs/solutions/integration-issues/sanitizer-test-research.md` + +If the pattern was new (not already in the table), ensure it was added in Phase 1. + +If the fix worked, move the pattern from "New Patterns Not Yet Covered" to "Patterns Already Handled by Sanitizer" with the function name. + +### Update existing bug docs if relevant + +Check if this relates to previously documented bugs: +- `docs/solutions/integration-issues/post-import-sanitizer-bugs-found-japanese-review.md` + +### Report summary + +Display to user: +``` +## Fix Complete + +**Issue:** {ISSUE_DESCRIPTION} +**Type:** {fix | warn | document-only} +**Function:** {FUNCTION_NAME} +**Test file:** {TEST_FILE} +**Tests:** {N} new tests added, {TOTAL} total passing +**Languages verified:** {LANGUAGES_CHECKED} +**Files changed:** + - src/scripts/i18n/post_import_sanitize.ts (fix + export) + - tests/unit/sanitizer/{FILE}.spec.ts (new tests) + - docs/solutions/integration-issues/sanitizer-test-research.md (documentation) +``` + +## Quick Reference + +### Run all sanitizer tests +```bash +npx playwright test --project=unit tests/unit/sanitizer/ +``` + +### Run sanitizer against a language +```bash +TARGET_LANGUAGES=ja npx ts-node -O '{"module":"commonjs"}' ./src/scripts/i18n/post_import_sanitize.ts +``` + +### Key files +| File | Purpose | +|------|---------| +| `src/scripts/i18n/post_import_sanitize.ts` | Sanitizer source (~2100 lines) | +| `tests/unit/sanitizer/standalone-fixes.spec.ts` | Tests for pure functions | +| `tests/unit/sanitizer/english-comparison.spec.ts` | Tests needing English source | +| `tests/unit/sanitizer/warnings.spec.ts` | Tests for warn-only functions | +| `tests/unit/sanitizer/integration.spec.ts` | End-to-end tests | +| `docs/solutions/integration-issues/sanitizer-test-research.md` | Pattern catalog | + +### Code block awareness pattern +Every text transformation MUST use this split pattern: +```typescript +const codeBlockPattern = /(```[\s\S]*?```|~~~[\s\S]*?~~~|`[^`]+`)/g +const parts = content.split(codeBlockPattern) +for (let i = 0; i < parts.length; i++) { + if (i % 2 === 1) continue // Skip code blocks + // Transform parts[i] only +} +``` + +### Function signature conventions +- **Fix functions:** `(content: string) => { content: string; fixCount: number }` +- **Fix w/ English:** `(translated: string, english: string) => { content: string; fixCount: number }` +- **Warn functions:** `(content: string, ...) => string[]` diff --git a/docs/solutions/integration-issues/sanitizer-test-research.md b/docs/solutions/integration-issues/sanitizer-test-research.md new file mode 100644 index 00000000000..2ef75c1291c --- /dev/null +++ b/docs/solutions/integration-issues/sanitizer-test-research.md @@ -0,0 +1,59 @@ +# Sanitizer Test Research: Patterns from PR Analysis + +> **Date:** 2026-02-25 +> **Source PRs:** #17544 (zh-tw), #17529 (sw), #17492 (ta), #17441 (bn), #17467 (ur), #17389 (de), #17182 (tr), #17132 (ja), #17090 (zh), #16979 (es) +> **Purpose:** Document new translation artifact patterns found during PR research, informing future sanitizer improvements and test coverage. + +## New Patterns Not Yet Covered by Sanitizer + +| # | Pattern | Source PR | Example | Severity | +|---|---------|-----------|---------|----------| +| 1 | Full-width parentheses break markdown links | zh-tw #17544 | `[text](/url/)` instead of `[text](/url/)` | High — breaks navigation | +| 2 | Lorem ipsum placeholder left in JSON | zh #17090 | "Lorem ipsum dolor sit amet" in real translation value | Medium — user-visible | +| 3 | Protocol separator corruption | ta #17492 | `http.bitcoinmagazine.com` instead of `http://` | High — breaks links | +| 4 | Chinese text leaking into image paths | zh #17090 | `![alt](./file.png 中文)` | High — breaks images | +| 5 | Missing whitespace around inline HTML in JSON | es #16979 | `wordwordword` | Low — cosmetic | +| 6 | Crowdin `''text''` double-apostrophe artifacts | sw #17529 | 86 occurrences across 3 files | Medium — unnatural text | +| 7 | Translated `@username` GitHub handles | sw #17529 | `@axic` → `kwaaxic` | Medium — broken attribution | +| 8 | Translated interpolation placeholders in JSON | bn #17441 | `{appName}` → `{Bengali script}` | Critical — breaks rendering | +| 9 | Simplified Chinese contamination in zh-tw | zh-tw #17544 | `着` (simplified) instead of `著` (traditional) | Medium — wrong variant | +| 10 | "Gas" translated as "Sprit" (gasoline) in German | de #17389 | 31 replacements needed across files | Medium — semantic error | +| 11 | Dropped glossary links during translation | ur #17467 | Entire `` tag removed, only text remains | High — loses links | + +## Patterns Already Handled by Sanitizer (Confirmed Working) + +These patterns are covered by existing fix functions and should have regression tests: + +- **Duplicated headings** (`fixDuplicatedHeadings`) — `## Text? Text? {#id}` +- **Broken markdown links** (`fixBrokenMarkdownLinks`) — `] (url)` space +- **Escaped bold/italic** (`fixEscapedBoldAndItalic`) — `\*\*text\*\*` +- **ASCII guillemets** (`fixAsciiGuillemets`) — `<>` +- **Ticker transpositions** (`fixTickerTranspositions`) — `EHT` → `ETH` +- **MDX angle brackets** (`escapeMdxAngleBrackets`) — `<5GB` +- **Orphaned closing tags** (`removeOrphanedClosingTags`) — trailing `` +- **Block component line breaks** (`fixBlockComponentLineBreaks`) +- **Frontmatter date normalization** (`normalizeFrontmatterDates`) +- **Frontmatter non-ASCII quoting** (`quoteFrontmatterNonAscii`) +- **Header ID sync** (`syncHeaderIdsWithEnglish`) +- **Brand tag restoration** (`fixBrandTags`) +- **Protected frontmatter sync** (`syncProtectedFrontmatterFields`) +- **Translated href detection** (`fixTranslatedHrefs`) — warn only +- **Cross-script contamination** (`detectCrossScriptContamination`) +- **Code fence drift** (`warnCodeFenceContentDrift`) +- **Backslash escape restoration** (`restoreDroppedBackslashEscapes`) +- **Unclosed backtick repair** (`repairUnclosedBackticks`) + +## Recommendations for Future Sanitizer Iteration + +1. **Full-width parentheses** (#1) — Add regex to normalize `(` → `(` and `)` → `)` inside markdown link syntax +2. **Translated interpolation placeholders** (#8) — Compare `{placeholder}` tokens between English and translated JSON; flag mismatches +3. **Protocol corruption** (#3) — Detect `http.` or `https.` followed by a domain and flag as potential `://` corruption +4. **Lorem ipsum detection** (#2) — Simple regex check in JSON values for "Lorem ipsum" +5. **Double-apostrophe artifacts** (#6) — Replace `''` with `'` in non-code contexts +6. **Translated @handles** (#7) — Compare `@username` patterns against English source + +## Related Documentation + +- [Post-Import Sanitizer Bugs Found During Japanese Review](./post-import-sanitizer-bugs-found-japanese-review.md) +- [Crowdin Import Review Agent Calibration](./crowdin-import-review-agent-calibration.md) +- [Crowdin File Path Mapping and Review Workflow](./crowdin-file-path-mapping-and-review-workflow.md) diff --git a/src/scripts/i18n/post_import_sanitize.ts b/src/scripts/i18n/post_import_sanitize.ts index 72bb32c09d7..e3ec46ded32 100644 --- a/src/scripts/i18n/post_import_sanitize.ts +++ b/src/scripts/i18n/post_import_sanitize.ts @@ -2099,6 +2099,49 @@ export async function runSanitizer( } } +/** @internal Exposed for unit testing only. Not part of the public API. */ +export const _testOnly = { + // Standalone fixes + fixDuplicatedHeadings, + fixBrokenMarkdownLinks, + fixEscapedBoldAndItalic, + fixAsciiGuillemets, + fixBlockComponentLineBreaks, + fixTickerTranspositions, + escapeMdxAngleBrackets, + removeOrphanedClosingTags, + normalizeFrontmatterDates, + quoteFrontmatterNonAscii, + normalizeBlockHtmlLines, + // English-comparison fixes + syncHeaderIdsWithEnglish, + fixTranslatedHrefs, + fixBrandTags, + fixProtectedBrandNames, + syncProtectedFrontmatterFields, + restoreBlankLinesFromEnglish, + collapseInlineHtmlFromEnglish, + fixMergedClosingTags, + normalizeInlineComponentsFromEnglish, + repairUnclosedBackticks, + restoreDroppedBackslashEscapes, + fixCollapsedComponentLineBreaks, + // Warnings + warnPunctuationOnlyHeadings, + warnCodeFenceContentDrift, + detectCrossScriptContamination, + // Utilities + toAsciiId, + extractHeaderStructure, + escapeRegex, + extractHrefs, + isInternalHref, + splitIntoBlocks, + // Entry points + processMarkdownFile, + processJsonFile, +} + if (require.main === module) { runSanitizer().catch(console.error) } diff --git a/tests/unit/sanitizer/english-comparison.spec.ts b/tests/unit/sanitizer/english-comparison.spec.ts new file mode 100644 index 00000000000..e706e68e519 --- /dev/null +++ b/tests/unit/sanitizer/english-comparison.spec.ts @@ -0,0 +1,397 @@ +/** + * Unit tests for sanitizer functions that compare translated content + * against English source content. + */ + +import { expect, test } from "@playwright/test" + +import { _testOnly } from "@/scripts/i18n/post_import_sanitize" + +const { + syncHeaderIdsWithEnglish, + fixBrandTags, + fixProtectedBrandNames, + syncProtectedFrontmatterFields, + restoreBlankLinesFromEnglish, + collapseInlineHtmlFromEnglish, + fixMergedClosingTags, + normalizeInlineComponentsFromEnglish, + repairUnclosedBackticks, + restoreDroppedBackslashEscapes, + fixCollapsedComponentLineBreaks, +} = _testOnly + +test.describe("English Comparison Fixes", () => { + test.describe("syncHeaderIdsWithEnglish", () => { + test("replaces translated IDs with ASCII English IDs when counts match", () => { + const english = [ + "## What is Ethereum? {#what-is-ethereum}", + "## How does it work? {#how-does-it-work}", + ].join("\n") + const translated = [ + "## \u30A4\u30FC\u30B5\u30EA\u30A2\u30E0\u3068\u306F? {#\u30A4\u30FC\u30B5\u30EA\u30A2\u30E0\u3068\u306F}", + "## \u3069\u306E\u3088\u3046\u306B\u6A5F\u80FD\u3059\u308B\u304B? {#\u3069\u306E\u3088\u3046\u306B}", + ].join("\n") + const result = syncHeaderIdsWithEnglish(translated, english) + expect(result).toContain("{#what-is-ethereum}") + expect(result).toContain("{#how-does-it-work}") + }) + + test("returns original when header counts mismatch", () => { + const english = "## One heading {#one}" + const translated = [ + "## \u898B\u51FA\u30571 {#one}", + "## \u898B\u51FA\u30572 {#two}", + ].join("\n") + const result = syncHeaderIdsWithEnglish(translated, english) + expect(result).toBe(translated) + }) + + test("normalizes accented IDs to ASCII", () => { + const english = "## \u00DCber uns {#\u00FCber-uns}" + const translated = "## \u79C1\u305F\u3061\u306B\u3064\u3044\u3066 {#\u79C1\u305F\u3061}" + const result = syncHeaderIdsWithEnglish(translated, english) + expect(result).toContain("{#uber-uns}") + }) + }) + + test.describe("fixBrandTags", () => { + test("restores brand tags to canonical casing", () => { + const english = [ + "---", + 'tags: ["solidity", "ethereum"]', + "---", + "Content", + ].join("\n") + const translated = [ + "---", + 'tags: ["\u30BD\u30EA\u30C7\u30A3\u30C6\u30A3", "\u30A4\u30FC\u30B5\u30EA\u30A2\u30E0"]', + "---", + "Content", + ].join("\n") + const { content, fixCount } = fixBrandTags(translated, english) + expect(content).toContain('"Solidity"') + expect(content).toContain('"Ethereum"') + expect(fixCount).toBe(2) + }) + + test("leaves non-brand concept tags translated", () => { + const english = [ + "---", + 'tags: ["zero-knowledge", "solidity"]', + "---", + ].join("\n") + const translated = [ + "---", + 'tags: ["\u30BC\u30ED\u77E5\u8B58", "\u30BD\u30EA\u30C7\u30A3\u30C6\u30A3"]', + "---", + ].join("\n") + const { content, fixCount } = fixBrandTags(translated, english) + // "zero-knowledge" is not a brand, so it stays translated + expect(content).toContain('"\u30BC\u30ED\u77E5\u8B58"') + // "solidity" is a brand, so it becomes "Solidity" + expect(content).toContain('"Solidity"') + expect(fixCount).toBe(1) + }) + + test("returns unchanged when tag counts mismatch", () => { + const english = [ + "---", + 'tags: ["solidity"]', + "---", + ].join("\n") + const translated = [ + "---", + 'tags: ["\u30BD\u30EA\u30C7\u30A3\u30C6\u30A3", "extra"]', + "---", + ].join("\n") + const { content, fixCount } = fixBrandTags(translated, english) + expect(content).toBe(translated) + expect(fixCount).toBe(0) + }) + + test("returns unchanged when no frontmatter", () => { + const { content, fixCount } = fixBrandTags("no frontmatter", "no frontmatter") + expect(content).toBe("no frontmatter") + expect(fixCount).toBe(0) + }) + }) + + test.describe("fixProtectedBrandNames", () => { + test("warns when brand count drops in translation", () => { + const english = "Ethereum is great. Ethereum rocks. Ethereum forever." + const translated = "Ethereum is great. Something else. Something more." + const { warnings } = fixProtectedBrandNames(translated, english) + const ethereumWarning = warnings.find((w) => + w.includes('"Ethereum"') + ) + expect(ethereumWarning).toBeDefined() + expect(ethereumWarning).toContain("3x in English") + expect(ethereumWarning).toContain("1x in translation") + }) + + test("delegates tag fixing to fixBrandTags", () => { + const english = [ + "---", + 'tags: ["solidity"]', + "---", + "Content", + ].join("\n") + const translated = [ + "---", + 'tags: ["\u30BD\u30EA\u30C7\u30A3\u30C6\u30A3"]', + "---", + "Content", + ].join("\n") + const { content, fixCount } = fixProtectedBrandNames(translated, english) + expect(content).toContain('"Solidity"') + expect(fixCount).toBe(1) + }) + }) + + test.describe("syncProtectedFrontmatterFields", () => { + test("restores translated protected fields from English", () => { + const english = [ + "---", + "template: tutorial", + "sidebar: true", + "published: 2024-01-01", + "---", + ].join("\n") + const translated = [ + "---", + "template: \u30C1\u30E5\u30FC\u30C8\u30EA\u30A2\u30EB", + "sidebar: \u306F\u3044", + "published: 01-01-2024", + "---", + ].join("\n") + const { content, fixCount } = syncProtectedFrontmatterFields( + translated, + english + ) + expect(content).toContain("template: tutorial") + expect(content).toContain("sidebar: true") + expect(content).toContain("published: 2024-01-01") + expect(fixCount).toBe(3) + }) + + test("does NOT sync lang field", () => { + const english = "---\nlang: en\n---" + const translated = "---\nlang: ja\n---" + const { content, fixCount } = syncProtectedFrontmatterFields( + translated, + english + ) + expect(content).toContain("lang: ja") + expect(fixCount).toBe(0) + }) + + test("leaves already-correct fields unchanged", () => { + const english = "---\ntemplate: tutorial\n---" + const translated = "---\ntemplate: tutorial\n---" + const { content, fixCount } = syncProtectedFrontmatterFields( + translated, + english + ) + expect(content).toBe(translated) + expect(fixCount).toBe(0) + }) + }) + + test.describe("restoreBlankLinesFromEnglish", () => { + test("adds blank line after heading when English has it", () => { + const english = "## Heading {#id}\n\nParagraph text" + const translated = "## \u898B\u51FA\u3057 {#id}\nParagraph text" + const { content, fixCount } = restoreBlankLinesFromEnglish( + translated, + english + ) + expect(content).toContain("## \u898B\u51FA\u3057 {#id}\n\nParagraph text") + expect(fixCount).toBe(1) + }) + + test("leaves unchanged when both already have blank lines", () => { + const english = "## Heading {#id}\n\nText" + const translated = "## \u898B\u51FA\u3057 {#id}\n\nText" + const { content, fixCount } = restoreBlankLinesFromEnglish( + translated, + english + ) + expect(content).toBe(translated) + expect(fixCount).toBe(0) + }) + }) + + test.describe("collapseInlineHtmlFromEnglish", () => { + test("collapses multi-line to single line when English is single-line", () => { + const english = "
content here
" + const translated = "
content here\n
" + const { content, fixCount } = collapseInlineHtmlFromEnglish( + translated, + english + ) + expect(content).toBe("
content here
") + expect(fixCount).toBe(1) + }) + + test("leaves multi-line when English is multi-line", () => { + const english = "
\ncontent\n
" + const translated = "
content\n
" + const { content, fixCount } = collapseInlineHtmlFromEnglish( + translated, + english + ) + // English is not single-line for this div, so no collapse + expect(content).toBe(translated) + expect(fixCount).toBe(0) + }) + }) + + test.describe("fixMergedClosingTags", () => { + test("splits merged closing tag when English has it on own line", () => { + const english = [ + '', + " Click here", + "", + ].join("\n") + const translated = [ + '', + " \u30AF\u30EA\u30C3\u30AF", + ].join("\n") + const { content, fixCount } = fixMergedClosingTags(translated, english) + expect(content).toContain("\u30AF\u30EA\u30C3\u30AF\n") + expect(content).toContain("") + expect(fixCount).toBe(1) + }) + + test("leaves unchanged when English has single-line format", () => { + const english = 'Click' + const translated = '\u30AF\u30EA\u30C3\u30AF' + const { content, fixCount } = fixMergedClosingTags(translated, english) + expect(content).toBe(translated) + expect(fixCount).toBe(0) + }) + }) + + test.describe("normalizeInlineComponentsFromEnglish", () => { + test("collapses multi-line ButtonLink to match English single-line", () => { + const english = + 'Learn more' + const translated = + '\n \u8A73\u7D30\u306F\u3053\u3061\u3089\n' + const { content, fixCount } = normalizeInlineComponentsFromEnglish( + translated, + english + ) + expect(content).not.toContain("\n") + expect(content).toContain("\u8A73\u7D30\u306F\u3053\u3061\u3089") + expect(fixCount).toBe(1) + }) + + test("keys by href attribute for matching", () => { + const english = [ + 'Text A', + '\n Text B\n', + ].join("\n") + const translated = [ + '\n Text A\n', + '\n Text B\n', + ].join("\n") + const { content, fixCount } = normalizeInlineComponentsFromEnglish( + translated, + english + ) + // Only /a should be collapsed (English is single-line) + // /b stays multi-line (English is multi-line) + expect(fixCount).toBe(1) + // The collapsed one should not have newlines around its content + expect(content).toMatch(/Text A<\/ButtonLink>/) + }) + }) + + test.describe("repairUnclosedBackticks", () => { + test("adds closing backtick when English has balanced pair", () => { + const english = "Use the `` to store data" + const translated = "Use the ` to store data" + const { content, fixCount } = repairUnclosedBackticks( + translated, + english + ) + expect(content).toContain("``") + expect(fixCount).toBe(1) + }) + + test("leaves balanced backticks unchanged", () => { + const english = "Use the `` to store data" + const translated = "Use the `` to store data" + const { content, fixCount } = repairUnclosedBackticks( + translated, + english + ) + expect(content).toBe(translated) + expect(fixCount).toBe(0) + }) + }) + + test.describe("restoreDroppedBackslashEscapes", () => { + test("restores backslash before < when English has it", () => { + const english = "Values \\ are mapped" + const translated = "Values are mapped" + const { content, fixCount } = restoreDroppedBackslashEscapes( + translated, + english + ) + expect(content).toContain("\\") + expect(fixCount).toBe(1) + }) + + test("restores backslash for <= comparison", () => { + const english = "When x \\<=256" + const translated = "When x <=256" + const { content, fixCount } = restoreDroppedBackslashEscapes( + translated, + english + ) + expect(content).toContain("\\<=256") + expect(fixCount).toBe(1) + }) + + test("skips code blocks", () => { + const english = "```\n\\\n```\nProse \\" + const translated = "```\n\n```\nProse " + const { content, fixCount } = restoreDroppedBackslashEscapes( + translated, + english + ) + // Code block should not be touched + expect(content).toContain("```\n\n```") + // Prose should be fixed + expect(content).toContain("Prose \\") + expect(fixCount).toBe(1) + }) + }) + + test.describe("fixCollapsedComponentLineBreaks", () => { + test("inserts newline between components when English has it", () => { + const english = "\n" + const translated = " " + const { content, fixCount } = fixCollapsedComponentLineBreaks( + translated, + english + ) + expect(content).toBe("\n") + expect(fixCount).toBe(1) + }) + + test("leaves already-separated components unchanged", () => { + const english = "\n" + const translated = "\n" + const { content, fixCount } = fixCollapsedComponentLineBreaks( + translated, + english + ) + expect(content).toBe(translated) + expect(fixCount).toBe(0) + }) + }) +}) diff --git a/tests/unit/sanitizer/integration.spec.ts b/tests/unit/sanitizer/integration.spec.ts new file mode 100644 index 00000000000..d3dc55edb75 --- /dev/null +++ b/tests/unit/sanitizer/integration.spec.ts @@ -0,0 +1,88 @@ +/** + * Integration tests for the sanitizer entry points. + * Tests processMarkdownFile and processJsonFile end-to-end. + */ + +import { expect, test } from "@playwright/test" + +import { _testOnly } from "@/scripts/i18n/post_import_sanitize" + +const { processMarkdownFile, processJsonFile } = _testOnly + +test.describe("Integration Tests", () => { + test.describe("processMarkdownFile", () => { + test("fixes multiple issues in a single pass", () => { + const content = [ + "---", + "title: Test", + "---", + "", + "## Heading? Heading? {#heading}", + "", + "See [link] (https://example.com) for more.", + "", + "\\*\\*bold\\*\\* text here", + "", + "Use <> for quoting", + ].join("\n") + + // Use a path without /translations/ to skip English-comparison fixes + const result = processMarkdownFile("/tmp/test.md", content) + + expect(result.fixed).toBe(true) + // Duplicated heading fixed + expect(result.content).toContain("## Heading? {#heading}") + expect(result.content).not.toContain("Heading? Heading?") + // Broken link fixed + expect(result.content).toContain("[link](https://example.com)") + // Escaped bold fixed + expect(result.content).toContain("**bold**") + // Guillemets fixed + expect(result.content).toContain("\u00AB") + expect(result.content).toContain("\u00BB") + }) + + test("standalone fixes applied when path has no translations segment", () => { + const content = "Some EHT content with \\*\\*bold\\*\\*" + const result = processMarkdownFile("/tmp/test.md", content) + + expect(result.content).toContain("ETH") + expect(result.content).toContain("**bold**") + // Should note that translations segment is missing + expect(result.issues.some((i) => i.includes("No translations segment"))).toBe(true) + }) + }) + + test.describe("processJsonFile", () => { + test("removes BOM", () => { + const content = '\uFEFF{"key": "value"}' + const result = processJsonFile("/tmp/test.json", content) + expect(result.fixed).toBe(true) + expect(result.content).toBe('{"key": "value"}') + }) + + test("removes BOM and validates JSON", () => { + const content = '\uFEFF{"key": "value", "num": 42}' + const result = processJsonFile("/tmp/test.json", content) + expect(result.fixed).toBe(true) + expect(result.content).toBe('{"key": "value", "num": 42}') + expect(result.content).not.toContain("\uFEFF") + expect(result.issues).toHaveLength(0) + }) + + test("reports JSON parse errors", () => { + const content = '{"key": broken}' + const result = processJsonFile("/tmp/test.json", content) + expect(result.issues.some((i) => i.includes("JSON parse error"))).toBe( + true + ) + }) + + test("leaves valid JSON unchanged", () => { + const content = '{"key": "value"}' + const result = processJsonFile("/tmp/test.json", content) + expect(result.fixed).toBe(false) + expect(result.content).toBe(content) + }) + }) +}) diff --git a/tests/unit/sanitizer/standalone-fixes.spec.ts b/tests/unit/sanitizer/standalone-fixes.spec.ts new file mode 100644 index 00000000000..ef4cfd1a538 --- /dev/null +++ b/tests/unit/sanitizer/standalone-fixes.spec.ts @@ -0,0 +1,409 @@ +/** + * Unit tests for standalone sanitizer fix functions. + * These functions take only content (no English source needed). + */ + +import { expect, test } from "@playwright/test" + +import { _testOnly } from "@/scripts/i18n/post_import_sanitize" + +const { + fixDuplicatedHeadings, + fixBrokenMarkdownLinks, + fixEscapedBoldAndItalic, + fixAsciiGuillemets, + fixBlockComponentLineBreaks, + fixTickerTranspositions, + escapeMdxAngleBrackets, + removeOrphanedClosingTags, + normalizeFrontmatterDates, + quoteFrontmatterNonAscii, + normalizeBlockHtmlLines, + toAsciiId, + escapeRegex, + extractHrefs, + isInternalHref, + splitIntoBlocks, +} = _testOnly + +test.describe("Standalone Fixes", () => { + test.describe("fixDuplicatedHeadings", () => { + test("removes duplicated heading text", () => { + const input = "## What is Ethereum? What is Ethereum? {#what-is-ethereum}" + const { content, fixCount } = fixDuplicatedHeadings(input) + expect(content).toBe("## What is Ethereum? {#what-is-ethereum}") + expect(fixCount).toBe(1) + }) + + test("leaves non-duplicated headings unchanged", () => { + const input = "## Normal heading {#normal}" + const { content, fixCount } = fixDuplicatedHeadings(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("handles multiple headings with only some duplicated", () => { + const input = [ + "## Good heading {#good}", + "## Bad? Bad? {#bad}", + "### Also fine {#fine}", + ].join("\n") + const { content, fixCount } = fixDuplicatedHeadings(input) + expect(content).toContain("## Good heading {#good}") + expect(content).toContain("## Bad? {#bad}") + expect(content).toContain("### Also fine {#fine}") + expect(fixCount).toBe(1) + }) + }) + + test.describe("fixBrokenMarkdownLinks", () => { + test("removes space between ] and (", () => { + const input = "[text] (https://example.com)" + const { content, fixCount } = fixBrokenMarkdownLinks(input) + expect(content).toBe("[text](https://example.com)") + expect(fixCount).toBe(1) + }) + + test("leaves correct links unchanged", () => { + const input = "[text](https://example.com)" + const { content, fixCount } = fixBrokenMarkdownLinks(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("fixes multiple broken links in one string", () => { + const input = + "See [link1] (url1) and [link2] (url2) for more." + const { content, fixCount } = fixBrokenMarkdownLinks(input) + expect(content).toBe("See [link1](url1) and [link2](url2) for more.") + expect(fixCount).toBe(2) + }) + }) + + test.describe("fixEscapedBoldAndItalic", () => { + test("unescapes bold markers", () => { + const input = "This is \\*\\*bold\\*\\* text" + const { content, fixCount } = fixEscapedBoldAndItalic(input) + expect(content).toBe("This is **bold** text") + expect(fixCount).toBe(1) + }) + + test("unescapes italic markers", () => { + const input = "This is \\*italic\\* text" + const { content, fixCount } = fixEscapedBoldAndItalic(input) + expect(content).toBe("This is *italic* text") + expect(fixCount).toBe(1) + }) + + test("skips table rows where escaped stars may be intentional", () => { + const input = "| 2\\*\\*256 | exponent |" + const { content, fixCount } = fixEscapedBoldAndItalic(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("skips code fences", () => { + const input = "```\n\\*\\*bold\\*\\*\n```" + const { content, fixCount } = fixEscapedBoldAndItalic(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("fixes prose but skips table in mixed content", () => { + const input = [ + "\\*\\*bold\\*\\* prose", + "| 2\\*\\*256 | value |", + ].join("\n") + const { content, fixCount } = fixEscapedBoldAndItalic(input) + expect(content).toContain("**bold** prose") + expect(content).toContain("| 2\\*\\*256 | value |") + expect(fixCount).toBe(1) + }) + }) + + test.describe("fixAsciiGuillemets", () => { + test("converts << and >> to Unicode guillemets", () => { + const input = "<>" + const { content, fixCount } = fixAsciiGuillemets(input) + expect(content).toBe("\u00ABtext\u00BB") + expect(fixCount).toBe(2) + }) + + test("skips inline code", () => { + const input = "Use `<>` for shift" + const { content, fixCount } = fixAsciiGuillemets(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("skips fenced code blocks", () => { + const input = "```\nresult = a << b\n```" + const { content, fixCount } = fixAsciiGuillemets(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + }) + + test.describe("fixTickerTranspositions", () => { + test("corrects EHT to ETH", () => { + const input = "Send some EHT to the address" + const { content, fixCount } = fixTickerTranspositions(input) + expect(content).toBe("Send some ETH to the address") + expect(fixCount).toBe(1) + }) + + test("corrects BSL to BLS and ECDAS to ECDSA", () => { + const input = "BSL signatures use ECDAS" + const { content, fixCount } = fixTickerTranspositions(input) + expect(content).toBe("BLS signatures use ECDSA") + expect(fixCount).toBe(2) + }) + + test("skips code fences", () => { + const input = "```\nconst EHT = 'ticker'\n```" + const { content, fixCount } = fixTickerTranspositions(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("skips inline code", () => { + const input = "The `EHT` variable is used here" + const { content, fixCount } = fixTickerTranspositions(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + }) + + test.describe("escapeMdxAngleBrackets", () => { + test("escapes < before digit", () => { + const input = "Requires <5GB of disk space" + const { content, fixCount } = escapeMdxAngleBrackets(input) + expect(content).toBe("Requires <5GB of disk space") + expect(fixCount).toBe(1) + }) + + test("escapes bare JSX fragment <>", () => { + const input = "Returns <> from the function" + const { content, fixCount } = escapeMdxAngleBrackets(input) + expect(content).toBe("Returns \\<> from the function") + expect(fixCount).toBe(1) + }) + + test("escapes bare closing fragment ", () => { + const input = "Ends with here" + const { content, fixCount } = escapeMdxAngleBrackets(input) + expect(content).toBe("Ends with \\ here") + expect(fixCount).toBe(1) + }) + + test("skips code blocks", () => { + const input = "```\nif (x <5) return\n```" + const { content, fixCount } = escapeMdxAngleBrackets(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("does not double-escape already escaped content", () => { + const input = "Requires <5GB of space" + const { content, fixCount } = escapeMdxAngleBrackets(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + }) + + test.describe("removeOrphanedClosingTags", () => { + test("removes trailing orphan when paired closer exists", () => { + const input = 'Home some prose ' + const { content, fixCount } = removeOrphanedClosingTags(input) + expect(content).toBe('Home some prose') + expect(fixCount).toBe(1) + }) + + test("leaves balanced tags unchanged", () => { + const input = 'Home' + const { content, fixCount } = removeOrphanedClosingTags(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("skips code spans", () => { + const input = "`` - description" + const { content, fixCount } = removeOrphanedClosingTags(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("skips fenced code blocks", () => { + const input = "```html\n\n```" + const { content, fixCount } = removeOrphanedClosingTags(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("handles multiple orphan types on different lines", () => { + const input = "text \nmore text " + const { content, fixCount } = removeOrphanedClosingTags(input) + expect(content).not.toContain("") + expect(content).not.toContain("") + expect(fixCount).toBe(2) + }) + + test("keeps first closer when one opener exists but two closers", () => { + const input = 'text' + const { content, fixCount } = removeOrphanedClosingTags(input) + expect(content).toBe('text') + expect(fixCount).toBe(1) + }) + }) + + test.describe("fixBlockComponentLineBreaks", () => { + test("adds newline before closing tag", () => { + const input = "Some content" + const { content, fixCount } = fixBlockComponentLineBreaks(input) + expect(content).toBe("Some content\n") + expect(fixCount).toBeGreaterThanOrEqual(1) + }) + + test("adds newline after opening tag", () => { + const input = "Some content" + const { content, fixCount } = fixBlockComponentLineBreaks(input) + expect(content).toBe("\nSome content") + expect(fixCount).toBeGreaterThanOrEqual(1) + }) + + test("leaves already separated tags unchanged in content", () => { + const input = "\nSome content\n" + const { content } = fixBlockComponentLineBreaks(input) + // Content should be identical even if regex matches (replacement is a no-op) + expect(content).toBe(input) + }) + + test("handles multiple component types", () => { + const input = "text\nmore" + const { content } = fixBlockComponentLineBreaks(input) + expect(content).toContain("text\n") + expect(content).toContain("more\n") + }) + }) + + test.describe("normalizeFrontmatterDates", () => { + test("converts DD-MM-YYYY to ISO format", () => { + const input = "---\npublished: 25-02-2026\n---\nContent" + const { content, fixCount } = normalizeFrontmatterDates(input) + expect(content).toContain("published: 2026-02-25") + expect(fixCount).toBe(1) + }) + + test("converts DD/MM/YYYY with zero-padding", () => { + const input = "---\npublished: 5/2/2026\n---\nContent" + const { content, fixCount } = normalizeFrontmatterDates(input) + expect(content).toContain("published: 2026-02-05") + expect(fixCount).toBe(1) + }) + + test("leaves ISO dates unchanged", () => { + const input = "---\npublished: 2026-02-25\n---\nContent" + const { content, fixCount } = normalizeFrontmatterDates(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("returns unchanged when no frontmatter", () => { + const input = "No frontmatter here" + const { content, fixCount } = normalizeFrontmatterDates(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + }) + + test.describe("quoteFrontmatterNonAscii", () => { + test("quotes values with non-ASCII characters", () => { + const input = '---\ntitle: \u00DCber Ethereum\n---\nContent' + const { content, fixCount } = quoteFrontmatterNonAscii(input) + expect(content).toContain('title: "\u00DCber Ethereum"') + expect(fixCount).toBe(1) + }) + + test("leaves already-quoted values unchanged", () => { + const input = '---\ntitle: "\u00DCber Ethereum"\n---\nContent' + const { content, fixCount } = quoteFrontmatterNonAscii(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("skips YAML arrays", () => { + const input = '---\ntags: ["\u00FCber", "test"]\n---\nContent' + const { content, fixCount } = quoteFrontmatterNonAscii(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + + test("leaves ASCII-only values unchanged", () => { + const input = "---\ntitle: About Ethereum\n---\nContent" + const { content, fixCount } = quoteFrontmatterNonAscii(input) + expect(content).toBe(input) + expect(fixCount).toBe(0) + }) + }) + + test.describe("normalizeBlockHtmlLines", () => { + test("splits inline closing tag to own line", () => { + const input = "some text" + const result = normalizeBlockHtmlLines(input) + expect(result).toBe("some text\n") + }) + + test("leaves already-separated tags unchanged", () => { + const input = "some text\n" + const result = normalizeBlockHtmlLines(input) + expect(result).toBe(input) + }) + }) + + test.describe("Utility functions", () => { + test("toAsciiId normalizes accented characters", () => { + expect(toAsciiId("qu-est-ce-qu-ethereum")).toBe( + "qu-est-ce-qu-ethereum" + ) + expect(toAsciiId("\u00FCber-ethereum")).toBe("uber-ethereum") + }) + + test("toAsciiId strips non-ASCII non-alphanumeric chars", () => { + // Each non-ASCII char (including NFD decomposition products) becomes "-" + const result = toAsciiId("\u4F55\u304C-ethereum") + expect(result).toMatch(/^-+-ethereum$/) + expect(result).not.toContain("\u4F55") + expect(result).not.toContain("\u304C") + }) + + test("escapeRegex escapes special regex characters", () => { + expect(escapeRegex("foo.bar[0]")).toBe("foo\\.bar\\[0\\]") + expect(escapeRegex("a+b*c")).toBe("a\\+b\\*c") + }) + + test("extractHrefs finds markdown and HTML hrefs", () => { + const input = + '[link](/path) and text and [ext](https://example.com)' + const hrefs = extractHrefs(input) + expect(hrefs.has("/path")).toBe(true) + expect(hrefs.has("/other")).toBe(true) + expect(hrefs.has("https://example.com")).toBe(true) + }) + + test("isInternalHref identifies internal links", () => { + expect(isInternalHref("/about")).toBe(true) + expect(isInternalHref("/en/docs")).toBe(true) + expect(isInternalHref("//cdn.example.com")).toBe(false) + expect(isInternalHref("https://example.com")).toBe(false) + }) + + test("splitIntoBlocks splits on blank lines", () => { + const input = "Block one\n\nBlock two\n\nBlock three" + const blocks = splitIntoBlocks(input) + expect(blocks).toHaveLength(3) + expect(blocks[0]).toBe("Block one") + expect(blocks[1]).toBe("Block two") + expect(blocks[2]).toBe("Block three") + }) + }) +}) diff --git a/tests/unit/sanitizer/warnings.spec.ts b/tests/unit/sanitizer/warnings.spec.ts new file mode 100644 index 00000000000..1f4419faa20 --- /dev/null +++ b/tests/unit/sanitizer/warnings.spec.ts @@ -0,0 +1,151 @@ +/** + * Unit tests for sanitizer warning functions. + * These functions detect issues and return warnings without modifying content. + */ + +import { expect, test } from "@playwright/test" + +import { _testOnly } from "@/scripts/i18n/post_import_sanitize" + +const { + warnPunctuationOnlyHeadings, + warnCodeFenceContentDrift, + fixTranslatedHrefs, + detectCrossScriptContamination, +} = _testOnly + +test.describe("Warning Functions", () => { + test.describe("warnPunctuationOnlyHeadings", () => { + test("warns on heading with only punctuation text", () => { + const input = "## \u3002 {#who-is-involved}" + const warnings = warnPunctuationOnlyHeadings(input) + expect(warnings.length).toBe(1) + expect(warnings[0]).toContain("only punctuation") + }) + + test("does not warn on real heading text", () => { + const input = "## Real heading {#id}" + const warnings = warnPunctuationOnlyHeadings(input) + expect(warnings).toHaveLength(0) + }) + + test("warns on question-mark-only heading", () => { + const input = "## ??? {#faq}" + const warnings = warnPunctuationOnlyHeadings(input) + expect(warnings.length).toBe(1) + expect(warnings[0]).toContain("only punctuation") + }) + }) + + test.describe("warnCodeFenceContentDrift", () => { + test("no warning when fences are identical", () => { + const content = "```js\nconst x = 1\n```" + const warnings = warnCodeFenceContentDrift(content, content) + expect(warnings).toHaveLength(0) + }) + + test("warns when code content was translated", () => { + const english = "```js\nconst x = 1\n```" + const translated = "```js\nconst x = 1\u306E\u5024\n```" + const warnings = warnCodeFenceContentDrift(translated, english) + expect(warnings.length).toBe(1) + expect(warnings[0]).toContain("content differs") + }) + + test("warns on fence count mismatch", () => { + const english = "```js\ncode1\n```\n\n```py\ncode2\n```" + const translated = "```js\ncode1\n```" + const warnings = warnCodeFenceContentDrift(translated, english) + expect(warnings.length).toBe(1) + expect(warnings[0]).toContain("count mismatch") + }) + }) + + test.describe("fixTranslatedHrefs (warn-only)", () => { + test("NEVER modifies content", () => { + const english = + "See [docs](/docs) and [about](/about)" + const translated = + "See [\u30C9\u30AD\u30E5\u30E1\u30F3\u30C8](/wrong-path) and [\u6982\u8981](/about)" + const { content, fixCount } = fixTranslatedHrefs(translated, english) + expect(content).toBe(translated) + expect(fixCount).toBe(0) + }) + + test("warns about missing English hrefs", () => { + const english = "See [docs](/docs) and [about](/about)" + const translated = "See [\u30C9\u30AD\u30E5\u30E1\u30F3\u30C8](/docs)" + const { warnings } = fixTranslatedHrefs(translated, english) + const missingWarning = warnings.find((w) => w.includes("/about")) + expect(missingWarning).toBeDefined() + expect(missingWarning).toContain("Missing href") + }) + + test("warns about translation-only hrefs", () => { + const english = "See [docs](/docs)" + const translated = + "See [\u30C9\u30AD\u30E5\u30E1\u30F3\u30C8](/docs) and [\u4ED6](/other)" + const { warnings } = fixTranslatedHrefs(translated, english) + const invalidWarning = warnings.find((w) => w.includes("/other")) + expect(invalidWarning).toBeDefined() + expect(invalidWarning).toContain("Invalid internal href") + }) + + test("no warnings when all hrefs match", () => { + const content = "See [text](/docs) and [more](/about)" + const { warnings } = fixTranslatedHrefs(content, content) + expect(warnings).toHaveLength(0) + }) + + test("warns on block count mismatch without modifying content", () => { + const english = "Block one\n\nBlock two\n\nBlock three" + const translated = "Block one\n\nBlock two" + const { content, warnings } = fixTranslatedHrefs(translated, english) + expect(content).toBe(translated) + const blockWarning = warnings.find((w) => w.includes("Block count")) + expect(blockWarning).toBeDefined() + }) + }) + + test.describe("detectCrossScriptContamination", () => { + test("warns on Cyrillic chars in Japanese content", () => { + const content = "\u30A4\u30FC\u30B5\u30EA\u30A2\u30E0\u306F \u0410\u0411\u0412 \u3067\u3059" + const warnings = detectCrossScriptContamination(content, "ja") + expect(warnings.length).toBe(1) + expect(warnings[0]).toContain("Cyrillic") + }) + + test("warns on Devanagari chars in Bengali content", () => { + const content = "\u09AC\u09BE\u0982\u09B2\u09BE \u0915\u0916\u0917 \u099F\u09C7\u0995\u09CD\u09B8\u099F" + const warnings = detectCrossScriptContamination(content, "bn") + expect(warnings.length).toBe(1) + expect(warnings[0]).toContain("Devanagari") + }) + + test("warns on CJK chars in Tamil content", () => { + const content = "\u0BA4\u0BAE\u0BBF\u0BB4\u0BCD \u4E2D\u6587 \u0B89\u0BB0\u0BC8" + const warnings = detectCrossScriptContamination(content, "ta") + expect(warnings.length).toBe(1) + expect(warnings[0]).toContain("CJK") + }) + + test("returns no warnings for unknown locale", () => { + const content = "some \u0410\u0411\u0412 text" + const warnings = detectCrossScriptContamination(content, "xx-unknown") + expect(warnings).toHaveLength(0) + }) + + test("skips characters inside code blocks", () => { + const content = "```\n\u0410\u0411\u0412\n```\n\u30C6\u30B9\u30C8" + const warnings = detectCrossScriptContamination(content, "ja") + // Cyrillic is inside code block, should be skipped + expect(warnings).toHaveLength(0) + }) + + test("skips characters inside inline code", () => { + const content = "\u30C6\u30B9\u30C8 `\u0410\u0411\u0412` \u30C6\u30B9\u30C8" + const warnings = detectCrossScriptContamination(content, "ja") + expect(warnings).toHaveLength(0) + }) + }) +})