diff --git a/.github/workflows/validate-translations.yml b/.github/workflows/validate-translations.yml new file mode 100644 index 00000000000..7381eeb7e12 --- /dev/null +++ b/.github/workflows/validate-translations.yml @@ -0,0 +1,37 @@ +name: Validate Translation URLs + +on: + pull_request: + paths: + - "public/content/translations/**" + - "src/intl/**" + workflow_run: + workflows: ["Crowdin CI"] + types: [completed] + +jobs: + validate: + runs-on: ubuntu-latest + # Only run if triggered by PR or if Crowdin CI succeeded + if: ${{ github.event_name == 'pull_request' || github.event.workflow_run.conclusion == 'success' }} + timeout-minutes: 15 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup pnpm + uses: pnpm/action-setup@v2 + with: + version: 9 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: "20" + cache: "pnpm" + + - name: Install dependencies + run: pnpm install --frozen-lockfile + + - name: Validate translated URLs + run: pnpm validate-urls diff --git a/package.json b/package.json index a43d2800816..80501f9fb12 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,8 @@ "crowdin-clean": "rm -rf .crowdin && mkdir .crowdin", "crowdin-import": "ts-node src/scripts/crowdin-import.ts", "markdown-checker": "ts-node -O '{ \"module\": \"commonjs\" }' src/scripts/markdownChecker.ts", + "validate-urls": "ts-node -O '{ \"module\": \"commonjs\" }' src/scripts/validateTranslatedUrls.ts", + "validate-urls:fix": "ts-node -O '{ \"module\": \"commonjs\" }' src/scripts/validateTranslatedUrls.ts --fix", "events-import": "ts-node -O '{ \"module\": \"commonjs\" }' src/scripts/events-import.ts", "crowdin-needs-review": "ts-node -O '{ \"module\": \"commonjs\" }' src/scripts/crowdin/reports/generateReviewReport.ts", "update-tutorials": "ts-node -O '{ \"module\": \"commonjs\" }' src/scripts/update-tutorials-list.ts", diff --git a/public/content/translations/ga/community/research/index.md b/public/content/translations/ga/community/research/index.md index d84b5d13652..81a6f629d37 100644 --- a/public/content/translations/ga/community/research/index.md +++ b/public/content/translations/ga/community/research/index.md @@ -20,7 +20,7 @@ Tugann an tuarascáil seo a foilsíodh i mBealtaine 2022 ag [DelphiDigital](http ## Foinsí Maoinithe {#sources-of-funding} -Is féidir leat a bheith páirteach i dtaighde Ethereum agus íocaíocht a fháil! Mar shampla, reáchtáil [Fondúireacht Ethereum](/foundation/) [babhta maoinithe na nDeontas Acadúla] le déanaí ( https://esp.ethereum.foundation/academic-grants). Is féidir leat faisnéis a fháil ar dheiseanna maoinithe gníomhacha agus atá le teacht ar [leathanach deontas Ethereum](/pobal/deontais/). +Is féidir leat a bheith páirteach i dtaighde Ethereum agus íocaíocht a fháil! Mar shampla, reáchtáil [Fondúireacht Ethereum](/foundation/) [babhta maoinithe na nDeontas Acadúla] le déanaí ( https://esp.ethereum.foundation/academic-grants). Is féidir leat faisnéis a fháil ar dheiseanna maoinithe gníomhacha agus atá le teacht ar [leathanach deontas Ethereum](/community/grants/). ## Taighde prótacail {#protocol-research} diff --git a/public/content/translations/ga/developers/docs/scaling/state-channels/index.md b/public/content/translations/ga/developers/docs/scaling/state-channels/index.md index 031e728a966..e7c3f67895a 100644 --- a/public/content/translations/ga/developers/docs/scaling/state-channels/index.md +++ b/public/content/translations/ga/developers/docs/scaling/state-channels/index.md @@ -45,7 +45,7 @@ Mar sin is féidir conradh cliste a rith as slabhra idir dhá úsáideoir. Sa ch Cé go réitíonn sé seo an fhadhb inscálaithe a luadh níos luaithe, tá impleachtaí aige do shlándáil. Ar Ethereum, déantar bailíocht aistrithe staide a fhorghníomhú ag prótacal comhdhearcadh an líonra. Fágann sin go bhfuil sé dodhéanta nuashonrú neamhbhailí a mholadh do staid an chonartha chliste nó forghníomhú conartha cliste a athrú. -Níl na ráthaíochtaí slándála céanna ag cainéil staide. Go pointe áirithe, is leagan beag de Mainnet é cainéal staide. Agus líon teoranta rannpháirtithe ag cur rialacha i bhfeidhm, méadaítear an fhéidearthacht go ndéanfaí iompar mailíseach (m.sh. nuashonruithe staide neamhbhailí a mholadh). Díorthaíonn cainéil staide a slándáil ó chóras eadrána díospóide atá bunaithe ar [cruthúnais calaoise](/gluais/#fraud-proof). +Níl na ráthaíochtaí slándála céanna ag cainéil staide. Go pointe áirithe, is leagan beag de Mainnet é cainéal staide. Agus líon teoranta rannpháirtithe ag cur rialacha i bhfeidhm, méadaítear an fhéidearthacht go ndéanfaí iompar mailíseach (m.sh. nuashonruithe staide neamhbhailí a mholadh). Díorthaíonn cainéil staide a slándáil ó chóras eadrána díospóide atá bunaithe ar [cruthúnais calaoise](/glossary/#fraud-proof). ## Conas a oibríonn cainéil staide {#how-state-channels-work} @@ -115,7 +115,7 @@ Cibé cás, bíonn ráthaíochtaí críochnaitheachta láidre ag úsáideoirí c Cé go bhfuil siad ann mar phrótacail as slabhra, tá comhpháirt ar slabhra ag cainéil staide: an conradh cliste a imscartar ar Ethereum agus an cainéal á oscailt. Rialaíonn an conradh seo na sócmhainní a thaisctear sa chainéal, fíoraíonn sé nuashonruithe staide, agus eadránaíonn sé díospóidí idir rannpháirtithe. -Ní fhoilsíonn cainéil staide sonraí idirbhirt ná gealltanais staide chuig Mainnet, murab ionann agus réitigh scálaithe [ciseal 2](/ciseal-2/). Mar sin féin, tá siad níos nasctha le Mainnet ná, abair, [taobhshlabhraí](/developers/docs/scaling/sidechains/), rud a fhágann go bhfuil siad beagán níos sábháilte. +Ní fhoilsíonn cainéil staide sonraí idirbhirt ná gealltanais staide chuig Mainnet, murab ionann agus réitigh scálaithe [ciseal 2](/layer-2/). Mar sin féin, tá siad níos nasctha le Mainnet ná, abair, [taobhshlabhraí](/developers/docs/scaling/sidechains/), rud a fhágann go bhfuil siad beagán níos sábháilte. Braitheann cainéil staide ar phríomhphrótacal Ethereum le haghaidh na nithe seo a leanas: diff --git a/public/content/translations/ga/ethereum-forks/index.md b/public/content/translations/ga/ethereum-forks/index.md index 3913b5069d6..2fe0973f269 100644 --- a/public/content/translations/ga/ethereum-forks/index.md +++ b/public/content/translations/ga/ethereum-forks/index.md @@ -40,7 +40,7 @@ Tá na bogearraí atá mar bhonn le Ethereum comhdhéanta de dhá leath, ar a dt **Uasghrádú comhthola** -Ó seoladh an [Beacon Chain](/gluais/#beacon-chain), ainmnítear uasghráduithe ar an **ciseal comhdhearcadh** i ndiaidh réaltaí neamhaí ag tosú le litreacha a théann ar aghaidh in ord aibítre: +Ó seoladh an [Beacon Chain](/glossary/#beacon-chain), ainmnítear uasghráduithe ar an **ciseal comhdhearcadh** i ndiaidh réaltaí neamhaí ag tosú le litreacha a théann ar aghaidh in ord aibítre: | Uasghrádú Ainm | Dáta Uasghrádaithe | | ----------------------------------------------------------- | ------------ | diff --git a/public/content/translations/ga/roadmap/dencun/index.md b/public/content/translations/ga/roadmap/dencun/index.md index 575b0bdffd6..b49f0dc7004 100644 --- a/public/content/translations/ga/roadmap/dencun/index.md +++ b/public/content/translations/ga/roadmap/dencun/index.md @@ -31,11 +31,11 @@ Tugann Dencun aghaidh go príomha ar **inscálaitheacht** (níos mó úsáideoir Tá cur chuige "láraithe ar rollú suas" á ghlacadh ag pobal Ethereum maidir lena fhás, rud a dhéanann príomhbhealach chun tacú le níos mó úsáideoirí go sábháilte de rollú suas ciseal 2. -Láimhseálann líonraí rollta _próiseáil_ (nó “forghníomhú”) na n-idirbhearta scartha ó Mainnet agus ansin foilsíonn siad cruthúnas cripteagrafach agus/nó sonraí idirbhirt chomhbhrúite ar na torthaí ar ais go Mainnet chun taifid a choinneáil. Gabhann costas (i bhfoirm [gás](/gluais/#gas)) leis na cruthúnais seo a stóráil, a raibh ar gach oibreoir nód líonra é a stóráil go buan roimh Proto-Danksharding, rud a fhágann gur tasc costasach é. +Láimhseálann líonraí rollta _próiseáil_ (nó “forghníomhú”) na n-idirbhearta scartha ó Mainnet agus ansin foilsíonn siad cruthúnas cripteagrafach agus/nó sonraí idirbhirt chomhbhrúite ar na torthaí ar ais go Mainnet chun taifid a choinneáil. Gabhann costas (i bhfoirm [gás](/glossary/#gas)) leis na cruthúnais seo a stóráil, a raibh ar gach oibreoir nód líonra é a stóráil go buan roimh Proto-Danksharding, rud a fhágann gur tasc costasach é. Cuireann tabhairt isteach Proto-Danksharding in uasghrádú Dencun stóráil sonraí níos saoire ar fáil do na cruthúnais seo gan a éileamh ar oibreoirí nód na sonraí seo a stóráil ach ar feadh thart ar 18 lá, agus ina dhiaidh sin is féidir sonraí a bhaint go sábháilte chun leathnú ar riachtanais crua-earraí a chosc. Toisc go mbíonn tréimhse aistarraingthe 7 lá ag rollú suas go hiondúil, níl aon athrú ar a samhail slándála fad is a bhíonn blobaí ar fáil ar L1 don tréimhse sin. Soláthraíonn an fhuinneog bhearrtha 18-lá maolán suntasach don tréimhse seo. -[Tuilleadh ar scálú Ethereum](/treochlár/scálú/) +[Tuilleadh ar scálú Ethereum](/roadmap/scaling/) ## Conas a fhaightear rochtain ar sheanshonraí blobaí? {#historical-access} @@ -97,7 +97,7 @@ Ní féidir. Baineann buntáistí Proto-Danksharding go sonrach le rollaí cisea Ní chiallaíonn a chomhoiriúnacht le Meaisín Fíorúil Ethereum (EVM) go bhfaighidh líonra aon tairbhe as an uasghrádú seo. Ní stórálann líonraí a oibríonn go neamhspleách ar Ethereum (cibé acu comhoiriúnach le EVM nó nach gan a bheith) a gcuid sonraí ar Ethereum agus ní bhainfidh siad aon tairbhe as an uasghrádú seo. -[Tuilleadh faoi rolluithe suas ciseal 2](/ciseal-2/) +[Tuilleadh faoi rolluithe suas ciseal 2](/layer-2/) ## An bhfuil tú níos mó d’fhoghlaimeoir amhairc? {#visual-learner} diff --git a/public/content/translations/ha/dao/index.md b/public/content/translations/ha/dao/index.md index 0b6c2092498..2cff31df6f6 100644 --- a/public/content/translations/ha/dao/index.md +++ b/public/content/translations/ha/dao/index.md @@ -99,7 +99,7 @@ Akwai samfura daban-daban ga membobin DAO. Membobi na iya ƙayyade yadda zaɓe k ### Memban tushen kuɗi {#token-based-membership} -Yawanci a cikakke [marasa izini](/kamus/#permissionless), ya danganta da kuɗin da aka yi amfani da ita. Yawancin waɗannan shugabancin kuɗi ana iya siyar da su ba tare da izini ba akan [ musanya mara ƙarfi](/glossary/#dex). Wasu dole ne a sami su ta hanyar samar da kuɗi ko wasu 'hujja-na aiki'. Ko ta wani hanya, riƙe alamar kawai yana ba da damar yin zaɓe. +Yawanci a cikakke [marasa izini](/glossary/#permissionless), ya danganta da kuɗin da aka yi amfani da ita. Yawancin waɗannan shugabancin kuɗi ana iya siyar da su ba tare da izini ba akan [ musanya mara ƙarfi](/glossary/#dex). Wasu dole ne a sami su ta hanyar samar da kuɗi ko wasu 'hujja-na aiki'. Ko ta wani hanya, riƙe alamar kawai yana ba da damar yin zaɓe. _Yawanci ana amfani da su don gudanar da manyan tsare-tsare da/ko kuɗaɗe kansu._ diff --git a/public/content/translations/nl/governance/index.md b/public/content/translations/nl/governance/index.md index 91cb06563ec..495d7f4739f 100644 --- a/public/content/translations/nl/governance/index.md +++ b/public/content/translations/nl/governance/index.md @@ -152,7 +152,7 @@ Het governance-proces van Ethereum weegt vaak snelheid en efficiëntie af tegen Hoewel de specificatie en de ontwikkelingsimplementaties altijd volledig open-source zijn geweest, werden de formele processen om updates voor te stellen zoals hierboven beschreven niet gebruikt. Hierdoor konden onderzoekers en uitvoerders sneller wijzigingen doorvoeren en hierover overeenstemming bereiken. -Toen de Beacon Chain fuseerde met de Ethereum-uitvoeringslaag op 15 september 2022, was The Merge voltooid als onderdeel van de [netwerkupgrade van Parijs](/geschiedenis/#paris). Het voorstel [EIP-3675](https://eips.ethereum.org/EIPS/eip-3675) werd gewijzigd van 'Laatste oproep' naar 'Definitief', waarmee de overgang naar proof-of-stake werd voltooid. +Toen de Beacon Chain fuseerde met de Ethereum-uitvoeringslaag op 15 september 2022, was The Merge voltooid als onderdeel van de [netwerkupgrade van Parijs](/history/#paris). Het voorstel [EIP-3675](https://eips.ethereum.org/EIPS/eip-3675) werd gewijzigd van 'Laatste oproep' naar 'Definitief', waarmee de overgang naar proof-of-stake werd voltooid. Meer over de merge diff --git a/public/content/translations/nl/zero-knowledge-proofs/index.md b/public/content/translations/nl/zero-knowledge-proofs/index.md index f64ad486493..3fed0f32af8 100644 --- a/public/content/translations/nl/zero-knowledge-proofs/index.md +++ b/public/content/translations/nl/zero-knowledge-proofs/index.md @@ -44,7 +44,7 @@ Door zero-knowledge technologie in het protocol te integreren, zorgen privacyger De huidige identiteitsbeheersystemen brengen persoonlijke informatie in gevaar. Zero-knowledge bewijzen kunnen mensen helpen om hun identiteit te valideren en tegelijkertijd gevoelige gegevens te beschermen. -Nul-kennis bewijzen zijn vooral nuttig in de context van [gedecentraliseerde identiteit](/decentrale-identiteit/). Gedecentraliseerde identiteit (ook wel bekend als 'zelfsoevereine identiteity') geeft hde persoon de mogelijkheid om de toegang tot persoonlijke informatie te controleren. Je staatsburgerschap bewijzen zonder je paspoortgegevens te onthullen is een goed voorbeeld van hoe zero-knowledge technologie gedecentraliseerde identiteit mogelijk maakt. +Nul-kennis bewijzen zijn vooral nuttig in de context van [gedecentraliseerde identiteit](/decentralized-identity/). Gedecentraliseerde identiteit (ook wel bekend als 'zelfsoevereine identiteity') geeft hde persoon de mogelijkheid om de toegang tot persoonlijke informatie te controleren. Je staatsburgerschap bewijzen zonder je paspoortgegevens te onthullen is een goed voorbeeld van hoe zero-knowledge technologie gedecentraliseerde identiteit mogelijk maakt. ### Authenticatie {#authentication} diff --git a/public/content/translations/tr/developers/docs/consensus-mechanisms/pow/mining/mining-algorithms/dagger-hashimoto/index.md b/public/content/translations/tr/developers/docs/consensus-mechanisms/pow/mining/mining-algorithms/dagger-hashimoto/index.md index aa850ac8e61..412d47a0071 100644 --- a/public/content/translations/tr/developers/docs/consensus-mechanisms/pow/mining/mining-algorithms/dagger-hashimoto/index.md +++ b/public/content/translations/tr/developers/docs/consensus-mechanisms/pow/mining/mining-algorithms/dagger-hashimoto/index.md @@ -8,7 +8,7 @@ Dagger-Hashimoto, Ethereum'un madencilik algoritması için orijinal araştırma ## Ön koşullar {#prerequisites} -Bu sayfayı daha iyi anlamak için önce [iş kanıtı mutabakatı](/developers/docs/consensus-mekanizmalar/pow), [madencilik](/developers/docs/consensus-mechanisms/pow/mining) ve [>madencilik algoritmaları](/developers/docs/consensus-mechanisms/pow/mining/mining-algorithms) hakkında okumanızı öneririz. +Bu sayfayı daha iyi anlamak için önce [iş kanıtı mutabakatı](/developers/docs/consensus-mechanisms/pow), [madencilik](/developers/docs/consensus-mechanisms/pow/mining) ve [>madencilik algoritmaları](/developers/docs/consensus-mechanisms/pow/mining/mining-algorithms) hakkında okumanızı öneririz. ## Dagger-Hashimoto {#dagger-hashimoto} diff --git a/src/intl/ha/page-run-a-node.json b/src/intl/ha/page-run-a-node.json index 4d55fff00d2..dd220627c81 100644 --- a/src/intl/ha/page-run-a-node.json +++ b/src/intl/ha/page-run-a-node.json @@ -60,7 +60,7 @@ "page-run-a-node-getting-started-software-section-1-link": "Juya saman Ethereum node", "page-run-a-node-getting-started-software-section-2": "Yanzu muna da DappNode, wanda shine kyauta da] buɗaɗɗen-tushen manhaja da ya bayarwa masu amfanuwa akanmanhaja-kamar kwarewa alhalin gudanar da 'node' ɗin sa.", "page-run-a-node-getting-started-software-section-3a": "Cikin ɗan danni kaɗan za ka iya samun 'node' sama da gudanarwa.", - "page-run-a-node-getting-started-software-section-3b": "DAppNode ya mai dashi da sauƙi ga masu amfanuwa da gudanar da cikakke 'nodes', a haka ma ga P2P sadarwar, wanda babu bukatar tabawan layin-umurni. Wannan ya mai dashi mai sauki ga wanda zai shiga da kera yawan maran-tsakakkiya sadarwa.", + "page-run-a-node-getting-started-software-section-3b": "DAppNode ya mai dashi da sauƙi ga masu amfanuwa da gudanar da cikakke 'nodes', a haka ma ga P2P sadarwar, wanda babu bukatar tabawan layin-umurni. Wannan ya mai dashi mai sauki ga wanda zai shiga da kera yawan maran-tsakakkiya sadarwa.", "page-run-a-node-getting-started-software-title": "Bangare na 2: Manhaja", "page-run-a-node-glyph-alt-terminal": "Ƙarshen glyph", "page-run-a-node-glyph-alt-phone": "Danna waya glyph", diff --git a/src/scripts/validateTranslatedUrls.ts b/src/scripts/validateTranslatedUrls.ts new file mode 100644 index 00000000000..33d3d0b9e04 --- /dev/null +++ b/src/scripts/validateTranslatedUrls.ts @@ -0,0 +1,663 @@ +/** + * Validate Translated URLs + * + * Detects translated URL paths in translation files. URLs should always use + * English paths regardless of content language. + * + * Strategy: Build a list of valid English URL paths from the content structure, + * then check each internal link in translations against this list. + * + * Usage: + * pnpm validate-urls # Report errors + * pnpm validate-urls --fix # Auto-fix errors + * pnpm validate-urls --json # Output as JSON + */ + +import fs from "fs" +import path from "path" + +// Configuration +const CONTENT_DIR = "public/content" +const TRANSLATIONS_DIR = "public/content/translations" +const INTL_DIR = "src/intl" +const DEFAULT_LOCALE = "en" + +// Regex patterns for extracting links +// Captures internal links starting with / but not just /# (hash-only links) +const MD_LINK_REGEX = /\[([^\]]*)\]\((\/[^)#\s]+)/g +const JSON_HREF_REGEX = /href=\\?"(\/[^"#\\]+)/g + +// ReDoS protection: maximum line length to process +const MAX_LINE_LENGTH = 10000 +// ReDoS protection: maximum matches per line +const MAX_MATCHES_PER_LINE = 100 + +// Minimum similarity threshold for fuzzy matching suggestions +// Lower threshold to catch more potential matches +const SUGGEST_THRESHOLD = 0.4 +const AUTO_FIX_THRESHOLD = 0.7 + +// Maximum URL length for Levenshtein computation (prevents memory exhaustion) +const MAX_URL_LENGTH = 500 + +// Known valid prefixes that don't need content files +// NOTE: This list must be manually maintained when new routes are added +const VALID_PREFIXES = [ + "/developers/docs/", + "/developers/tutorials/", + "/glossary", + "/community/", + "/contributing/", + "/roadmap/", + "/staking/", + "/layer-2/", + "/run-a-node/", + "/gas/", + "/governance/", + "/enterprise/", + "/defi/", + "/dao/", + "/nft/", + "/desci/", + "/refi/", + "/social-networks/", + "/decentralized-identity/", + "/dapps/", + "/wallets/", + "/security/", + "/web3/", + "/zero-knowledge-proofs/", + "/bridges/", + "/history/", + "/whitepaper/", + "/energy-consumption/", + "/upgrades/", + "/eips/", + "/about/", + "/assets/", + "/bug-bounty/", + "/brand-assets/", + "/languages/", + "/privacy-policy/", + "/terms-of-use/", + "/cookie-policy/", + "/guides/", + "/quizzes/", + "/learn/", + "/eth/", + "/what-is-ethereum/", + "/get-eth/", +] + +// Exact paths to whitelist (valid routes not in content directory) +const WHITELISTED_PATHS = [ + "/apps", + "/what-is-ether", + "/developers", + "/stablecoins", + "/developers/local-environment", + "/developers/learning-tools", +] + +// Path prefixes to whitelist (dynamic routes) +const WHITELISTED_PREFIXES = [ + "/apps/", // /apps/categories/gaming, etc. +] + +// Sanitize filename to prevent path traversal +function sanitizeFilename(filename: string): string { + if ( + filename.includes("..") || + filename.includes("\0") || + path.isAbsolute(filename) + ) { + throw new Error(`Invalid filename detected: ${filename}`) + } + return filename +} + +// Recursively get all files matching an extension +function getAllFiles( + dirPath: string, + extension: string, + arrayOfFiles: string[] = [] +): string[] { + if (!fs.existsSync(dirPath)) { + console.warn(`Warning: Directory not found: ${dirPath}`) + return arrayOfFiles + } + + const files = fs.readdirSync(dirPath) + + for (const file of files) { + const sanitized = sanitizeFilename(file) + const fullPath = path.join(dirPath, sanitized) + if (fs.statSync(fullPath).isDirectory()) { + getAllFiles(fullPath, extension, arrayOfFiles) + } else if (file.endsWith(extension)) { + arrayOfFiles.push(fullPath) + } + } + + return arrayOfFiles +} + +// Get all translation JSON files (excluding English) +function getTranslationJsonFiles(): string[] { + const intlDir = INTL_DIR + if (!fs.existsSync(intlDir)) { + console.warn(`Warning: Directory not found: ${intlDir}`) + return [] + } + + const locales = fs.readdirSync(intlDir).filter((dir) => { + const sanitized = sanitizeFilename(dir) + const fullPath = path.join(intlDir, sanitized) + return fs.statSync(fullPath).isDirectory() && dir !== DEFAULT_LOCALE + }) + + const files: string[] = [] + for (const locale of locales) { + const localePath = path.join(intlDir, locale) + getAllFiles(localePath, ".json", files) + } + + return files +} + +// Build set of all valid URL paths from English content +function buildValidPaths(): Set { + const validPaths = new Set() + + // Add known valid prefixes + for (const prefix of VALID_PREFIXES) { + validPaths.add(prefix.replace(/\/$/, "")) + } + + // Add paths from English content files + const englishMdFiles = getAllFiles(CONTENT_DIR, ".md").filter( + (f) => !f.includes("/translations/") + ) + + for (const file of englishMdFiles) { + // Convert file path to URL path + // public/content/about/index.md -> /about + // public/content/developers/docs/intro-to-ethereum/index.md -> /developers/docs/intro-to-ethereum + const relativePath = file + .replace(/^public\/content\//, "") + .replace(/\/index\.md$/, "") + .replace(/\.md$/, "") + if (relativePath) { + validPaths.add("/" + relativePath) + } + } + + return validPaths +} + +interface LinkInfo { + text: string + url: string + line: number +} + +interface ValidationResult { + file: string + type: "error" | "warning" + message: string + found: LinkInfo + suggestion?: string + confidence?: number +} + +// Levenshtein distance for fuzzy matching with length protection +function levenshtein(a: string, b: string): number { + // Prevent memory exhaustion on very long strings + if (a.length > MAX_URL_LENGTH || b.length > MAX_URL_LENGTH) { + return Math.max(a.length, b.length) + } + + const matrix: number[][] = [] + + for (let i = 0; i <= b.length; i++) { + matrix[i] = [i] + } + for (let j = 0; j <= a.length; j++) { + matrix[0][j] = j + } + + for (let i = 1; i <= b.length; i++) { + for (let j = 1; j <= a.length; j++) { + if (b.charAt(i - 1) === a.charAt(j - 1)) { + matrix[i][j] = matrix[i - 1][j - 1] + } else { + matrix[i][j] = Math.min( + matrix[i - 1][j - 1] + 1, + matrix[i][j - 1] + 1, + matrix[i - 1][j] + 1 + ) + } + } + } + + return matrix[b.length][a.length] +} + +function similarity(a: string, b: string): number { + if (a === b) return 1 + const distance = levenshtein(a.toLowerCase(), b.toLowerCase()) + const maxLen = Math.max(a.length, b.length) + return 1 - distance / maxLen +} + +function normalizeUrl(url: string): string { + // Remove trailing slash, query params, and anchors for comparison + return url.replace(/\/$/, "").replace(/[?#].*$/, "") +} + +function isExternalLink(url: string): boolean { + return ( + url.startsWith("http") || + url.startsWith("mailto:") || + url.startsWith("ipfs:") || + url.startsWith("//") + ) +} + +// Unified link extraction function (fixes code duplication) +function extractLinks( + content: string, + regex: RegExp, + urlGroupIndex: number, + textGroupIndex: number | null +): LinkInfo[] { + const links: LinkInfo[] = [] + const lines = content.split("\n") + + lines.forEach((line, lineIndex) => { + // ReDoS protection: skip lines that are too long + if (line.length > MAX_LINE_LENGTH) { + console.warn( + `Warning: Line ${lineIndex + 1} exceeds max length (${line.length}), skipping` + ) + return + } + + let match + const lineRegex = new RegExp(regex.source, "g") + let matchCount = 0 + + while ( + (match = lineRegex.exec(line)) !== null && + matchCount < MAX_MATCHES_PER_LINE + ) { + matchCount++ + const url = match[urlGroupIndex] + if (!isExternalLink(url)) { + links.push({ + text: textGroupIndex !== null ? match[textGroupIndex] : "", + url: normalizeUrl(url), + line: lineIndex + 1, + }) + } + } + + if (matchCount >= MAX_MATCHES_PER_LINE) { + console.warn( + `Warning: Line ${lineIndex + 1} hit max match limit, some links may be skipped` + ) + } + }) + + return links +} + +// Wrapper functions for backwards compatibility +function extractLinksFromMarkdown(content: string): LinkInfo[] { + return extractLinks(content, MD_LINK_REGEX, 2, 1) +} + +function extractLinksFromJson(content: string): LinkInfo[] { + return extractLinks(content, JSON_HREF_REGEX, 1, null) +} + +// Optimized fuzzy matching with candidate filtering +function findBestMatch( + url: string, + validPaths: Set +): { path: string; confidence: number } | null { + // Filter candidates by similar length (±30%) to reduce search space + const urlLength = url.length + const candidates = [...validPaths].filter((p) => { + const lengthRatio = Math.abs(p.length - urlLength) / urlLength + return lengthRatio < 0.3 + }) + + // Further filter by common first path segment + const urlSegments = url.split("/").filter(Boolean) + const urlPrefix = urlSegments[0]?.toLowerCase() + + let searchSet = candidates + if (urlPrefix) { + const prefixFiltered = candidates.filter((p) => { + const pathPrefix = p.split("/").filter(Boolean)[0]?.toLowerCase() + if (!pathPrefix) return false + // Check if prefixes are similar (within 2 edits) + return pathPrefix === urlPrefix || levenshtein(urlPrefix, pathPrefix) <= 2 + }) + // Use filtered set if it has results, otherwise fall back to length-filtered set + if (prefixFiltered.length > 0) { + searchSet = prefixFiltered + } + } + + let bestMatch: { path: string; confidence: number } | null = null + + for (const validPath of searchSet) { + const conf = similarity(url, validPath) + if ( + conf > SUGGEST_THRESHOLD && + (!bestMatch || conf > bestMatch.confidence) + ) { + bestMatch = { path: validPath, confidence: conf } + } + } + + return bestMatch +} + +function isValidPath(url: string, validPaths: Set): boolean { + // Check exact match in valid paths + if (validPaths.has(url)) { + return true + } + + // Check exact whitelist + if (WHITELISTED_PATHS.includes(url)) { + return true + } + + // Check whitelisted prefixes + for (const prefix of WHITELISTED_PREFIXES) { + if (url.startsWith(prefix)) { + return true + } + } + + // Check if URL starts with any valid prefix + for (const prefix of VALID_PREFIXES) { + if (url.startsWith(prefix.replace(/\/$/, ""))) { + return true + } + } + + return false +} + +function validateFile( + translatedFilePath: string, + isMarkdown: boolean, + validPaths: Set +): ValidationResult[] { + const results: ValidationResult[] = [] + + // Read translated file + const translatedContent = fs.readFileSync(translatedFilePath, "utf-8") + + // Extract links + const extractFn = isMarkdown ? extractLinksFromMarkdown : extractLinksFromJson + const links = extractFn(translatedContent) + + // Check each link + for (const link of links) { + // Skip if the URL is valid + if (isValidPath(link.url, validPaths)) { + continue + } + + // Try to find a fuzzy match for suggestion + const match = findBestMatch(link.url, validPaths) + + // Report all invalid paths as errors + results.push({ + file: translatedFilePath, + type: "error", + message: match ? "Translated URL path detected" : "Invalid URL path", + found: link, + suggestion: match?.path, + confidence: match?.confidence, + }) + } + + return results +} + +// Escape special regex characters in a string +function escapeRegex(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") +} + +// Atomic fix application with rollback support +interface FixResult { + success: boolean + fixedCount: number + errors: string[] +} + +function applyFixesWithRollback( + fileResultsMap: Map +): FixResult { + const backups = new Map() + const errors: string[] = [] + let totalFixed = 0 + + try { + // Phase 1: Create in-memory backups + for (const [filePath] of fileResultsMap) { + const originalContent = fs.readFileSync(filePath, "utf-8") + backups.set(filePath, originalContent) + } + + // Phase 2: Apply fixes + for (const [filePath, results] of fileResultsMap) { + const fixCount = applyFix(filePath, results) + if (fixCount > 0) { + console.log(` Fixed ${fixCount} URL(s) in ${filePath}`) + totalFixed += fixCount + + // Validate the fixed content (for JSON files) + if (filePath.endsWith(".json")) { + const fixedContent = fs.readFileSync(filePath, "utf-8") + try { + JSON.parse(fixedContent) + } catch { + throw new Error(`Fix corrupted JSON syntax in ${filePath}`) + } + } + } + } + + // Phase 3: Success + return { success: true, fixedCount: totalFixed, errors: [] } + } catch (error) { + // Phase 4: Rollback on any error + console.error("\nError during fix, rolling back changes...") + for (const [filePath, originalContent] of backups) { + try { + fs.writeFileSync(filePath, originalContent) + } catch (writeError) { + errors.push(`Failed to rollback ${filePath}: ${writeError}`) + } + } + errors.push(error instanceof Error ? error.message : String(error)) + return { success: false, fixedCount: 0, errors } + } +} + +function applyFix(filePath: string, results: ValidationResult[]): number { + let content = fs.readFileSync(filePath, "utf-8") + let fixCount = 0 + + // Only fix errors with high confidence + const fixableResults = results.filter( + (r) => + r.type === "error" && + r.suggestion && + r.confidence && + r.confidence >= AUTO_FIX_THRESHOLD + ) + + for (const result of fixableResults) { + const oldUrl = escapeRegex(result.found.url) + const newUrl = result.suggestion! + + // Use regex with global flag to replace ALL occurrences + const patterns = [ + new RegExp(`\\]\\(${oldUrl}\\)`, "g"), + new RegExp(`\\]\\(${oldUrl}/\\)`, "g"), + new RegExp(`\\]\\(${oldUrl}#`, "g"), + new RegExp(`href="${oldUrl}"`, "g"), + new RegExp(`href="${oldUrl}/"`, "g"), + new RegExp(`href=\\\\"${oldUrl}\\\\"`, "g"), + new RegExp(`href=\\\\"${oldUrl}/\\\\"`, "g"), + ] + + const replacements = [ + `](${newUrl})`, + `](${newUrl}/)`, + `](${newUrl}#`, + `href="${newUrl}"`, + `href="${newUrl}/"`, + `href=\\"${newUrl}\\"`, + `href=\\"${newUrl}/\\"`, + ] + + for (let i = 0; i < patterns.length; i++) { + const matches = content.match(patterns[i]) + if (matches && matches.length > 0) { + content = content.replace(patterns[i], replacements[i]) + fixCount += matches.length + break + } + } + } + + if (fixCount > 0) { + fs.writeFileSync(filePath, content) + } + + return fixCount +} + +function main() { + const args = process.argv.slice(2) + const shouldFix = args.includes("--fix") + const outputJson = args.includes("--json") + + console.log("Validating translated URLs...\n") + console.log("Building list of valid paths...") + + // Build valid paths from English content + const validPaths = buildValidPaths() + console.log(`Found ${validPaths.size} valid URL paths\n`) + + // Find all translation files + const mdFiles = getAllFiles(TRANSLATIONS_DIR, ".md") + const jsonFiles = getTranslationJsonFiles() + + console.log( + `Scanning ${mdFiles.length} markdown files and ${jsonFiles.length} JSON files...\n` + ) + + const allResults: ValidationResult[] = [] + + // Validate markdown files + for (const file of mdFiles) { + const results = validateFile(file, true, validPaths) + allResults.push(...results) + } + + // Validate JSON files + for (const file of jsonFiles) { + const results = validateFile(file, false, validPaths) + allResults.push(...results) + } + + const errors = allResults.filter((r) => r.type === "error") + + if (outputJson) { + console.log(JSON.stringify({ errors }, null, 2)) + } else { + // Print errors + if (errors.length > 0) { + const fixable = errors.filter((e) => e.suggestion) + const unfixable = errors.filter((e) => !e.suggestion) + + if (fixable.length > 0) { + console.log("ERRORS (with suggestions):\n") + for (const result of fixable) { + console.log(`${result.file}:${result.found.line}`) + console.log(` Found: ${result.found.url}`) + console.log( + ` Fix: ${result.found.url} → ${result.suggestion} (${Math.round(result.confidence! * 100)}% match)` + ) + console.log() + } + } + + if (unfixable.length > 0) { + console.log("ERRORS (no suggestion - review manually):\n") + for (const result of unfixable) { + console.log(`${result.file}:${result.found.line}`) + console.log(` Invalid: ${result.found.url}`) + console.log() + } + } + } + + const fixableCount = errors.filter( + (e) => e.suggestion && e.confidence && e.confidence >= AUTO_FIX_THRESHOLD + ).length + console.log( + `Summary: ${errors.length} errors (${fixableCount} auto-fixable)` + ) + + if (shouldFix && errors.length > 0) { + console.log("\nApplying fixes with rollback support...") + + // Group results by file + const resultsByFile = new Map() + for (const result of errors) { + const existing = resultsByFile.get(result.file) || [] + existing.push(result) + resultsByFile.set(result.file, existing) + } + + const fixResult = applyFixesWithRollback(resultsByFile) + + if (fixResult.success) { + console.log(`\nFixed ${fixResult.fixedCount} URL(s) total.`) + } else { + console.error("\nFix failed, all changes rolled back.") + for (const err of fixResult.errors) { + console.error(` - ${err}`) + } + process.exit(1) + } + } else if (errors.length > 0) { + console.log("\nRun with --fix to auto-correct errors.") + } + } + + // Exit with error code if there are unfixed errors + if (errors.length > 0 && !shouldFix) { + process.exit(1) + } +} + +try { + main() +} catch (error) { + console.error("Error:", error) + process.exit(1) +}