diff --git a/docs/backlog/P1/B-0272-atari-rom-canonical-naming-tosec-lookup-2026-05-08.md b/docs/backlog/P1/B-0272-atari-rom-canonical-naming-tosec-lookup-2026-05-08.md index 232e7d5f9..87ac96332 100644 --- a/docs/backlog/P1/B-0272-atari-rom-canonical-naming-tosec-lookup-2026-05-08.md +++ b/docs/backlog/P1/B-0272-atari-rom-canonical-naming-tosec-lookup-2026-05-08.md @@ -17,6 +17,16 @@ type: friction-reducer Hash each ROM file, look up in TOSEC/No-Intro DAT files, rename to canonical form. TS script at tools/roms/. +## Pre-start checklist + +- [x] Prior-art search: checked `tools/roms/` (empty), grepped for + TOSEC/No-Intro/canonicalize across repo (no existing TS tooling), + read parent B-0083 algorithm section and tooling design. +- [x] Dependency walk: parent B-0083 (decomposed umbrella), + sibling B-0273 (depends on this item), no other deps. +- [x] Datfile format: Logiqx XML used by both TOSEC and No-Intro; + `` is the match surface. + ## Acceptance criteria - Script at tools/roms/canonicalize.ts diff --git a/tools/roms/canonicalize.test.ts b/tools/roms/canonicalize.test.ts new file mode 100644 index 000000000..f7472d63f --- /dev/null +++ b/tools/roms/canonicalize.test.ts @@ -0,0 +1,181 @@ +import { describe, expect, test } from "bun:test"; +import { existsSync, mkdtempSync, writeFileSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import { parseDatfile, hashFileSha1, scanRomFiles, matchAndReport } from "./canonicalize.ts"; + +const FIXTURE_DATFILE = ` + + +
+ Test Datfile +
+ + Combat (1977)(Atari) + + + + Adventure (1980)(Atari) + + +
`; + +describe("parseDatfile", () => { + test("extracts rom entries from Logiqx XML", () => { + const lookup = parseDatfile(FIXTURE_DATFILE); + expect(lookup.size).toBe(2); + }); + + test("keys by lowercase sha1", () => { + const lookup = parseDatfile(FIXTURE_DATFILE); + expect(lookup.has("da39a3ee5e6b4b0d3255bfef95601890afd80709")).toBe(true); + expect(lookup.has("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d")).toBe(true); + }); + + test("preserves canonical name from datfile", () => { + const lookup = parseDatfile(FIXTURE_DATFILE); + const entry = lookup.get("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d"); + expect(entry?.name).toBe("Adventure (1980)(Atari).a26"); + }); + + test("decodes XML entities in canonical names", () => { + const xml = ``; + const lookup = parseDatfile(xml); + const entry = lookup.get("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d"); + expect(entry?.name).toBe('Fish & Chips "Demo".bin'); + }); + + test("returns empty map for empty input", () => { + const lookup = parseDatfile(""); + expect(lookup.size).toBe(0); + }); + + test("handles datfile with no sha1 attributes gracefully", () => { + const xml = ``; + const lookup = parseDatfile(xml); + expect(lookup.size).toBe(0); + }); +}); + +describe("hashFileSha1", () => { + test("computes correct SHA1 for known content", () => { + const tmp = mkdtempSync(join(tmpdir(), "rom-test-")); + const file = join(tmp, "test.bin"); + writeFileSync(file, "hello"); + const sha1 = hashFileSha1(file); + expect(sha1).toBe("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d"); + }); + + test("computes correct SHA1 for empty file", () => { + const tmp = mkdtempSync(join(tmpdir(), "rom-test-")); + const file = join(tmp, "empty.bin"); + writeFileSync(file, ""); + const sha1 = hashFileSha1(file); + expect(sha1).toBe("da39a3ee5e6b4b0d3255bfef95601890afd80709"); + }); +}); + +describe("scanRomFiles", () => { + test("finds .bin and .a26 files", () => { + const tmp = mkdtempSync(join(tmpdir(), "rom-scan-")); + writeFileSync(join(tmp, "game.bin"), "data"); + writeFileSync(join(tmp, "game2.a26"), "data"); + writeFileSync(join(tmp, "README.md"), "docs"); + + const files = scanRomFiles(tmp); + expect(files.length).toBe(2); + expect(files.some((f) => f.endsWith("game.bin"))).toBe(true); + expect(files.some((f) => f.endsWith("game2.a26"))).toBe(true); + }); + + test("ignores non-ROM files", () => { + const tmp = mkdtempSync(join(tmpdir(), "rom-scan-")); + writeFileSync(join(tmp, "notes.txt"), "data"); + writeFileSync(join(tmp, "save.sav"), "data"); + + const files = scanRomFiles(tmp); + expect(files.length).toBe(0); + }); +}); + +describe("matchAndReport", () => { + test("matches file by SHA1 and reports canonical name", () => { + const tmp = mkdtempSync(join(tmpdir(), "rom-match-")); + const file = join(tmp, "hello.bin"); + writeFileSync(file, "hello"); + + const lookup = parseDatfile(FIXTURE_DATFILE); + const results = matchAndReport(lookup, [file], false); + expect(results.length).toBe(1); + expect(results[0]?.matched).toBe(true); + expect(results[0]?.canonicalName).toBe("Adventure (1980)(Atari).a26"); + expect(results[0]?.renamed).toBe(false); + }); + + test("reports unmatched files", () => { + const tmp = mkdtempSync(join(tmpdir(), "rom-nomatch-")); + const file = join(tmp, "unknown.bin"); + writeFileSync(file, "this content has no datfile entry"); + + const lookup = parseDatfile(FIXTURE_DATFILE); + const results = matchAndReport(lookup, [file], false); + expect(results.length).toBe(1); + expect(results[0]?.matched).toBe(false); + expect(results[0]?.canonicalName).toBeNull(); + }); + + test("renames file when --apply is true", () => { + const tmp = mkdtempSync(join(tmpdir(), "rom-rename-")); + const file = join(tmp, "wrong-name.bin"); + writeFileSync(file, "hello"); + + const lookup = parseDatfile(FIXTURE_DATFILE); + const results = matchAndReport(lookup, [file], true); + expect(results[0]?.renamed).toBe(true); + + const renamedPath = join(tmp, "Adventure (1980)(Atari).a26"); + expect(() => readFileSync(renamedPath)).not.toThrow(); + }); + + test("does not apply unsafe canonical names with path separators", () => { + const tmp = mkdtempSync(join(tmpdir(), "rom-unsafe-name-")); + const file = join(tmp, "wrong-name.bin"); + writeFileSync(file, "hello"); + + const lookup = parseDatfile( + ``, + ); + const results = matchAndReport(lookup, [file], true); + + expect(results[0]?.matched).toBe(true); + expect(results[0]?.renamed).toBe(false); + expect(existsSync(file)).toBe(true); + expect(existsSync(join(tmp, "..", "escaped.a26"))).toBe(false); + }); + + test("does not apply unsafe canonical names with Windows separators", () => { + const tmp = mkdtempSync(join(tmpdir(), "rom-unsafe-win-name-")); + const file = join(tmp, "wrong-name.bin"); + writeFileSync(file, "hello"); + + const lookup = parseDatfile( + ``, + ); + const results = matchAndReport(lookup, [file], true); + + expect(results[0]?.matched).toBe(true); + expect(results[0]?.renamed).toBe(false); + expect(existsSync(file)).toBe(true); + }); + + test("does not rename when file already has canonical name", () => { + const tmp = mkdtempSync(join(tmpdir(), "rom-already-")); + const file = join(tmp, "Adventure (1980)(Atari).a26"); + writeFileSync(file, "hello"); + + const lookup = parseDatfile(FIXTURE_DATFILE); + const results = matchAndReport(lookup, [file], true); + expect(results[0]?.matched).toBe(true); + expect(results[0]?.renamed).toBe(false); + }); +}); diff --git a/tools/roms/canonicalize.ts b/tools/roms/canonicalize.ts new file mode 100644 index 000000000..82e9b7b3d --- /dev/null +++ b/tools/roms/canonicalize.ts @@ -0,0 +1,320 @@ +#!/usr/bin/env bun +// canonicalize.ts -- hash ROM files and match against a TOSEC / No-Intro +// datfile (Logiqx XML format). Smallest safe slice of B-0272. +// +// Usage: +// bun tools/roms/canonicalize.ts --datfile --dir +// bun tools/roms/canonicalize.ts --datfile --dir --apply +// +// Output (default dry-run): JSON array of { file, sha1, match, canonicalName }. +// --apply: renames matched files to their canonical names. + +import { createHash } from "node:crypto"; +import { + readdirSync, + readFileSync, + renameSync, + existsSync, +} from "node:fs"; +import { basename, extname, join } from "node:path"; + +// --- Datfile parsing (Logiqx XML) --- + +interface DatEntry { + readonly name: string; + readonly size: number; + readonly sha1: string; + readonly md5: string; + readonly crc: string; +} + +const ROM_EXTENSIONS = new Set([ + ".bin", + ".a26", + ".rom", + ".int", + ".vec", + ".gg", + ".sms", + ".gen", + ".smc", + ".sfc", + ".nes", + ".gb", + ".gbc", + ".gba", + ".n64", + ".z64", + ".v64", + ".pce", + ".ngp", + ".ngc", + ".ws", + ".wsc", + ".sg", + ".32x", + ".col", +]); + +function decodeXmlAttributeValue(value: string): string { + return value.replace( + /&(#x[0-9a-fA-F]+|#[0-9]+|amp|lt|gt|quot|apos);/g, + (_match, entity: string) => { + switch (entity) { + case "amp": + return "&"; + case "lt": + return "<"; + case "gt": + return ">"; + case "quot": + return '"'; + case "apos": + return "'"; + default: + if (entity.startsWith("#x")) { + return String.fromCodePoint(parseInt(entity.slice(2), 16)); + } + if (entity.startsWith("#")) { + return String.fromCodePoint(parseInt(entity.slice(1), 10)); + } + return `&${entity};`; + } + }, + ); +} + +function isSafeCanonicalName(name: string): boolean { + return ( + name.length > 0 && + name !== "." && + name !== ".." && + !name.includes("\0") && + !name.includes("/") && + !name.includes("\\") + ); +} + +export function parseDatfile(xml: string): ReadonlyMap { + const lookup = new Map(); + const romTagRe = /]*?)\/?\s*>/g; + const attrRe = /(\w+)\s*=\s*"([^"]*)"/g; + + let tagMatch: RegExpExecArray | null; + while ((tagMatch = romTagRe.exec(xml)) !== null) { + const attrStr = tagMatch[1] ?? ""; + const attrs: Record = {}; + let attrMatch: RegExpExecArray | null; + while ((attrMatch = attrRe.exec(attrStr)) !== null) { + const key = attrMatch[1]; + const val = attrMatch[2]; + if (key !== undefined && val !== undefined) { + attrs[key] = decodeXmlAttributeValue(val); + } + } + attrRe.lastIndex = 0; + + const sha1 = attrs["sha1"]?.toLowerCase(); + const name = attrs["name"]; + if (sha1 && name) { + lookup.set(sha1, { + name, + size: parseInt(attrs["size"] ?? "0", 10), + sha1, + md5: (attrs["md5"] ?? "").toLowerCase(), + crc: (attrs["crc"] ?? "").toLowerCase(), + }); + } + } + return lookup; +} + +// --- File hashing --- + +export function hashFileSha1(path: string): string { + const data = readFileSync(path); + return createHash("sha1").update(data).digest("hex"); +} + +// --- Directory scanning --- + +export function scanRomFiles(dir: string): readonly string[] { + const files: string[] = []; + for (const entry of readdirSync(dir, { withFileTypes: true })) { + if (!entry.isFile()) continue; + const ext = extname(entry.name).toLowerCase(); + if (ROM_EXTENSIONS.has(ext) || ext === "") { + files.push(join(dir, entry.name)); + } + } + return files.sort(); +} + +// --- Match + rename --- + +interface MatchResult { + readonly file: string; + readonly sha1: string; + readonly matched: boolean; + readonly canonicalName: string | null; + readonly renamed: boolean; +} + +export function matchAndReport( + datLookup: ReadonlyMap, + romFiles: readonly string[], + apply: boolean, +): readonly MatchResult[] { + const results: MatchResult[] = []; + const usedNames = new Set(); + + for (const filePath of romFiles) { + const sha1 = hashFileSha1(filePath); + const entry = datLookup.get(sha1); + const currentName = basename(filePath); + + if (!entry) { + results.push({ + file: filePath, + sha1, + matched: false, + canonicalName: null, + renamed: false, + }); + continue; + } + + const canonicalName = entry.name; + const alreadyCorrect = currentName === canonicalName; + let renamed = false; + + if (apply && !alreadyCorrect) { + const dir = filePath.slice(0, filePath.length - currentName.length); + if (!isSafeCanonicalName(canonicalName)) { + process.stderr.write( + `skip: unsafe canonical name from datfile: ${canonicalName}\n`, + ); + usedNames.add(canonicalName); + results.push({ + file: filePath, + sha1, + matched: true, + canonicalName, + renamed: false, + }); + continue; + } + const target = join(dir, canonicalName); + if (existsSync(target) || usedNames.has(canonicalName)) { + process.stderr.write( + `skip: target already exists: ${target}\n`, + ); + } else { + renameSync(filePath, target); + renamed = true; + } + } + + usedNames.add(canonicalName); + results.push({ + file: filePath, + sha1, + matched: true, + canonicalName, + renamed, + }); + } + + return results; +} + +// --- CLI --- + +interface Args { + readonly datfile: string; + readonly dir: string; + readonly apply: boolean; +} + +function parseArgs(argv: readonly string[]): Args { + let datfile: string | undefined; + let dir: string | undefined; + let apply = false; + + for (let i = 0; i < argv.length; i++) { + const arg = argv[i]; + if (arg === "--datfile" || arg === "-d") { + datfile = argv[++i]; + } else if (arg === "--dir") { + dir = argv[++i]; + } else if (arg === "--apply") { + apply = true; + } else if (arg === "--help" || arg === "-h") { + process.stdout.write( + "Usage: canonicalize.ts --datfile --dir [--apply]\n\n" + + " --datfile, -d Path to a TOSEC / No-Intro datfile (Logiqx XML).\n" + + " --dir Directory containing ROM files to match.\n" + + " --apply Actually rename files (default: dry-run report).\n", + ); + process.exit(0); + } else { + process.stderr.write(`unknown arg: ${arg}\n`); + process.exit(64); + } + } + + if (!datfile) { + process.stderr.write("--datfile is required\n"); + process.exit(64); + } + if (!dir) { + process.stderr.write("--dir is required\n"); + process.exit(64); + } + return { datfile, dir, apply }; +} + +export function main(argv: readonly string[]): number { + const args = parseArgs(argv); + + if (!existsSync(args.datfile)) { + process.stderr.write(`datfile not found: ${args.datfile}\n`); + return 1; + } + if (!existsSync(args.dir)) { + process.stderr.write(`directory not found: ${args.dir}\n`); + return 1; + } + + const datXml = readFileSync(args.datfile, "utf8"); + const lookup = parseDatfile(datXml); + process.stderr.write(`datfile: ${lookup.size} entries loaded\n`); + + const romFiles = scanRomFiles(args.dir); + process.stderr.write(`directory: ${romFiles.length} ROM files found\n`); + + if (romFiles.length === 0) { + process.stdout.write("[]\n"); + return 0; + } + + const results = matchAndReport(lookup, romFiles, args.apply); + + const matched = results.filter((r) => r.matched); + const unmatched = results.filter((r) => !r.matched); + const renamed = results.filter((r) => r.renamed); + + process.stdout.write(JSON.stringify(results, null, 2) + "\n"); + + process.stderr.write( + `\nsummary: ${matched.length} matched, ` + + `${unmatched.length} unmatched, ` + + `${renamed.length} renamed\n`, + ); + + return 0; +} + +if (import.meta.main) { + process.exit(main(process.argv.slice(2))); +}