From 72b27e4c9f2be25b75ad3a842aac8395ad626b7b Mon Sep 17 00:00:00 2001 From: Kazuki Yamada Date: Sun, 14 Dec 2025 18:44:48 +0900 Subject: [PATCH 1/4] fix(file): remove jschardet confidence check for encoding detection Remove the confidence < 0.2 check that was causing valid UTF-8/ASCII files to be incorrectly skipped. Files are now only skipped if they contain actual decode errors (U+FFFD replacement characters). This fixes issues where: - Valid Python files were skipped with confidence=0.00 (#869) - HTML files with Thymeleaf syntax (~{}) were incorrectly detected as binary (#847) The isbinaryfile library (added in PR #1006) now handles binary detection more accurately, making the confidence-based heuristic unnecessary. Fixes #869 --- src/core/file/fileRead.ts | 11 ++- tests/core/file/fileRead.test.ts | 128 +++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 6 deletions(-) create mode 100644 tests/core/file/fileRead.test.ts diff --git a/src/core/file/fileRead.ts b/src/core/file/fileRead.ts index 122d7ac5b..a48625455 100644 --- a/src/core/file/fileRead.ts +++ b/src/core/file/fileRead.ts @@ -43,16 +43,15 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis return { content: null, skippedReason: 'binary-content' }; } - const { encoding: detectedEncoding, confidence } = jschardet.detect(buffer) ?? {}; + const { encoding: detectedEncoding } = jschardet.detect(buffer) ?? {}; const encoding = detectedEncoding && iconv.encodingExists(detectedEncoding) ? detectedEncoding : 'utf-8'; const content = iconv.decode(buffer, encoding, { stripBOM: true }); - // Heuristics: U+FFFD indicates decode errors; very low confidence implies unreliable guess. - if (content.includes('\uFFFD') || (typeof confidence === 'number' && confidence < 0.2)) { - logger.debug( - `Skipping file due to encoding errors (${encoding}, confidence=${(confidence ?? 0).toFixed(2)}): ${filePath}`, - ); + // Only skip if there are actual decode errors (U+FFFD replacement characters) + // Don't rely on jschardet confidence as it can return low values for valid UTF-8/ASCII files + if (content.includes('\uFFFD')) { + logger.debug(`Skipping file due to encoding errors (detected: ${encoding}): ${filePath}`); return { content: null, skippedReason: 'encoding-error' }; } diff --git a/tests/core/file/fileRead.test.ts b/tests/core/file/fileRead.test.ts new file mode 100644 index 000000000..0513bbc6c --- /dev/null +++ b/tests/core/file/fileRead.test.ts @@ -0,0 +1,128 @@ +import * as fs from 'node:fs/promises'; +import path from 'node:path'; +import { afterEach, beforeEach, describe, expect, test } from 'vitest'; +import { readRawFile } from '../../../src/core/file/fileRead.js'; + +describe('readRawFile', () => { + const testDir = path.join(process.cwd(), 'tests', 'fixtures', 'fileRead'); + + beforeEach(async () => { + await fs.mkdir(testDir, { recursive: true }); + }); + + afterEach(async () => { + try { + await fs.rm(testDir, { recursive: true }); + } catch { + // Ignore if directory doesn't exist + } + }); + + test('should read normal text file successfully', async () => { + const filePath = path.join(testDir, 'normal.txt'); + const content = 'Hello World'; + await fs.writeFile(filePath, content, 'utf-8'); + + const result = await readRawFile(filePath, 1024); + + expect(result.content).toBe(content); + expect(result.skippedReason).toBeUndefined(); + }); + + test('should read file with low jschardet confidence (Issue #869)', async () => { + // This tests that files with low confidence scores from jschardet + // are NOT skipped if they contain valid UTF-8 content + const filePath = path.join(testDir, 'server.py'); + const content = `import json +import time +import uuid + +def hello(): + print("Hello, World!") +`; + await fs.writeFile(filePath, content, 'utf-8'); + + const result = await readRawFile(filePath, 1024 * 1024); + + expect(result.content).toBe(content); + expect(result.skippedReason).toBeUndefined(); + }); + + test('should read HTML file with Thymeleaf syntax (Issue #847)', async () => { + // This tests that HTML files with special syntax like Thymeleaf (~{}) + // are NOT skipped even if jschardet returns low confidence + const filePath = path.join(testDir, 'thymeleaf.html'); + const content = ''; + await fs.writeFile(filePath, content, 'utf-8'); + + const result = await readRawFile(filePath, 1024); + + expect(result.content).toBe(content); + expect(result.skippedReason).toBeUndefined(); + }); + + test('should read empty file successfully', async () => { + // Empty files should not be skipped (jschardet may return 0 confidence for empty files) + const filePath = path.join(testDir, '__init__.py'); + await fs.writeFile(filePath, '', 'utf-8'); + + const result = await readRawFile(filePath, 1024); + + expect(result.content).toBe(''); + expect(result.skippedReason).toBeUndefined(); + }); + + test('should skip file with actual decode errors (U+FFFD)', async () => { + const filePath = path.join(testDir, 'invalid.txt'); + // Create a file with a UTF-8 BOM followed by valid text and invalid UTF-8 sequences + // The BOM forces UTF-8 detection, and the invalid sequence will produce U+FFFD + const utf8Bom = Buffer.from([0xef, 0xbb, 0xbf]); // UTF-8 BOM + const validText = 'Hello World\n'.repeat(50); + // Invalid UTF-8: 0x80 is a continuation byte without a leading byte + const invalidSequence = Buffer.from([0x80, 0x81, 0x82]); + const buffer = Buffer.concat([utf8Bom, Buffer.from(validText), invalidSequence, Buffer.from(validText)]); + await fs.writeFile(filePath, buffer); + + const result = await readRawFile(filePath, 1024 * 1024); + + expect(result.content).toBeNull(); + expect(result.skippedReason).toBe('encoding-error'); + }); + + test('should skip file if it exceeds size limit', async () => { + const filePath = path.join(testDir, 'large.txt'); + const content = 'x'.repeat(1000); + await fs.writeFile(filePath, content, 'utf-8'); + + const result = await readRawFile(filePath, 100); + + expect(result.content).toBeNull(); + expect(result.skippedReason).toBe('size-limit'); + }); + + test('should skip binary file by extension', async () => { + const filePath = path.join(testDir, 'test.jpg'); + const binaryData = Buffer.from([0xff, 0xd8, 0xff, 0xe0]); + await fs.writeFile(filePath, binaryData); + + const result = await readRawFile(filePath, 1024); + + expect(result.content).toBeNull(); + expect(result.skippedReason).toBe('binary-extension'); + }); + + test('should skip binary content in text extension file', async () => { + const filePath = path.join(testDir, 'binary.txt'); + // Create file with binary content (null bytes and control characters) + const binaryData = Buffer.alloc(256); + for (let i = 0; i < 256; i++) { + binaryData[i] = i; + } + await fs.writeFile(filePath, binaryData); + + const result = await readRawFile(filePath, 1024); + + expect(result.content).toBeNull(); + expect(result.skippedReason).toBe('binary-content'); + }); +}); From c4354e77455d9c6e6cb6a0a97378f423d5ab3914 Mon Sep 17 00:00:00 2001 From: Kazuki Yamada Date: Sun, 14 Dec 2025 18:56:34 +0900 Subject: [PATCH 2/4] fix(file): improve U+FFFD detection for UTF-8 encoding - Use TextDecoder('utf-8', { fatal: true }) to distinguish actual decode errors from legitimate U+FFFD characters in UTF-8 files - Change test temp directory from tests/fixtures to os.tmpdir() to avoid clobbering committed fixtures and reduce parallel-run collisions - Non-UTF-8 files still use iconv.decode() fallback behavior Addresses CodeRabbit review comments on PR #1007 --- src/core/file/fileRead.ts | 10 ++++++++++ tests/core/file/fileRead.test.ts | 11 ++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/core/file/fileRead.ts b/src/core/file/fileRead.ts index a48625455..85166354d 100644 --- a/src/core/file/fileRead.ts +++ b/src/core/file/fileRead.ts @@ -51,6 +51,16 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis // Only skip if there are actual decode errors (U+FFFD replacement characters) // Don't rely on jschardet confidence as it can return low values for valid UTF-8/ASCII files if (content.includes('\uFFFD')) { + // For UTF-8, distinguish invalid byte sequences from a legitimate U+FFFD in the source + if (encoding.toLowerCase() === 'utf-8') { + try { + let utf8 = new TextDecoder('utf-8', { fatal: true }).decode(buffer); + if (utf8.charCodeAt(0) === 0xfeff) utf8 = utf8.slice(1); // strip UTF-8 BOM + return { content: utf8 }; + } catch { + // fall through to skip below + } + } logger.debug(`Skipping file due to encoding errors (detected: ${encoding}): ${filePath}`); return { content: null, skippedReason: 'encoding-error' }; } diff --git a/tests/core/file/fileRead.test.ts b/tests/core/file/fileRead.test.ts index 0513bbc6c..ae7d446fa 100644 --- a/tests/core/file/fileRead.test.ts +++ b/tests/core/file/fileRead.test.ts @@ -1,21 +1,18 @@ import * as fs from 'node:fs/promises'; +import os from 'node:os'; import path from 'node:path'; import { afterEach, beforeEach, describe, expect, test } from 'vitest'; import { readRawFile } from '../../../src/core/file/fileRead.js'; describe('readRawFile', () => { - const testDir = path.join(process.cwd(), 'tests', 'fixtures', 'fileRead'); + let testDir: string; beforeEach(async () => { - await fs.mkdir(testDir, { recursive: true }); + testDir = await fs.mkdtemp(path.join(os.tmpdir(), 'repomix-fileRead-')); }); afterEach(async () => { - try { - await fs.rm(testDir, { recursive: true }); - } catch { - // Ignore if directory doesn't exist - } + await fs.rm(testDir, { recursive: true, force: true }); }); test('should read normal text file successfully', async () => { From 0604b7e2e11ec648bcd8a08314535bb6db2a1cd3 Mon Sep 17 00:00:00 2001 From: Kazuki Yamada Date: Sun, 14 Dec 2025 19:10:48 +0900 Subject: [PATCH 3/4] fix(deps): downgrade isbinaryfile to v5.0.2 for Node.js 20+ support isbinaryfile v6.0.0 requires Node.js >= 24.0.0, but repomix supports Node.js >= 20.0.0. Downgrade to v5.0.2 (requires Node.js >= 18.0.0) to maintain compatibility with current LTS versions. Addresses gemini-code-assist review on PR #1006 --- package-lock.json | 10 +++++----- package.json | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/package-lock.json b/package-lock.json index 73fcb1359..4dfd6928b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -23,7 +23,7 @@ "handlebars": "^4.7.8", "iconv-lite": "^0.7.0", "is-binary-path": "^3.0.0", - "isbinaryfile": "^6.0.0", + "isbinaryfile": "^5.0.2", "jiti": "^2.6.1", "jschardet": "^3.1.4", "json5": "^2.2.3", @@ -3483,12 +3483,12 @@ } }, "node_modules/isbinaryfile": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-6.0.0.tgz", - "integrity": "sha512-2FN2B8MAqKv6d5TaKsLvMrwMcghxwHTpcKy0L5mhNbRqjNqo2++SpCqN6eG1lCC1GmTQgvrYJYXv2+Chvyevag==", + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-5.0.2.tgz", + "integrity": "sha512-GvcjojwonMjWbTkfMpnVHVqXW/wKMYDfEpY94/8zy8HFMOqb/VL6oeONq9v87q4ttVlaTLnGXnJD4B5B1OTGIg==", "license": "MIT", "engines": { - "node": ">= 24.0.0" + "node": ">= 18.0.0" }, "funding": { "url": "https://github.com/sponsors/gjtorikian/" diff --git a/package.json b/package.json index 06bfdcec9..4fa53f9ab 100644 --- a/package.json +++ b/package.json @@ -87,7 +87,7 @@ "handlebars": "^4.7.8", "iconv-lite": "^0.7.0", "is-binary-path": "^3.0.0", - "isbinaryfile": "^6.0.0", + "isbinaryfile": "^5.0.2", "jiti": "^2.6.1", "jschardet": "^3.1.4", "json5": "^2.2.3", From 47398ae8206ef066bd9212e3a6069734b599f06d Mon Sep 17 00:00:00 2001 From: Kazuki Yamada Date: Sun, 14 Dec 2025 19:44:47 +0900 Subject: [PATCH 4/4] test(file): Add test for legitimate U+FFFD character handling Verify that files containing intentional U+FFFD characters in the source are correctly read (not skipped), testing the TextDecoder validation path. --- tests/core/file/fileRead.test.ts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/core/file/fileRead.test.ts b/tests/core/file/fileRead.test.ts index ae7d446fa..5fb5f9266 100644 --- a/tests/core/file/fileRead.test.ts +++ b/tests/core/file/fileRead.test.ts @@ -69,6 +69,20 @@ def hello(): expect(result.skippedReason).toBeUndefined(); }); + test('should read file containing legitimate U+FFFD character', async () => { + // This tests that files with intentional U+FFFD characters in the source + // are NOT skipped (TextDecoder can decode them successfully) + const filePath = path.join(testDir, 'with-replacement-char.txt'); + // U+FFFD is a valid Unicode character that can appear in source files + const content = 'Some text with replacement char: \uFFFD and more text'; + await fs.writeFile(filePath, content, 'utf-8'); + + const result = await readRawFile(filePath, 1024); + + expect(result.content).toBe(content); + expect(result.skippedReason).toBeUndefined(); + }); + test('should skip file with actual decode errors (U+FFFD)', async () => { const filePath = path.join(testDir, 'invalid.txt'); // Create a file with a UTF-8 BOM followed by valid text and invalid UTF-8 sequences