From 72b27e4c9f2be25b75ad3a842aac8395ad626b7b Mon Sep 17 00:00:00 2001
From: Kazuki Yamada <koukun0120@gmail.com>
Date: Sun, 14 Dec 2025 18:44:48 +0900
Subject: [PATCH 1/4] fix(file): remove jschardet confidence check for encoding
 detection

Remove the confidence < 0.2 check that was causing valid UTF-8/ASCII files
to be incorrectly skipped. Files are now only skipped if they contain actual
decode errors (U+FFFD replacement characters).

This fixes issues where:
- Valid Python files were skipped with confidence=0.00 (#869)
- HTML files with Thymeleaf syntax (~{}) were incorrectly detected as binary (#847)

The isbinaryfile library (added in PR #1006) now handles binary detection more
accurately, making the confidence-based heuristic unnecessary.

Fixes #869
---
 src/core/file/fileRead.ts        |  11 ++-
 tests/core/file/fileRead.test.ts | 128 +++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+), 6 deletions(-)
 create mode 100644 tests/core/file/fileRead.test.ts

diff --git a/src/core/file/fileRead.ts b/src/core/file/fileRead.ts
index 122d7ac5b..a48625455 100644
--- a/src/core/file/fileRead.ts
+++ b/src/core/file/fileRead.ts
@@ -43,16 +43,15 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis
       return { content: null, skippedReason: 'binary-content' };
     }
 
-    const { encoding: detectedEncoding, confidence } = jschardet.detect(buffer) ?? {};
+    const { encoding: detectedEncoding } = jschardet.detect(buffer) ?? {};
     const encoding = detectedEncoding && iconv.encodingExists(detectedEncoding) ? detectedEncoding : 'utf-8';
 
     const content = iconv.decode(buffer, encoding, { stripBOM: true });
 
-    // Heuristics: U+FFFD indicates decode errors; very low confidence implies unreliable guess.
-    if (content.includes('\uFFFD') || (typeof confidence === 'number' && confidence < 0.2)) {
-      logger.debug(
-        `Skipping file due to encoding errors (${encoding}, confidence=${(confidence ?? 0).toFixed(2)}): ${filePath}`,
-      );
+    // Only skip if there are actual decode errors (U+FFFD replacement characters)
+    // Don't rely on jschardet confidence as it can return low values for valid UTF-8/ASCII files
+    if (content.includes('\uFFFD')) {
+      logger.debug(`Skipping file due to encoding errors (detected: ${encoding}): ${filePath}`);
       return { content: null, skippedReason: 'encoding-error' };
     }
 
diff --git a/tests/core/file/fileRead.test.ts b/tests/core/file/fileRead.test.ts
new file mode 100644
index 000000000..0513bbc6c
--- /dev/null
+++ b/tests/core/file/fileRead.test.ts
@@ -0,0 +1,128 @@
+import * as fs from 'node:fs/promises';
+import path from 'node:path';
+import { afterEach, beforeEach, describe, expect, test } from 'vitest';
+import { readRawFile } from '../../../src/core/file/fileRead.js';
+
+describe('readRawFile', () => {
+  const testDir = path.join(process.cwd(), 'tests', 'fixtures', 'fileRead');
+
+  beforeEach(async () => {
+    await fs.mkdir(testDir, { recursive: true });
+  });
+
+  afterEach(async () => {
+    try {
+      await fs.rm(testDir, { recursive: true });
+    } catch {
+      // Ignore if directory doesn't exist
+    }
+  });
+
+  test('should read normal text file successfully', async () => {
+    const filePath = path.join(testDir, 'normal.txt');
+    const content = 'Hello World';
+    await fs.writeFile(filePath, content, 'utf-8');
+
+    const result = await readRawFile(filePath, 1024);
+
+    expect(result.content).toBe(content);
+    expect(result.skippedReason).toBeUndefined();
+  });
+
+  test('should read file with low jschardet confidence (Issue #869)', async () => {
+    // This tests that files with low confidence scores from jschardet
+    // are NOT skipped if they contain valid UTF-8 content
+    const filePath = path.join(testDir, 'server.py');
+    const content = `import json
+import time
+import uuid
+
+def hello():
+    print("Hello, World!")
+`;
+    await fs.writeFile(filePath, content, 'utf-8');
+
+    const result = await readRawFile(filePath, 1024 * 1024);
+
+    expect(result.content).toBe(content);
+    expect(result.skippedReason).toBeUndefined();
+  });
+
+  test('should read HTML file with Thymeleaf syntax (Issue #847)', async () => {
+    // This tests that HTML files with special syntax like Thymeleaf (~{})
+    // are NOT skipped even if jschardet returns low confidence
+    const filePath = path.join(testDir, 'thymeleaf.html');
+    const content = '<html lang="en" xmlns:th="http://www.thymeleaf.org" layout:decorate="~{layouts/default}"></html>';
+    await fs.writeFile(filePath, content, 'utf-8');
+
+    const result = await readRawFile(filePath, 1024);
+
+    expect(result.content).toBe(content);
+    expect(result.skippedReason).toBeUndefined();
+  });
+
+  test('should read empty file successfully', async () => {
+    // Empty files should not be skipped (jschardet may return 0 confidence for empty files)
+    const filePath = path.join(testDir, '__init__.py');
+    await fs.writeFile(filePath, '', 'utf-8');
+
+    const result = await readRawFile(filePath, 1024);
+
+    expect(result.content).toBe('');
+    expect(result.skippedReason).toBeUndefined();
+  });
+
+  test('should skip file with actual decode errors (U+FFFD)', async () => {
+    const filePath = path.join(testDir, 'invalid.txt');
+    // Create a file with a UTF-8 BOM followed by valid text and invalid UTF-8 sequences
+    // The BOM forces UTF-8 detection, and the invalid sequence will produce U+FFFD
+    const utf8Bom = Buffer.from([0xef, 0xbb, 0xbf]); // UTF-8 BOM
+    const validText = 'Hello World\n'.repeat(50);
+    // Invalid UTF-8: 0x80 is a continuation byte without a leading byte
+    const invalidSequence = Buffer.from([0x80, 0x81, 0x82]);
+    const buffer = Buffer.concat([utf8Bom, Buffer.from(validText), invalidSequence, Buffer.from(validText)]);
+    await fs.writeFile(filePath, buffer);
+
+    const result = await readRawFile(filePath, 1024 * 1024);
+
+    expect(result.content).toBeNull();
+    expect(result.skippedReason).toBe('encoding-error');
+  });
+
+  test('should skip file if it exceeds size limit', async () => {
+    const filePath = path.join(testDir, 'large.txt');
+    const content = 'x'.repeat(1000);
+    await fs.writeFile(filePath, content, 'utf-8');
+
+    const result = await readRawFile(filePath, 100);
+
+    expect(result.content).toBeNull();
+    expect(result.skippedReason).toBe('size-limit');
+  });
+
+  test('should skip binary file by extension', async () => {
+    const filePath = path.join(testDir, 'test.jpg');
+    const binaryData = Buffer.from([0xff, 0xd8, 0xff, 0xe0]);
+    await fs.writeFile(filePath, binaryData);
+
+    const result = await readRawFile(filePath, 1024);
+
+    expect(result.content).toBeNull();
+    expect(result.skippedReason).toBe('binary-extension');
+  });
+
+  test('should skip binary content in text extension file', async () => {
+    const filePath = path.join(testDir, 'binary.txt');
+    // Create file with binary content (null bytes and control characters)
+    const binaryData = Buffer.alloc(256);
+    for (let i = 0; i < 256; i++) {
+      binaryData[i] = i;
+    }
+    await fs.writeFile(filePath, binaryData);
+
+    const result = await readRawFile(filePath, 1024);
+
+    expect(result.content).toBeNull();
+    expect(result.skippedReason).toBe('binary-content');
+  });
+});

From c4354e77455d9c6e6cb6a0a97378f423d5ab3914 Mon Sep 17 00:00:00 2001
From: Kazuki Yamada <koukun0120@gmail.com>
Date: Sun, 14 Dec 2025 18:56:34 +0900
Subject: [PATCH 2/4] fix(file): improve U+FFFD detection for UTF-8 encoding

- Use TextDecoder('utf-8', { fatal: true }) to distinguish actual decode
  errors from legitimate U+FFFD characters in UTF-8 files
- Change test temp directory from tests/fixtures to os.tmpdir() to avoid
  clobbering committed fixtures and reduce parallel-run collisions
- Non-UTF-8 files still use iconv.decode() fallback behavior

Addresses CodeRabbit review comments on PR #1007
---
 src/core/file/fileRead.ts        | 10 ++++++++++
 tests/core/file/fileRead.test.ts | 11 ++++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/core/file/fileRead.ts b/src/core/file/fileRead.ts
index a48625455..85166354d 100644
--- a/src/core/file/fileRead.ts
+++ b/src/core/file/fileRead.ts
@@ -51,6 +51,16 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis
     // Only skip if there are actual decode errors (U+FFFD replacement characters)
     // Don't rely on jschardet confidence as it can return low values for valid UTF-8/ASCII files
     if (content.includes('\uFFFD')) {
+      // For UTF-8, distinguish invalid byte sequences from a legitimate U+FFFD in the source
+      if (encoding.toLowerCase() === 'utf-8') {
+        try {
+          let utf8 = new TextDecoder('utf-8', { fatal: true }).decode(buffer);
+          if (utf8.charCodeAt(0) === 0xfeff) utf8 = utf8.slice(1); // strip UTF-8 BOM
+          return { content: utf8 };
+        } catch {
+          // fall through to skip below
+        }
+      }
       logger.debug(`Skipping file due to encoding errors (detected: ${encoding}): ${filePath}`);
       return { content: null, skippedReason: 'encoding-error' };
     }
diff --git a/tests/core/file/fileRead.test.ts b/tests/core/file/fileRead.test.ts
index 0513bbc6c..ae7d446fa 100644
--- a/tests/core/file/fileRead.test.ts
+++ b/tests/core/file/fileRead.test.ts
@@ -1,21 +1,18 @@
 import * as fs from 'node:fs/promises';
+import os from 'node:os';
 import path from 'node:path';
 import { afterEach, beforeEach, describe, expect, test } from 'vitest';
 import { readRawFile } from '../../../src/core/file/fileRead.js';
 
 describe('readRawFile', () => {
-  const testDir = path.join(process.cwd(), 'tests', 'fixtures', 'fileRead');
+  let testDir: string;
 
   beforeEach(async () => {
-    await fs.mkdir(testDir, { recursive: true });
+    testDir = await fs.mkdtemp(path.join(os.tmpdir(), 'repomix-fileRead-'));
   });
 
   afterEach(async () => {
-    try {
-      await fs.rm(testDir, { recursive: true });
-    } catch {
-      // Ignore if directory doesn't exist
-    }
+    await fs.rm(testDir, { recursive: true, force: true });
   });
 
   test('should read normal text file successfully', async () => {

From 0604b7e2e11ec648bcd8a08314535bb6db2a1cd3 Mon Sep 17 00:00:00 2001
From: Kazuki Yamada <koukun0120@gmail.com>
Date: Sun, 14 Dec 2025 19:10:48 +0900
Subject: [PATCH 3/4] fix(deps): downgrade isbinaryfile to v5.0.2 for Node.js
 20+ support

isbinaryfile v6.0.0 requires Node.js >= 24.0.0, but repomix supports
Node.js >= 20.0.0. Downgrade to v5.0.2 (requires Node.js >= 18.0.0)
to maintain compatibility with current LTS versions.

Addresses gemini-code-assist review on PR #1006
---
 package-lock.json | 10 +++++-----
 package.json      |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 73fcb1359..4dfd6928b 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -23,7 +23,7 @@
         "handlebars": "^4.7.8",
         "iconv-lite": "^0.7.0",
         "is-binary-path": "^3.0.0",
-        "isbinaryfile": "^6.0.0",
+        "isbinaryfile": "^5.0.2",
         "jiti": "^2.6.1",
         "jschardet": "^3.1.4",
         "json5": "^2.2.3",
@@ -3483,12 +3483,12 @@
       }
     },
     "node_modules/isbinaryfile": {
-      "version": "6.0.0",
-      "resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-6.0.0.tgz",
-      "integrity": "sha512-2FN2B8MAqKv6d5TaKsLvMrwMcghxwHTpcKy0L5mhNbRqjNqo2++SpCqN6eG1lCC1GmTQgvrYJYXv2+Chvyevag==",
+      "version": "5.0.2",
+      "resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-5.0.2.tgz",
+      "integrity": "sha512-GvcjojwonMjWbTkfMpnVHVqXW/wKMYDfEpY94/8zy8HFMOqb/VL6oeONq9v87q4ttVlaTLnGXnJD4B5B1OTGIg==",
       "license": "MIT",
       "engines": {
-        "node": ">= 24.0.0"
+        "node": ">= 18.0.0"
       },
       "funding": {
         "url": "https://github.com/sponsors/gjtorikian/"
diff --git a/package.json b/package.json
index 06bfdcec9..4fa53f9ab 100644
--- a/package.json
+++ b/package.json
@@ -87,7 +87,7 @@
     "handlebars": "^4.7.8",
     "iconv-lite": "^0.7.0",
     "is-binary-path": "^3.0.0",
-    "isbinaryfile": "^6.0.0",
+    "isbinaryfile": "^5.0.2",
     "jiti": "^2.6.1",
     "jschardet": "^3.1.4",
     "json5": "^2.2.3",

From 47398ae8206ef066bd9212e3a6069734b599f06d Mon Sep 17 00:00:00 2001
From: Kazuki Yamada <koukun0120@gmail.com>
Date: Sun, 14 Dec 2025 19:44:47 +0900
Subject: [PATCH 4/4] test(file): Add test for legitimate U+FFFD character
 handling

Verify that files containing intentional U+FFFD characters in the source
are correctly read (not skipped), testing the TextDecoder validation path.
---
 tests/core/file/fileRead.test.ts | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/core/file/fileRead.test.ts b/tests/core/file/fileRead.test.ts
index ae7d446fa..5fb5f9266 100644
--- a/tests/core/file/fileRead.test.ts
+++ b/tests/core/file/fileRead.test.ts
@@ -69,6 +69,20 @@ def hello():
     expect(result.skippedReason).toBeUndefined();
   });
 
+  test('should read file containing legitimate U+FFFD character', async () => {
+    // This tests that files with intentional U+FFFD characters in the source
+    // are NOT skipped (TextDecoder can decode them successfully)
+    const filePath = path.join(testDir, 'with-replacement-char.txt');
+    // U+FFFD is a valid Unicode character that can appear in source files
+    const content = 'Some text with replacement char: \uFFFD and more text';
+    await fs.writeFile(filePath, content, 'utf-8');
+
+    const result = await readRawFile(filePath, 1024);
+
+    expect(result.content).toBe(content);
+    expect(result.skippedReason).toBeUndefined();
+  });
+
   test('should skip file with actual decode errors (U+FFFD)', async () => {
     const filePath = path.join(testDir, 'invalid.txt');
     // Create a file with a UTF-8 BOM followed by valid text and invalid UTF-8 sequences