Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"handlebars": "^4.7.8",
"iconv-lite": "^0.7.0",
"is-binary-path": "^3.0.0",
"isbinaryfile": "^6.0.0",
"isbinaryfile": "^5.0.2",
"jiti": "^2.6.1",
"jschardet": "^3.1.4",
"json5": "^2.2.3",
Expand Down
21 changes: 15 additions & 6 deletions src/core/file/fileRead.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,25 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis
return { content: null, skippedReason: 'binary-content' };
}

const { encoding: detectedEncoding, confidence } = jschardet.detect(buffer) ?? {};
const { encoding: detectedEncoding } = jschardet.detect(buffer) ?? {};
const encoding = detectedEncoding && iconv.encodingExists(detectedEncoding) ? detectedEncoding : 'utf-8';

const content = iconv.decode(buffer, encoding, { stripBOM: true });

// Heuristics: U+FFFD indicates decode errors; very low confidence implies unreliable guess.
if (content.includes('\uFFFD') || (typeof confidence === 'number' && confidence < 0.2)) {
logger.debug(
`Skipping file due to encoding errors (${encoding}, confidence=${(confidence ?? 0).toFixed(2)}): ${filePath}`,
);
// Only skip if there are actual decode errors (U+FFFD replacement characters)
// Don't rely on jschardet confidence as it can return low values for valid UTF-8/ASCII files
if (content.includes('\uFFFD')) {
// For UTF-8, distinguish invalid byte sequences from a legitimate U+FFFD in the source
if (encoding.toLowerCase() === 'utf-8') {
try {
let utf8 = new TextDecoder('utf-8', { fatal: true }).decode(buffer);
if (utf8.charCodeAt(0) === 0xfeff) utf8 = utf8.slice(1); // strip UTF-8 BOM
return { content: utf8 };
} catch {
// fall through to skip below
}
}
logger.debug(`Skipping file due to encoding errors (detected: ${encoding}): ${filePath}`);
return { content: null, skippedReason: 'encoding-error' };
}

Expand Down
139 changes: 139 additions & 0 deletions tests/core/file/fileRead.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import * as fs from 'node:fs/promises';
import os from 'node:os';
import path from 'node:path';
import { afterEach, beforeEach, describe, expect, test } from 'vitest';
import { readRawFile } from '../../../src/core/file/fileRead.js';

describe('readRawFile', () => {
let testDir: string;

beforeEach(async () => {
testDir = await fs.mkdtemp(path.join(os.tmpdir(), 'repomix-fileRead-'));
});

afterEach(async () => {
await fs.rm(testDir, { recursive: true, force: true });
});

test('should read normal text file successfully', async () => {
const filePath = path.join(testDir, 'normal.txt');
const content = 'Hello World';
await fs.writeFile(filePath, content, 'utf-8');

const result = await readRawFile(filePath, 1024);

expect(result.content).toBe(content);
expect(result.skippedReason).toBeUndefined();
});

test('should read file with low jschardet confidence (Issue #869)', async () => {
// This tests that files with low confidence scores from jschardet
// are NOT skipped if they contain valid UTF-8 content
const filePath = path.join(testDir, 'server.py');
const content = `import json
import time
import uuid

def hello():
print("Hello, World!")
`;
await fs.writeFile(filePath, content, 'utf-8');

const result = await readRawFile(filePath, 1024 * 1024);

expect(result.content).toBe(content);
expect(result.skippedReason).toBeUndefined();
});

test('should read HTML file with Thymeleaf syntax (Issue #847)', async () => {
// This tests that HTML files with special syntax like Thymeleaf (~{})
// are NOT skipped even if jschardet returns low confidence
const filePath = path.join(testDir, 'thymeleaf.html');
const content = '<html lang="en" xmlns:th="http://www.thymeleaf.org" layout:decorate="~{layouts/default}"></html>';
await fs.writeFile(filePath, content, 'utf-8');

const result = await readRawFile(filePath, 1024);

expect(result.content).toBe(content);
expect(result.skippedReason).toBeUndefined();
});

test('should read empty file successfully', async () => {
// Empty files should not be skipped (jschardet may return 0 confidence for empty files)
const filePath = path.join(testDir, '__init__.py');
await fs.writeFile(filePath, '', 'utf-8');

const result = await readRawFile(filePath, 1024);

expect(result.content).toBe('');
expect(result.skippedReason).toBeUndefined();
});

test('should read file containing legitimate U+FFFD character', async () => {
// This tests that files with intentional U+FFFD characters in the source
// are NOT skipped (TextDecoder can decode them successfully)
const filePath = path.join(testDir, 'with-replacement-char.txt');
// U+FFFD is a valid Unicode character that can appear in source files
const content = 'Some text with replacement char: \uFFFD and more text';
await fs.writeFile(filePath, content, 'utf-8');

const result = await readRawFile(filePath, 1024);

expect(result.content).toBe(content);
expect(result.skippedReason).toBeUndefined();
});

test('should skip file with actual decode errors (U+FFFD)', async () => {
const filePath = path.join(testDir, 'invalid.txt');
// Create a file with a UTF-8 BOM followed by valid text and invalid UTF-8 sequences
// The BOM forces UTF-8 detection, and the invalid sequence will produce U+FFFD
const utf8Bom = Buffer.from([0xef, 0xbb, 0xbf]); // UTF-8 BOM
const validText = 'Hello World\n'.repeat(50);
// Invalid UTF-8: 0x80 is a continuation byte without a leading byte
const invalidSequence = Buffer.from([0x80, 0x81, 0x82]);
const buffer = Buffer.concat([utf8Bom, Buffer.from(validText), invalidSequence, Buffer.from(validText)]);
await fs.writeFile(filePath, buffer);

const result = await readRawFile(filePath, 1024 * 1024);

expect(result.content).toBeNull();
expect(result.skippedReason).toBe('encoding-error');
});

test('should skip file if it exceeds size limit', async () => {
const filePath = path.join(testDir, 'large.txt');
const content = 'x'.repeat(1000);
await fs.writeFile(filePath, content, 'utf-8');

const result = await readRawFile(filePath, 100);

expect(result.content).toBeNull();
expect(result.skippedReason).toBe('size-limit');
});

test('should skip binary file by extension', async () => {
const filePath = path.join(testDir, 'test.jpg');
const binaryData = Buffer.from([0xff, 0xd8, 0xff, 0xe0]);
await fs.writeFile(filePath, binaryData);

const result = await readRawFile(filePath, 1024);

expect(result.content).toBeNull();
expect(result.skippedReason).toBe('binary-extension');
});

test('should skip binary content in text extension file', async () => {
const filePath = path.join(testDir, 'binary.txt');
// Create file with binary content (null bytes and control characters)
const binaryData = Buffer.alloc(256);
for (let i = 0; i < 256; i++) {
binaryData[i] = i;
}
await fs.writeFile(filePath, binaryData);

const result = await readRawFile(filePath, 1024);

expect(result.content).toBeNull();
expect(result.skippedReason).toBe('binary-content');
});
});
Loading