diff --git a/src/cli/cliReport.ts b/src/cli/cliReport.ts index c308e8f73..107589c99 100644 --- a/src/cli/cliReport.ts +++ b/src/cli/cliReport.ts @@ -1,6 +1,7 @@ import path from 'node:path'; import pc from 'picocolors'; import type { RepomixConfigMerged } from '../config/configSchema.js'; +import type { SkippedFileInfo } from '../core/file/fileCollect.js'; import type { PackResult } from '../core/packager.js'; import type { SuspiciousFileResult } from '../core/security/securityCheck.js'; import { logger } from '../shared/logger.js'; @@ -36,6 +37,9 @@ export const reportResults = (cwd: string, packResult: PackResult, config: Repom ); logger.log(''); + reportSkippedFiles(cwd, packResult.skippedFiles); + logger.log(''); + reportSummary(packResult, config); logger.log(''); @@ -157,6 +161,31 @@ export const reportTopFiles = ( }); }; +export const reportSkippedFiles = (rootDir: string, skippedFiles: SkippedFileInfo[]) => { + const binaryContentFiles = skippedFiles.filter((file) => file.reason === 'binary-content'); + + if (binaryContentFiles.length === 0) { + return; + } + + logger.log(pc.white('📄 Binary Files Detected:')); + logger.log(pc.dim('─────────────────────────')); + + if (binaryContentFiles.length === 1) { + logger.log(pc.yellow('1 file detected as binary by content inspection:')); + } else { + logger.log(pc.yellow(`${binaryContentFiles.length} files detected as binary by content inspection:`)); + } + + binaryContentFiles.forEach((file, index) => { + const relativeFilePath = path.relative(rootDir, file.path); + logger.log(`${pc.white(`${index + 1}.`)} ${pc.white(relativeFilePath)}`); + }); + + logger.log(pc.yellow('\nThese files have been excluded from the output.')); + logger.log(pc.yellow('Please review these files if you expected them to contain text content.')); +}; + export const reportCompletion = () => { logger.log(pc.green('🎉 All Done!')); logger.log(pc.white('Your repository has been successfully packed.')); diff --git a/src/core/file/fileCollect.ts b/src/core/file/fileCollect.ts index 1e28b16cb..6cd284f52 100644 --- a/src/core/file/fileCollect.ts +++ b/src/core/file/fileCollect.ts @@ -4,7 +4,15 @@ import { logger } from '../../shared/logger.js'; import { initTaskRunner } from '../../shared/processConcurrency.js'; import type { RepomixProgressCallback } from '../../shared/types.js'; import type { RawFile } from './fileTypes.js'; -import type { FileCollectTask } from './workers/fileCollectWorker.js'; +import type { FileCollectResult, FileCollectTask, SkippedFileInfo } from './workers/fileCollectWorker.js'; + +export interface FileCollectResults { + rawFiles: RawFile[]; + skippedFiles: SkippedFileInfo[]; +} + +// Re-export SkippedFileInfo for external use +export type { SkippedFileInfo } from './workers/fileCollectWorker.js'; export const collectFiles = async ( filePaths: string[], @@ -14,8 +22,8 @@ export const collectFiles = async ( deps = { initTaskRunner, }, -): Promise => { - const taskRunner = deps.initTaskRunner( +): Promise => { + const taskRunner = deps.initTaskRunner( filePaths.length, new URL('./workers/fileCollectWorker.js', import.meta.url).href, ); @@ -50,7 +58,19 @@ export const collectFiles = async ( const duration = Number(endTime - startTime) / 1e6; logger.trace(`File collection completed in ${duration.toFixed(2)}ms`); - return results.filter((file): file is RawFile => file !== null); + const rawFiles: RawFile[] = []; + const skippedFiles: SkippedFileInfo[] = []; + + for (const result of results) { + if (result.rawFile) { + rawFiles.push(result.rawFile); + } + if (result.skippedFile) { + skippedFiles.push(result.skippedFile); + } + } + + return { rawFiles, skippedFiles }; } catch (error) { logger.error('Error during file collection:', error); throw error; diff --git a/src/core/file/fileRead.ts b/src/core/file/fileRead.ts index b544551af..2c3300d43 100644 --- a/src/core/file/fileRead.ts +++ b/src/core/file/fileRead.ts @@ -4,13 +4,20 @@ import { isBinary } from 'istextorbinary'; import jschardet from 'jschardet'; import { logger } from '../../shared/logger.js'; +export type FileSkipReason = 'binary-extension' | 'binary-content' | 'size-limit' | 'encoding-error'; + +export interface FileReadResult { + content: string | null; + skippedReason?: FileSkipReason; +} + /** * Read a file and return its text content * @param filePath Path to the file * @param maxFileSize Maximum file size in bytes - * @returns File content as string, or null if the file is binary or exceeds size limit + * @returns File content as string and skip reason if file was skipped */ -export const readRawFile = async (filePath: string, maxFileSize: number): Promise => { +export const readRawFile = async (filePath: string, maxFileSize: number): Promise => { try { const stats = await fs.stat(filePath); @@ -18,12 +25,12 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis const sizeKB = (stats.size / 1024).toFixed(1); const maxSizeKB = (maxFileSize / 1024).toFixed(1); logger.trace(`File exceeds size limit: ${sizeKB}KB > ${maxSizeKB}KB (${filePath})`); - return null; + return { content: null, skippedReason: 'size-limit' }; } if (isBinary(filePath)) { logger.debug(`Skipping binary file: ${filePath}`); - return null; + return { content: null, skippedReason: 'binary-extension' }; } logger.trace(`Reading file: ${filePath}`); @@ -32,15 +39,25 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis if (isBinary(null, buffer)) { logger.debug(`Skipping binary file (content check): ${filePath}`); - return null; + return { content: null, skippedReason: 'binary-content' }; } - const encoding = jschardet.detect(buffer).encoding || 'utf-8'; - const content = iconv.decode(buffer, encoding); + const { encoding: detectedEncoding, confidence } = jschardet.detect(buffer) ?? {}; + const encoding = detectedEncoding && iconv.encodingExists(detectedEncoding) ? detectedEncoding : 'utf-8'; + + const content = iconv.decode(buffer, encoding, { stripBOM: true }); + + // Heuristics: U+FFFD indicates decode errors; very low confidence implies unreliable guess. + if (content.includes('\uFFFD') || (typeof confidence === 'number' && confidence < 0.2)) { + logger.debug( + `Skipping file due to encoding errors (${encoding}, confidence=${(confidence ?? 0).toFixed(2)}): ${filePath}`, + ); + return { content: null, skippedReason: 'encoding-error' }; + } - return content; + return { content }; } catch (error) { logger.warn(`Failed to read file: ${filePath}`, error); - return null; + return { content: null, skippedReason: 'encoding-error' }; } }; diff --git a/src/core/file/workers/fileCollectWorker.ts b/src/core/file/workers/fileCollectWorker.ts index c20cf98da..364329010 100644 --- a/src/core/file/workers/fileCollectWorker.ts +++ b/src/core/file/workers/fileCollectWorker.ts @@ -1,6 +1,7 @@ import path from 'node:path'; -import { logger, setLogLevelByWorkerData } from '../../../shared/logger.js'; -import { readRawFile } from '../fileRead.js'; +import { setLogLevelByWorkerData } from '../../../shared/logger.js'; +import { type FileSkipReason, readRawFile } from '../fileRead.js'; +import type { RawFile } from '../fileTypes.js'; // Initialize logger configuration from workerData at module load time // This must be called before any logging operations in the worker @@ -12,16 +13,39 @@ export interface FileCollectTask { maxFileSize: number; } -export default async ({ filePath, rootDir, maxFileSize }: FileCollectTask) => { +export interface SkippedFileInfo { + path: string; + reason: FileSkipReason; +} + +export interface FileCollectResult { + rawFile?: RawFile; + skippedFile?: SkippedFileInfo; +} + +export default async ({ filePath, rootDir, maxFileSize }: FileCollectTask): Promise => { const fullPath = path.resolve(rootDir, filePath); - const content = await readRawFile(fullPath, maxFileSize); + const result = await readRawFile(fullPath, maxFileSize); + + if (result.content !== null) { + return { + rawFile: { + path: filePath, + content: result.content, + }, + }; + } - if (content) { + if (result.skippedReason) { return { - path: filePath, - content, + skippedFile: { + path: filePath, + reason: result.skippedReason, + }, }; } - return null; + throw new Error( + `File processing for ${filePath} resulted in an unexpected state: content is null but no skip reason was provided.`, + ); }; diff --git a/src/core/packager.ts b/src/core/packager.ts index 037c99ba3..94ab1b8bf 100644 --- a/src/core/packager.ts +++ b/src/core/packager.ts @@ -2,7 +2,7 @@ import type { RepomixConfigMerged } from '../config/configSchema.js'; import { RepomixError } from '../shared/errorHandle.js'; import { logMemoryUsage, withMemoryLogging } from '../shared/memoryUtils.js'; import type { RepomixProgressCallback } from '../shared/types.js'; -import { collectFiles } from './file/fileCollect.js'; +import { type SkippedFileInfo, collectFiles } from './file/fileCollect.js'; import { sortPaths } from './file/filePathSort.js'; import { processFiles } from './file/fileProcess.js'; import { searchFiles } from './file/fileSearch.js'; @@ -29,6 +29,7 @@ export interface PackResult { suspiciousGitLogResults: SuspiciousFileResult[]; processedFiles: ProcessedFile[]; safeFilePaths: string[]; + skippedFiles: SkippedFileInfo[]; } const defaultDeps = { @@ -83,16 +84,19 @@ export const pack = async ( })); progressCallback('Collecting files...'); - const rawFiles = await withMemoryLogging('Collect Files', async () => - ( + const collectResults = await withMemoryLogging( + 'Collect Files', + async () => await Promise.all( sortedFilePathsByDir.map(({ rootDir, filePaths }) => deps.collectFiles(filePaths, rootDir, config, progressCallback), ), - ) - ).reduce((acc: RawFile[], curr: RawFile[]) => acc.concat(...curr), []), + ), ); + const rawFiles = collectResults.flatMap((curr) => curr.rawFiles); + const allSkippedFiles = collectResults.flatMap((curr) => curr.skippedFiles); + // Get git diffs if enabled - run this before security check progressCallback('Getting git diffs...'); const gitDiffResult = await deps.getGitDiffs(rootDirs, config); @@ -135,6 +139,7 @@ export const pack = async ( suspiciousGitLogResults, processedFiles, safeFilePaths, + skippedFiles: allSkippedFiles, }; logMemoryUsage('Pack - End'); diff --git a/tests/cli/actions/defaultAction.test.ts b/tests/cli/actions/defaultAction.test.ts index bdfe76ec0..7a31dfc86 100644 --- a/tests/cli/actions/defaultAction.test.ts +++ b/tests/cli/actions/defaultAction.test.ts @@ -107,6 +107,7 @@ describe('defaultAction', () => { safeFilePaths: [], gitDiffTokenCount: 0, gitLogTokenCount: 0, + skippedFiles: [], }); }); @@ -665,6 +666,7 @@ describe('defaultAction', () => { fileCharCounts: {}, fileTokenCounts: {}, outputFilePath: 'output.txt', + skippedFiles: [], } as PackResult); }); @@ -796,6 +798,7 @@ describe('defaultAction', () => { fileCharCounts: {}, fileTokenCounts: {}, outputFilePath: 'output.txt', + skippedFiles: [], } as PackResult); }); @@ -865,6 +868,7 @@ describe('defaultAction', () => { fileCharCounts: {}, fileTokenCounts: {}, outputFilePath: 'output.txt', + skippedFiles: [], } as PackResult; }); diff --git a/tests/cli/actions/remoteAction.test.ts b/tests/cli/actions/remoteAction.test.ts index 041efc7a4..9765b12fc 100644 --- a/tests/cli/actions/remoteAction.test.ts +++ b/tests/cli/actions/remoteAction.test.ts @@ -50,6 +50,7 @@ describe('remoteAction functions', () => { safeFilePaths: [], gitDiffTokenCount: 0, gitLogTokenCount: 0, + skippedFiles: [], }, config: createMockConfig(), } satisfies DefaultActionRunnerResult; @@ -92,6 +93,7 @@ describe('remoteAction functions', () => { safeFilePaths: [], gitDiffTokenCount: 0, gitLogTokenCount: 0, + skippedFiles: [], }, config: createMockConfig(), } satisfies DefaultActionRunnerResult; @@ -137,6 +139,7 @@ describe('remoteAction functions', () => { safeFilePaths: [], gitDiffTokenCount: 0, gitLogTokenCount: 0, + skippedFiles: [], }, config: createMockConfig(), } satisfies DefaultActionRunnerResult; @@ -182,6 +185,7 @@ describe('remoteAction functions', () => { safeFilePaths: [], gitDiffTokenCount: 0, gitLogTokenCount: 0, + skippedFiles: [], }, config: createMockConfig(), } satisfies DefaultActionRunnerResult; diff --git a/tests/cli/cliReport.binaryFiles.test.ts b/tests/cli/cliReport.binaryFiles.test.ts new file mode 100644 index 000000000..37052f3c8 --- /dev/null +++ b/tests/cli/cliReport.binaryFiles.test.ts @@ -0,0 +1,67 @@ +import path from 'node:path'; +import { beforeEach, describe, expect, test, vi } from 'vitest'; +import { reportSkippedFiles } from '../../src/cli/cliReport.js'; +import type { SkippedFileInfo } from '../../src/core/file/fileCollect.js'; +import { logger } from '../../src/shared/logger.js'; + +vi.mock('../../src/shared/logger'); + +describe('reportSkippedFiles', () => { + beforeEach(() => { + vi.resetAllMocks(); + }); + + test('should not report anything when there are no binary-content files', () => { + const skippedFiles: SkippedFileInfo[] = [ + { path: 'large.txt', reason: 'size-limit' }, + { path: 'binary.bin', reason: 'binary-extension' }, + { path: 'error.txt', reason: 'encoding-error' }, + ]; + + reportSkippedFiles('/root', skippedFiles); + + expect(logger.log).not.toHaveBeenCalled(); + }); + + test('should report single binary-content file', () => { + const skippedFiles: SkippedFileInfo[] = [{ path: '/root/dir/malformed.txt', reason: 'binary-content' }]; + + reportSkippedFiles('/root', skippedFiles); + + expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('📄 Binary Files Detected:')); + expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('1 file detected as binary')); + expect(logger.log).toHaveBeenCalledWith(expect.stringContaining(path.join('dir', 'malformed.txt'))); + }); + + test('should report multiple binary-content files', () => { + const skippedFiles: SkippedFileInfo[] = [ + { path: '/root/file1.txt', reason: 'binary-content' }, + { path: '/root/dir/file2.md', reason: 'binary-content' }, + { path: '/root/normal.bin', reason: 'binary-extension' }, // Should be ignored + ]; + + reportSkippedFiles('/root', skippedFiles); + + expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('📄 Binary Files Detected:')); + expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('2 files detected as binary')); + expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('file1.txt')); + expect(logger.log).toHaveBeenCalledWith(expect.stringContaining(path.join('dir', 'file2.md'))); + }); + + test('should show relative paths correctly', () => { + const skippedFiles: SkippedFileInfo[] = [{ path: '/root/src/components/app.tsx', reason: 'binary-content' }]; + + reportSkippedFiles('/root', skippedFiles); + + expect(logger.log).toHaveBeenCalledWith(expect.stringContaining(path.join('src', 'components', 'app.tsx'))); + }); + + test('should show warning messages about excluded files', () => { + const skippedFiles: SkippedFileInfo[] = [{ path: '/root/file.txt', reason: 'binary-content' }]; + + reportSkippedFiles('/root', skippedFiles); + + expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('These files have been excluded')); + expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('Please review these files if you expected')); + }); +}); diff --git a/tests/cli/cliReport.test.ts b/tests/cli/cliReport.test.ts index c4c213775..6c50f5968 100644 --- a/tests/cli/cliReport.test.ts +++ b/tests/cli/cliReport.test.ts @@ -46,6 +46,7 @@ describe('cliReport', () => { safeFilePaths: [], gitDiffTokenCount: 0, gitLogTokenCount: 0, + skippedFiles: [], }; reportSummary(packResult, config); @@ -71,6 +72,7 @@ describe('cliReport', () => { safeFilePaths: [], gitDiffTokenCount: 0, gitLogTokenCount: 0, + skippedFiles: [], }; reportSummary(packResult, config); diff --git a/tests/cli/cliRun.test.ts b/tests/cli/cliRun.test.ts index c4434c1da..4fea9a092 100644 --- a/tests/cli/cliRun.test.ts +++ b/tests/cli/cliRun.test.ts @@ -99,6 +99,7 @@ describe('cliRun', () => { suspiciousGitLogResults: [], processedFiles: [], safeFilePaths: [], + skippedFiles: [], } satisfies PackResult, }); vi.mocked(initAction.runInitAction).mockResolvedValue(); @@ -154,6 +155,7 @@ describe('cliRun', () => { suspiciousGitLogResults: [], processedFiles: [], safeFilePaths: [], + skippedFiles: [], } satisfies PackResult, }); vi.mocked(versionAction.runVersionAction).mockResolvedValue(); diff --git a/tests/core/file/fileCollect.test.ts b/tests/core/file/fileCollect.test.ts index 2b8e6e50b..2761ed451 100644 --- a/tests/core/file/fileCollect.test.ts +++ b/tests/core/file/fileCollect.test.ts @@ -61,10 +61,13 @@ describe('fileCollect', () => { initTaskRunner: mockInitTaskRunner, }); - expect(result).toEqual([ - { path: 'file1.txt', content: 'decoded content' }, - { path: 'file2.txt', content: 'decoded content' }, - ]); + expect(result).toEqual({ + rawFiles: [ + { path: 'file1.txt', content: 'decoded content' }, + { path: 'file2.txt', content: 'decoded content' }, + ], + skippedFiles: [], + }); }); it('should skip binary files', async () => { @@ -83,7 +86,10 @@ describe('fileCollect', () => { initTaskRunner: mockInitTaskRunner, }); - expect(result).toEqual([{ path: 'text.txt', content: 'decoded content' }]); + expect(result).toEqual({ + rawFiles: [{ path: 'text.txt', content: 'decoded content' }], + skippedFiles: [{ path: 'binary.bin', reason: 'binary-extension' }], + }); expect(logger.debug).toHaveBeenCalledWith(`Skipping binary file: ${path.resolve('/root/binary.bin')}`); }); @@ -113,7 +119,10 @@ describe('fileCollect', () => { initTaskRunner: mockInitTaskRunner, }); - expect(result).toEqual([{ path: 'normal.txt', content: 'decoded content' }]); + expect(result).toEqual({ + rawFiles: [{ path: 'normal.txt', content: 'decoded content' }], + skippedFiles: [{ path: 'large.txt', reason: 'size-limit' }], + }); expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('File exceeds size limit:')); expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining(largePath)); @@ -153,7 +162,10 @@ describe('fileCollect', () => { initTaskRunner: mockInitTaskRunner, }); - expect(result).toEqual([{ path: 'small.txt', content: 'decoded content' }]); + expect(result).toEqual({ + rawFiles: [{ path: 'small.txt', content: 'decoded content' }], + skippedFiles: [{ path: 'medium.txt', reason: 'size-limit' }], + }); expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('File exceeds size limit:')); expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('10240.0KB > 5120.0KB')); expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining(mediumPath)); @@ -175,7 +187,10 @@ describe('fileCollect', () => { initTaskRunner: mockInitTaskRunner, }); - expect(result).toEqual([]); + expect(result).toEqual({ + rawFiles: [], + skippedFiles: [{ path: 'error.txt', reason: 'encoding-error' }], + }); expect(logger.warn).toHaveBeenCalledWith( `Failed to read file: ${path.resolve('/root/error.txt')}`, expect.any(Error), diff --git a/tests/core/packager.test.ts b/tests/core/packager.test.ts index 7fdc6e41f..fa792e9be 100644 --- a/tests/core/packager.test.ts +++ b/tests/core/packager.test.ts @@ -42,7 +42,7 @@ describe('packager', () => { emptyDirPaths: [], }), sortPaths: vi.fn().mockImplementation((paths) => paths), - collectFiles: vi.fn().mockResolvedValue(mockRawFiles), + collectFiles: vi.fn().mockResolvedValue({ rawFiles: mockRawFiles, skippedFiles: [] }), processFiles: vi.fn().mockReturnValue(mockProcessedFiles), validateFileSafety: vi.fn().mockResolvedValue({ safeFilePaths: mockFilePaths, @@ -118,5 +118,6 @@ describe('packager', () => { 'file1.txt': 10, [file2Path]: 10, }); + expect(result.skippedFiles).toEqual([]); }); }); diff --git a/tests/core/packager/diffsFunctionality.test.ts b/tests/core/packager/diffsFunctionality.test.ts index 173ab4dbb..5069aaffd 100644 --- a/tests/core/packager/diffsFunctionality.test.ts +++ b/tests/core/packager/diffsFunctionality.test.ts @@ -53,7 +53,7 @@ index 123..456 100644 test('should not fetch diffs when includeDiffs is disabled', async () => { // Mock the dependencies for pack const mockSearchFiles = vi.fn().mockResolvedValue({ filePaths: [] }); - const mockCollectFiles = vi.fn().mockResolvedValue([]); + const mockCollectFiles = vi.fn().mockResolvedValue({ rawFiles: [], skippedFiles: [] }); const mockProcessFiles = vi.fn().mockResolvedValue([]); const mockGenerateOutput = vi.fn().mockResolvedValue('mocked output'); const mockValidateFileSafety = vi.fn().mockResolvedValue({ @@ -104,7 +104,7 @@ index 123..456 100644 // Mock dependencies const mockSearchFiles = vi.fn().mockResolvedValue({ filePaths: ['test.js'] }); - const mockCollectFiles = vi.fn().mockResolvedValue(processedFiles); + const mockCollectFiles = vi.fn().mockResolvedValue({ rawFiles: processedFiles, skippedFiles: [] }); const mockProcessFiles = vi.fn().mockResolvedValue(processedFiles); const mockGenerateOutput = vi.fn().mockResolvedValue('Generated output with diffs included'); const mockValidateFileSafety = vi.fn().mockResolvedValue({ diff --git a/tests/mcp/tools/packCodebaseTool.test.ts b/tests/mcp/tools/packCodebaseTool.test.ts index 4225b5bc6..d630b656a 100644 --- a/tests/mcp/tools/packCodebaseTool.test.ts +++ b/tests/mcp/tools/packCodebaseTool.test.ts @@ -44,6 +44,7 @@ describe('PackCodebaseTool', () => { suspiciousGitLogResults: [], processedFiles: [], safeFilePaths: [], + skippedFiles: [], }; beforeEach(() => {