diff --git a/src/core/git/archiveEntryFilter.ts b/src/core/git/archiveEntryFilter.ts new file mode 100644 index 000000000..49d6f2b61 --- /dev/null +++ b/src/core/git/archiveEntryFilter.ts @@ -0,0 +1,26 @@ +import isBinaryPath from 'is-binary-path'; +import { logger } from '../../shared/logger.js'; + +/** + * Creates a filter function for tar extraction that skips binary files. + * The filter is called with the raw tar entry path (before strip is applied), + * so we manually remove the leading segment (e.g. "repo-branch/") before checking. + */ +export const createArchiveEntryFilter = (deps = { isBinaryPath }) => { + return (entryPath: string): boolean => { + // Remove the leading directory segment that tar's strip:1 would remove + const strippedPath = entryPath.replace(/^[^/]+\//, ''); + + if (!strippedPath) { + // Root directory entry — always allow + return true; + } + + if (deps.isBinaryPath(strippedPath)) { + logger.trace(`Skipping binary file in archive: ${strippedPath}`); + return false; + } + + return true; + }; +}; diff --git a/src/core/git/gitHubArchive.ts b/src/core/git/gitHubArchive.ts index e513d15e1..4c5c68e69 100644 --- a/src/core/git/gitHubArchive.ts +++ b/src/core/git/gitHubArchive.ts @@ -4,6 +4,7 @@ import * as zlib from 'node:zlib'; import { extract as tarExtract } from 'tar'; import { RepomixError } from '../../shared/errorHandle.js'; import { logger } from '../../shared/logger.js'; +import { createArchiveEntryFilter } from './archiveEntryFilter.js'; import { buildGitHubArchiveUrl, buildGitHubMasterArchiveUrl, @@ -31,6 +32,7 @@ export interface ArchiveDownloadDeps { Transform: typeof Transform; tarExtract: typeof tarExtract; createGunzip: typeof zlib.createGunzip; + createArchiveEntryFilter: typeof createArchiveEntryFilter; } const defaultDeps: ArchiveDownloadDeps = { @@ -39,6 +41,7 @@ const defaultDeps: ArchiveDownloadDeps = { Transform, tarExtract, createGunzip: zlib.createGunzip, + createArchiveEntryFilter, }; /** @@ -164,9 +167,12 @@ const downloadAndExtractArchive = async ( // Stream: HTTP response -> progress tracking -> gunzip -> tar extract to disk // strip: 1 removes the top-level "repo-branch/" directory from archive paths + // filter: skips binary files (e.g. images, fonts, executables) to avoid unnecessary disk I/O + const entryFilter = deps.createArchiveEntryFilter(); const extractStream = deps.tarExtract({ cwd: targetDirectory, strip: 1, + filter: (entryPath: string) => entryFilter(entryPath), }); const gunzipStream = deps.createGunzip(); diff --git a/tests/core/git/archiveEntryFilter.test.ts b/tests/core/git/archiveEntryFilter.test.ts new file mode 100644 index 000000000..09b30a1a5 --- /dev/null +++ b/tests/core/git/archiveEntryFilter.test.ts @@ -0,0 +1,72 @@ +import { describe, expect, test, vi } from 'vitest'; +import { createArchiveEntryFilter } from '../../../src/core/git/archiveEntryFilter.js'; + +vi.mock('../../../src/shared/logger'); + +describe('archiveEntryFilter', () => { + describe('createArchiveEntryFilter', () => { + test('should allow text files', () => { + const filter = createArchiveEntryFilter(); + expect(filter('repo-main/src/index.ts')).toBe(true); + expect(filter('repo-main/README.md')).toBe(true); + expect(filter('repo-main/package.json')).toBe(true); + }); + + test('should skip binary image files', () => { + const filter = createArchiveEntryFilter(); + expect(filter('repo-main/assets/logo.png')).toBe(false); + expect(filter('repo-main/images/photo.jpg')).toBe(false); + expect(filter('repo-main/icon.gif')).toBe(false); + }); + + test('should skip font files', () => { + const filter = createArchiveEntryFilter(); + expect(filter('repo-main/fonts/inter.woff2')).toBe(false); + expect(filter('repo-main/fonts/roboto.woff')).toBe(false); + expect(filter('repo-main/fonts/arial.ttf')).toBe(false); + }); + + test('should skip archive and executable files', () => { + const filter = createArchiveEntryFilter(); + expect(filter('repo-main/dist/app.exe')).toBe(false); + expect(filter('repo-main/vendor/lib.zip')).toBe(false); + }); + + test('should allow root directory entry', () => { + const filter = createArchiveEntryFilter(); + expect(filter('repo-main/')).toBe(true); + }); + + test('should handle nested directory paths', () => { + const filter = createArchiveEntryFilter(); + expect(filter('repo-main/src/components/Button.tsx')).toBe(true); + expect(filter('repo-main/src/assets/icons/arrow.png')).toBe(false); + }); + + test('should strip leading segment correctly for various repo name formats', () => { + const filter = createArchiveEntryFilter(); + // Different repository name formats in tar archives + expect(filter('yamadashy-repomix-abc123/src/index.ts')).toBe(true); + expect(filter('yamadashy-repomix-abc123/logo.png')).toBe(false); + }); + + test('should use injected isBinaryPath dependency', () => { + const mockIsBinaryPath = vi.fn().mockReturnValue(true); + const filter = createArchiveEntryFilter({ isBinaryPath: mockIsBinaryPath }); + + const result = filter('repo-main/src/index.ts'); + + expect(result).toBe(false); + expect(mockIsBinaryPath).toHaveBeenCalledWith('src/index.ts'); + }); + + test('should pass stripped path to isBinaryPath', () => { + const mockIsBinaryPath = vi.fn().mockReturnValue(false); + const filter = createArchiveEntryFilter({ isBinaryPath: mockIsBinaryPath }); + + filter('repo-main/src/deep/file.ts'); + + expect(mockIsBinaryPath).toHaveBeenCalledWith('src/deep/file.ts'); + }); + }); +}); diff --git a/tests/core/git/gitHubArchive.test.ts b/tests/core/git/gitHubArchive.test.ts index f325418d1..0ca6182a9 100644 --- a/tests/core/git/gitHubArchive.test.ts +++ b/tests/core/git/gitHubArchive.test.ts @@ -3,6 +3,7 @@ import type { pipeline as pipelineType } from 'node:stream/promises'; import type * as zlib from 'node:zlib'; import type { extract as tarExtractType } from 'tar'; import { beforeEach, describe, expect, test, vi } from 'vitest'; +import type { createArchiveEntryFilter as createArchiveEntryFilterType } from '../../../src/core/git/archiveEntryFilter.js'; import { type ArchiveDownloadOptions, downloadGitHubArchive, @@ -22,6 +23,7 @@ interface MockDeps { Transform: typeof Transform; tarExtract: typeof tarExtractType; createGunzip: typeof zlib.createGunzip; + createArchiveEntryFilter: typeof createArchiveEntryFilterType; } // Simple test data @@ -32,6 +34,7 @@ describe('gitHubArchive', () => { let mockPipeline: ReturnType>; let mockTarExtract: ReturnType>; let mockCreateGunzip: ReturnType>; + let mockCreateArchiveEntryFilter: ReturnType>; let mockDeps: MockDeps; beforeEach(() => { @@ -53,6 +56,7 @@ describe('gitHubArchive', () => { }, }) as unknown as ReturnType, ); + mockCreateArchiveEntryFilter = vi.fn().mockReturnValue(() => true); mockDeps = { fetch: mockFetch, @@ -60,6 +64,7 @@ describe('gitHubArchive', () => { Transform, tarExtract: mockTarExtract as unknown as typeof tarExtractType, createGunzip: mockCreateGunzip as unknown as typeof zlib.createGunzip, + createArchiveEntryFilter: mockCreateArchiveEntryFilter, }; }); @@ -104,10 +109,11 @@ describe('gitHubArchive', () => { }), ); - // Verify tar extract was called with correct options + // Verify tar extract was called with correct options including filter expect(mockTarExtract).toHaveBeenCalledWith({ cwd: mockTargetDirectory, strip: 1, + filter: expect.any(Function), }); // Verify streaming pipeline was used