Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/core/git/archiveEntryFilter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import isBinaryPath from 'is-binary-path';
import { logger } from '../../shared/logger.js';

/**
* Creates a filter function for tar extraction that skips binary files.
* The filter is called with the raw tar entry path (before strip is applied),
* so we manually remove the leading segment (e.g. "repo-branch/") before checking.
*/
export const createArchiveEntryFilter = (deps = { isBinaryPath }) => {
return (entryPath: string): boolean => {
// Remove the leading directory segment that tar's strip:1 would remove
const strippedPath = entryPath.replace(/^[^/]+\//, '');

if (!strippedPath) {
// Root directory entry — always allow
return true;
}

if (deps.isBinaryPath(strippedPath)) {
logger.trace(`Skipping binary file in archive: ${strippedPath}`);
return false;
}

return true;
};
Comment on lines +10 to +25
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The filter should explicitly check the entry type to ensure it only applies to regular files. This prevents potential issues where directories or other entry types with names resembling binary extensions (e.g., a repository named test.zip) might be incorrectly skipped if the stripping logic fails or the trailing slash is missing. Using the entry object provided by tar is a more robust approach.

  return (entryPath: string, entry?: any): boolean => {
    // Only filter regular files; allow directories, symlinks, etc.
    if (entry && entry.type !== 'File') {
      return true;
    }

    // Remove the leading directory segment that tar's strip:1 would remove
    const strippedPath = entryPath.replace(/^[^/]+\//, '');

    if (!strippedPath) {
      // Root directory entry — always allow
      return true;
    }

    if (deps.isBinaryPath(strippedPath)) {
      logger.trace(`Skipping binary file in archive: ${strippedPath}`);
      return false;
    }

    return true;
  };

};
6 changes: 6 additions & 0 deletions src/core/git/gitHubArchive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import * as zlib from 'node:zlib';
import { extract as tarExtract } from 'tar';
import { RepomixError } from '../../shared/errorHandle.js';
import { logger } from '../../shared/logger.js';
import { createArchiveEntryFilter } from './archiveEntryFilter.js';
import {
buildGitHubArchiveUrl,
buildGitHubMasterArchiveUrl,
Expand Down Expand Up @@ -31,6 +32,7 @@ export interface ArchiveDownloadDeps {
Transform: typeof Transform;
tarExtract: typeof tarExtract;
createGunzip: typeof zlib.createGunzip;
createArchiveEntryFilter: typeof createArchiveEntryFilter;
}

const defaultDeps: ArchiveDownloadDeps = {
Expand All @@ -39,6 +41,7 @@ const defaultDeps: ArchiveDownloadDeps = {
Transform,
tarExtract,
createGunzip: zlib.createGunzip,
createArchiveEntryFilter,
};

/**
Expand Down Expand Up @@ -164,9 +167,12 @@ const downloadAndExtractArchive = async (

// Stream: HTTP response -> progress tracking -> gunzip -> tar extract to disk
// strip: 1 removes the top-level "repo-branch/" directory from archive paths
// filter: skips binary files (e.g. images, fonts, executables) to avoid unnecessary disk I/O
const entryFilter = deps.createArchiveEntryFilter();
const extractStream = deps.tarExtract({
cwd: targetDirectory,
strip: 1,
filter: (entryPath: string) => entryFilter(entryPath),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The filter callback can be simplified by passing the entryFilter function directly. This also ensures that all arguments provided by the tar library (like the entry object) are correctly passed to the filter function.

Suggested change
filter: (entryPath: string) => entryFilter(entryPath),
filter: entryFilter,

});
const gunzipStream = deps.createGunzip();

Expand Down
72 changes: 72 additions & 0 deletions tests/core/git/archiveEntryFilter.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import { describe, expect, test, vi } from 'vitest';
import { createArchiveEntryFilter } from '../../../src/core/git/archiveEntryFilter.js';

vi.mock('../../../src/shared/logger');

describe('archiveEntryFilter', () => {
describe('createArchiveEntryFilter', () => {
test('should allow text files', () => {
const filter = createArchiveEntryFilter();
expect(filter('repo-main/src/index.ts')).toBe(true);
expect(filter('repo-main/README.md')).toBe(true);
expect(filter('repo-main/package.json')).toBe(true);
});

test('should skip binary image files', () => {
const filter = createArchiveEntryFilter();
expect(filter('repo-main/assets/logo.png')).toBe(false);
expect(filter('repo-main/images/photo.jpg')).toBe(false);
expect(filter('repo-main/icon.gif')).toBe(false);
});

test('should skip font files', () => {
const filter = createArchiveEntryFilter();
expect(filter('repo-main/fonts/inter.woff2')).toBe(false);
expect(filter('repo-main/fonts/roboto.woff')).toBe(false);
expect(filter('repo-main/fonts/arial.ttf')).toBe(false);
});

test('should skip archive and executable files', () => {
const filter = createArchiveEntryFilter();
expect(filter('repo-main/dist/app.exe')).toBe(false);
expect(filter('repo-main/vendor/lib.zip')).toBe(false);
});

test('should allow root directory entry', () => {
const filter = createArchiveEntryFilter();
expect(filter('repo-main/')).toBe(true);
});

test('should handle nested directory paths', () => {
const filter = createArchiveEntryFilter();
expect(filter('repo-main/src/components/Button.tsx')).toBe(true);
expect(filter('repo-main/src/assets/icons/arrow.png')).toBe(false);
});

test('should strip leading segment correctly for various repo name formats', () => {
const filter = createArchiveEntryFilter();
// Different repository name formats in tar archives
expect(filter('yamadashy-repomix-abc123/src/index.ts')).toBe(true);
expect(filter('yamadashy-repomix-abc123/logo.png')).toBe(false);
});

test('should use injected isBinaryPath dependency', () => {
const mockIsBinaryPath = vi.fn().mockReturnValue(true);
const filter = createArchiveEntryFilter({ isBinaryPath: mockIsBinaryPath });

const result = filter('repo-main/src/index.ts');

expect(result).toBe(false);
expect(mockIsBinaryPath).toHaveBeenCalledWith('src/index.ts');
});

test('should pass stripped path to isBinaryPath', () => {
const mockIsBinaryPath = vi.fn().mockReturnValue(false);
const filter = createArchiveEntryFilter({ isBinaryPath: mockIsBinaryPath });

filter('repo-main/src/deep/file.ts');

expect(mockIsBinaryPath).toHaveBeenCalledWith('src/deep/file.ts');
});
});
});
8 changes: 7 additions & 1 deletion tests/core/git/gitHubArchive.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import type { pipeline as pipelineType } from 'node:stream/promises';
import type * as zlib from 'node:zlib';
import type { extract as tarExtractType } from 'tar';
import { beforeEach, describe, expect, test, vi } from 'vitest';
import type { createArchiveEntryFilter as createArchiveEntryFilterType } from '../../../src/core/git/archiveEntryFilter.js';
import {
type ArchiveDownloadOptions,
downloadGitHubArchive,
Expand All @@ -22,6 +23,7 @@ interface MockDeps {
Transform: typeof Transform;
tarExtract: typeof tarExtractType;
createGunzip: typeof zlib.createGunzip;
createArchiveEntryFilter: typeof createArchiveEntryFilterType;
}

// Simple test data
Expand All @@ -32,6 +34,7 @@ describe('gitHubArchive', () => {
let mockPipeline: ReturnType<typeof vi.fn<typeof pipelineType>>;
let mockTarExtract: ReturnType<typeof vi.fn<typeof tarExtractType>>;
let mockCreateGunzip: ReturnType<typeof vi.fn<typeof zlib.createGunzip>>;
let mockCreateArchiveEntryFilter: ReturnType<typeof vi.fn<typeof createArchiveEntryFilterType>>;
let mockDeps: MockDeps;

beforeEach(() => {
Expand All @@ -53,13 +56,15 @@ describe('gitHubArchive', () => {
},
}) as unknown as ReturnType<typeof zlib.createGunzip>,
);
mockCreateArchiveEntryFilter = vi.fn<typeof createArchiveEntryFilterType>().mockReturnValue(() => true);

mockDeps = {
fetch: mockFetch,
pipeline: mockPipeline as unknown as typeof pipelineType,
Transform,
tarExtract: mockTarExtract as unknown as typeof tarExtractType,
createGunzip: mockCreateGunzip as unknown as typeof zlib.createGunzip,
createArchiveEntryFilter: mockCreateArchiveEntryFilter,
};
});

Expand Down Expand Up @@ -104,10 +109,11 @@ describe('gitHubArchive', () => {
}),
);

// Verify tar extract was called with correct options
// Verify tar extract was called with correct options including filter
expect(mockTarExtract).toHaveBeenCalledWith({
cwd: mockTargetDirectory,
strip: 1,
filter: expect.any(Function),
});

// Verify streaming pipeline was used
Expand Down
Loading