Skip to content

Commit

Permalink
fix: secret scan pre-commit crashing on big merges
Browse files Browse the repository at this point in the history
salome-voltz committed Jan 16, 2025
1 parent 28f5153 commit 4af42f9
Showing 4 changed files with 58 additions and 17 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Fixed

- Fix `secret scan pre-commit` crashing on big merges (#1032).
16 changes: 8 additions & 8 deletions ggshield/core/scan/commit.py
Original file line number Diff line number Diff line change
@@ -8,6 +8,7 @@
from .commit_information import CommitInformation
from .commit_utils import (
DIFF_EMPTY_COMMIT_INFO_BLOCK,
MAX_FILES_PER_GIT_COMMAND,
PATCH_COMMON_ARGS,
PATCH_PREFIX,
STAGED_PREFIX,
@@ -96,14 +97,13 @@ def from_merge(
files_to_scan.add(path)

def parser_merge(commit: "Commit") -> Iterable[Scannable]:
patch = git(
["diff", "--staged", *PATCH_COMMON_ARGS, *files_to_scan], cwd=cwd
)
yield from parse_patch(
STAGED_PREFIX,
DIFF_EMPTY_COMMIT_INFO_BLOCK + patch,
exclusion_regexes,
)
for files in batched(files_to_scan, MAX_FILES_PER_GIT_COMMAND):
patch = git(["diff", "--staged", *PATCH_COMMON_ARGS, *files], cwd=cwd)
yield from parse_patch(
STAGED_PREFIX,
DIFF_EMPTY_COMMIT_INFO_BLOCK + patch,
exclusion_regexes,
)

info = CommitInformation.from_staged(cwd)
return Commit(sha=None, patch_parser=parser_merge, info=info)
21 changes: 13 additions & 8 deletions ggshield/core/scan/commit_utils.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@

from ggshield.utils.files import is_path_excluded
from ggshield.utils.git_shell import Filemode, git
from ggshield.utils.itertools import batched

from .scannable import Scannable

@@ -43,6 +44,8 @@
r"^(?P<at>@@+) (?P<from>-\d+(?:,\d+)?) .* (?P<to>\+\d+(?:,\d+)?) @@+(?P<trailing_content>.+)?"
)

MAX_FILES_PER_GIT_COMMAND = 100


class PatchParseError(Exception):
"""
@@ -370,10 +373,11 @@ def get_file_sha_in_ref(
"""
Helper function to get the shas of files in the git reference.
"""
output = git(["ls-tree", "-z", ref] + files, cwd=cwd)
for line in output.split("\0")[:-1]:
_, _, sha, path = line.split(maxsplit=3)
yield (path, sha)
for files in batched(files, MAX_FILES_PER_GIT_COMMAND):
output = git(["ls-tree", "-z", ref] + files, cwd=cwd)
for line in output.split("\0")[:-1]:
_, _, sha, path = line.split(maxsplit=3)
yield (path, sha)


def get_file_sha_stage(
@@ -382,10 +386,11 @@ def get_file_sha_stage(
"""
Helper function to get the shas currently staged of files.
"""
output = git(["ls-files", "--stage", "-z"] + files, cwd=cwd)
for line in output.split("\0")[:-1]:
_, sha, _, path = line.split(maxsplit=3)
yield (path, sha)
for files in batched(files, MAX_FILES_PER_GIT_COMMAND):
output = git(["ls-files", "--stage", "-z"] + files, cwd=cwd)
for line in output.split("\0")[:-1]:
_, sha, _, path = line.split(maxsplit=3)
yield (path, sha)


def get_diff_files(cwd: Optional[Path] = None) -> List[str]:
35 changes: 34 additions & 1 deletion tests/unit/core/scan/test_commit_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import tempfile
from pathlib import Path
from typing import Optional, Tuple

import pytest

from ggshield.core.scan.commit_utils import PatchFileInfo, convert_multi_parent_diff
from ggshield.core.scan.commit_utils import (
PatchFileInfo,
convert_multi_parent_diff,
get_file_sha_in_ref,
)
from ggshield.utils.git_shell import Filemode
from tests.repository import Repository


@pytest.mark.parametrize(
@@ -124,3 +130,30 @@ def test_convert_multi_parent_diff(diff: str, expected: str):
expected = expected.strip()
result = convert_multi_parent_diff(diff)
assert result == expected


def test_get_file_sha_in_ref():
"""
Assert that get_file_sha_in_ref doesn't crash when called
with a large number of files
"""
with tempfile.TemporaryDirectory() as tmp_path_str:

tmp_path = Path(tmp_path_str)
repo = Repository.create(tmp_path)

for i in range(200):
file_path = tmp_path / f"{i:0200d}.txt"
file_path.write_text("dummy content")

repo.add(".")
repo.create_commit("Add 200 dummy files")

try:
files = [f"{i:0200d}.txt" for i in range(200)]
result = list(get_file_sha_in_ref("HEAD", files))

assert isinstance(result, list), "The result should be a list."

except Exception as e:
assert False, f"get_file_sha_in_ref crashed with error: {e}"

0 comments on commit 4af42f9

Please sign in to comment.