Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
pack-objects: create new name-hash algorithm (#5157)
Browse files Browse the repository at this point in the history
This is an updated version of gitgitgadget#1785, intended for early
consumption into Git for Windows.

The idea here is to add a new `--full-name-hash` option to `git
pack-objects` and `git repack`. This adjusts the name-hash value used
for finding delta bases in such a way that uses the full path name with
a lower likelihood of collisions than the default name-hash algorithm.
In many repositories with name-hash collisions and many versions of
those paths, this can significantly reduce the size of a full repack. It
can also help in certain cases of `git push`, but only if the pack is
already artificially inflated by name-hash collisions; cases that find
"sibling" deltas as better choices become worse with `--full-name-hash`.

Thus, this option is currently recommended for full repacks of large
repos, and on client machines without reachability bitmaps.

Some care is taken to ignore this option when using bitmaps, either
writing bitmaps or using a bitmap walk during reads. The bitmap file
format contains name-hash values, but no way to indicate which function
is used, so compatibility is a concern for bitmaps. Future work could
explore this idea.

After this PR is merged, then the more-involved `--path-walk` option may
be considered.
dscho committed Jan 7, 2025

Verified

This commit was signed with the committer’s verified signature.
rm3l Armel Soro
2 parents be75295 + e274a63 commit d8a424f
Showing 22 changed files with 311 additions and 13 deletions.
3 changes: 2 additions & 1 deletion Documentation/git-pack-objects.txt
Original file line number Diff line number Diff line change
@@ -15,7 +15,8 @@ SYNOPSIS
[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
[--cruft] [--cruft-expiration=<time>]
[--stdout [--filter=<filter-spec>] | <base-name>]
[--shallow] [--keep-true-parents] [--[no-]sparse] < <object-list>
[--shallow] [--keep-true-parents] [--[no-]sparse]
[--full-name-hash] < <object-list>


DESCRIPTION
4 changes: 3 additions & 1 deletion Documentation/git-repack.txt
Original file line number Diff line number Diff line change
@@ -9,7 +9,9 @@ git-repack - Pack unpacked objects in a repository
SYNOPSIS
--------
[verse]
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m] [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>] [--write-midx]
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]
[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
[--write-midx] [--full-name-hash]

DESCRIPTION
-----------
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -824,6 +824,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o
TEST_BUILTINS_OBJS += test-match-trees.o
TEST_BUILTINS_OBJS += test-mergesort.o
TEST_BUILTINS_OBJS += test-mktemp.o
TEST_BUILTINS_OBJS += test-name-hash.o
TEST_BUILTINS_OBJS += test-online-cpus.o
TEST_BUILTINS_OBJS += test-pack-mtimes.o
TEST_BUILTINS_OBJS += test-parse-options.o
25 changes: 20 additions & 5 deletions builtin/pack-objects.c
Original file line number Diff line number Diff line change
@@ -267,6 +267,14 @@ struct configured_exclusion {
static struct oidmap configured_exclusions;

static struct oidset excluded_by_config;
static int use_full_name_hash = -1;

static inline uint32_t pack_name_hash_fn(const char *name)
{
if (use_full_name_hash)
return pack_full_name_hash(name);
return pack_name_hash(name);
}

/*
* stats
@@ -1684,7 +1692,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
return 0;
}

create_object_entry(oid, type, pack_name_hash(name),
create_object_entry(oid, type, pack_name_hash_fn(name),
exclude, name && no_try_delta(name),
found_pack, found_offset);
return 1;
@@ -1898,7 +1906,7 @@ static void add_preferred_base_object(const char *name)
{
struct pbase_tree *it;
size_t cmplen;
unsigned hash = pack_name_hash(name);
unsigned hash = pack_name_hash_fn(name);

if (!num_preferred_base || check_pbase_path(hash))
return;
@@ -3408,7 +3416,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
* here using a now in order to perhaps improve the delta selection
* process.
*/
oe->hash = pack_name_hash(name);
oe->hash = pack_name_hash_fn(name);
oe->no_try_delta = name && no_try_delta(name);

stdin_packs_hints_nr++;
@@ -3558,7 +3566,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
entry = packlist_find(&to_pack, oid);
if (entry) {
if (name) {
entry->hash = pack_name_hash(name);
entry->hash = pack_name_hash_fn(name);
entry->no_try_delta = no_try_delta(name);
}
} else {
@@ -3581,7 +3589,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
return;
}

entry = create_object_entry(oid, type, pack_name_hash(name),
entry = create_object_entry(oid, type, pack_name_hash_fn(name),
0, name && no_try_delta(name),
pack, offset);
}
@@ -4433,6 +4441,8 @@ int cmd_pack_objects(int argc,
OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
N_("protocol"),
N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
OPT_BOOL(0, "full-name-hash", &use_full_name_hash,
N_("(EXPERIMENTAL!) optimize delta compression across identical path names over time")),
OPT_END(),
};

@@ -4588,6 +4598,11 @@ int cmd_pack_objects(int argc,
if (pack_to_stdout || !rev_list_all)
write_bitmap_index = 0;

if (write_bitmap_index && use_full_name_hash > 0)
die(_("currently, the --full-name-hash option is incompatible with --write-bitmap-index"));
if (use_full_name_hash < 0)
use_full_name_hash = git_env_bool("GIT_TEST_FULL_NAME_HASH", 0);

if (use_delta_islands)
strvec_push(&rp, "--topo-order");

9 changes: 8 additions & 1 deletion builtin/repack.c
Original file line number Diff line number Diff line change
@@ -39,7 +39,9 @@ static int run_update_server_info = 1;
static char *packdir, *packtmp_name, *packtmp;

static const char *const git_repack_usage[] = {
N_("git repack [<options>]"),
N_("git repack [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]\n"
"[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]\n"
"[--write-midx] [--full-name-hash]"),
NULL
};

@@ -58,6 +60,7 @@ struct pack_objects_args {
int no_reuse_object;
int quiet;
int local;
int full_name_hash;
struct list_objects_filter_options filter_options;
};

@@ -306,6 +309,8 @@ static void prepare_pack_objects(struct child_process *cmd,
strvec_pushf(&cmd->args, "--no-reuse-delta");
if (args->no_reuse_object)
strvec_pushf(&cmd->args, "--no-reuse-object");
if (args->full_name_hash)
strvec_pushf(&cmd->args, "--full-name-hash");
if (args->local)
strvec_push(&cmd->args, "--local");
if (args->quiet)
@@ -1203,6 +1208,8 @@ int cmd_repack(int argc,
N_("pass --no-reuse-delta to git-pack-objects")),
OPT_BOOL('F', NULL, &po_args.no_reuse_object,
N_("pass --no-reuse-object to git-pack-objects")),
OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash,
N_("(EXPERIMENTAL!) pass --full-name-hash to git-pack-objects")),
OPT_NEGBIT('n', NULL, &run_update_server_info,
N_("do not run git-update-server-info"), 1),
OPT__QUIET(&po_args.quiet, N_("be quiet")),
1 change: 1 addition & 0 deletions ci/run-build-and-tests.sh
Original file line number Diff line number Diff line change
@@ -25,6 +25,7 @@ linux-TEST-vars)
export GIT_TEST_NO_WRITE_REV_INDEX=1
export GIT_TEST_CHECKOUT_WORKERS=2
export GIT_TEST_PACK_USE_BITMAP_BOUNDARY_TRAVERSAL=1
export GIT_TEST_FULL_NAME_HASH=1
;;
linux-clang)
export GIT_TEST_DEFAULT_HASH=sha1
21 changes: 21 additions & 0 deletions pack-objects.h
Original file line number Diff line number Diff line change
@@ -208,6 +208,27 @@ static inline uint32_t pack_name_hash(const char *name)
return hash;
}

static inline uint32_t pack_full_name_hash(const char *name)
{
const uint32_t bigp = 1234572167U;
uint32_t c, hash = bigp;

if (!name)
return 0;

/*
* Do the simplest thing that will resemble pseudo-randomness: add
* random multiples of a large prime number with a binary shift.
* The goal is not to be cryptographic, but to be generally
* uniformly distributed.
*/
while ((c = *name++) != 0) {
hash += c * bigp;
hash = (hash >> 5) | (hash << 27);
}
return hash;
}

static inline enum object_type oe_type(const struct object_entry *e)
{
return e->type_valid ? e->type_ : OBJ_BAD;
4 changes: 4 additions & 0 deletions t/README
Original file line number Diff line number Diff line change
@@ -471,6 +471,10 @@ a test and then fails then the whole test run will abort. This can help to make
sure the expected tests are executed and not silently skipped when their
dependency breaks or is simply not present in a new environment.

GIT_TEST_FULL_NAME_HASH=<boolean>, when true, sets the default name-hash
function in 'git pack-objects' to be the one used by the --full-name-hash
option.

Naming Tests
------------

1 change: 1 addition & 0 deletions t/helper/meson.build
Original file line number Diff line number Diff line change
@@ -34,6 +34,7 @@ test_tool_sources = [
'test-match-trees.c',
'test-mergesort.c',
'test-mktemp.c',
'test-name-hash.c',
'test-online-cpus.c',
'test-pack-mtimes.c',
'test-parse-options.c',
24 changes: 24 additions & 0 deletions t/helper/test-name-hash.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* test-name-hash.c: Read a list of paths over stdin and report on their
* name-hash and full name-hash.
*/

#include "test-tool.h"
#include "git-compat-util.h"
#include "pack-objects.h"
#include "strbuf.h"

int cmd__name_hash(int argc UNUSED, const char **argv UNUSED)
{
struct strbuf line = STRBUF_INIT;

while (!strbuf_getline(&line, stdin)) {
uint32_t name_hash = pack_name_hash(line.buf);
uint32_t full_hash = pack_full_name_hash(line.buf);

printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf);
}

strbuf_release(&line);
return 0;
}
1 change: 1 addition & 0 deletions t/helper/test-tool.c
Original file line number Diff line number Diff line change
@@ -44,6 +44,7 @@ static struct test_cmd cmds[] = {
{ "match-trees", cmd__match_trees },
{ "mergesort", cmd__mergesort },
{ "mktemp", cmd__mktemp },
{ "name-hash", cmd__name_hash },
{ "online-cpus", cmd__online_cpus },
{ "pack-mtimes", cmd__pack_mtimes },
{ "parse-options", cmd__parse_options },
1 change: 1 addition & 0 deletions t/helper/test-tool.h
Original file line number Diff line number Diff line change
@@ -37,6 +37,7 @@ int cmd__lazy_init_name_hash(int argc, const char **argv);
int cmd__match_trees(int argc, const char **argv);
int cmd__mergesort(int argc, const char **argv);
int cmd__mktemp(int argc, const char **argv);
int cmd__name_hash(int argc, const char **argv);
int cmd__online_cpus(int argc, const char **argv);
int cmd__pack_mtimes(int argc, const char **argv);
int cmd__parse_options(int argc, const char **argv);
73 changes: 73 additions & 0 deletions t/perf/p5313-pack-objects.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/sh

test_description='Tests pack performance using bitmaps'
. ./perf-lib.sh

GIT_TEST_PASSING_SANITIZE_LEAK=0
export GIT_TEST_PASSING_SANITIZE_LEAK

test_perf_large_repo

test_expect_success 'create rev input' '
cat >in-thin <<-EOF &&
$(git rev-parse HEAD)
^$(git rev-parse HEAD~1)
EOF
cat >in-big <<-EOF
$(git rev-parse HEAD)
^$(git rev-parse HEAD~1000)
EOF
'

test_perf 'thin pack' '
git pack-objects --thin --stdout --revs --sparse <in-thin >out
'

test_size 'thin pack size' '
test_file_size out
'

test_perf 'thin pack with --full-name-hash' '
git pack-objects --thin --stdout --revs --sparse --full-name-hash <in-thin >out
'

test_size 'thin pack size with --full-name-hash' '
test_file_size out
'

test_perf 'big pack' '
git pack-objects --stdout --revs --sparse <in-big >out
'

test_size 'big pack size' '
test_file_size out
'

test_perf 'big pack with --full-name-hash' '
git pack-objects --stdout --revs --sparse --full-name-hash <in-big >out
'

test_size 'big pack size with --full-name-hash' '
test_file_size out
'

test_perf 'repack' '
git repack -adf
'

test_size 'repack size' '
pack=$(ls .git/objects/pack/pack-*.pack) &&
test_file_size "$pack"
'

test_perf 'repack with --full-name-hash' '
git repack -adf --full-name-hash
'

test_size 'repack size with --full-name-hash' '
pack=$(ls .git/objects/pack/pack-*.pack) &&
test_file_size "$pack"
'

test_done
41 changes: 41 additions & 0 deletions t/perf/p5314-name-hash.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/sh

test_description='Tests pack performance using bitmaps'
. ./perf-lib.sh

GIT_TEST_PASSING_SANITIZE_LEAK=0
export GIT_TEST_PASSING_SANITIZE_LEAK

test_perf_large_repo

test_size 'paths at head' '
git ls-tree -r --name-only HEAD >path-list &&
wc -l <path-list
'

test_size 'number of distinct name-hashes' '
cat path-list | test-tool name-hash >name-hashes &&
cat name-hashes | awk "{ print \$1; }" | sort -n | uniq -c >name-hash-count &&
wc -l <name-hash-count
'

test_size 'number of distinct full-name-hashes' '
cat name-hashes | awk "{ print \$2; }" | sort -n | uniq -c >full-name-hash-count &&
wc -l <full-name-hash-count
'

test_size 'maximum multiplicity of name-hashes' '
cat name-hash-count | \
sort -nr | \
head -n 1 | \
awk "{ print \$1; }"
'

test_size 'maximum multiplicity of fullname-hashes' '
cat full-name-hash-count | \
sort -nr | \
head -n 1 | \
awk "{ print \$1; }"
'

test_done
1 change: 0 additions & 1 deletion t/t0450/txt-help-mismatches
Original file line number Diff line number Diff line change
@@ -45,7 +45,6 @@ rebase
remote
remote-ext
remote-fd
repack
reset
restore
rev-parse
15 changes: 15 additions & 0 deletions t/t5300-pack-object.sh
Original file line number Diff line number Diff line change
@@ -671,4 +671,19 @@ do
'
done

# The following test is not necessarily a permanent choice, but since we do not
# have a "name hash version" bit in the .bitmap file format, we cannot write the
# full-name hash values into the .bitmap file without risking breakage later.
#
# TODO: Make these compatible in the future and replace this test with the
# expected behavior when both are specified.
test_expect_success '--full-name-hash and --write-bitmap-index are incompatible' '
test_must_fail git pack-objects base --all \
--full-name-hash --write-bitmap-index 2>err &&
grep incompatible err &&
# --stdout option silently removes --write-bitmap-index
git pack-objects --stdout --all --full-name-hash --write-bitmap-index >out
'

test_done
26 changes: 26 additions & 0 deletions t/t5310-pack-bitmaps.sh
Original file line number Diff line number Diff line change
@@ -26,6 +26,32 @@ has_any () {
grep -Ff "$1" "$2"
}

# Since name-hash values are stored in the .bitmap files, add a test
# that checks that the name-hash calculations are stable across versions.
# Not exhaustive, but these hashing algorithms would be hard to change
# without causing deviations here.
test_expect_success 'name-hash value stability' '
cat >names <<-\EOF &&
first
second
third
one-long-enough-for-collisions
two-long-enough-for-collisions
EOF
test-tool name-hash <names >out &&
cat >expect <<-\EOF &&
2582249472 3109209818 first
2289942528 3781118409 second
2300837888 3028707182 third
2544516325 3241327563 one-long-enough-for-collisions
2544516325 4207880830 two-long-enough-for-collisions
EOF
test_cmp expect out
'

test_bitmap_cases () {
writeLookupTable=false
for i in "$@"
7 changes: 6 additions & 1 deletion t/t5510-fetch.sh
Original file line number Diff line number Diff line change
@@ -1061,7 +1061,12 @@ test_expect_success 'all boundary commits are excluded' '
test_tick &&
git merge otherside &&
ad=$(git log --no-walk --format=%ad HEAD) &&
git bundle create twoside-boundary.bdl main --since="$ad" &&
# If the --full-name-hash function is used here, then no delta
# pair is found and the bundle does not expand to three objects
# when fixing the thin object.
GIT_TEST_FULL_NAME_HASH=0 \
git bundle create twoside-boundary.bdl main --since="$ad" &&
test_bundle_object_count --thin twoside-boundary.bdl 3
'

26 changes: 24 additions & 2 deletions t/t5616-partial-clone.sh
Original file line number Diff line number Diff line change
@@ -515,7 +515,18 @@ test_expect_success 'fetch lazy-fetches only to resolve deltas' '
# Exercise to make sure it works. Git will not fetch anything from the
# promisor remote other than for the big tree (because it needs to
# resolve the delta).
GIT_TRACE_PACKET="$(pwd)/trace" git -C client \
#
# TODO: the --full-name-hash option is disabled here, since this test
# is fundamentally broken! When GIT_TEST_FULL_NAME_HASH=1, the server
# recognizes delta bases in a different way and then sends a _blob_ to
# the client with a delta base that the client does not have! This is
# because the client is cloned from "promisor-server" with tree:0 but
# is now fetching from "server" withot any filter. This is violating the
# promise to the server that all reachable objects exist and could be
# used as delta bases!
GIT_TRACE_PACKET="$(pwd)/trace" \
GIT_TEST_FULL_NAME_HASH=0 \
git -C client \
fetch "file://$(pwd)/server" main &&
# Verify the assumption that the client needed to fetch the delta base
@@ -534,7 +545,18 @@ test_expect_success 'fetch lazy-fetches only to resolve deltas, protocol v2' '
# Exercise to make sure it works. Git will not fetch anything from the
# promisor remote other than for the big blob (because it needs to
# resolve the delta).
GIT_TRACE_PACKET="$(pwd)/trace" git -C client \
#
# TODO: the --full-name-hash option is disabled here, since this test
# is fundamentally broken! When GIT_TEST_FULL_NAME_HASH=1, the server
# recognizes delta bases in a different way and then sends a _blob_ to
# the client with a delta base that the client does not have! This is
# because the client is cloned from "promisor-server" with tree:0 but
# is now fetching from "server" withot any filter. This is violating the
# promise to the server that all reachable objects exist and could be
# used as delta bases!
GIT_TRACE_PACKET="$(pwd)/trace" \
GIT_TEST_FULL_NAME_HASH=0 \
git -C client \
fetch "file://$(pwd)/server" main &&
# Verify that protocol version 2 was used.
6 changes: 5 additions & 1 deletion t/t6020-bundle-misc.sh
Original file line number Diff line number Diff line change
@@ -246,7 +246,11 @@ test_expect_success 'create bundle with --since option' '
EOF
test_cmp expect actual &&
git bundle create since.bdl \
# If the --full-name-hash option is used, then one fewer
# delta base is found and this counts a different number
# of objects after performing --fix-thin.
GIT_TEST_FULL_NAME_HASH=0 \
git bundle create since.bdl \
--since "Thu Apr 7 15:27:00 2005 -0700" \
--all &&
7 changes: 7 additions & 0 deletions t/t7700-repack.sh
Original file line number Diff line number Diff line change
@@ -776,6 +776,13 @@ test_expect_success 'repack -ad cleans up old .tmp-* packs' '
test_must_be_empty tmpfiles
'

test_expect_success '--full-name-hash option passes through to pack-objects' '
GIT_TRACE2_EVENT="$(pwd)/full-trace.txt" \
git repack -a --full-name-hash &&
test_subcommand_flex git pack-objects --full-name-hash <full-trace.txt
'


test_expect_success 'setup for update-server-info' '
git init update-server-info &&
test_commit -C update-server-info message
27 changes: 27 additions & 0 deletions t/test-lib-functions.sh
Original file line number Diff line number Diff line change
@@ -1886,6 +1886,33 @@ test_subcommand () {
fi
}


# Check that the given subcommand was run with the given set of
# arguments in order (but with possible extra arguments).
#
# test_subcommand_flex [!] <command> <args>... < <trace>
#
# If the first parameter passed is !, this instead checks that
# the given command was not called.
#
test_subcommand_flex () {
local negate=
if test "$1" = "!"
then
negate=t
shift
fi

local expr="$(printf '"%s".*' "$@")"

if test -n "$negate"
then
! grep "\[$expr\]"
else
grep "\[$expr\]"
fi
}

# Check that the given command was invoked as part of the
# trace2-format trace on stdin.
#

0 comments on commit d8a424f

Please sign in to comment.