Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/manual/rl-next/tarball-fixes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
synopsis: "Improve handling of tarballs that don't consist of a single top-level directory"
prs:
- 11195
---

In previous Nix releases, the tarball fetcher (used by `builtins.fetchTarball`) erroneously merged top-level directories into a single directory, and silently discarded top-level files that are not directories. This is no longer the case. The new behaviour is that *only* if the tarball consists of a single directory, the top-level path component of the files in the tarball is removed (similar to `tar`'s `--strip-components=1`).

Author: [**Eelco Dolstra (@edolstra)**](https://github.com/edolstra)
10 changes: 5 additions & 5 deletions src/libexpr/primops/fetchTree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -559,11 +559,11 @@ static RegisterPrimOp primop_fetchTarball({
.doc = R"(
Download the specified URL, unpack it and return the path of the
unpacked tree. The file must be a tape archive (`.tar`) compressed
with `gzip`, `bzip2` or `xz`. The top-level path component of the
files in the tarball is removed, so it is best if the tarball
contains a single directory at top level. The typical use of the
function is to obtain external Nix expression dependencies, such as
a particular version of Nixpkgs, e.g.
with `gzip`, `bzip2` or `xz`. If the tarball consists of a
single directory, then the top-level path component of the files
in the tarball is removed. The typical use of the function is to
obtain external Nix expression dependencies, such as a
particular version of Nixpkgs, e.g.

```nix
with import (fetchTarball https://github.com/NixOS/nixpkgs/archive/nixos-14.12.tar.gz) {};
Expand Down
81 changes: 60 additions & 21 deletions src/libfetchers/git-utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,39 @@ Object lookupObject(git_repository * repo, const git_oid & oid, git_object_t typ
}

template<typename T>
T peelObject(git_repository * repo, git_object * obj, git_object_t type)
T peelObject(git_object * obj, git_object_t type)
{
T obj2;
if (git_object_peel((git_object * *) (typename T::pointer *) Setter(obj2), obj, type)) {
auto err = git_error_last();
throw Error("peeling Git object '%s': %s", git_object_id(obj), err->message);
throw Error("peeling Git object '%s': %s", *git_object_id(obj), err->message);
}
return obj2;
}

template<typename T>
T dupObject(typename T::pointer obj)
{
T obj2;
if (git_object_dup((git_object * *) (typename T::pointer *) Setter(obj2), (git_object *) obj))
throw Error("duplicating object '%s': %s", *git_object_id((git_object *) obj), git_error_last()->message);
return obj2;
}

/**
* Peel the specified object (i.e. follow tag and commit objects) to
* either a blob or a tree.
*/
static Object peelToTreeOrBlob(git_object * obj)
{
/* git_object_peel() doesn't handle blob objects, so handle those
specially. */
if (git_object_type(obj) == GIT_OBJECT_BLOB)
return dupObject<Object>(obj);
else
return peelObject<Object>(obj, GIT_OBJECT_TREE);
}

struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
{
/** Location of the repository on disk. */
Expand Down Expand Up @@ -166,7 +189,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
std::unordered_set<git_oid> done;
std::queue<Commit> todo;

todo.push(peelObject<Commit>(*this, lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT));
todo.push(peelObject<Commit>(lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT));

while (auto commit = pop(todo)) {
if (!done.insert(*git_commit_id(commit->get())).second) continue;
Expand All @@ -184,7 +207,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>

uint64_t getLastModified(const Hash & rev) override
{
auto commit = peelObject<Commit>(*this, lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT);
auto commit = peelObject<Commit>(lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT);

return git_commit_time(commit.get());
}
Expand Down Expand Up @@ -463,6 +486,23 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>

return narHash;
}

Hash dereferenceSingletonDirectory(const Hash & oid_) override
{
auto oid = hashToOID(oid_);

auto _tree = lookupObject(*this, oid, GIT_OBJECT_TREE);
auto tree = (const git_tree *) &*_tree;

if (git_tree_entrycount(tree) == 1) {
auto entry = git_tree_entry_byindex(tree, 0);
auto mode = git_tree_entry_filemode(entry);
if (mode == GIT_FILEMODE_TREE)
oid = *git_tree_entry_id(entry);
}

return toHash(oid);
}
};

ref<GitRepo> GitRepo::openRepo(const std::filesystem::path & path, bool create, bool bare)
Expand All @@ -476,11 +516,11 @@ ref<GitRepo> GitRepo::openRepo(const std::filesystem::path & path, bool create,
struct GitSourceAccessor : SourceAccessor
{
ref<GitRepoImpl> repo;
Tree root;
Object root;

GitSourceAccessor(ref<GitRepoImpl> repo_, const Hash & rev)
: repo(repo_)
, root(peelObject<Tree>(*repo, lookupObject(*repo, hashToOID(rev)).get(), GIT_OBJECT_TREE))
, root(peelToTreeOrBlob(lookupObject(*repo, hashToOID(rev)).get()))
{
}

Expand All @@ -506,7 +546,7 @@ struct GitSourceAccessor : SourceAccessor
std::optional<Stat> maybeLstat(const CanonPath & path) override
{
if (path.isRoot())
return Stat { .type = tDirectory };
return Stat { .type = git_object_type(root.get()) == GIT_OBJECT_TREE ? tDirectory : tRegular };

auto entry = lookup(path);
if (!entry)
Expand Down Expand Up @@ -616,10 +656,10 @@ struct GitSourceAccessor : SourceAccessor
std::optional<Tree> lookupTree(const CanonPath & path)
{
if (path.isRoot()) {
Tree tree;
if (git_tree_dup(Setter(tree), root.get()))
throw Error("duplicating directory '%s': %s", showPath(path), git_error_last()->message);
return tree;
if (git_object_type(root.get()) == GIT_OBJECT_TREE)
return dupObject<Tree>((git_tree *) &*root);
else
return std::nullopt;
}

auto entry = lookup(path);
Expand All @@ -646,10 +686,10 @@ struct GitSourceAccessor : SourceAccessor
std::variant<Tree, Submodule> getTree(const CanonPath & path)
{
if (path.isRoot()) {
Tree tree;
if (git_tree_dup(Setter(tree), root.get()))
throw Error("duplicating directory '%s': %s", showPath(path), git_error_last()->message);
return tree;
if (git_object_type(root.get()) == GIT_OBJECT_TREE)
return dupObject<Tree>((git_tree *) &*root);
else
throw Error("Git root object '%s' is not a directory", *git_object_id(root.get()));
}

auto entry = need(path);
Expand All @@ -669,6 +709,9 @@ struct GitSourceAccessor : SourceAccessor

Blob getBlob(const CanonPath & path, bool expectSymlink)
{
if (!expectSymlink && git_object_type(root.get()) == GIT_OBJECT_BLOB)
return dupObject<Blob>((git_blob *) &*root);

auto notExpected = [&]()
{
throw Error(
Expand Down Expand Up @@ -782,8 +825,6 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink

std::vector<PendingDir> pendingDirs;

size_t componentsToStrip = 1;

void pushBuilder(std::string name)
{
git_treebuilder * b;
Expand Down Expand Up @@ -839,9 +880,6 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
{
std::span<const std::string> pathComponents2{pathComponents};

if (pathComponents2.size() <= componentsToStrip) return false;
pathComponents2 = pathComponents2.subspan(componentsToStrip);

updateBuilders(
isDir
? pathComponents2
Expand Down Expand Up @@ -964,7 +1002,8 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
git_tree_entry_filemode(entry));
}

Hash sync() override {
Hash sync() override
{
updateBuilders({});

auto [oid, _name] = popBuilder();
Expand Down
7 changes: 7 additions & 0 deletions src/libfetchers/git-utils.hh
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ struct GitRepo
* serialisation. This is memoised on-disk.
*/
virtual Hash treeHashToNarHash(const Hash & treeHash) = 0;

/**
* If the specified Git object is a directory with a single entry
* that is a directory, return the ID of that object.
* Otherwise, return the passed ID unchanged.
*/
virtual Hash dereferenceSingletonDirectory(const Hash & oid) = 0;
};

ref<GitRepo> getTarballCache();
Expand Down
5 changes: 3 additions & 2 deletions src/libfetchers/github.cc
Original file line number Diff line number Diff line change
Expand Up @@ -255,11 +255,12 @@ struct GitArchiveInputScheme : InputScheme
});

TarArchive archive { *source };
auto parseSink = getTarballCache()->getFileSystemObjectSink();
auto tarballCache = getTarballCache();
auto parseSink = tarballCache->getFileSystemObjectSink();
auto lastModified = unpackTarfileToSink(archive, *parseSink);

TarballInfo tarballInfo {
.treeHash = parseSink->sync(),
.treeHash = tarballCache->dereferenceSingletonDirectory(parseSink->sync()),
.lastModified = lastModified
};

Expand Down
6 changes: 4 additions & 2 deletions src/libfetchers/tarball.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ DownloadTarballResult downloadTarball(
TarArchive{path};
})
: TarArchive{*source};
auto parseSink = getTarballCache()->getFileSystemObjectSink();
auto tarballCache = getTarballCache();
auto parseSink = tarballCache->getFileSystemObjectSink();
auto lastModified = unpackTarfileToSink(archive, *parseSink);

auto res(_res->lock());
Expand All @@ -177,7 +178,8 @@ DownloadTarballResult downloadTarball(
infoAttrs = cached->value;
} else {
infoAttrs.insert_or_assign("etag", res->etag);
infoAttrs.insert_or_assign("treeHash", parseSink->sync().gitRev());
infoAttrs.insert_or_assign("treeHash",
tarballCache->dereferenceSingletonDirectory(parseSink->sync()).gitRev());
infoAttrs.insert_or_assign("lastModified", uint64_t(lastModified));
if (res->immutableUrl)
infoAttrs.insert_or_assign("immutableUrl", *res->immutableUrl);
Expand Down
17 changes: 17 additions & 0 deletions tests/functional/tarball.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,20 @@ path="$(nix flake prefetch --json "tarball+file://$(pwd)/tree.tar.gz" | jq -r .s
[[ $(cat "$path/a/zzz") = bar ]]
[[ $(cat "$path/c/aap") = bar ]]
[[ $(cat "$path/fnord") = bar ]]

# Test a tarball that has multiple top-level directories.
rm -rf "$TEST_ROOT/tar_root"
mkdir -p "$TEST_ROOT/tar_root" "$TEST_ROOT/tar_root/foo" "$TEST_ROOT/tar_root/bar"
tar cvf "$TEST_ROOT/tar.tar" -C "$TEST_ROOT/tar_root" .
path="$(nix flake prefetch --json "tarball+file://$TEST_ROOT/tar.tar" | jq -r .storePath)"
[[ -d "$path/foo" ]]
[[ -d "$path/bar" ]]

# Test a tarball that has a single regular file.
rm -rf "$TEST_ROOT/tar_root"
mkdir -p "$TEST_ROOT/tar_root"
echo bar > "$TEST_ROOT/tar_root/foo"
chmod +x "$TEST_ROOT/tar_root/foo"
tar cvf "$TEST_ROOT/tar.tar" -C "$TEST_ROOT/tar_root" .
path="$(nix flake prefetch --refresh --json "tarball+file://$TEST_ROOT/tar.tar" | jq -r .storePath)"
[[ $(cat "$path/foo") = bar ]]
4 changes: 2 additions & 2 deletions tests/unit/libfetchers/git-utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ TEST_F(GitUtilsTest, sink_basic)

// sink->createHardlink("foo-1.1/links/foo-2", CanonPath("foo-1.1/hello"));

auto result = sink->sync();
auto result = repo->dereferenceSingletonDirectory(sink->sync());
auto accessor = repo->getAccessor(result, false);
auto entries = accessor->readDirectory(CanonPath::root);
ASSERT_EQ(entries.size(), 5);
Expand All @@ -103,7 +103,7 @@ TEST_F(GitUtilsTest, sink_hardlink)
sink->createHardlink(CanonPath("foo-1.1/link"), CanonPath("hello"));
FAIL() << "Expected an exception";
} catch (const nix::Error & e) {
ASSERT_THAT(e.msg(), testing::HasSubstr("invalid hard link target"));
ASSERT_THAT(e.msg(), testing::HasSubstr("cannot find hard link target"));
ASSERT_THAT(e.msg(), testing::HasSubstr("/hello"));
ASSERT_THAT(e.msg(), testing::HasSubstr("foo-1.1/link"));
}
Expand Down