Merge pull request #11195 from DeterminateSystems/tarball-roots

Improve handling of tarballs that don't consist of a single top-level directory
This commit is contained in:
Eelco Dolstra 2024-07-29 16:58:59 +02:00 committed by GitHub
commit 0b96c586e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 107 additions and 32 deletions

View file

@ -0,0 +1,9 @@
---
synopsis: "Improve handling of tarballs that don't consist of a single top-level directory"
prs:
- 11195
---
In previous Nix releases, the tarball fetcher (used by `builtins.fetchTarball`) erroneously merged top-level directories into a single directory, and silently discarded top-level files that are not directories. This is no longer the case. The new behaviour is that *only* if the tarball consists of a single directory, the top-level path component of the files in the tarball is removed (similar to `tar`'s `--strip-components=1`).
Author: [**Eelco Dolstra (@edolstra)**](https://github.com/edolstra)

View file

@ -559,11 +559,11 @@ static RegisterPrimOp primop_fetchTarball({
.doc = R"(
Download the specified URL, unpack it and return the path of the
unpacked tree. The file must be a tape archive (`.tar`) compressed
with `gzip`, `bzip2` or `xz`. The top-level path component of the
files in the tarball is removed, so it is best if the tarball
contains a single directory at top level. The typical use of the
function is to obtain external Nix expression dependencies, such as
a particular version of Nixpkgs, e.g.
with `gzip`, `bzip2` or `xz`. If the tarball consists of a
single directory, then the top-level path component of the files
in the tarball is removed. The typical use of the function is to
obtain external Nix expression dependencies, such as a
particular version of Nixpkgs, e.g.
```nix
with import (fetchTarball https://github.com/NixOS/nixpkgs/archive/nixos-14.12.tar.gz) {};

View file

@ -126,16 +126,39 @@ Object lookupObject(git_repository * repo, const git_oid & oid, git_object_t typ
}
template<typename T>
T peelObject(git_repository * repo, git_object * obj, git_object_t type)
T peelObject(git_object * obj, git_object_t type)
{
T obj2;
if (git_object_peel((git_object * *) (typename T::pointer *) Setter(obj2), obj, type)) {
auto err = git_error_last();
throw Error("peeling Git object '%s': %s", git_object_id(obj), err->message);
throw Error("peeling Git object '%s': %s", *git_object_id(obj), err->message);
}
return obj2;
}
template<typename T>
T dupObject(typename T::pointer obj)
{
T obj2;
if (git_object_dup((git_object * *) (typename T::pointer *) Setter(obj2), (git_object *) obj))
throw Error("duplicating object '%s': %s", *git_object_id((git_object *) obj), git_error_last()->message);
return obj2;
}
/**
* Peel the specified object (i.e. follow tag and commit objects) to
* either a blob or a tree.
*/
static Object peelToTreeOrBlob(git_object * obj)
{
/* git_object_peel() doesn't handle blob objects, so handle those
specially. */
if (git_object_type(obj) == GIT_OBJECT_BLOB)
return dupObject<Object>(obj);
else
return peelObject<Object>(obj, GIT_OBJECT_TREE);
}
struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
{
/** Location of the repository on disk. */
@ -166,7 +189,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
std::unordered_set<git_oid> done;
std::queue<Commit> todo;
todo.push(peelObject<Commit>(*this, lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT));
todo.push(peelObject<Commit>(lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT));
while (auto commit = pop(todo)) {
if (!done.insert(*git_commit_id(commit->get())).second) continue;
@ -184,7 +207,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
uint64_t getLastModified(const Hash & rev) override
{
auto commit = peelObject<Commit>(*this, lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT);
auto commit = peelObject<Commit>(lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT);
return git_commit_time(commit.get());
}
@ -463,6 +486,23 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
return narHash;
}
Hash dereferenceSingletonDirectory(const Hash & oid_) override
{
auto oid = hashToOID(oid_);
auto _tree = lookupObject(*this, oid, GIT_OBJECT_TREE);
auto tree = (const git_tree *) &*_tree;
if (git_tree_entrycount(tree) == 1) {
auto entry = git_tree_entry_byindex(tree, 0);
auto mode = git_tree_entry_filemode(entry);
if (mode == GIT_FILEMODE_TREE)
oid = *git_tree_entry_id(entry);
}
return toHash(oid);
}
};
ref<GitRepo> GitRepo::openRepo(const std::filesystem::path & path, bool create, bool bare)
@ -476,11 +516,11 @@ ref<GitRepo> GitRepo::openRepo(const std::filesystem::path & path, bool create,
struct GitSourceAccessor : SourceAccessor
{
ref<GitRepoImpl> repo;
Tree root;
Object root;
GitSourceAccessor(ref<GitRepoImpl> repo_, const Hash & rev)
: repo(repo_)
, root(peelObject<Tree>(*repo, lookupObject(*repo, hashToOID(rev)).get(), GIT_OBJECT_TREE))
, root(peelToTreeOrBlob(lookupObject(*repo, hashToOID(rev)).get()))
{
}
@ -506,7 +546,7 @@ struct GitSourceAccessor : SourceAccessor
std::optional<Stat> maybeLstat(const CanonPath & path) override
{
if (path.isRoot())
return Stat { .type = tDirectory };
return Stat { .type = git_object_type(root.get()) == GIT_OBJECT_TREE ? tDirectory : tRegular };
auto entry = lookup(path);
if (!entry)
@ -616,10 +656,10 @@ struct GitSourceAccessor : SourceAccessor
std::optional<Tree> lookupTree(const CanonPath & path)
{
if (path.isRoot()) {
Tree tree;
if (git_tree_dup(Setter(tree), root.get()))
throw Error("duplicating directory '%s': %s", showPath(path), git_error_last()->message);
return tree;
if (git_object_type(root.get()) == GIT_OBJECT_TREE)
return dupObject<Tree>((git_tree *) &*root);
else
return std::nullopt;
}
auto entry = lookup(path);
@ -646,10 +686,10 @@ struct GitSourceAccessor : SourceAccessor
std::variant<Tree, Submodule> getTree(const CanonPath & path)
{
if (path.isRoot()) {
Tree tree;
if (git_tree_dup(Setter(tree), root.get()))
throw Error("duplicating directory '%s': %s", showPath(path), git_error_last()->message);
return tree;
if (git_object_type(root.get()) == GIT_OBJECT_TREE)
return dupObject<Tree>((git_tree *) &*root);
else
throw Error("Git root object '%s' is not a directory", *git_object_id(root.get()));
}
auto entry = need(path);
@ -669,6 +709,9 @@ struct GitSourceAccessor : SourceAccessor
Blob getBlob(const CanonPath & path, bool expectSymlink)
{
if (!expectSymlink && git_object_type(root.get()) == GIT_OBJECT_BLOB)
return dupObject<Blob>((git_blob *) &*root);
auto notExpected = [&]()
{
throw Error(
@ -782,8 +825,6 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
std::vector<PendingDir> pendingDirs;
size_t componentsToStrip = 1;
void pushBuilder(std::string name)
{
git_treebuilder * b;
@ -839,9 +880,6 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
{
std::span<const std::string> pathComponents2{pathComponents};
if (pathComponents2.size() <= componentsToStrip) return false;
pathComponents2 = pathComponents2.subspan(componentsToStrip);
updateBuilders(
isDir
? pathComponents2
@ -964,7 +1002,8 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
git_tree_entry_filemode(entry));
}
Hash sync() override {
Hash sync() override
{
updateBuilders({});
auto [oid, _name] = popBuilder();

View file

@ -98,6 +98,13 @@ struct GitRepo
* serialisation. This is memoised on-disk.
*/
virtual Hash treeHashToNarHash(const Hash & treeHash) = 0;
/**
* If the specified Git object is a directory with a single entry
* that is a directory, return the ID of that object.
* Otherwise, return the passed ID unchanged.
*/
virtual Hash dereferenceSingletonDirectory(const Hash & oid) = 0;
};
ref<GitRepo> getTarballCache();

View file

@ -258,13 +258,14 @@ struct GitArchiveInputScheme : InputScheme
fmt("unpacking '%s' into the Git cache", input.to_string()));
TarArchive archive { *source };
auto parseSink = getTarballCache()->getFileSystemObjectSink();
auto tarballCache = getTarballCache();
auto parseSink = tarballCache->getFileSystemObjectSink();
auto lastModified = unpackTarfileToSink(archive, *parseSink);
act.reset();
TarballInfo tarballInfo {
.treeHash = parseSink->sync(),
.treeHash = tarballCache->dereferenceSingletonDirectory(parseSink->sync()),
.lastModified = lastModified
};

View file

@ -167,7 +167,8 @@ DownloadTarballResult downloadTarball(
TarArchive{path};
})
: TarArchive{*source};
auto parseSink = getTarballCache()->getFileSystemObjectSink();
auto tarballCache = getTarballCache();
auto parseSink = tarballCache->getFileSystemObjectSink();
auto lastModified = unpackTarfileToSink(archive, *parseSink);
act.reset();
@ -182,7 +183,8 @@ DownloadTarballResult downloadTarball(
infoAttrs = cached->value;
} else {
infoAttrs.insert_or_assign("etag", res->etag);
infoAttrs.insert_or_assign("treeHash", parseSink->sync().gitRev());
infoAttrs.insert_or_assign("treeHash",
tarballCache->dereferenceSingletonDirectory(parseSink->sync()).gitRev());
infoAttrs.insert_or_assign("lastModified", uint64_t(lastModified));
if (res->immutableUrl)
infoAttrs.insert_or_assign("immutableUrl", *res->immutableUrl);

View file

@ -83,3 +83,20 @@ path="$(nix flake prefetch --json "tarball+file://$(pwd)/tree.tar.gz" | jq -r .s
[[ $(cat "$path/a/zzz") = bar ]]
[[ $(cat "$path/c/aap") = bar ]]
[[ $(cat "$path/fnord") = bar ]]
# Test a tarball that has multiple top-level directories.
rm -rf "$TEST_ROOT/tar_root"
mkdir -p "$TEST_ROOT/tar_root" "$TEST_ROOT/tar_root/foo" "$TEST_ROOT/tar_root/bar"
tar cvf "$TEST_ROOT/tar.tar" -C "$TEST_ROOT/tar_root" .
path="$(nix flake prefetch --json "tarball+file://$TEST_ROOT/tar.tar" | jq -r .storePath)"
[[ -d "$path/foo" ]]
[[ -d "$path/bar" ]]
# Test a tarball that has a single regular file.
rm -rf "$TEST_ROOT/tar_root"
mkdir -p "$TEST_ROOT/tar_root"
echo bar > "$TEST_ROOT/tar_root/foo"
chmod +x "$TEST_ROOT/tar_root/foo"
tar cvf "$TEST_ROOT/tar.tar" -C "$TEST_ROOT/tar_root" .
path="$(nix flake prefetch --refresh --json "tarball+file://$TEST_ROOT/tar.tar" | jq -r .storePath)"
[[ $(cat "$path/foo") = bar ]]

View file

@ -77,7 +77,7 @@ TEST_F(GitUtilsTest, sink_basic)
// sink->createHardlink("foo-1.1/links/foo-2", CanonPath("foo-1.1/hello"));
auto result = sink->sync();
auto result = repo->dereferenceSingletonDirectory(sink->sync());
auto accessor = repo->getAccessor(result, false);
auto entries = accessor->readDirectory(CanonPath::root);
ASSERT_EQ(entries.size(), 5);
@ -103,7 +103,7 @@ TEST_F(GitUtilsTest, sink_hardlink)
sink->createHardlink(CanonPath("foo-1.1/link"), CanonPath("hello"));
FAIL() << "Expected an exception";
} catch (const nix::Error & e) {
ASSERT_THAT(e.msg(), testing::HasSubstr("invalid hard link target"));
ASSERT_THAT(e.msg(), testing::HasSubstr("cannot find hard link target"));
ASSERT_THAT(e.msg(), testing::HasSubstr("/hello"));
ASSERT_THAT(e.msg(), testing::HasSubstr("foo-1.1/link"));
}