Merge pull request #11195 from DeterminateSystems/tarball-roots

Improve handling of tarballs that don't consist of a single top-level directory
This commit is contained in:
Eelco Dolstra 2024-07-29 16:58:59 +02:00 committed by GitHub
commit 0b96c586e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 107 additions and 32 deletions

View file

@ -0,0 +1,9 @@
---
synopsis: "Improve handling of tarballs that don't consist of a single top-level directory"
prs:
- 11195
---
In previous Nix releases, the tarball fetcher (used by `builtins.fetchTarball`) erroneously merged top-level directories into a single directory, and silently discarded top-level files that are not directories. This is no longer the case. The new behaviour is that *only* if the tarball consists of a single directory, the top-level path component of the files in the tarball is removed (similar to `tar`'s `--strip-components=1`).
Author: [**Eelco Dolstra (@edolstra)**](https://github.com/edolstra)

View file

@ -559,11 +559,11 @@ static RegisterPrimOp primop_fetchTarball({
.doc = R"( .doc = R"(
Download the specified URL, unpack it and return the path of the Download the specified URL, unpack it and return the path of the
unpacked tree. The file must be a tape archive (`.tar`) compressed unpacked tree. The file must be a tape archive (`.tar`) compressed
with `gzip`, `bzip2` or `xz`. The top-level path component of the with `gzip`, `bzip2` or `xz`. If the tarball consists of a
files in the tarball is removed, so it is best if the tarball single directory, then the top-level path component of the files
contains a single directory at top level. The typical use of the in the tarball is removed. The typical use of the function is to
function is to obtain external Nix expression dependencies, such as obtain external Nix expression dependencies, such as a
a particular version of Nixpkgs, e.g. particular version of Nixpkgs, e.g.
```nix ```nix
with import (fetchTarball https://github.com/NixOS/nixpkgs/archive/nixos-14.12.tar.gz) {}; with import (fetchTarball https://github.com/NixOS/nixpkgs/archive/nixos-14.12.tar.gz) {};

View file

@ -126,16 +126,39 @@ Object lookupObject(git_repository * repo, const git_oid & oid, git_object_t typ
} }
template<typename T> template<typename T>
T peelObject(git_repository * repo, git_object * obj, git_object_t type) T peelObject(git_object * obj, git_object_t type)
{ {
T obj2; T obj2;
if (git_object_peel((git_object * *) (typename T::pointer *) Setter(obj2), obj, type)) { if (git_object_peel((git_object * *) (typename T::pointer *) Setter(obj2), obj, type)) {
auto err = git_error_last(); auto err = git_error_last();
throw Error("peeling Git object '%s': %s", git_object_id(obj), err->message); throw Error("peeling Git object '%s': %s", *git_object_id(obj), err->message);
} }
return obj2; return obj2;
} }
template<typename T>
T dupObject(typename T::pointer obj)
{
T obj2;
if (git_object_dup((git_object * *) (typename T::pointer *) Setter(obj2), (git_object *) obj))
throw Error("duplicating object '%s': %s", *git_object_id((git_object *) obj), git_error_last()->message);
return obj2;
}
/**
* Peel the specified object (i.e. follow tag and commit objects) to
* either a blob or a tree.
*/
static Object peelToTreeOrBlob(git_object * obj)
{
/* git_object_peel() doesn't handle blob objects, so handle those
specially. */
if (git_object_type(obj) == GIT_OBJECT_BLOB)
return dupObject<Object>(obj);
else
return peelObject<Object>(obj, GIT_OBJECT_TREE);
}
struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl> struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
{ {
/** Location of the repository on disk. */ /** Location of the repository on disk. */
@ -166,7 +189,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
std::unordered_set<git_oid> done; std::unordered_set<git_oid> done;
std::queue<Commit> todo; std::queue<Commit> todo;
todo.push(peelObject<Commit>(*this, lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT)); todo.push(peelObject<Commit>(lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT));
while (auto commit = pop(todo)) { while (auto commit = pop(todo)) {
if (!done.insert(*git_commit_id(commit->get())).second) continue; if (!done.insert(*git_commit_id(commit->get())).second) continue;
@ -184,7 +207,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
uint64_t getLastModified(const Hash & rev) override uint64_t getLastModified(const Hash & rev) override
{ {
auto commit = peelObject<Commit>(*this, lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT); auto commit = peelObject<Commit>(lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT);
return git_commit_time(commit.get()); return git_commit_time(commit.get());
} }
@ -463,6 +486,23 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
return narHash; return narHash;
} }
Hash dereferenceSingletonDirectory(const Hash & oid_) override
{
auto oid = hashToOID(oid_);
auto _tree = lookupObject(*this, oid, GIT_OBJECT_TREE);
auto tree = (const git_tree *) &*_tree;
if (git_tree_entrycount(tree) == 1) {
auto entry = git_tree_entry_byindex(tree, 0);
auto mode = git_tree_entry_filemode(entry);
if (mode == GIT_FILEMODE_TREE)
oid = *git_tree_entry_id(entry);
}
return toHash(oid);
}
}; };
ref<GitRepo> GitRepo::openRepo(const std::filesystem::path & path, bool create, bool bare) ref<GitRepo> GitRepo::openRepo(const std::filesystem::path & path, bool create, bool bare)
@ -476,11 +516,11 @@ ref<GitRepo> GitRepo::openRepo(const std::filesystem::path & path, bool create,
struct GitSourceAccessor : SourceAccessor struct GitSourceAccessor : SourceAccessor
{ {
ref<GitRepoImpl> repo; ref<GitRepoImpl> repo;
Tree root; Object root;
GitSourceAccessor(ref<GitRepoImpl> repo_, const Hash & rev) GitSourceAccessor(ref<GitRepoImpl> repo_, const Hash & rev)
: repo(repo_) : repo(repo_)
, root(peelObject<Tree>(*repo, lookupObject(*repo, hashToOID(rev)).get(), GIT_OBJECT_TREE)) , root(peelToTreeOrBlob(lookupObject(*repo, hashToOID(rev)).get()))
{ {
} }
@ -506,7 +546,7 @@ struct GitSourceAccessor : SourceAccessor
std::optional<Stat> maybeLstat(const CanonPath & path) override std::optional<Stat> maybeLstat(const CanonPath & path) override
{ {
if (path.isRoot()) if (path.isRoot())
return Stat { .type = tDirectory }; return Stat { .type = git_object_type(root.get()) == GIT_OBJECT_TREE ? tDirectory : tRegular };
auto entry = lookup(path); auto entry = lookup(path);
if (!entry) if (!entry)
@ -616,10 +656,10 @@ struct GitSourceAccessor : SourceAccessor
std::optional<Tree> lookupTree(const CanonPath & path) std::optional<Tree> lookupTree(const CanonPath & path)
{ {
if (path.isRoot()) { if (path.isRoot()) {
Tree tree; if (git_object_type(root.get()) == GIT_OBJECT_TREE)
if (git_tree_dup(Setter(tree), root.get())) return dupObject<Tree>((git_tree *) &*root);
throw Error("duplicating directory '%s': %s", showPath(path), git_error_last()->message); else
return tree; return std::nullopt;
} }
auto entry = lookup(path); auto entry = lookup(path);
@ -646,10 +686,10 @@ struct GitSourceAccessor : SourceAccessor
std::variant<Tree, Submodule> getTree(const CanonPath & path) std::variant<Tree, Submodule> getTree(const CanonPath & path)
{ {
if (path.isRoot()) { if (path.isRoot()) {
Tree tree; if (git_object_type(root.get()) == GIT_OBJECT_TREE)
if (git_tree_dup(Setter(tree), root.get())) return dupObject<Tree>((git_tree *) &*root);
throw Error("duplicating directory '%s': %s", showPath(path), git_error_last()->message); else
return tree; throw Error("Git root object '%s' is not a directory", *git_object_id(root.get()));
} }
auto entry = need(path); auto entry = need(path);
@ -669,6 +709,9 @@ struct GitSourceAccessor : SourceAccessor
Blob getBlob(const CanonPath & path, bool expectSymlink) Blob getBlob(const CanonPath & path, bool expectSymlink)
{ {
if (!expectSymlink && git_object_type(root.get()) == GIT_OBJECT_BLOB)
return dupObject<Blob>((git_blob *) &*root);
auto notExpected = [&]() auto notExpected = [&]()
{ {
throw Error( throw Error(
@ -782,8 +825,6 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
std::vector<PendingDir> pendingDirs; std::vector<PendingDir> pendingDirs;
size_t componentsToStrip = 1;
void pushBuilder(std::string name) void pushBuilder(std::string name)
{ {
git_treebuilder * b; git_treebuilder * b;
@ -839,9 +880,6 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
{ {
std::span<const std::string> pathComponents2{pathComponents}; std::span<const std::string> pathComponents2{pathComponents};
if (pathComponents2.size() <= componentsToStrip) return false;
pathComponents2 = pathComponents2.subspan(componentsToStrip);
updateBuilders( updateBuilders(
isDir isDir
? pathComponents2 ? pathComponents2
@ -964,7 +1002,8 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
git_tree_entry_filemode(entry)); git_tree_entry_filemode(entry));
} }
Hash sync() override { Hash sync() override
{
updateBuilders({}); updateBuilders({});
auto [oid, _name] = popBuilder(); auto [oid, _name] = popBuilder();

View file

@ -98,6 +98,13 @@ struct GitRepo
* serialisation. This is memoised on-disk. * serialisation. This is memoised on-disk.
*/ */
virtual Hash treeHashToNarHash(const Hash & treeHash) = 0; virtual Hash treeHashToNarHash(const Hash & treeHash) = 0;
/**
* If the specified Git object is a directory with a single entry
* that is a directory, return the ID of that object.
* Otherwise, return the passed ID unchanged.
*/
virtual Hash dereferenceSingletonDirectory(const Hash & oid) = 0;
}; };
ref<GitRepo> getTarballCache(); ref<GitRepo> getTarballCache();

View file

@ -258,13 +258,14 @@ struct GitArchiveInputScheme : InputScheme
fmt("unpacking '%s' into the Git cache", input.to_string())); fmt("unpacking '%s' into the Git cache", input.to_string()));
TarArchive archive { *source }; TarArchive archive { *source };
auto parseSink = getTarballCache()->getFileSystemObjectSink(); auto tarballCache = getTarballCache();
auto parseSink = tarballCache->getFileSystemObjectSink();
auto lastModified = unpackTarfileToSink(archive, *parseSink); auto lastModified = unpackTarfileToSink(archive, *parseSink);
act.reset(); act.reset();
TarballInfo tarballInfo { TarballInfo tarballInfo {
.treeHash = parseSink->sync(), .treeHash = tarballCache->dereferenceSingletonDirectory(parseSink->sync()),
.lastModified = lastModified .lastModified = lastModified
}; };

View file

@ -167,7 +167,8 @@ DownloadTarballResult downloadTarball(
TarArchive{path}; TarArchive{path};
}) })
: TarArchive{*source}; : TarArchive{*source};
auto parseSink = getTarballCache()->getFileSystemObjectSink(); auto tarballCache = getTarballCache();
auto parseSink = tarballCache->getFileSystemObjectSink();
auto lastModified = unpackTarfileToSink(archive, *parseSink); auto lastModified = unpackTarfileToSink(archive, *parseSink);
act.reset(); act.reset();
@ -182,7 +183,8 @@ DownloadTarballResult downloadTarball(
infoAttrs = cached->value; infoAttrs = cached->value;
} else { } else {
infoAttrs.insert_or_assign("etag", res->etag); infoAttrs.insert_or_assign("etag", res->etag);
infoAttrs.insert_or_assign("treeHash", parseSink->sync().gitRev()); infoAttrs.insert_or_assign("treeHash",
tarballCache->dereferenceSingletonDirectory(parseSink->sync()).gitRev());
infoAttrs.insert_or_assign("lastModified", uint64_t(lastModified)); infoAttrs.insert_or_assign("lastModified", uint64_t(lastModified));
if (res->immutableUrl) if (res->immutableUrl)
infoAttrs.insert_or_assign("immutableUrl", *res->immutableUrl); infoAttrs.insert_or_assign("immutableUrl", *res->immutableUrl);

View file

@ -83,3 +83,20 @@ path="$(nix flake prefetch --json "tarball+file://$(pwd)/tree.tar.gz" | jq -r .s
[[ $(cat "$path/a/zzz") = bar ]] [[ $(cat "$path/a/zzz") = bar ]]
[[ $(cat "$path/c/aap") = bar ]] [[ $(cat "$path/c/aap") = bar ]]
[[ $(cat "$path/fnord") = bar ]] [[ $(cat "$path/fnord") = bar ]]
# Test a tarball that has multiple top-level directories.
rm -rf "$TEST_ROOT/tar_root"
mkdir -p "$TEST_ROOT/tar_root" "$TEST_ROOT/tar_root/foo" "$TEST_ROOT/tar_root/bar"
tar cvf "$TEST_ROOT/tar.tar" -C "$TEST_ROOT/tar_root" .
path="$(nix flake prefetch --json "tarball+file://$TEST_ROOT/tar.tar" | jq -r .storePath)"
[[ -d "$path/foo" ]]
[[ -d "$path/bar" ]]
# Test a tarball that has a single regular file.
rm -rf "$TEST_ROOT/tar_root"
mkdir -p "$TEST_ROOT/tar_root"
echo bar > "$TEST_ROOT/tar_root/foo"
chmod +x "$TEST_ROOT/tar_root/foo"
tar cvf "$TEST_ROOT/tar.tar" -C "$TEST_ROOT/tar_root" .
path="$(nix flake prefetch --refresh --json "tarball+file://$TEST_ROOT/tar.tar" | jq -r .storePath)"
[[ $(cat "$path/foo") = bar ]]

View file

@ -77,7 +77,7 @@ TEST_F(GitUtilsTest, sink_basic)
// sink->createHardlink("foo-1.1/links/foo-2", CanonPath("foo-1.1/hello")); // sink->createHardlink("foo-1.1/links/foo-2", CanonPath("foo-1.1/hello"));
auto result = sink->sync(); auto result = repo->dereferenceSingletonDirectory(sink->sync());
auto accessor = repo->getAccessor(result, false); auto accessor = repo->getAccessor(result, false);
auto entries = accessor->readDirectory(CanonPath::root); auto entries = accessor->readDirectory(CanonPath::root);
ASSERT_EQ(entries.size(), 5); ASSERT_EQ(entries.size(), 5);
@ -103,7 +103,7 @@ TEST_F(GitUtilsTest, sink_hardlink)
sink->createHardlink(CanonPath("foo-1.1/link"), CanonPath("hello")); sink->createHardlink(CanonPath("foo-1.1/link"), CanonPath("hello"));
FAIL() << "Expected an exception"; FAIL() << "Expected an exception";
} catch (const nix::Error & e) { } catch (const nix::Error & e) {
ASSERT_THAT(e.msg(), testing::HasSubstr("invalid hard link target")); ASSERT_THAT(e.msg(), testing::HasSubstr("cannot find hard link target"));
ASSERT_THAT(e.msg(), testing::HasSubstr("/hello")); ASSERT_THAT(e.msg(), testing::HasSubstr("/hello"));
ASSERT_THAT(e.msg(), testing::HasSubstr("foo-1.1/link")); ASSERT_THAT(e.msg(), testing::HasSubstr("foo-1.1/link"));
} }