Merge pull request #11077 from hercules-ci/support-hardlinks-in-tarballs

Support hardlinks in tarballs
This commit is contained in:
John Ericson 2024-07-11 07:10:25 -04:00 committed by GitHub
commit 426e2af6f7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 280 additions and 8 deletions

View file

@ -115,10 +115,10 @@ git_oid hashToOID(const Hash & hash)
return oid;
}
Object lookupObject(git_repository * repo, const git_oid & oid)
Object lookupObject(git_repository * repo, const git_oid & oid, git_object_t type = GIT_OBJECT_ANY)
{
Object obj;
if (git_object_lookup(Setter(obj), repo, &oid, GIT_OBJECT_ANY)) {
if (git_object_lookup(Setter(obj), repo, &oid, type)) {
auto err = git_error_last();
throw Error("getting Git object '%s': %s", oid, err->message);
}
@ -909,6 +909,61 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK);
}
void createHardlink(const CanonPath & path, const CanonPath & target) override
{
std::vector<std::string> pathComponents;
for (auto & c : path)
pathComponents.emplace_back(c);
if (!prepareDirs(pathComponents, false)) return;
// We can't just look up the path from the start of the root, since
// some parent directories may not have finished yet, so we compute
// a relative path that helps us find the right git_tree_builder or object.
auto relTarget = CanonPath(path).parent()->makeRelative(target);
auto dir = pendingDirs.rbegin();
// For each ../ component at the start, go up one directory.
// CanonPath::makeRelative() always puts all .. elements at the start,
// so they're all handled by this loop:
std::string_view relTargetLeft(relTarget);
while (hasPrefix(relTargetLeft, "../")) {
if (dir == pendingDirs.rend())
throw Error("invalid hard link target '%s' for path '%s'", target, path);
++dir;
relTargetLeft = relTargetLeft.substr(3);
}
if (dir == pendingDirs.rend())
throw Error("invalid hard link target '%s' for path '%s'", target, path);
// Look up the remainder of the target, starting at the
// top-most `git_treebuilder`.
std::variant<git_treebuilder *, git_oid> curDir{dir->builder.get()};
Object tree; // needed to keep `entry` alive
const git_tree_entry * entry = nullptr;
for (auto & c : CanonPath(relTargetLeft)) {
if (auto builder = std::get_if<git_treebuilder *>(&curDir)) {
assert(*builder);
if (!(entry = git_treebuilder_get(*builder, std::string(c).c_str())))
throw Error("cannot find hard link target '%s' for path '%s'", target, path);
curDir = *git_tree_entry_id(entry);
} else if (auto oid = std::get_if<git_oid>(&curDir)) {
tree = lookupObject(*repo, *oid, GIT_OBJECT_TREE);
if (!(entry = git_tree_entry_byname((const git_tree *) &*tree, std::string(c).c_str())))
throw Error("cannot find hard link target '%s' for path '%s'", target, path);
curDir = *git_tree_entry_id(entry);
}
}
assert(entry);
addToTree(*pathComponents.rbegin(),
*git_tree_entry_id(entry),
git_tree_entry_filemode(entry));
}
Hash sync() override {
updateBuilders({});

View file

@ -7,7 +7,7 @@ namespace nix {
namespace fetchers { struct PublicKey; }
struct GitFileSystemObjectSink : FileSystemObjectSink
struct GitFileSystemObjectSink : ExtendedFileSystemObjectSink
{
/**
* Flush builder and return a final Git hash.

View file

@ -41,6 +41,19 @@ struct FileSystemObjectSink
virtual void createSymlink(const CanonPath & path, const std::string & target) = 0;
};
/**
* An extension of `FileSystemObjectSink` that supports file types
* that are not supported by Nix's FSO model.
*/
struct ExtendedFileSystemObjectSink : virtual FileSystemObjectSink
{
/**
* Create a hard link. The target must be the path of a previously
* encountered file relative to the root of the FSO.
*/
virtual void createHardlink(const CanonPath & path, const CanonPath & target) = 0;
};
/**
* Recursively copy file system objects from the source into the sink.
*/

View file

@ -174,7 +174,7 @@ void unpackTarfile(const Path & tarFile, const Path & destDir)
extract_archive(archive, destDir);
}
time_t unpackTarfileToSink(TarArchive & archive, FileSystemObjectSink & parseSink)
time_t unpackTarfileToSink(TarArchive & archive, ExtendedFileSystemObjectSink & parseSink)
{
time_t lastModified = 0;
@ -195,7 +195,12 @@ time_t unpackTarfileToSink(TarArchive & archive, FileSystemObjectSink & parseSin
lastModified = std::max(lastModified, archive_entry_mtime(entry));
switch (archive_entry_filetype(entry)) {
if (auto target = archive_entry_hardlink(entry)) {
parseSink.createHardlink(cpath, CanonPath(target));
continue;
}
switch (auto type = archive_entry_filetype(entry)) {
case AE_IFDIR:
parseSink.createDirectory(cpath);
@ -232,7 +237,7 @@ time_t unpackTarfileToSink(TarArchive & archive, FileSystemObjectSink & parseSin
}
default:
throw Error("file '%s' in tarball has unsupported file type", path);
throw Error("file '%s' in tarball has unsupported file type %d", path, type);
}
}

View file

@ -41,6 +41,6 @@ void unpackTarfile(Source & source, const Path & destDir);
void unpackTarfile(const Path & tarFile, const Path & destDir);
time_t unpackTarfileToSink(TarArchive & archive, FileSystemObjectSink & parseSink);
time_t unpackTarfileToSink(TarArchive & archive, ExtendedFileSystemObjectSink & parseSink);
}

View file

@ -71,3 +71,15 @@ test_tarball() {
test_tarball '' cat
test_tarball .xz xz
test_tarball .gz gzip
# Test hard links.
# All entries in tree.tar.gz refer to the same file, and all have the same inode when unpacked by GNU tar.
# We don't preserve the hard links, because that's an optimization we think is not worth the complexity,
# so we only make sure that the contents are copied correctly.
path="$(nix flake prefetch --json "tarball+file://$(pwd)/tree.tar.gz" | jq -r .storePath)"
[[ $(cat "$path/a/b/foo") = bar ]]
[[ $(cat "$path/a/b/xyzzy") = bar ]]
[[ $(cat "$path/a/yyy") = bar ]]
[[ $(cat "$path/a/zzz") = bar ]]
[[ $(cat "$path/c/aap") = bar ]]
[[ $(cat "$path/fnord") = bar ]]

Binary file not shown.

View file

@ -0,0 +1,112 @@
#include "git-utils.hh"
#include "file-system.hh"
#include "gmock/gmock.h"
#include <git2/global.h>
#include <git2/repository.h>
#include <git2/types.h>
#include <gtest/gtest.h>
#include "fs-sink.hh"
#include "serialise.hh"
namespace nix {
class GitUtilsTest : public ::testing::Test
{
// We use a single repository for all tests.
Path tmpDir;
std::unique_ptr<AutoDelete> delTmpDir;
public:
void SetUp() override
{
tmpDir = createTempDir();
delTmpDir = std::make_unique<AutoDelete>(tmpDir, true);
// Create the repo with libgit2
git_libgit2_init();
git_repository * repo = nullptr;
auto r = git_repository_init(&repo, tmpDir.c_str(), 0);
ASSERT_EQ(r, 0);
git_repository_free(repo);
}
void TearDown() override
{
// Destroy the AutoDelete, triggering removal
// not AutoDelete::reset(), which would cancel the deletion.
delTmpDir.reset();
}
ref<GitRepo> openRepo()
{
return GitRepo::openRepo(tmpDir, true, false);
}
};
void writeString(CreateRegularFileSink & fileSink, std::string contents, bool executable)
{
if (executable)
fileSink.isExecutable();
fileSink.preallocateContents(contents.size());
fileSink(contents);
}
TEST_F(GitUtilsTest, sink_basic)
{
auto repo = openRepo();
auto sink = repo->getFileSystemObjectSink();
// TODO/Question: It seems a little odd that we use the tarball-like convention of requiring a top-level directory
// here
// The sync method does not document this behavior, should probably renamed because it's not very
// general, and I can't imagine that "non-conventional" archives or any other source to be handled by
// this sink.
sink->createDirectory(CanonPath("foo-1.1"));
sink->createRegularFile(CanonPath("foo-1.1/hello"), [](CreateRegularFileSink & fileSink) {
writeString(fileSink, "hello world", false);
});
sink->createRegularFile(CanonPath("foo-1.1/bye"), [](CreateRegularFileSink & fileSink) {
writeString(fileSink, "thanks for all the fish", false);
});
sink->createSymlink(CanonPath("foo-1.1/bye-link"), "bye");
sink->createDirectory(CanonPath("foo-1.1/empty"));
sink->createDirectory(CanonPath("foo-1.1/links"));
sink->createHardlink(CanonPath("foo-1.1/links/foo"), CanonPath("foo-1.1/hello"));
// sink->createHardlink("foo-1.1/links/foo-2", CanonPath("foo-1.1/hello"));
auto result = sink->sync();
auto accessor = repo->getAccessor(result, false);
auto entries = accessor->readDirectory(CanonPath::root);
ASSERT_EQ(entries.size(), 5);
ASSERT_EQ(accessor->readFile(CanonPath("hello")), "hello world");
ASSERT_EQ(accessor->readFile(CanonPath("bye")), "thanks for all the fish");
ASSERT_EQ(accessor->readLink(CanonPath("bye-link")), "bye");
ASSERT_EQ(accessor->readDirectory(CanonPath("empty")).size(), 0);
ASSERT_EQ(accessor->readFile(CanonPath("links/foo")), "hello world");
};
TEST_F(GitUtilsTest, sink_hardlink)
{
auto repo = openRepo();
auto sink = repo->getFileSystemObjectSink();
sink->createDirectory(CanonPath("foo-1.1"));
sink->createRegularFile(CanonPath("foo-1.1/hello"), [](CreateRegularFileSink & fileSink) {
writeString(fileSink, "hello world", false);
});
try {
sink->createHardlink(CanonPath("foo-1.1/link"), CanonPath("hello"));
FAIL() << "Expected an exception";
} catch (const nix::Error & e) {
ASSERT_THAT(e.msg(), testing::HasSubstr("invalid hard link target"));
ASSERT_THAT(e.msg(), testing::HasSubstr("/hello"));
ASSERT_THAT(e.msg(), testing::HasSubstr("foo-1.1/link"));
}
};
} // namespace nix

View file

@ -29,7 +29,7 @@ libfetchers-tests_LIBS = \
libstore-test-support libutil-test-support \
libfetchers libstore libutil
libfetchers-tests_LDFLAGS := -lrapidcheck $(GTEST_LIBS)
libfetchers-tests_LDFLAGS := -lrapidcheck $(GTEST_LIBS) $(LIBGIT2_LIBS)
ifdef HOST_WINDOWS
# Increase the default reserved stack size to 65 MB so Nix doesn't run out of space

View file

@ -0,0 +1,34 @@
#include <iostream>
#include "tracing-file-system-object-sink.hh"
namespace nix::test {
void TracingFileSystemObjectSink::createDirectory(const CanonPath & path)
{
std::cerr << "createDirectory(" << path << ")\n";
sink.createDirectory(path);
}
void TracingFileSystemObjectSink::createRegularFile(
const CanonPath & path, std::function<void(CreateRegularFileSink &)> fn)
{
std::cerr << "createRegularFile(" << path << ")\n";
sink.createRegularFile(path, [&](CreateRegularFileSink & crf) {
// We could wrap this and trace about the chunks of data and such
fn(crf);
});
}
void TracingFileSystemObjectSink::createSymlink(const CanonPath & path, const std::string & target)
{
std::cerr << "createSymlink(" << path << ", target: " << target << ")\n";
sink.createSymlink(path, target);
}
void TracingExtendedFileSystemObjectSink::createHardlink(const CanonPath & path, const CanonPath & target)
{
std::cerr << "createHardlink(" << path << ", target: " << target << ")\n";
sink.createHardlink(path, target);
}
} // namespace nix::test

View file

@ -0,0 +1,41 @@
#pragma once
#include "fs-sink.hh"
namespace nix::test {
/**
* A `FileSystemObjectSink` that traces calls, writing to stderr.
*/
class TracingFileSystemObjectSink : public virtual FileSystemObjectSink
{
FileSystemObjectSink & sink;
public:
TracingFileSystemObjectSink(FileSystemObjectSink & sink)
: sink(sink)
{
}
void createDirectory(const CanonPath & path) override;
void createRegularFile(const CanonPath & path, std::function<void(CreateRegularFileSink &)> fn) override;
void createSymlink(const CanonPath & path, const std::string & target) override;
};
/**
* A `ExtendedFileSystemObjectSink` that traces calls, writing to stderr.
*/
class TracingExtendedFileSystemObjectSink : public TracingFileSystemObjectSink, public ExtendedFileSystemObjectSink
{
ExtendedFileSystemObjectSink & sink;
public:
TracingExtendedFileSystemObjectSink(ExtendedFileSystemObjectSink & sink)
: TracingFileSystemObjectSink(sink)
, sink(sink)
{
}
void createHardlink(const CanonPath & path, const CanonPath & target) override;
};
}