Merge pull request #9485 from edolstra/tarball-cache

Add a Git-based content-addressed tarball cache
This commit is contained in:
Eelco Dolstra 2024-02-15 22:37:57 +01:00 committed by GitHub
commit 06be819b89
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 345 additions and 44 deletions

View file

@ -104,4 +104,9 @@ std::map<std::string, std::string> attrsToQuery(const Attrs & attrs)
return query; return query;
} }
Hash getRevAttr(const Attrs & attrs, const std::string & name)
{
return Hash::parseAny(getStrAttr(attrs, name), HashAlgorithm::SHA1);
}
} }

View file

@ -39,4 +39,6 @@ bool getBoolAttr(const Attrs & attrs, const std::string & name);
std::map<std::string, std::string> attrsToQuery(const Attrs & attrs); std::map<std::string, std::string> attrsToQuery(const Attrs & attrs);
Hash getRevAttr(const Attrs & attrs, const std::string & name);
} }

View file

@ -6,8 +6,8 @@
#include "finally.hh" #include "finally.hh"
#include "processes.hh" #include "processes.hh"
#include "signals.hh" #include "signals.hh"
#include "users.hh"
#include <boost/core/span.hpp> #include "fs-sink.hh"
#include <git2/attr.h> #include <git2/attr.h>
#include <git2/blob.h> #include <git2/blob.h>
@ -28,6 +28,7 @@
#include <unordered_set> #include <unordered_set>
#include <queue> #include <queue>
#include <regex> #include <regex>
#include <span>
namespace std { namespace std {
@ -356,6 +357,8 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
ref<InputAccessor> getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError e) override; ref<InputAccessor> getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError e) override;
ref<GitFileSystemObjectSink> getFileSystemObjectSink() override;
static int sidebandProgressCallback(const char * str, int len, void * payload) static int sidebandProgressCallback(const char * str, int len, void * payload)
{ {
auto act = (Activity *) payload; auto act = (Activity *) payload;
@ -770,6 +773,154 @@ struct GitExportIgnoreInputAccessor : CachingFilteringInputAccessor {
}; };
struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
{
ref<GitRepoImpl> repo;
struct PendingDir
{
std::string name;
TreeBuilder builder;
};
std::vector<PendingDir> pendingDirs;
size_t componentsToStrip = 1;
void pushBuilder(std::string name)
{
git_treebuilder * b;
if (git_treebuilder_new(&b, *repo, nullptr))
throw Error("creating a tree builder: %s", git_error_last()->message);
pendingDirs.push_back({ .name = std::move(name), .builder = TreeBuilder(b) });
};
GitFileSystemObjectSinkImpl(ref<GitRepoImpl> repo) : repo(repo)
{
pushBuilder("");
}
std::pair<git_oid, std::string> popBuilder()
{
assert(!pendingDirs.empty());
auto pending = std::move(pendingDirs.back());
git_oid oid;
if (git_treebuilder_write(&oid, pending.builder.get()))
throw Error("creating a tree object: %s", git_error_last()->message);
pendingDirs.pop_back();
return {oid, pending.name};
};
void addToTree(const std::string & name, const git_oid & oid, git_filemode_t mode)
{
assert(!pendingDirs.empty());
auto & pending = pendingDirs.back();
if (git_treebuilder_insert(nullptr, pending.builder.get(), name.c_str(), &oid, mode))
throw Error("adding a file to a tree builder: %s", git_error_last()->message);
};
void updateBuilders(std::span<const std::string> names)
{
// Find the common prefix of pendingDirs and names.
size_t prefixLen = 0;
for (; prefixLen < names.size() && prefixLen + 1 < pendingDirs.size(); ++prefixLen)
if (names[prefixLen] != pendingDirs[prefixLen + 1].name)
break;
// Finish the builders that are not part of the common prefix.
for (auto n = pendingDirs.size(); n > prefixLen + 1; --n) {
auto [oid, name] = popBuilder();
addToTree(name, oid, GIT_FILEMODE_TREE);
}
// Create builders for the new directories.
for (auto n = prefixLen; n < names.size(); ++n)
pushBuilder(names[n]);
};
bool prepareDirs(const std::vector<std::string> & pathComponents, bool isDir)
{
std::span<const std::string> pathComponents2{pathComponents};
if (pathComponents2.size() <= componentsToStrip) return false;
pathComponents2 = pathComponents2.subspan(componentsToStrip);
updateBuilders(
isDir
? pathComponents2
: pathComponents2.first(pathComponents2.size() - 1));
return true;
}
void createRegularFile(
const Path & path,
std::function<void(CreateRegularFileSink &)> func) override
{
auto pathComponents = tokenizeString<std::vector<std::string>>(path, "/");
if (!prepareDirs(pathComponents, false)) return;
git_writestream * stream = nullptr;
if (git_blob_create_from_stream(&stream, *repo, nullptr))
throw Error("creating a blob stream object: %s", git_error_last()->message);
struct CRF : CreateRegularFileSink {
const Path & path;
GitFileSystemObjectSinkImpl & back;
git_writestream * stream;
bool executable = false;
CRF(const Path & path, GitFileSystemObjectSinkImpl & back, git_writestream * stream)
: path(path), back(back), stream(stream)
{}
void operator () (std::string_view data) override
{
if (stream->write(stream, data.data(), data.size()))
throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message);
}
void isExecutable() override
{
executable = true;
}
} crf { path, *this, stream };
func(crf);
git_oid oid;
if (git_blob_create_from_stream_commit(&oid, stream))
throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message);
addToTree(*pathComponents.rbegin(), oid,
crf.executable
? GIT_FILEMODE_BLOB_EXECUTABLE
: GIT_FILEMODE_BLOB);
}
void createDirectory(const Path & path) override
{
auto pathComponents = tokenizeString<std::vector<std::string>>(path, "/");
(void) prepareDirs(pathComponents, true);
}
void createSymlink(const Path & path, const std::string & target) override
{
auto pathComponents = tokenizeString<std::vector<std::string>>(path, "/");
if (!prepareDirs(pathComponents, false)) return;
git_oid oid;
if (git_blob_create_from_buffer(&oid, *repo, target.c_str(), target.size()))
throw Error("creating a blob object for tarball symlink member '%s': %s", path, git_error_last()->message);
addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK);
}
Hash sync() override {
updateBuilders({});
auto [oid, _name] = popBuilder();
return toHash(oid);
}
};
ref<GitInputAccessor> GitRepoImpl::getRawAccessor(const Hash & rev) ref<GitInputAccessor> GitRepoImpl::getRawAccessor(const Hash & rev)
{ {
auto self = ref<GitRepoImpl>(shared_from_this()); auto self = ref<GitRepoImpl>(shared_from_this());
@ -804,6 +955,11 @@ ref<InputAccessor> GitRepoImpl::getAccessor(const WorkdirInfo & wd, bool exportI
} }
} }
ref<GitFileSystemObjectSink> GitRepoImpl::getFileSystemObjectSink()
{
return make_ref<GitFileSystemObjectSinkImpl>(ref<GitRepoImpl>(shared_from_this()));
}
std::vector<std::tuple<GitRepoImpl::Submodule, Hash>> GitRepoImpl::getSubmodules(const Hash & rev, bool exportIgnore) std::vector<std::tuple<GitRepoImpl::Submodule, Hash>> GitRepoImpl::getSubmodules(const Hash & rev, bool exportIgnore)
{ {
/* Read the .gitmodules files from this revision. */ /* Read the .gitmodules files from this revision. */
@ -830,5 +986,11 @@ std::vector<std::tuple<GitRepoImpl::Submodule, Hash>> GitRepoImpl::getSubmodules
return result; return result;
} }
ref<GitRepo> getTarballCache()
{
static auto repoDir = std::filesystem::path(getCacheDir()) / "nix" / "tarball-cache";
return GitRepo::openRepo(repoDir, true, true);
}
} }

View file

@ -2,11 +2,20 @@
#include "filtering-input-accessor.hh" #include "filtering-input-accessor.hh"
#include "input-accessor.hh" #include "input-accessor.hh"
#include "fs-sink.hh"
namespace nix { namespace nix {
namespace fetchers { struct PublicKey; } namespace fetchers { struct PublicKey; }
struct GitFileSystemObjectSink : FileSystemObjectSink
{
/**
* Flush builder and return a final Git hash.
*/
virtual Hash sync() = 0;
};
struct GitRepo struct GitRepo
{ {
virtual ~GitRepo() virtual ~GitRepo()
@ -64,18 +73,14 @@ struct GitRepo
const std::string & url, const std::string & url,
const std::string & base) = 0; const std::string & base) = 0;
struct TarballInfo
{
Hash treeHash;
time_t lastModified;
};
virtual bool hasObject(const Hash & oid) = 0; virtual bool hasObject(const Hash & oid) = 0;
virtual ref<InputAccessor> getAccessor(const Hash & rev, bool exportIgnore) = 0; virtual ref<InputAccessor> getAccessor(const Hash & rev, bool exportIgnore) = 0;
virtual ref<InputAccessor> getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError makeNotAllowedError) = 0; virtual ref<InputAccessor> getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError makeNotAllowedError) = 0;
virtual ref<GitFileSystemObjectSink> getFileSystemObjectSink() = 0;
virtual void fetch( virtual void fetch(
const std::string & url, const std::string & url,
const std::string & refspec, const std::string & refspec,
@ -90,4 +95,6 @@ struct GitRepo
const std::vector<fetchers::PublicKey> & publicKeys) = 0; const std::vector<fetchers::PublicKey> & publicKeys) = 0;
}; };
ref<GitRepo> getTarballCache();
} }

View file

@ -8,6 +8,8 @@
#include "fetchers.hh" #include "fetchers.hh"
#include "fetch-settings.hh" #include "fetch-settings.hh"
#include "tarball.hh" #include "tarball.hh"
#include "tarfile.hh"
#include "git-utils.hh"
#include <optional> #include <optional>
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
@ -180,49 +182,102 @@ struct GitArchiveInputScheme : InputScheme
return headers; return headers;
} }
virtual Hash getRevFromRef(nix::ref<Store> store, const Input & input) const = 0; struct RefInfo
{
Hash rev;
std::optional<Hash> treeHash;
};
virtual RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const = 0;
virtual DownloadUrl getDownloadUrl(const Input & input) const = 0; virtual DownloadUrl getDownloadUrl(const Input & input) const = 0;
std::pair<StorePath, Input> fetch(ref<Store> store, const Input & _input) override struct TarballInfo
{ {
Input input(_input); Hash treeHash;
time_t lastModified;
};
std::pair<Input, TarballInfo> downloadArchive(ref<Store> store, Input input) const
{
if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD"); if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD");
std::optional<Hash> upstreamTreeHash;
auto rev = input.getRev(); auto rev = input.getRev();
if (!rev) rev = getRevFromRef(store, input); if (!rev) {
auto refInfo = getRevFromRef(store, input);
rev = refInfo.rev;
upstreamTreeHash = refInfo.treeHash;
debug("HEAD revision for '%s' is %s", input.to_string(), refInfo.rev.gitRev());
}
input.attrs.erase("ref"); input.attrs.erase("ref");
input.attrs.insert_or_assign("rev", rev->gitRev()); input.attrs.insert_or_assign("rev", rev->gitRev());
Attrs lockedAttrs({ auto cache = getCache();
{"type", "git-tarball"},
{"rev", rev->gitRev()},
});
if (auto res = getCache()->lookup(*store, lockedAttrs)) { Attrs treeHashKey{{"_what", "gitRevToTreeHash"}, {"rev", rev->gitRev()}};
input.attrs.insert_or_assign("lastModified", getIntAttr(res->first, "lastModified")); Attrs lastModifiedKey{{"_what", "gitRevToLastModified"}, {"rev", rev->gitRev()}};
return {std::move(res->second), input};
if (auto treeHashAttrs = cache->lookup(treeHashKey)) {
if (auto lastModifiedAttrs = cache->lookup(lastModifiedKey)) {
auto treeHash = getRevAttr(*treeHashAttrs, "treeHash");
auto lastModified = getIntAttr(*lastModifiedAttrs, "lastModified");
if (getTarballCache()->hasObject(treeHash))
return {std::move(input), TarballInfo { .treeHash = treeHash, .lastModified = (time_t) lastModified }};
else
debug("Git tree with hash '%s' has disappeared from the cache, refetching...", treeHash.gitRev());
}
} }
/* Stream the tarball into the tarball cache. */
auto url = getDownloadUrl(input); auto url = getDownloadUrl(input);
auto result = downloadTarball(store, url.url, input.getName(), true, url.headers); auto source = sinkToSource([&](Sink & sink) {
FileTransferRequest req(url.url);
req.headers = url.headers;
getFileTransfer()->download(std::move(req), sink);
});
input.attrs.insert_or_assign("lastModified", uint64_t(result.lastModified)); TarArchive archive { *source };
auto parseSink = getTarballCache()->getFileSystemObjectSink();
auto lastModified = unpackTarfileToSink(archive, *parseSink);
getCache()->add( TarballInfo tarballInfo {
*store, .treeHash = parseSink->sync(),
lockedAttrs, .lastModified = lastModified
{ };
{"rev", rev->gitRev()},
{"lastModified", uint64_t(result.lastModified)}
},
result.storePath,
true);
return {result.storePath, input}; cache->upsert(treeHashKey, Attrs{{"treeHash", tarballInfo.treeHash.gitRev()}});
cache->upsert(lastModifiedKey, Attrs{{"lastModified", (uint64_t) tarballInfo.lastModified}});
#if 0
if (upstreamTreeHash != tarballInfo.treeHash)
warn(
"Git tree hash mismatch for revision '%s' of '%s': "
"expected '%s', got '%s'. "
"This can happen if the Git repository uses submodules.",
rev->gitRev(), input.to_string(), upstreamTreeHash->gitRev(), tarballInfo.treeHash.gitRev());
#endif
return {std::move(input), tarballInfo};
}
std::pair<ref<InputAccessor>, Input> getAccessor(ref<Store> store, const Input & _input) const override
{
auto [input, tarballInfo] = downloadArchive(store, _input);
input.attrs.insert_or_assign("treeHash", tarballInfo.treeHash.gitRev());
input.attrs.insert_or_assign("lastModified", uint64_t(tarballInfo.lastModified));
auto accessor = getTarballCache()->getAccessor(tarballInfo.treeHash, false);
accessor->setPathDisplay("«" + input.to_string() + "»");
accessor->fingerprint = input.getFingerprint(store);
return {accessor, input};
} }
std::optional<ExperimentalFeature> experimentalFeature() const override std::optional<ExperimentalFeature> experimentalFeature() const override
@ -269,7 +324,7 @@ struct GitHubInputScheme : GitArchiveInputScheme
return getStrAttr(input.attrs, "repo"); return getStrAttr(input.attrs, "repo");
} }
Hash getRevFromRef(nix::ref<Store> store, const Input & input) const override RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const override
{ {
auto host = getHost(input); auto host = getHost(input);
auto url = fmt( auto url = fmt(
@ -284,9 +339,11 @@ struct GitHubInputScheme : GitArchiveInputScheme
readFile( readFile(
store->toRealPath( store->toRealPath(
downloadFile(store, url, "source", false, headers).storePath))); downloadFile(store, url, "source", false, headers).storePath)));
auto rev = Hash::parseAny(std::string { json["sha"] }, HashAlgorithm::SHA1);
debug("HEAD revision for '%s' is %s", url, rev.gitRev()); return RefInfo {
return rev; .rev = Hash::parseAny(std::string { json["sha"] }, HashAlgorithm::SHA1),
.treeHash = Hash::parseAny(std::string { json["commit"]["tree"]["sha"] }, HashAlgorithm::SHA1)
};
} }
DownloadUrl getDownloadUrl(const Input & input) const override DownloadUrl getDownloadUrl(const Input & input) const override
@ -343,7 +400,7 @@ struct GitLabInputScheme : GitArchiveInputScheme
return std::make_pair(token.substr(0,fldsplit), token.substr(fldsplit+1)); return std::make_pair(token.substr(0,fldsplit), token.substr(fldsplit+1));
} }
Hash getRevFromRef(nix::ref<Store> store, const Input & input) const override RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const override
{ {
auto host = maybeGetStrAttr(input.attrs, "host").value_or("gitlab.com"); auto host = maybeGetStrAttr(input.attrs, "host").value_or("gitlab.com");
// See rate limiting note below // See rate limiting note below
@ -356,9 +413,10 @@ struct GitLabInputScheme : GitArchiveInputScheme
readFile( readFile(
store->toRealPath( store->toRealPath(
downloadFile(store, url, "source", false, headers).storePath))); downloadFile(store, url, "source", false, headers).storePath)));
auto rev = Hash::parseAny(std::string(json[0]["id"]), HashAlgorithm::SHA1);
debug("HEAD revision for '%s' is %s", url, rev.gitRev()); return RefInfo {
return rev; .rev = Hash::parseAny(std::string(json[0]["id"]), HashAlgorithm::SHA1)
};
} }
DownloadUrl getDownloadUrl(const Input & input) const override DownloadUrl getDownloadUrl(const Input & input) const override
@ -402,7 +460,7 @@ struct SourceHutInputScheme : GitArchiveInputScheme
// Once it is implemented, however, should work as expected. // Once it is implemented, however, should work as expected.
} }
Hash getRevFromRef(nix::ref<Store> store, const Input & input) const override RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const override
{ {
// TODO: In the future, when the sourcehut graphql API is implemented for mercurial // TODO: In the future, when the sourcehut graphql API is implemented for mercurial
// and with anonymous access, this method should use it instead. // and with anonymous access, this method should use it instead.
@ -445,12 +503,12 @@ struct SourceHutInputScheme : GitArchiveInputScheme
id = parsedLine->target; id = parsedLine->target;
} }
if(!id) if (!id)
throw BadURL("in '%d', couldn't find ref '%d'", input.to_string(), ref); throw BadURL("in '%d', couldn't find ref '%d'", input.to_string(), ref);
auto rev = Hash::parseAny(*id, HashAlgorithm::SHA1); return RefInfo {
debug("HEAD revision for '%s' is %s", fmt("%s/%s", base_url, ref), rev.gitRev()); .rev = Hash::parseAny(*id, HashAlgorithm::SHA1)
return rev; };
} }
DownloadUrl getDownloadUrl(const Input & input) const override DownloadUrl getDownloadUrl(const Input & input) const override

View file

@ -26,6 +26,8 @@ struct CreateRegularFileSink : Sink
struct FileSystemObjectSink struct FileSystemObjectSink
{ {
virtual ~FileSystemObjectSink() = default;
virtual void createDirectory(const Path & path) = 0; virtual void createDirectory(const Path & path) = 0;
/** /**

View file

@ -132,4 +132,66 @@ void unpackTarfile(const Path & tarFile, const Path & destDir)
extract_archive(archive, destDir); extract_archive(archive, destDir);
} }
time_t unpackTarfileToSink(TarArchive & archive, FileSystemObjectSink & parseSink)
{
time_t lastModified = 0;
for (;;) {
// FIXME: merge with extract_archive
struct archive_entry * entry;
int r = archive_read_next_header(archive.archive, &entry);
if (r == ARCHIVE_EOF) break;
auto path = archive_entry_pathname(entry);
if (!path)
throw Error("cannot get archive member name: %s", archive_error_string(archive.archive));
if (r == ARCHIVE_WARN)
warn(archive_error_string(archive.archive));
else
archive.check(r);
lastModified = std::max(lastModified, archive_entry_mtime(entry));
switch (archive_entry_filetype(entry)) {
case AE_IFDIR:
parseSink.createDirectory(path);
break;
case AE_IFREG: {
parseSink.createRegularFile(path, [&](auto & crf) {
if (archive_entry_mode(entry) & S_IXUSR)
crf.isExecutable();
while (true) {
std::vector<unsigned char> buf(128 * 1024);
auto n = archive_read_data(archive.archive, buf.data(), buf.size());
if (n < 0)
throw Error("cannot read file '%s' from tarball", path);
if (n == 0) break;
crf(std::string_view {
(const char *) buf.data(),
(size_t) n,
});
}
});
break;
}
case AE_IFLNK: {
auto target = archive_entry_symlink(entry);
parseSink.createSymlink(path, target);
break;
}
default:
throw Error("file '%s' in tarball has unsupported file type", path);
}
}
return lastModified;
}
} }

View file

@ -2,6 +2,7 @@
///@file ///@file
#include "serialise.hh" #include "serialise.hh"
#include "fs-sink.hh"
#include <archive.h> #include <archive.h>
namespace nix { namespace nix {
@ -29,4 +30,6 @@ void unpackTarfile(Source & source, const Path & destDir);
void unpackTarfile(const Path & tarFile, const Path & destDir); void unpackTarfile(const Path & tarFile, const Path & destDir);
time_t unpackTarfileToSink(TarArchive & archive, FileSystemObjectSink & parseSink);
} }