Add a Git-based content-addressed tarball cache

GitArchiveInputScheme now streams tarballs into a Git repository. This
deduplicates data a lot, e.g. when you're fetching different revisions
of the Nixpkgs repo. It also warns if the tree hash returned by GitHub
doesn't match the tree hash of the imported tarball.
This commit is contained in:
Eelco Dolstra 2023-11-29 12:35:08 +01:00
parent a8fea5a54f
commit b36857ac8d
5 changed files with 272 additions and 37 deletions

View file

@ -104,4 +104,9 @@ std::map<std::string, std::string> attrsToQuery(const Attrs & attrs)
return query; return query;
} }
Hash getRevAttr(const Attrs & attrs, const std::string & name)
{
return Hash::parseAny(getStrAttr(attrs, name), htSHA1);
}
} }

View file

@ -39,4 +39,6 @@ bool getBoolAttr(const Attrs & attrs, const std::string & name);
std::map<std::string, std::string> attrsToQuery(const Attrs & attrs); std::map<std::string, std::string> attrsToQuery(const Attrs & attrs);
Hash getRevAttr(const Attrs & attrs, const std::string & name);
} }

View file

@ -4,6 +4,7 @@
#include "finally.hh" #include "finally.hh"
#include "processes.hh" #include "processes.hh"
#include "signals.hh" #include "signals.hh"
#include "users.hh"
#include <boost/core/span.hpp> #include <boost/core/span.hpp>
@ -21,6 +22,9 @@
#include <git2/submodule.h> #include <git2/submodule.h>
#include <git2/tree.h> #include <git2/tree.h>
#include "tarfile.hh"
#include <archive_entry.h>
#include <unordered_set> #include <unordered_set>
#include <queue> #include <queue>
#include <regex> #include <regex>
@ -307,6 +311,158 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
return std::nullopt; return std::nullopt;
} }
TarballInfo importTarball(Source & source) override
{
TarArchive archive(source);
struct PendingDir
{
std::string name;
TreeBuilder builder;
};
std::vector<PendingDir> pendingDirs;
auto pushBuilder = [&](std::string name)
{
git_treebuilder * b;
if (git_treebuilder_new(&b, *this, nullptr))
throw Error("creating a tree builder: %s", git_error_last()->message);
pendingDirs.push_back({ .name = std::move(name), .builder = TreeBuilder(b) });
};
auto popBuilder = [&]() -> std::pair<git_oid, std::string>
{
assert(!pendingDirs.empty());
auto pending = std::move(pendingDirs.back());
git_oid oid;
if (git_treebuilder_write(&oid, pending.builder.get()))
throw Error("creating a tree object: %s", git_error_last()->message);
pendingDirs.pop_back();
return {oid, pending.name};
};
auto addToTree = [&](const std::string & name, const git_oid & oid, git_filemode_t mode)
{
assert(!pendingDirs.empty());
auto & pending = pendingDirs.back();
if (git_treebuilder_insert(nullptr, pending.builder.get(), name.c_str(), &oid, mode))
throw Error("adding a file to a tree builder: %s", git_error_last()->message);
};
auto updateBuilders = [&](boost::span<const std::string> names)
{
// Find the common prefix of pendingDirs and names.
size_t prefixLen = 0;
for (; prefixLen < names.size() && prefixLen + 1 < pendingDirs.size(); ++prefixLen)
if (names[prefixLen] != pendingDirs[prefixLen + 1].name)
break;
// Finish the builders that are not part of the common prefix.
for (auto n = pendingDirs.size(); n > prefixLen + 1; --n) {
auto [oid, name] = popBuilder();
addToTree(name, oid, GIT_FILEMODE_TREE);
}
// Create builders for the new directories.
for (auto n = prefixLen; n < names.size(); ++n)
pushBuilder(names[n]);
};
pushBuilder("");
size_t componentsToStrip = 1;
time_t lastModified = 0;
for (;;) {
// FIXME: merge with extract_archive
struct archive_entry * entry;
int r = archive_read_next_header(archive.archive, &entry);
if (r == ARCHIVE_EOF) break;
auto path = archive_entry_pathname(entry);
if (!path)
throw Error("cannot get archive member name: %s", archive_error_string(archive.archive));
if (r == ARCHIVE_WARN)
warn(archive_error_string(archive.archive));
else
archive.check(r);
lastModified = std::max(lastModified, archive_entry_mtime(entry));
auto pathComponents = tokenizeString<std::vector<std::string>>(path, "/");
boost::span<const std::string> pathComponents2{pathComponents};
if (pathComponents2.size() <= componentsToStrip) continue;
pathComponents2 = pathComponents2.subspan(componentsToStrip);
updateBuilders(
archive_entry_filetype(entry) == AE_IFDIR
? pathComponents2
: pathComponents2.first(pathComponents2.size() - 1));
switch (archive_entry_filetype(entry)) {
case AE_IFDIR:
// Nothing to do right now.
break;
case AE_IFREG: {
git_writestream * stream = nullptr;
if (git_blob_create_from_stream(&stream, *this, nullptr))
throw Error("creating a blob stream object: %s", git_error_last()->message);
while (true) {
std::vector<unsigned char> buf(128 * 1024);
auto n = archive_read_data(archive.archive, buf.data(), buf.size());
if (n < 0)
throw Error("cannot read file '%s' from tarball", path);
if (n == 0) break;
if (stream->write(stream, (const char *) buf.data(), n))
throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message);
}
git_oid oid;
if (git_blob_create_from_stream_commit(&oid, stream))
throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message);
addToTree(*pathComponents.rbegin(), oid,
archive_entry_mode(entry) & S_IXUSR
? GIT_FILEMODE_BLOB_EXECUTABLE
: GIT_FILEMODE_BLOB);
break;
}
case AE_IFLNK: {
auto target = archive_entry_symlink(entry);
git_oid oid;
if (git_blob_create_from_buffer(&oid, *this, target, strlen(target)))
throw Error("creating a blob object for tarball symlink member '%s': %s", path, git_error_last()->message);
addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK);
break;
}
default:
throw Error("file '%s' in tarball has unsupported file type", path);
}
}
updateBuilders({});
auto [oid, _name] = popBuilder();
return TarballInfo {
.treeHash = toHash(oid),
.lastModified = lastModified
};
}
std::vector<std::tuple<Submodule, Hash>> getSubmodules(const Hash & rev) override; std::vector<std::tuple<Submodule, Hash>> getSubmodules(const Hash & rev) override;
std::string resolveSubmoduleUrl( std::string resolveSubmoduleUrl(
@ -449,6 +605,22 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
else else
throw Error("Commit signature verification on commit %s failed: %s", rev.gitRev(), output); throw Error("Commit signature verification on commit %s failed: %s", rev.gitRev(), output);
} }
Hash treeHashToNarHash(const Hash & treeHash) override
{
auto accessor = getAccessor(treeHash);
fetchers::Attrs cacheKey({{"_what", "treeHashToNarHash"}, {"treeHash", treeHash.gitRev()}});
if (auto res = fetchers::getCache()->lookup(cacheKey))
return Hash::parseAny(fetchers::getStrAttr(*res, "narHash"), htSHA256);
auto narHash = accessor->hashPath(CanonPath::root);
fetchers::getCache()->upsert(cacheKey, fetchers::Attrs({{"narHash", narHash.to_string(HashFormat::SRI, true)}}));
return narHash;
}
}; };
ref<GitRepo> GitRepo::openRepo(const CanonPath & path, bool create, bool bare) ref<GitRepo> GitRepo::openRepo(const CanonPath & path, bool create, bool bare)
@ -673,5 +845,11 @@ std::vector<std::tuple<GitRepoImpl::Submodule, Hash>> GitRepoImpl::getSubmodules
return result; return result;
} }
ref<GitRepo> getTarballCache()
{
static CanonPath repoDir(getCacheDir() + "/nix/tarball-cache");
return make_ref<GitRepoImpl>(repoDir, true, true);
}
} }

View file

@ -69,6 +69,8 @@ struct GitRepo
time_t lastModified; time_t lastModified;
}; };
virtual TarballInfo importTarball(Source & source) = 0;
virtual bool hasObject(const Hash & oid) = 0; virtual bool hasObject(const Hash & oid) = 0;
virtual ref<InputAccessor> getAccessor(const Hash & rev) = 0; virtual ref<InputAccessor> getAccessor(const Hash & rev) = 0;
@ -85,6 +87,14 @@ struct GitRepo
virtual void verifyCommit( virtual void verifyCommit(
const Hash & rev, const Hash & rev,
const std::vector<fetchers::PublicKey> & publicKeys) = 0; const std::vector<fetchers::PublicKey> & publicKeys) = 0;
/**
* Given a Git tree hash, compute the hash of its NAR
* serialisation. This is memoised on-disk.
*/
virtual Hash treeHashToNarHash(const Hash & treeHash) = 0;
}; };
ref<GitRepo> getTarballCache();
} }

View file

@ -8,6 +8,7 @@
#include "fetchers.hh" #include "fetchers.hh"
#include "fetch-settings.hh" #include "fetch-settings.hh"
#include "tarball.hh" #include "tarball.hh"
#include "git-utils.hh"
#include <optional> #include <optional>
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
@ -180,49 +181,87 @@ struct GitArchiveInputScheme : InputScheme
return headers; return headers;
} }
virtual Hash getRevFromRef(nix::ref<Store> store, const Input & input) const = 0; struct RefInfo
{
Hash rev;
std::optional<Hash> treeHash;
};
virtual RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const = 0;
virtual DownloadUrl getDownloadUrl(const Input & input) const = 0; virtual DownloadUrl getDownloadUrl(const Input & input) const = 0;
std::pair<StorePath, Input> fetch(ref<Store> store, const Input & _input) override std::pair<Input, GitRepo::TarballInfo> downloadArchive(ref<Store> store, Input input) const
{ {
Input input(_input);
if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD"); if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD");
std::optional<Hash> upstreamTreeHash;
auto rev = input.getRev(); auto rev = input.getRev();
if (!rev) rev = getRevFromRef(store, input); if (!rev) {
auto refInfo = getRevFromRef(store, input);
rev = refInfo.rev;
upstreamTreeHash = refInfo.treeHash;
debug("HEAD revision for '%s' is %s", input.to_string(), refInfo.rev.gitRev());
}
input.attrs.erase("ref"); input.attrs.erase("ref");
input.attrs.insert_or_assign("rev", rev->gitRev()); input.attrs.insert_or_assign("rev", rev->gitRev());
Attrs lockedAttrs({ auto cache = getCache();
{"type", "git-tarball"},
{"rev", rev->gitRev()},
});
if (auto res = getCache()->lookup(store, lockedAttrs)) { Attrs treeHashKey{{"_what", "gitRevToTreeHash"}, {"rev", rev->gitRev()}};
input.attrs.insert_or_assign("lastModified", getIntAttr(res->first, "lastModified")); Attrs lastModifiedKey{{"_what", "gitRevToLastModified"}, {"rev", rev->gitRev()}};
return {std::move(res->second), input};
if (auto treeHashAttrs = cache->lookup(treeHashKey)) {
if (auto lastModifiedAttrs = cache->lookup(lastModifiedKey)) {
auto treeHash = getRevAttr(*treeHashAttrs, "treeHash");
auto lastModified = getIntAttr(*lastModifiedAttrs, "lastModified");
if (getTarballCache()->hasObject(treeHash))
return {std::move(input), GitRepo::TarballInfo { .treeHash = treeHash, .lastModified = (time_t) lastModified }};
else
debug("Git tree with hash '%s' has disappeared from the cache, refetching...", treeHash.gitRev());
}
} }
/* Stream the tarball into the tarball cache. */
auto url = getDownloadUrl(input); auto url = getDownloadUrl(input);
auto result = downloadTarball(store, url.url, input.getName(), true, url.headers); auto source = sinkToSource([&](Sink & sink) {
FileTransferRequest req(url.url);
req.headers = url.headers;
getFileTransfer()->download(std::move(req), sink);
});
input.attrs.insert_or_assign("lastModified", uint64_t(result.lastModified)); auto tarballInfo = getTarballCache()->importTarball(*source);
getCache()->add( cache->upsert(treeHashKey, Attrs{{"treeHash", tarballInfo.treeHash.gitRev()}});
store, cache->upsert(lastModifiedKey, Attrs{{"lastModified", (uint64_t) tarballInfo.lastModified}});
lockedAttrs,
if (upstreamTreeHash != tarballInfo.treeHash)
warn(
"Git tree hash mismatch for revision '%s' of '%s': "
"expected '%s', got '%s'. "
"This can happen if the Git repository uses submodules.",
rev->gitRev(), input.to_string(), upstreamTreeHash->gitRev(), tarballInfo.treeHash.gitRev());
return {std::move(input), tarballInfo};
}
std::pair<ref<InputAccessor>, Input> getAccessor(ref<Store> store, const Input & _input) const override
{ {
{"rev", rev->gitRev()}, auto [input, tarballInfo] = downloadArchive(store, _input);
{"lastModified", uint64_t(result.lastModified)}
},
result.storePath,
true);
return {result.storePath, input}; input.attrs.insert_or_assign("treeHash", tarballInfo.treeHash.gitRev());
input.attrs.insert_or_assign("lastModified", uint64_t(tarballInfo.lastModified));
auto accessor = getTarballCache()->getAccessor(tarballInfo.treeHash);
accessor->setPathDisplay("«" + input.to_string() + "»");
accessor->fingerprint = input.getFingerprint(store);
return {accessor, input};
} }
std::optional<ExperimentalFeature> experimentalFeature() const override std::optional<ExperimentalFeature> experimentalFeature() const override
@ -269,7 +308,7 @@ struct GitHubInputScheme : GitArchiveInputScheme
return getStrAttr(input.attrs, "repo"); return getStrAttr(input.attrs, "repo");
} }
Hash getRevFromRef(nix::ref<Store> store, const Input & input) const override RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const override
{ {
auto host = getHost(input); auto host = getHost(input);
auto url = fmt( auto url = fmt(
@ -284,9 +323,10 @@ struct GitHubInputScheme : GitArchiveInputScheme
readFile( readFile(
store->toRealPath( store->toRealPath(
downloadFile(store, url, "source", false, headers).storePath))); downloadFile(store, url, "source", false, headers).storePath)));
auto rev = Hash::parseAny(std::string { json["sha"] }, htSHA1); return RefInfo {
debug("HEAD revision for '%s' is %s", url, rev.gitRev()); .rev = Hash::parseAny(std::string { json["sha"] }, htSHA1),
return rev; .treeHash = Hash::parseAny(std::string { json["commit"]["tree"]["sha"] }, htSHA1)
};
} }
DownloadUrl getDownloadUrl(const Input & input) const override DownloadUrl getDownloadUrl(const Input & input) const override
@ -343,7 +383,7 @@ struct GitLabInputScheme : GitArchiveInputScheme
return std::make_pair(token.substr(0,fldsplit), token.substr(fldsplit+1)); return std::make_pair(token.substr(0,fldsplit), token.substr(fldsplit+1));
} }
Hash getRevFromRef(nix::ref<Store> store, const Input & input) const override RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const override
{ {
auto host = maybeGetStrAttr(input.attrs, "host").value_or("gitlab.com"); auto host = maybeGetStrAttr(input.attrs, "host").value_or("gitlab.com");
// See rate limiting note below // See rate limiting note below
@ -356,9 +396,9 @@ struct GitLabInputScheme : GitArchiveInputScheme
readFile( readFile(
store->toRealPath( store->toRealPath(
downloadFile(store, url, "source", false, headers).storePath))); downloadFile(store, url, "source", false, headers).storePath)));
auto rev = Hash::parseAny(std::string(json[0]["id"]), htSHA1); return RefInfo {
debug("HEAD revision for '%s' is %s", url, rev.gitRev()); .rev = Hash::parseAny(std::string(json[0]["id"]), htSHA1)
return rev; };
} }
DownloadUrl getDownloadUrl(const Input & input) const override DownloadUrl getDownloadUrl(const Input & input) const override
@ -402,7 +442,7 @@ struct SourceHutInputScheme : GitArchiveInputScheme
// Once it is implemented, however, should work as expected. // Once it is implemented, however, should work as expected.
} }
Hash getRevFromRef(nix::ref<Store> store, const Input & input) const override RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const override
{ {
// TODO: In the future, when the sourcehut graphql API is implemented for mercurial // TODO: In the future, when the sourcehut graphql API is implemented for mercurial
// and with anonymous access, this method should use it instead. // and with anonymous access, this method should use it instead.
@ -448,9 +488,9 @@ struct SourceHutInputScheme : GitArchiveInputScheme
if (!id) if (!id)
throw BadURL("in '%d', couldn't find ref '%d'", input.to_string(), ref); throw BadURL("in '%d', couldn't find ref '%d'", input.to_string(), ref);
auto rev = Hash::parseAny(*id, htSHA1); return RefInfo {
debug("HEAD revision for '%s' is %s", fmt("%s/%s", base_url, ref), rev.gitRev()); .rev = Hash::parseAny(*id, htSHA1)
return rev; };
} }
DownloadUrl getDownloadUrl(const Input & input) const override DownloadUrl getDownloadUrl(const Input & input) const override