mirror of
https://github.com/privatevoid-net/nix-super.git
synced 2024-11-10 08:16:15 +02:00
Add a Git-based content-addressed tarball cache
GitArchiveInputScheme now streams tarballs into a Git repository. This deduplicates data a lot, e.g. when you're fetching different revisions of the Nixpkgs repo. It also warns if the tree hash returned by GitHub doesn't match the tree hash of the imported tarball.
This commit is contained in:
parent
a8fea5a54f
commit
b36857ac8d
5 changed files with 272 additions and 37 deletions
|
@ -104,4 +104,9 @@ std::map<std::string, std::string> attrsToQuery(const Attrs & attrs)
|
|||
return query;
|
||||
}
|
||||
|
||||
Hash getRevAttr(const Attrs & attrs, const std::string & name)
|
||||
{
|
||||
return Hash::parseAny(getStrAttr(attrs, name), htSHA1);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -39,4 +39,6 @@ bool getBoolAttr(const Attrs & attrs, const std::string & name);
|
|||
|
||||
std::map<std::string, std::string> attrsToQuery(const Attrs & attrs);
|
||||
|
||||
Hash getRevAttr(const Attrs & attrs, const std::string & name);
|
||||
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#include "finally.hh"
|
||||
#include "processes.hh"
|
||||
#include "signals.hh"
|
||||
#include "users.hh"
|
||||
|
||||
#include <boost/core/span.hpp>
|
||||
|
||||
|
@ -21,6 +22,9 @@
|
|||
#include <git2/submodule.h>
|
||||
#include <git2/tree.h>
|
||||
|
||||
#include "tarfile.hh"
|
||||
#include <archive_entry.h>
|
||||
|
||||
#include <unordered_set>
|
||||
#include <queue>
|
||||
#include <regex>
|
||||
|
@ -307,6 +311,158 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
|
|||
return std::nullopt;
|
||||
}
|
||||
|
||||
TarballInfo importTarball(Source & source) override
|
||||
{
|
||||
TarArchive archive(source);
|
||||
|
||||
struct PendingDir
|
||||
{
|
||||
std::string name;
|
||||
TreeBuilder builder;
|
||||
};
|
||||
|
||||
std::vector<PendingDir> pendingDirs;
|
||||
|
||||
auto pushBuilder = [&](std::string name)
|
||||
{
|
||||
git_treebuilder * b;
|
||||
if (git_treebuilder_new(&b, *this, nullptr))
|
||||
throw Error("creating a tree builder: %s", git_error_last()->message);
|
||||
pendingDirs.push_back({ .name = std::move(name), .builder = TreeBuilder(b) });
|
||||
};
|
||||
|
||||
auto popBuilder = [&]() -> std::pair<git_oid, std::string>
|
||||
{
|
||||
assert(!pendingDirs.empty());
|
||||
auto pending = std::move(pendingDirs.back());
|
||||
git_oid oid;
|
||||
if (git_treebuilder_write(&oid, pending.builder.get()))
|
||||
throw Error("creating a tree object: %s", git_error_last()->message);
|
||||
pendingDirs.pop_back();
|
||||
return {oid, pending.name};
|
||||
};
|
||||
|
||||
auto addToTree = [&](const std::string & name, const git_oid & oid, git_filemode_t mode)
|
||||
{
|
||||
assert(!pendingDirs.empty());
|
||||
auto & pending = pendingDirs.back();
|
||||
if (git_treebuilder_insert(nullptr, pending.builder.get(), name.c_str(), &oid, mode))
|
||||
throw Error("adding a file to a tree builder: %s", git_error_last()->message);
|
||||
};
|
||||
|
||||
auto updateBuilders = [&](boost::span<const std::string> names)
|
||||
{
|
||||
// Find the common prefix of pendingDirs and names.
|
||||
size_t prefixLen = 0;
|
||||
for (; prefixLen < names.size() && prefixLen + 1 < pendingDirs.size(); ++prefixLen)
|
||||
if (names[prefixLen] != pendingDirs[prefixLen + 1].name)
|
||||
break;
|
||||
|
||||
// Finish the builders that are not part of the common prefix.
|
||||
for (auto n = pendingDirs.size(); n > prefixLen + 1; --n) {
|
||||
auto [oid, name] = popBuilder();
|
||||
addToTree(name, oid, GIT_FILEMODE_TREE);
|
||||
}
|
||||
|
||||
// Create builders for the new directories.
|
||||
for (auto n = prefixLen; n < names.size(); ++n)
|
||||
pushBuilder(names[n]);
|
||||
};
|
||||
|
||||
pushBuilder("");
|
||||
|
||||
size_t componentsToStrip = 1;
|
||||
|
||||
time_t lastModified = 0;
|
||||
|
||||
for (;;) {
|
||||
// FIXME: merge with extract_archive
|
||||
struct archive_entry * entry;
|
||||
int r = archive_read_next_header(archive.archive, &entry);
|
||||
if (r == ARCHIVE_EOF) break;
|
||||
auto path = archive_entry_pathname(entry);
|
||||
if (!path)
|
||||
throw Error("cannot get archive member name: %s", archive_error_string(archive.archive));
|
||||
if (r == ARCHIVE_WARN)
|
||||
warn(archive_error_string(archive.archive));
|
||||
else
|
||||
archive.check(r);
|
||||
|
||||
lastModified = std::max(lastModified, archive_entry_mtime(entry));
|
||||
|
||||
auto pathComponents = tokenizeString<std::vector<std::string>>(path, "/");
|
||||
|
||||
boost::span<const std::string> pathComponents2{pathComponents};
|
||||
|
||||
if (pathComponents2.size() <= componentsToStrip) continue;
|
||||
pathComponents2 = pathComponents2.subspan(componentsToStrip);
|
||||
|
||||
updateBuilders(
|
||||
archive_entry_filetype(entry) == AE_IFDIR
|
||||
? pathComponents2
|
||||
: pathComponents2.first(pathComponents2.size() - 1));
|
||||
|
||||
switch (archive_entry_filetype(entry)) {
|
||||
|
||||
case AE_IFDIR:
|
||||
// Nothing to do right now.
|
||||
break;
|
||||
|
||||
case AE_IFREG: {
|
||||
|
||||
git_writestream * stream = nullptr;
|
||||
if (git_blob_create_from_stream(&stream, *this, nullptr))
|
||||
throw Error("creating a blob stream object: %s", git_error_last()->message);
|
||||
|
||||
while (true) {
|
||||
std::vector<unsigned char> buf(128 * 1024);
|
||||
auto n = archive_read_data(archive.archive, buf.data(), buf.size());
|
||||
if (n < 0)
|
||||
throw Error("cannot read file '%s' from tarball", path);
|
||||
if (n == 0) break;
|
||||
if (stream->write(stream, (const char *) buf.data(), n))
|
||||
throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message);
|
||||
}
|
||||
|
||||
git_oid oid;
|
||||
if (git_blob_create_from_stream_commit(&oid, stream))
|
||||
throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message);
|
||||
|
||||
addToTree(*pathComponents.rbegin(), oid,
|
||||
archive_entry_mode(entry) & S_IXUSR
|
||||
? GIT_FILEMODE_BLOB_EXECUTABLE
|
||||
: GIT_FILEMODE_BLOB);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case AE_IFLNK: {
|
||||
auto target = archive_entry_symlink(entry);
|
||||
|
||||
git_oid oid;
|
||||
if (git_blob_create_from_buffer(&oid, *this, target, strlen(target)))
|
||||
throw Error("creating a blob object for tarball symlink member '%s': %s", path, git_error_last()->message);
|
||||
|
||||
addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
throw Error("file '%s' in tarball has unsupported file type", path);
|
||||
}
|
||||
}
|
||||
|
||||
updateBuilders({});
|
||||
|
||||
auto [oid, _name] = popBuilder();
|
||||
|
||||
return TarballInfo {
|
||||
.treeHash = toHash(oid),
|
||||
.lastModified = lastModified
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<std::tuple<Submodule, Hash>> getSubmodules(const Hash & rev) override;
|
||||
|
||||
std::string resolveSubmoduleUrl(
|
||||
|
@ -449,6 +605,22 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
|
|||
else
|
||||
throw Error("Commit signature verification on commit %s failed: %s", rev.gitRev(), output);
|
||||
}
|
||||
|
||||
Hash treeHashToNarHash(const Hash & treeHash) override
|
||||
{
|
||||
auto accessor = getAccessor(treeHash);
|
||||
|
||||
fetchers::Attrs cacheKey({{"_what", "treeHashToNarHash"}, {"treeHash", treeHash.gitRev()}});
|
||||
|
||||
if (auto res = fetchers::getCache()->lookup(cacheKey))
|
||||
return Hash::parseAny(fetchers::getStrAttr(*res, "narHash"), htSHA256);
|
||||
|
||||
auto narHash = accessor->hashPath(CanonPath::root);
|
||||
|
||||
fetchers::getCache()->upsert(cacheKey, fetchers::Attrs({{"narHash", narHash.to_string(HashFormat::SRI, true)}}));
|
||||
|
||||
return narHash;
|
||||
}
|
||||
};
|
||||
|
||||
ref<GitRepo> GitRepo::openRepo(const CanonPath & path, bool create, bool bare)
|
||||
|
@ -673,5 +845,11 @@ std::vector<std::tuple<GitRepoImpl::Submodule, Hash>> GitRepoImpl::getSubmodules
|
|||
return result;
|
||||
}
|
||||
|
||||
ref<GitRepo> getTarballCache()
|
||||
{
|
||||
static CanonPath repoDir(getCacheDir() + "/nix/tarball-cache");
|
||||
|
||||
return make_ref<GitRepoImpl>(repoDir, true, true);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -69,6 +69,8 @@ struct GitRepo
|
|||
time_t lastModified;
|
||||
};
|
||||
|
||||
virtual TarballInfo importTarball(Source & source) = 0;
|
||||
|
||||
virtual bool hasObject(const Hash & oid) = 0;
|
||||
|
||||
virtual ref<InputAccessor> getAccessor(const Hash & rev) = 0;
|
||||
|
@ -85,6 +87,14 @@ struct GitRepo
|
|||
virtual void verifyCommit(
|
||||
const Hash & rev,
|
||||
const std::vector<fetchers::PublicKey> & publicKeys) = 0;
|
||||
|
||||
/**
|
||||
* Given a Git tree hash, compute the hash of its NAR
|
||||
* serialisation. This is memoised on-disk.
|
||||
*/
|
||||
virtual Hash treeHashToNarHash(const Hash & treeHash) = 0;
|
||||
};
|
||||
|
||||
ref<GitRepo> getTarballCache();
|
||||
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#include "fetchers.hh"
|
||||
#include "fetch-settings.hh"
|
||||
#include "tarball.hh"
|
||||
#include "git-utils.hh"
|
||||
|
||||
#include <optional>
|
||||
#include <nlohmann/json.hpp>
|
||||
|
@ -180,49 +181,87 @@ struct GitArchiveInputScheme : InputScheme
|
|||
return headers;
|
||||
}
|
||||
|
||||
virtual Hash getRevFromRef(nix::ref<Store> store, const Input & input) const = 0;
|
||||
struct RefInfo
|
||||
{
|
||||
Hash rev;
|
||||
std::optional<Hash> treeHash;
|
||||
};
|
||||
|
||||
virtual RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const = 0;
|
||||
|
||||
virtual DownloadUrl getDownloadUrl(const Input & input) const = 0;
|
||||
|
||||
std::pair<StorePath, Input> fetch(ref<Store> store, const Input & _input) override
|
||||
std::pair<Input, GitRepo::TarballInfo> downloadArchive(ref<Store> store, Input input) const
|
||||
{
|
||||
Input input(_input);
|
||||
|
||||
if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD");
|
||||
|
||||
std::optional<Hash> upstreamTreeHash;
|
||||
|
||||
auto rev = input.getRev();
|
||||
if (!rev) rev = getRevFromRef(store, input);
|
||||
if (!rev) {
|
||||
auto refInfo = getRevFromRef(store, input);
|
||||
rev = refInfo.rev;
|
||||
upstreamTreeHash = refInfo.treeHash;
|
||||
debug("HEAD revision for '%s' is %s", input.to_string(), refInfo.rev.gitRev());
|
||||
}
|
||||
|
||||
input.attrs.erase("ref");
|
||||
input.attrs.insert_or_assign("rev", rev->gitRev());
|
||||
|
||||
Attrs lockedAttrs({
|
||||
{"type", "git-tarball"},
|
||||
{"rev", rev->gitRev()},
|
||||
});
|
||||
auto cache = getCache();
|
||||
|
||||
if (auto res = getCache()->lookup(store, lockedAttrs)) {
|
||||
input.attrs.insert_or_assign("lastModified", getIntAttr(res->first, "lastModified"));
|
||||
return {std::move(res->second), input};
|
||||
Attrs treeHashKey{{"_what", "gitRevToTreeHash"}, {"rev", rev->gitRev()}};
|
||||
Attrs lastModifiedKey{{"_what", "gitRevToLastModified"}, {"rev", rev->gitRev()}};
|
||||
|
||||
if (auto treeHashAttrs = cache->lookup(treeHashKey)) {
|
||||
if (auto lastModifiedAttrs = cache->lookup(lastModifiedKey)) {
|
||||
auto treeHash = getRevAttr(*treeHashAttrs, "treeHash");
|
||||
auto lastModified = getIntAttr(*lastModifiedAttrs, "lastModified");
|
||||
if (getTarballCache()->hasObject(treeHash))
|
||||
return {std::move(input), GitRepo::TarballInfo { .treeHash = treeHash, .lastModified = (time_t) lastModified }};
|
||||
else
|
||||
debug("Git tree with hash '%s' has disappeared from the cache, refetching...", treeHash.gitRev());
|
||||
}
|
||||
}
|
||||
|
||||
/* Stream the tarball into the tarball cache. */
|
||||
auto url = getDownloadUrl(input);
|
||||
|
||||
auto result = downloadTarball(store, url.url, input.getName(), true, url.headers);
|
||||
auto source = sinkToSource([&](Sink & sink) {
|
||||
FileTransferRequest req(url.url);
|
||||
req.headers = url.headers;
|
||||
getFileTransfer()->download(std::move(req), sink);
|
||||
});
|
||||
|
||||
input.attrs.insert_or_assign("lastModified", uint64_t(result.lastModified));
|
||||
auto tarballInfo = getTarballCache()->importTarball(*source);
|
||||
|
||||
getCache()->add(
|
||||
store,
|
||||
lockedAttrs,
|
||||
{
|
||||
{"rev", rev->gitRev()},
|
||||
{"lastModified", uint64_t(result.lastModified)}
|
||||
},
|
||||
result.storePath,
|
||||
true);
|
||||
cache->upsert(treeHashKey, Attrs{{"treeHash", tarballInfo.treeHash.gitRev()}});
|
||||
cache->upsert(lastModifiedKey, Attrs{{"lastModified", (uint64_t) tarballInfo.lastModified}});
|
||||
|
||||
return {result.storePath, input};
|
||||
if (upstreamTreeHash != tarballInfo.treeHash)
|
||||
warn(
|
||||
"Git tree hash mismatch for revision '%s' of '%s': "
|
||||
"expected '%s', got '%s'. "
|
||||
"This can happen if the Git repository uses submodules.",
|
||||
rev->gitRev(), input.to_string(), upstreamTreeHash->gitRev(), tarballInfo.treeHash.gitRev());
|
||||
|
||||
return {std::move(input), tarballInfo};
|
||||
}
|
||||
|
||||
std::pair<ref<InputAccessor>, Input> getAccessor(ref<Store> store, const Input & _input) const override
|
||||
{
|
||||
auto [input, tarballInfo] = downloadArchive(store, _input);
|
||||
|
||||
input.attrs.insert_or_assign("treeHash", tarballInfo.treeHash.gitRev());
|
||||
input.attrs.insert_or_assign("lastModified", uint64_t(tarballInfo.lastModified));
|
||||
|
||||
auto accessor = getTarballCache()->getAccessor(tarballInfo.treeHash);
|
||||
|
||||
accessor->setPathDisplay("«" + input.to_string() + "»");
|
||||
|
||||
accessor->fingerprint = input.getFingerprint(store);
|
||||
|
||||
return {accessor, input};
|
||||
}
|
||||
|
||||
std::optional<ExperimentalFeature> experimentalFeature() const override
|
||||
|
@ -269,7 +308,7 @@ struct GitHubInputScheme : GitArchiveInputScheme
|
|||
return getStrAttr(input.attrs, "repo");
|
||||
}
|
||||
|
||||
Hash getRevFromRef(nix::ref<Store> store, const Input & input) const override
|
||||
RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const override
|
||||
{
|
||||
auto host = getHost(input);
|
||||
auto url = fmt(
|
||||
|
@ -284,9 +323,10 @@ struct GitHubInputScheme : GitArchiveInputScheme
|
|||
readFile(
|
||||
store->toRealPath(
|
||||
downloadFile(store, url, "source", false, headers).storePath)));
|
||||
auto rev = Hash::parseAny(std::string { json["sha"] }, htSHA1);
|
||||
debug("HEAD revision for '%s' is %s", url, rev.gitRev());
|
||||
return rev;
|
||||
return RefInfo {
|
||||
.rev = Hash::parseAny(std::string { json["sha"] }, htSHA1),
|
||||
.treeHash = Hash::parseAny(std::string { json["commit"]["tree"]["sha"] }, htSHA1)
|
||||
};
|
||||
}
|
||||
|
||||
DownloadUrl getDownloadUrl(const Input & input) const override
|
||||
|
@ -343,7 +383,7 @@ struct GitLabInputScheme : GitArchiveInputScheme
|
|||
return std::make_pair(token.substr(0,fldsplit), token.substr(fldsplit+1));
|
||||
}
|
||||
|
||||
Hash getRevFromRef(nix::ref<Store> store, const Input & input) const override
|
||||
RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const override
|
||||
{
|
||||
auto host = maybeGetStrAttr(input.attrs, "host").value_or("gitlab.com");
|
||||
// See rate limiting note below
|
||||
|
@ -356,9 +396,9 @@ struct GitLabInputScheme : GitArchiveInputScheme
|
|||
readFile(
|
||||
store->toRealPath(
|
||||
downloadFile(store, url, "source", false, headers).storePath)));
|
||||
auto rev = Hash::parseAny(std::string(json[0]["id"]), htSHA1);
|
||||
debug("HEAD revision for '%s' is %s", url, rev.gitRev());
|
||||
return rev;
|
||||
return RefInfo {
|
||||
.rev = Hash::parseAny(std::string(json[0]["id"]), htSHA1)
|
||||
};
|
||||
}
|
||||
|
||||
DownloadUrl getDownloadUrl(const Input & input) const override
|
||||
|
@ -402,7 +442,7 @@ struct SourceHutInputScheme : GitArchiveInputScheme
|
|||
// Once it is implemented, however, should work as expected.
|
||||
}
|
||||
|
||||
Hash getRevFromRef(nix::ref<Store> store, const Input & input) const override
|
||||
RefInfo getRevFromRef(nix::ref<Store> store, const Input & input) const override
|
||||
{
|
||||
// TODO: In the future, when the sourcehut graphql API is implemented for mercurial
|
||||
// and with anonymous access, this method should use it instead.
|
||||
|
@ -445,12 +485,12 @@ struct SourceHutInputScheme : GitArchiveInputScheme
|
|||
id = parsedLine->target;
|
||||
}
|
||||
|
||||
if(!id)
|
||||
if (!id)
|
||||
throw BadURL("in '%d', couldn't find ref '%d'", input.to_string(), ref);
|
||||
|
||||
auto rev = Hash::parseAny(*id, htSHA1);
|
||||
debug("HEAD revision for '%s' is %s", fmt("%s/%s", base_url, ref), rev.gitRev());
|
||||
return rev;
|
||||
return RefInfo {
|
||||
.rev = Hash::parseAny(*id, htSHA1)
|
||||
};
|
||||
}
|
||||
|
||||
DownloadUrl getDownloadUrl(const Input & input) const override
|
||||
|
|
Loading…
Reference in a new issue