From b36857ac8d60cbf9a78c3c69f6370d38a14facbc Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 29 Nov 2023 12:35:08 +0100 Subject: [PATCH 1/6] Add a Git-based content-addressed tarball cache GitArchiveInputScheme now streams tarballs into a Git repository. This deduplicates data a lot, e.g. when you're fetching different revisions of the Nixpkgs repo. It also warns if the tree hash returned by GitHub doesn't match the tree hash of the imported tarball. --- src/libfetchers/attrs.cc | 5 + src/libfetchers/attrs.hh | 2 + src/libfetchers/git-utils.cc | 178 +++++++++++++++++++++++++++++++++++ src/libfetchers/git-utils.hh | 10 ++ src/libfetchers/github.cc | 114 ++++++++++++++-------- 5 files changed, 272 insertions(+), 37 deletions(-) diff --git a/src/libfetchers/attrs.cc b/src/libfetchers/attrs.cc index a565d19d4..e3fa1d26a 100644 --- a/src/libfetchers/attrs.cc +++ b/src/libfetchers/attrs.cc @@ -104,4 +104,9 @@ std::map attrsToQuery(const Attrs & attrs) return query; } +Hash getRevAttr(const Attrs & attrs, const std::string & name) +{ + return Hash::parseAny(getStrAttr(attrs, name), htSHA1); +} + } diff --git a/src/libfetchers/attrs.hh b/src/libfetchers/attrs.hh index b9a2c824e..97a74bce0 100644 --- a/src/libfetchers/attrs.hh +++ b/src/libfetchers/attrs.hh @@ -39,4 +39,6 @@ bool getBoolAttr(const Attrs & attrs, const std::string & name); std::map attrsToQuery(const Attrs & attrs); +Hash getRevAttr(const Attrs & attrs, const std::string & name); + } diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index 19eae0e1d..abad42c29 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -4,6 +4,7 @@ #include "finally.hh" #include "processes.hh" #include "signals.hh" +#include "users.hh" #include @@ -21,6 +22,9 @@ #include #include +#include "tarfile.hh" +#include + #include #include #include @@ -307,6 +311,158 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this return std::nullopt; } + TarballInfo importTarball(Source & source) override + { + TarArchive archive(source); + + struct PendingDir + { + std::string name; + TreeBuilder builder; + }; + + std::vector pendingDirs; + + auto pushBuilder = [&](std::string name) + { + git_treebuilder * b; + if (git_treebuilder_new(&b, *this, nullptr)) + throw Error("creating a tree builder: %s", git_error_last()->message); + pendingDirs.push_back({ .name = std::move(name), .builder = TreeBuilder(b) }); + }; + + auto popBuilder = [&]() -> std::pair + { + assert(!pendingDirs.empty()); + auto pending = std::move(pendingDirs.back()); + git_oid oid; + if (git_treebuilder_write(&oid, pending.builder.get())) + throw Error("creating a tree object: %s", git_error_last()->message); + pendingDirs.pop_back(); + return {oid, pending.name}; + }; + + auto addToTree = [&](const std::string & name, const git_oid & oid, git_filemode_t mode) + { + assert(!pendingDirs.empty()); + auto & pending = pendingDirs.back(); + if (git_treebuilder_insert(nullptr, pending.builder.get(), name.c_str(), &oid, mode)) + throw Error("adding a file to a tree builder: %s", git_error_last()->message); + }; + + auto updateBuilders = [&](boost::span names) + { + // Find the common prefix of pendingDirs and names. + size_t prefixLen = 0; + for (; prefixLen < names.size() && prefixLen + 1 < pendingDirs.size(); ++prefixLen) + if (names[prefixLen] != pendingDirs[prefixLen + 1].name) + break; + + // Finish the builders that are not part of the common prefix. + for (auto n = pendingDirs.size(); n > prefixLen + 1; --n) { + auto [oid, name] = popBuilder(); + addToTree(name, oid, GIT_FILEMODE_TREE); + } + + // Create builders for the new directories. + for (auto n = prefixLen; n < names.size(); ++n) + pushBuilder(names[n]); + }; + + pushBuilder(""); + + size_t componentsToStrip = 1; + + time_t lastModified = 0; + + for (;;) { + // FIXME: merge with extract_archive + struct archive_entry * entry; + int r = archive_read_next_header(archive.archive, &entry); + if (r == ARCHIVE_EOF) break; + auto path = archive_entry_pathname(entry); + if (!path) + throw Error("cannot get archive member name: %s", archive_error_string(archive.archive)); + if (r == ARCHIVE_WARN) + warn(archive_error_string(archive.archive)); + else + archive.check(r); + + lastModified = std::max(lastModified, archive_entry_mtime(entry)); + + auto pathComponents = tokenizeString>(path, "/"); + + boost::span pathComponents2{pathComponents}; + + if (pathComponents2.size() <= componentsToStrip) continue; + pathComponents2 = pathComponents2.subspan(componentsToStrip); + + updateBuilders( + archive_entry_filetype(entry) == AE_IFDIR + ? pathComponents2 + : pathComponents2.first(pathComponents2.size() - 1)); + + switch (archive_entry_filetype(entry)) { + + case AE_IFDIR: + // Nothing to do right now. + break; + + case AE_IFREG: { + + git_writestream * stream = nullptr; + if (git_blob_create_from_stream(&stream, *this, nullptr)) + throw Error("creating a blob stream object: %s", git_error_last()->message); + + while (true) { + std::vector buf(128 * 1024); + auto n = archive_read_data(archive.archive, buf.data(), buf.size()); + if (n < 0) + throw Error("cannot read file '%s' from tarball", path); + if (n == 0) break; + if (stream->write(stream, (const char *) buf.data(), n)) + throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message); + } + + git_oid oid; + if (git_blob_create_from_stream_commit(&oid, stream)) + throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message); + + addToTree(*pathComponents.rbegin(), oid, + archive_entry_mode(entry) & S_IXUSR + ? GIT_FILEMODE_BLOB_EXECUTABLE + : GIT_FILEMODE_BLOB); + + break; + } + + case AE_IFLNK: { + auto target = archive_entry_symlink(entry); + + git_oid oid; + if (git_blob_create_from_buffer(&oid, *this, target, strlen(target))) + throw Error("creating a blob object for tarball symlink member '%s': %s", path, git_error_last()->message); + + addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK); + + break; + } + + default: + throw Error("file '%s' in tarball has unsupported file type", path); + } + } + + updateBuilders({}); + + auto [oid, _name] = popBuilder(); + + return TarballInfo { + .treeHash = toHash(oid), + .lastModified = lastModified + }; + } + std::vector> getSubmodules(const Hash & rev) override; std::string resolveSubmoduleUrl( @@ -449,6 +605,22 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this else throw Error("Commit signature verification on commit %s failed: %s", rev.gitRev(), output); } + + Hash treeHashToNarHash(const Hash & treeHash) override + { + auto accessor = getAccessor(treeHash); + + fetchers::Attrs cacheKey({{"_what", "treeHashToNarHash"}, {"treeHash", treeHash.gitRev()}}); + + if (auto res = fetchers::getCache()->lookup(cacheKey)) + return Hash::parseAny(fetchers::getStrAttr(*res, "narHash"), htSHA256); + + auto narHash = accessor->hashPath(CanonPath::root); + + fetchers::getCache()->upsert(cacheKey, fetchers::Attrs({{"narHash", narHash.to_string(HashFormat::SRI, true)}})); + + return narHash; + } }; ref GitRepo::openRepo(const CanonPath & path, bool create, bool bare) @@ -673,5 +845,11 @@ std::vector> GitRepoImpl::getSubmodules return result; } +ref getTarballCache() +{ + static CanonPath repoDir(getCacheDir() + "/nix/tarball-cache"); + + return make_ref(repoDir, true, true); +} } diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index 1def82071..b8b31530a 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -69,6 +69,8 @@ struct GitRepo time_t lastModified; }; + virtual TarballInfo importTarball(Source & source) = 0; + virtual bool hasObject(const Hash & oid) = 0; virtual ref getAccessor(const Hash & rev) = 0; @@ -85,6 +87,14 @@ struct GitRepo virtual void verifyCommit( const Hash & rev, const std::vector & publicKeys) = 0; + + /** + * Given a Git tree hash, compute the hash of its NAR + * serialisation. This is memoised on-disk. + */ + virtual Hash treeHashToNarHash(const Hash & treeHash) = 0; }; +ref getTarballCache(); + } diff --git a/src/libfetchers/github.cc b/src/libfetchers/github.cc index 661ad4884..877f6378b 100644 --- a/src/libfetchers/github.cc +++ b/src/libfetchers/github.cc @@ -8,6 +8,7 @@ #include "fetchers.hh" #include "fetch-settings.hh" #include "tarball.hh" +#include "git-utils.hh" #include #include @@ -180,49 +181,87 @@ struct GitArchiveInputScheme : InputScheme return headers; } - virtual Hash getRevFromRef(nix::ref store, const Input & input) const = 0; + struct RefInfo + { + Hash rev; + std::optional treeHash; + }; + + virtual RefInfo getRevFromRef(nix::ref store, const Input & input) const = 0; virtual DownloadUrl getDownloadUrl(const Input & input) const = 0; - std::pair fetch(ref store, const Input & _input) override + std::pair downloadArchive(ref store, Input input) const { - Input input(_input); - if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD"); + std::optional upstreamTreeHash; + auto rev = input.getRev(); - if (!rev) rev = getRevFromRef(store, input); + if (!rev) { + auto refInfo = getRevFromRef(store, input); + rev = refInfo.rev; + upstreamTreeHash = refInfo.treeHash; + debug("HEAD revision for '%s' is %s", input.to_string(), refInfo.rev.gitRev()); + } input.attrs.erase("ref"); input.attrs.insert_or_assign("rev", rev->gitRev()); - Attrs lockedAttrs({ - {"type", "git-tarball"}, - {"rev", rev->gitRev()}, - }); + auto cache = getCache(); - if (auto res = getCache()->lookup(store, lockedAttrs)) { - input.attrs.insert_or_assign("lastModified", getIntAttr(res->first, "lastModified")); - return {std::move(res->second), input}; + Attrs treeHashKey{{"_what", "gitRevToTreeHash"}, {"rev", rev->gitRev()}}; + Attrs lastModifiedKey{{"_what", "gitRevToLastModified"}, {"rev", rev->gitRev()}}; + + if (auto treeHashAttrs = cache->lookup(treeHashKey)) { + if (auto lastModifiedAttrs = cache->lookup(lastModifiedKey)) { + auto treeHash = getRevAttr(*treeHashAttrs, "treeHash"); + auto lastModified = getIntAttr(*lastModifiedAttrs, "lastModified"); + if (getTarballCache()->hasObject(treeHash)) + return {std::move(input), GitRepo::TarballInfo { .treeHash = treeHash, .lastModified = (time_t) lastModified }}; + else + debug("Git tree with hash '%s' has disappeared from the cache, refetching...", treeHash.gitRev()); + } } + /* Stream the tarball into the tarball cache. */ auto url = getDownloadUrl(input); - auto result = downloadTarball(store, url.url, input.getName(), true, url.headers); + auto source = sinkToSource([&](Sink & sink) { + FileTransferRequest req(url.url); + req.headers = url.headers; + getFileTransfer()->download(std::move(req), sink); + }); - input.attrs.insert_or_assign("lastModified", uint64_t(result.lastModified)); + auto tarballInfo = getTarballCache()->importTarball(*source); - getCache()->add( - store, - lockedAttrs, - { - {"rev", rev->gitRev()}, - {"lastModified", uint64_t(result.lastModified)} - }, - result.storePath, - true); + cache->upsert(treeHashKey, Attrs{{"treeHash", tarballInfo.treeHash.gitRev()}}); + cache->upsert(lastModifiedKey, Attrs{{"lastModified", (uint64_t) tarballInfo.lastModified}}); - return {result.storePath, input}; + if (upstreamTreeHash != tarballInfo.treeHash) + warn( + "Git tree hash mismatch for revision '%s' of '%s': " + "expected '%s', got '%s'. " + "This can happen if the Git repository uses submodules.", + rev->gitRev(), input.to_string(), upstreamTreeHash->gitRev(), tarballInfo.treeHash.gitRev()); + + return {std::move(input), tarballInfo}; + } + + std::pair, Input> getAccessor(ref store, const Input & _input) const override + { + auto [input, tarballInfo] = downloadArchive(store, _input); + + input.attrs.insert_or_assign("treeHash", tarballInfo.treeHash.gitRev()); + input.attrs.insert_or_assign("lastModified", uint64_t(tarballInfo.lastModified)); + + auto accessor = getTarballCache()->getAccessor(tarballInfo.treeHash); + + accessor->setPathDisplay("«" + input.to_string() + "»"); + + accessor->fingerprint = input.getFingerprint(store); + + return {accessor, input}; } std::optional experimentalFeature() const override @@ -269,7 +308,7 @@ struct GitHubInputScheme : GitArchiveInputScheme return getStrAttr(input.attrs, "repo"); } - Hash getRevFromRef(nix::ref store, const Input & input) const override + RefInfo getRevFromRef(nix::ref store, const Input & input) const override { auto host = getHost(input); auto url = fmt( @@ -284,9 +323,10 @@ struct GitHubInputScheme : GitArchiveInputScheme readFile( store->toRealPath( downloadFile(store, url, "source", false, headers).storePath))); - auto rev = Hash::parseAny(std::string { json["sha"] }, htSHA1); - debug("HEAD revision for '%s' is %s", url, rev.gitRev()); - return rev; + return RefInfo { + .rev = Hash::parseAny(std::string { json["sha"] }, htSHA1), + .treeHash = Hash::parseAny(std::string { json["commit"]["tree"]["sha"] }, htSHA1) + }; } DownloadUrl getDownloadUrl(const Input & input) const override @@ -343,7 +383,7 @@ struct GitLabInputScheme : GitArchiveInputScheme return std::make_pair(token.substr(0,fldsplit), token.substr(fldsplit+1)); } - Hash getRevFromRef(nix::ref store, const Input & input) const override + RefInfo getRevFromRef(nix::ref store, const Input & input) const override { auto host = maybeGetStrAttr(input.attrs, "host").value_or("gitlab.com"); // See rate limiting note below @@ -356,9 +396,9 @@ struct GitLabInputScheme : GitArchiveInputScheme readFile( store->toRealPath( downloadFile(store, url, "source", false, headers).storePath))); - auto rev = Hash::parseAny(std::string(json[0]["id"]), htSHA1); - debug("HEAD revision for '%s' is %s", url, rev.gitRev()); - return rev; + return RefInfo { + .rev = Hash::parseAny(std::string(json[0]["id"]), htSHA1) + }; } DownloadUrl getDownloadUrl(const Input & input) const override @@ -402,7 +442,7 @@ struct SourceHutInputScheme : GitArchiveInputScheme // Once it is implemented, however, should work as expected. } - Hash getRevFromRef(nix::ref store, const Input & input) const override + RefInfo getRevFromRef(nix::ref store, const Input & input) const override { // TODO: In the future, when the sourcehut graphql API is implemented for mercurial // and with anonymous access, this method should use it instead. @@ -445,12 +485,12 @@ struct SourceHutInputScheme : GitArchiveInputScheme id = parsedLine->target; } - if(!id) + if (!id) throw BadURL("in '%d', couldn't find ref '%d'", input.to_string(), ref); - auto rev = Hash::parseAny(*id, htSHA1); - debug("HEAD revision for '%s' is %s", fmt("%s/%s", base_url, ref), rev.gitRev()); - return rev; + return RefInfo { + .rev = Hash::parseAny(*id, htSHA1) + }; } DownloadUrl getDownloadUrl(const Input & input) const override From 043413bb597760eefb983395a10141643db9ee8c Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 29 Nov 2023 12:38:46 +0100 Subject: [PATCH 2/6] boost::span -> std::span --- src/libfetchers/git-utils.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index abad42c29..2324fd9ee 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -6,8 +6,6 @@ #include "signals.hh" #include "users.hh" -#include - #include #include #include @@ -28,6 +26,7 @@ #include #include #include +#include namespace std { @@ -350,7 +349,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this throw Error("adding a file to a tree builder: %s", git_error_last()->message); }; - auto updateBuilders = [&](boost::span names) + auto updateBuilders = [&](std::span names) { // Find the common prefix of pendingDirs and names. size_t prefixLen = 0; @@ -392,7 +391,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this auto pathComponents = tokenizeString>(path, "/"); - boost::span pathComponents2{pathComponents}; + std::span pathComponents2{pathComponents}; if (pathComponents2.size() <= componentsToStrip) continue; pathComponents2 = pathComponents2.subspan(componentsToStrip); From 06e106beff4fe9922d1e5debe7a16daec26c398d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 14 Dec 2023 13:38:10 +0100 Subject: [PATCH 3/6] Disable GitHub tree hash mismatch warning --- src/libfetchers/github.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/libfetchers/github.cc b/src/libfetchers/github.cc index d07aa3cea..0f30723cf 100644 --- a/src/libfetchers/github.cc +++ b/src/libfetchers/github.cc @@ -238,12 +238,14 @@ struct GitArchiveInputScheme : InputScheme cache->upsert(treeHashKey, Attrs{{"treeHash", tarballInfo.treeHash.gitRev()}}); cache->upsert(lastModifiedKey, Attrs{{"lastModified", (uint64_t) tarballInfo.lastModified}}); + #if 0 if (upstreamTreeHash != tarballInfo.treeHash) warn( "Git tree hash mismatch for revision '%s' of '%s': " "expected '%s', got '%s'. " "This can happen if the Git repository uses submodules.", rev->gitRev(), input.to_string(), upstreamTreeHash->gitRev(), tarballInfo.treeHash.gitRev()); + #endif return {std::move(input), tarballInfo}; } From ba6a5f06eeaeb2a81f4e6871b8ef19927987409e Mon Sep 17 00:00:00 2001 From: John Ericson Date: Thu, 21 Dec 2023 03:49:52 -0500 Subject: [PATCH 4/6] Split `GitRepoImpl::importTarball` There is now a separation of: 1. A `FileSystemObjectSink` for writing to git repos 2. Adapting libarchive to use that parse sink. The prepares a proper separation of concerns. --- src/libfetchers/git-utils.cc | 388 +++++++++++++++++++++-------------- src/libfetchers/git-utils.hh | 15 +- src/libutil/fs-sink.hh | 2 + 3 files changed, 249 insertions(+), 156 deletions(-) diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index 51631e769..980a5a4d7 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -7,6 +7,7 @@ #include "processes.hh" #include "signals.hh" #include "users.hh" +#include "fs-sink.hh" #include #include @@ -23,9 +24,6 @@ #include #include -#include "tarfile.hh" -#include - #include #include #include @@ -317,157 +315,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this return std::nullopt; } - TarballInfo importTarball(Source & source) override - { - TarArchive archive(source); - - struct PendingDir - { - std::string name; - TreeBuilder builder; - }; - - std::vector pendingDirs; - - auto pushBuilder = [&](std::string name) - { - git_treebuilder * b; - if (git_treebuilder_new(&b, *this, nullptr)) - throw Error("creating a tree builder: %s", git_error_last()->message); - pendingDirs.push_back({ .name = std::move(name), .builder = TreeBuilder(b) }); - }; - - auto popBuilder = [&]() -> std::pair - { - assert(!pendingDirs.empty()); - auto pending = std::move(pendingDirs.back()); - git_oid oid; - if (git_treebuilder_write(&oid, pending.builder.get())) - throw Error("creating a tree object: %s", git_error_last()->message); - pendingDirs.pop_back(); - return {oid, pending.name}; - }; - - auto addToTree = [&](const std::string & name, const git_oid & oid, git_filemode_t mode) - { - assert(!pendingDirs.empty()); - auto & pending = pendingDirs.back(); - if (git_treebuilder_insert(nullptr, pending.builder.get(), name.c_str(), &oid, mode)) - throw Error("adding a file to a tree builder: %s", git_error_last()->message); - }; - - auto updateBuilders = [&](std::span names) - { - // Find the common prefix of pendingDirs and names. - size_t prefixLen = 0; - for (; prefixLen < names.size() && prefixLen + 1 < pendingDirs.size(); ++prefixLen) - if (names[prefixLen] != pendingDirs[prefixLen + 1].name) - break; - - // Finish the builders that are not part of the common prefix. - for (auto n = pendingDirs.size(); n > prefixLen + 1; --n) { - auto [oid, name] = popBuilder(); - addToTree(name, oid, GIT_FILEMODE_TREE); - } - - // Create builders for the new directories. - for (auto n = prefixLen; n < names.size(); ++n) - pushBuilder(names[n]); - }; - - pushBuilder(""); - - size_t componentsToStrip = 1; - - time_t lastModified = 0; - - for (;;) { - // FIXME: merge with extract_archive - struct archive_entry * entry; - int r = archive_read_next_header(archive.archive, &entry); - if (r == ARCHIVE_EOF) break; - auto path = archive_entry_pathname(entry); - if (!path) - throw Error("cannot get archive member name: %s", archive_error_string(archive.archive)); - if (r == ARCHIVE_WARN) - warn(archive_error_string(archive.archive)); - else - archive.check(r); - - lastModified = std::max(lastModified, archive_entry_mtime(entry)); - - auto pathComponents = tokenizeString>(path, "/"); - - std::span pathComponents2{pathComponents}; - - if (pathComponents2.size() <= componentsToStrip) continue; - pathComponents2 = pathComponents2.subspan(componentsToStrip); - - updateBuilders( - archive_entry_filetype(entry) == AE_IFDIR - ? pathComponents2 - : pathComponents2.first(pathComponents2.size() - 1)); - - switch (archive_entry_filetype(entry)) { - - case AE_IFDIR: - // Nothing to do right now. - break; - - case AE_IFREG: { - - git_writestream * stream = nullptr; - if (git_blob_create_from_stream(&stream, *this, nullptr)) - throw Error("creating a blob stream object: %s", git_error_last()->message); - - while (true) { - std::vector buf(128 * 1024); - auto n = archive_read_data(archive.archive, buf.data(), buf.size()); - if (n < 0) - throw Error("cannot read file '%s' from tarball", path); - if (n == 0) break; - if (stream->write(stream, (const char *) buf.data(), n)) - throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message); - } - - git_oid oid; - if (git_blob_create_from_stream_commit(&oid, stream)) - throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message); - - addToTree(*pathComponents.rbegin(), oid, - archive_entry_mode(entry) & S_IXUSR - ? GIT_FILEMODE_BLOB_EXECUTABLE - : GIT_FILEMODE_BLOB); - - break; - } - - case AE_IFLNK: { - auto target = archive_entry_symlink(entry); - - git_oid oid; - if (git_blob_create_from_buffer(&oid, *this, target, strlen(target))) - throw Error("creating a blob object for tarball symlink member '%s': %s", path, git_error_last()->message); - - addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK); - - break; - } - - default: - throw Error("file '%s' in tarball has unsupported file type", path); - } - } - - updateBuilders({}); - - auto [oid, _name] = popBuilder(); - - return TarballInfo { - .treeHash = toHash(oid), - .lastModified = lastModified - }; - } + TarballInfo importTarball(Source & source) override; std::vector> getSubmodules(const Hash & rev, bool exportIgnore) override; @@ -511,6 +359,8 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this ref getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError e) override; + ref getFileSystemObjectSink() override; + static int sidebandProgressCallback(const char * str, int len, void * payload) { auto act = (Activity *) payload; @@ -884,6 +734,154 @@ struct GitExportIgnoreInputAccessor : CachingFilteringInputAccessor { }; +struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink +{ + ref repo; + + struct PendingDir + { + std::string name; + TreeBuilder builder; + }; + + std::vector pendingDirs; + + size_t componentsToStrip = 1; + + void pushBuilder(std::string name) + { + git_treebuilder * b; + if (git_treebuilder_new(&b, *repo, nullptr)) + throw Error("creating a tree builder: %s", git_error_last()->message); + pendingDirs.push_back({ .name = std::move(name), .builder = TreeBuilder(b) }); + }; + + GitFileSystemObjectSinkImpl(ref repo) : repo(repo) + { + pushBuilder(""); + } + + std::pair popBuilder() + { + assert(!pendingDirs.empty()); + auto pending = std::move(pendingDirs.back()); + git_oid oid; + if (git_treebuilder_write(&oid, pending.builder.get())) + throw Error("creating a tree object: %s", git_error_last()->message); + pendingDirs.pop_back(); + return {oid, pending.name}; + }; + + void addToTree(const std::string & name, const git_oid & oid, git_filemode_t mode) + { + assert(!pendingDirs.empty()); + auto & pending = pendingDirs.back(); + if (git_treebuilder_insert(nullptr, pending.builder.get(), name.c_str(), &oid, mode)) + throw Error("adding a file to a tree builder: %s", git_error_last()->message); + }; + + void updateBuilders(std::span names) + { + // Find the common prefix of pendingDirs and names. + size_t prefixLen = 0; + for (; prefixLen < names.size() && prefixLen + 1 < pendingDirs.size(); ++prefixLen) + if (names[prefixLen] != pendingDirs[prefixLen + 1].name) + break; + + // Finish the builders that are not part of the common prefix. + for (auto n = pendingDirs.size(); n > prefixLen + 1; --n) { + auto [oid, name] = popBuilder(); + addToTree(name, oid, GIT_FILEMODE_TREE); + } + + // Create builders for the new directories. + for (auto n = prefixLen; n < names.size(); ++n) + pushBuilder(names[n]); + }; + + bool prepareDirs(const std::vector & pathComponents, bool isDir) + { + std::span pathComponents2{pathComponents}; + + if (pathComponents2.size() <= componentsToStrip) return false; + pathComponents2 = pathComponents2.subspan(componentsToStrip); + + updateBuilders( + isDir + ? pathComponents2 + : pathComponents2.first(pathComponents2.size() - 1)); + + return true; + } + + void createRegularFile( + const Path & path, + std::function func) override + { + auto pathComponents = tokenizeString>(path, "/"); + if (!prepareDirs(pathComponents, false)) return; + + git_writestream * stream = nullptr; + if (git_blob_create_from_stream(&stream, *repo, nullptr)) + throw Error("creating a blob stream object: %s", git_error_last()->message); + + struct CRF : CreateRegularFileSink { + const Path & path; + GitFileSystemObjectSinkImpl & back; + git_writestream * stream; + bool executable = false; + CRF(const Path & path, GitFileSystemObjectSinkImpl & back, git_writestream * stream) + : path(path), back(back), stream(stream) + {} + void operator () (std::string_view data) override + { + if (stream->write(stream, data.data(), data.size())) + throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message); + } + void isExecutable() override + { + executable = true; + } + } crf { path, *this, stream }; + func(crf); + + git_oid oid; + if (git_blob_create_from_stream_commit(&oid, stream)) + throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message); + + addToTree(*pathComponents.rbegin(), oid, + crf.executable + ? GIT_FILEMODE_BLOB_EXECUTABLE + : GIT_FILEMODE_BLOB); + } + + void createDirectory(const Path & path) override + { + auto pathComponents = tokenizeString>(path, "/"); + (void) prepareDirs(pathComponents, true); + } + + void createSymlink(const Path & path, const std::string & target) override + { + auto pathComponents = tokenizeString>(path, "/"); + if (!prepareDirs(pathComponents, false)) return; + + git_oid oid; + if (git_blob_create_from_buffer(&oid, *repo, target.c_str(), target.size())) + throw Error("creating a blob object for tarball symlink member '%s': %s", path, git_error_last()->message); + + addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK); + } + + Hash sync() override { + updateBuilders({}); + + auto [oid, _name] = popBuilder(); + + return toHash(oid); + } +}; + ref GitRepoImpl::getRawAccessor(const Hash & rev) { auto self = ref(shared_from_this()); @@ -918,6 +916,11 @@ ref GitRepoImpl::getAccessor(const WorkdirInfo & wd, bool exportI } } +ref GitRepoImpl::getFileSystemObjectSink() +{ + return make_ref(ref(shared_from_this())); +} + std::vector> GitRepoImpl::getSubmodules(const Hash & rev, bool exportIgnore) { /* Read the .gitmodules files from this revision. */ @@ -951,4 +954,81 @@ ref getTarballCache() return make_ref(repoDir, true, true); } +} + +#include "tarfile.hh" +#include + +namespace nix { + +GitRepo::TarballInfo GitRepoImpl::importTarball(Source & source) +{ + TarArchive archive { source }; + + auto parseSink = getFileSystemObjectSink(); + + time_t lastModified = 0; + + for (;;) { + // FIXME: merge with extract_archive + struct archive_entry * entry; + int r = archive_read_next_header(archive.archive, &entry); + if (r == ARCHIVE_EOF) break; + auto path = archive_entry_pathname(entry); + if (!path) + throw Error("cannot get archive member name: %s", archive_error_string(archive.archive)); + if (r == ARCHIVE_WARN) + warn(archive_error_string(archive.archive)); + else + archive.check(r); + + lastModified = std::max(lastModified, archive_entry_mtime(entry)); + + switch (archive_entry_filetype(entry)) { + + case AE_IFDIR: + parseSink->createDirectory(path); + break; + + case AE_IFREG: { + parseSink->createRegularFile(path, [&](auto & crf) { + if (archive_entry_mode(entry) & S_IXUSR) + crf.isExecutable(); + + while (true) { + std::vector buf(128 * 1024); + auto n = archive_read_data(archive.archive, buf.data(), buf.size()); + if (n < 0) + throw Error("cannot read file '%s' from tarball", path); + if (n == 0) break; + crf(std::string_view { + (const char *) buf.data(), + (size_t) n, + }); + } + }); + + break; + } + + case AE_IFLNK: { + auto target = archive_entry_symlink(entry); + + parseSink->createSymlink(path, target); + + break; + } + + default: + throw Error("file '%s' in tarball has unsupported file type", path); + } + } + + return TarballInfo { + .treeHash = parseSink->sync(), + .lastModified = lastModified + }; +} + + } diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index b54559def..f82f62fc8 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -2,11 +2,20 @@ #include "filtering-input-accessor.hh" #include "input-accessor.hh" +#include "fs-sink.hh" namespace nix { namespace fetchers { struct PublicKey; } +struct GitFileSystemObjectSink : FileSystemObjectSink +{ + /** + * Flush builder and return a final Git hash. + */ + virtual Hash sync() = 0; +}; + struct GitRepo { virtual ~GitRepo() @@ -70,14 +79,14 @@ struct GitRepo time_t lastModified; }; - virtual TarballInfo importTarball(Source & source) = 0; - virtual bool hasObject(const Hash & oid) = 0; virtual ref getAccessor(const Hash & rev, bool exportIgnore) = 0; virtual ref getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError makeNotAllowedError) = 0; + virtual ref getFileSystemObjectSink() = 0; + virtual void fetch( const std::string & url, const std::string & refspec, @@ -90,6 +99,8 @@ struct GitRepo virtual void verifyCommit( const Hash & rev, const std::vector & publicKeys) = 0; + + virtual TarballInfo importTarball(Source & source) = 0; }; ref getTarballCache(); diff --git a/src/libutil/fs-sink.hh b/src/libutil/fs-sink.hh index 4dfb5b329..ae577819a 100644 --- a/src/libutil/fs-sink.hh +++ b/src/libutil/fs-sink.hh @@ -26,6 +26,8 @@ struct CreateRegularFileSink : Sink struct FileSystemObjectSink { + virtual ~FileSystemObjectSink() = default; + virtual void createDirectory(const Path & path) = 0; /** From ed24baaec4f3825ce538d1894ced63bfc82db7c8 Mon Sep 17 00:00:00 2001 From: John Ericson Date: Thu, 21 Dec 2023 04:28:06 -0500 Subject: [PATCH 5/6] Finish separating concerns with tarball cache There is no longer an `importTarball` method. Instead, there is a `unpackTarfileToSink` function (back in libutil). The caller can use thisw with the `getParseSink` method we added in the last commit easily enough. In addition, tarball cache functionality is separated from `git-utils` and moved into `tarball-cache`. This ensures we are separating mechanism and policy. --- src/libfetchers/git-utils.cc | 86 -------------------------------- src/libfetchers/git-utils.hh | 10 ---- src/libfetchers/github.cc | 15 ++++-- src/libfetchers/tarball-cache.cc | 13 +++++ src/libfetchers/tarball-cache.hh | 17 +++++++ src/libutil/tarfile.cc | 62 +++++++++++++++++++++++ src/libutil/tarfile.hh | 3 ++ 7 files changed, 107 insertions(+), 99 deletions(-) create mode 100644 src/libfetchers/tarball-cache.cc create mode 100644 src/libfetchers/tarball-cache.hh diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index 980a5a4d7..42bf42de6 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -315,8 +315,6 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this return std::nullopt; } - TarballInfo importTarball(Source & source) override; - std::vector> getSubmodules(const Hash & rev, bool exportIgnore) override; std::string resolveSubmoduleUrl( @@ -947,88 +945,4 @@ std::vector> GitRepoImpl::getSubmodules return result; } -ref getTarballCache() -{ - static auto repoDir = std::filesystem::path(getCacheDir()) / "nix" / "tarball-cache"; - - return make_ref(repoDir, true, true); -} - -} - -#include "tarfile.hh" -#include - -namespace nix { - -GitRepo::TarballInfo GitRepoImpl::importTarball(Source & source) -{ - TarArchive archive { source }; - - auto parseSink = getFileSystemObjectSink(); - - time_t lastModified = 0; - - for (;;) { - // FIXME: merge with extract_archive - struct archive_entry * entry; - int r = archive_read_next_header(archive.archive, &entry); - if (r == ARCHIVE_EOF) break; - auto path = archive_entry_pathname(entry); - if (!path) - throw Error("cannot get archive member name: %s", archive_error_string(archive.archive)); - if (r == ARCHIVE_WARN) - warn(archive_error_string(archive.archive)); - else - archive.check(r); - - lastModified = std::max(lastModified, archive_entry_mtime(entry)); - - switch (archive_entry_filetype(entry)) { - - case AE_IFDIR: - parseSink->createDirectory(path); - break; - - case AE_IFREG: { - parseSink->createRegularFile(path, [&](auto & crf) { - if (archive_entry_mode(entry) & S_IXUSR) - crf.isExecutable(); - - while (true) { - std::vector buf(128 * 1024); - auto n = archive_read_data(archive.archive, buf.data(), buf.size()); - if (n < 0) - throw Error("cannot read file '%s' from tarball", path); - if (n == 0) break; - crf(std::string_view { - (const char *) buf.data(), - (size_t) n, - }); - } - }); - - break; - } - - case AE_IFLNK: { - auto target = archive_entry_symlink(entry); - - parseSink->createSymlink(path, target); - - break; - } - - default: - throw Error("file '%s' in tarball has unsupported file type", path); - } - } - - return TarballInfo { - .treeHash = parseSink->sync(), - .lastModified = lastModified - }; -} - - } diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index f82f62fc8..029d39741 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -73,12 +73,6 @@ struct GitRepo const std::string & url, const std::string & base) = 0; - struct TarballInfo - { - Hash treeHash; - time_t lastModified; - }; - virtual bool hasObject(const Hash & oid) = 0; virtual ref getAccessor(const Hash & rev, bool exportIgnore) = 0; @@ -99,10 +93,6 @@ struct GitRepo virtual void verifyCommit( const Hash & rev, const std::vector & publicKeys) = 0; - - virtual TarballInfo importTarball(Source & source) = 0; }; -ref getTarballCache(); - } diff --git a/src/libfetchers/github.cc b/src/libfetchers/github.cc index 1cfc142a5..8b3e6ff20 100644 --- a/src/libfetchers/github.cc +++ b/src/libfetchers/github.cc @@ -8,7 +8,9 @@ #include "fetchers.hh" #include "fetch-settings.hh" #include "tarball.hh" +#include "tarfile.hh" #include "git-utils.hh" +#include "tarball-cache.hh" #include #include @@ -191,7 +193,7 @@ struct GitArchiveInputScheme : InputScheme virtual DownloadUrl getDownloadUrl(const Input & input) const = 0; - std::pair downloadArchive(ref store, Input input) const + std::pair downloadArchive(ref store, Input input) const { if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD"); @@ -218,7 +220,7 @@ struct GitArchiveInputScheme : InputScheme auto treeHash = getRevAttr(*treeHashAttrs, "treeHash"); auto lastModified = getIntAttr(*lastModifiedAttrs, "lastModified"); if (getTarballCache()->hasObject(treeHash)) - return {std::move(input), GitRepo::TarballInfo { .treeHash = treeHash, .lastModified = (time_t) lastModified }}; + return {std::move(input), TarballInfo { .treeHash = treeHash, .lastModified = (time_t) lastModified }}; else debug("Git tree with hash '%s' has disappeared from the cache, refetching...", treeHash.gitRev()); } @@ -233,7 +235,14 @@ struct GitArchiveInputScheme : InputScheme getFileTransfer()->download(std::move(req), sink); }); - auto tarballInfo = getTarballCache()->importTarball(*source); + TarArchive archive { *source }; + auto parseSink = getTarballCache()->getFileSystemObjectSink(); + auto lastModified = unpackTarfileToSink(archive, *parseSink); + + TarballInfo tarballInfo { + .treeHash = parseSink->sync(), + .lastModified = lastModified + }; cache->upsert(treeHashKey, Attrs{{"treeHash", tarballInfo.treeHash.gitRev()}}); cache->upsert(lastModifiedKey, Attrs{{"lastModified", (uint64_t) tarballInfo.lastModified}}); diff --git a/src/libfetchers/tarball-cache.cc b/src/libfetchers/tarball-cache.cc new file mode 100644 index 000000000..bb2c51973 --- /dev/null +++ b/src/libfetchers/tarball-cache.cc @@ -0,0 +1,13 @@ +#include "tarball-cache.hh" +#include "users.hh" + +namespace nix::fetchers { + +ref getTarballCache() +{ + static auto repoDir = std::filesystem::path(getCacheDir()) / "nix" / "tarball-cache"; + + return GitRepo::openRepo(repoDir, true, true); +} + +} diff --git a/src/libfetchers/tarball-cache.hh b/src/libfetchers/tarball-cache.hh new file mode 100644 index 000000000..e1517038b --- /dev/null +++ b/src/libfetchers/tarball-cache.hh @@ -0,0 +1,17 @@ +#pragma once +///@file + +#include "ref.hh" +#include "git-utils.hh" + +namespace nix::fetchers { + +struct TarballInfo +{ + Hash treeHash; + time_t lastModified; +}; + +ref getTarballCache(); + +} diff --git a/src/libutil/tarfile.cc b/src/libutil/tarfile.cc index 187b3e948..3bb6694f8 100644 --- a/src/libutil/tarfile.cc +++ b/src/libutil/tarfile.cc @@ -132,4 +132,66 @@ void unpackTarfile(const Path & tarFile, const Path & destDir) extract_archive(archive, destDir); } +time_t unpackTarfileToSink(TarArchive & archive, FileSystemObjectSink & parseSink) +{ + time_t lastModified = 0; + + for (;;) { + // FIXME: merge with extract_archive + struct archive_entry * entry; + int r = archive_read_next_header(archive.archive, &entry); + if (r == ARCHIVE_EOF) break; + auto path = archive_entry_pathname(entry); + if (!path) + throw Error("cannot get archive member name: %s", archive_error_string(archive.archive)); + if (r == ARCHIVE_WARN) + warn(archive_error_string(archive.archive)); + else + archive.check(r); + + lastModified = std::max(lastModified, archive_entry_mtime(entry)); + + switch (archive_entry_filetype(entry)) { + + case AE_IFDIR: + parseSink.createDirectory(path); + break; + + case AE_IFREG: { + parseSink.createRegularFile(path, [&](auto & crf) { + if (archive_entry_mode(entry) & S_IXUSR) + crf.isExecutable(); + + while (true) { + std::vector buf(128 * 1024); + auto n = archive_read_data(archive.archive, buf.data(), buf.size()); + if (n < 0) + throw Error("cannot read file '%s' from tarball", path); + if (n == 0) break; + crf(std::string_view { + (const char *) buf.data(), + (size_t) n, + }); + } + }); + + break; + } + + case AE_IFLNK: { + auto target = archive_entry_symlink(entry); + + parseSink.createSymlink(path, target); + + break; + } + + default: + throw Error("file '%s' in tarball has unsupported file type", path); + } + } + + return lastModified; +} + } diff --git a/src/libutil/tarfile.hh b/src/libutil/tarfile.hh index 237d18c31..6a9c42149 100644 --- a/src/libutil/tarfile.hh +++ b/src/libutil/tarfile.hh @@ -2,6 +2,7 @@ ///@file #include "serialise.hh" +#include "fs-sink.hh" #include namespace nix { @@ -29,4 +30,6 @@ void unpackTarfile(Source & source, const Path & destDir); void unpackTarfile(const Path & tarFile, const Path & destDir); +time_t unpackTarfileToSink(TarArchive & archive, FileSystemObjectSink & parseSink); + } From 78b8db72b53b6657cbdaaac8ad6c0f99fb92ed10 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 15 Feb 2024 21:58:08 +0100 Subject: [PATCH 6/6] Remove tarball-cache.{hh,cc} TarballInfo is only used in github.cc, and getTarballCache() is a bit too trivial to have its own file. --- src/libfetchers/git-utils.cc | 7 +++++++ src/libfetchers/git-utils.hh | 2 ++ src/libfetchers/github.cc | 7 ++++++- src/libfetchers/tarball-cache.cc | 13 ------------- src/libfetchers/tarball-cache.hh | 17 ----------------- 5 files changed, 15 insertions(+), 31 deletions(-) delete mode 100644 src/libfetchers/tarball-cache.cc delete mode 100644 src/libfetchers/tarball-cache.hh diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index 32f665aa0..4f034e9d4 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -986,4 +986,11 @@ std::vector> GitRepoImpl::getSubmodules return result; } +ref getTarballCache() +{ + static auto repoDir = std::filesystem::path(getCacheDir()) / "nix" / "tarball-cache"; + + return GitRepo::openRepo(repoDir, true, true); +} + } diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index 029d39741..5f68d26a7 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -95,4 +95,6 @@ struct GitRepo const std::vector & publicKeys) = 0; }; +ref getTarballCache(); + } diff --git a/src/libfetchers/github.cc b/src/libfetchers/github.cc index 8b3e6ff20..e6fbece13 100644 --- a/src/libfetchers/github.cc +++ b/src/libfetchers/github.cc @@ -10,7 +10,6 @@ #include "tarball.hh" #include "tarfile.hh" #include "git-utils.hh" -#include "tarball-cache.hh" #include #include @@ -193,6 +192,12 @@ struct GitArchiveInputScheme : InputScheme virtual DownloadUrl getDownloadUrl(const Input & input) const = 0; + struct TarballInfo + { + Hash treeHash; + time_t lastModified; + }; + std::pair downloadArchive(ref store, Input input) const { if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD"); diff --git a/src/libfetchers/tarball-cache.cc b/src/libfetchers/tarball-cache.cc deleted file mode 100644 index bb2c51973..000000000 --- a/src/libfetchers/tarball-cache.cc +++ /dev/null @@ -1,13 +0,0 @@ -#include "tarball-cache.hh" -#include "users.hh" - -namespace nix::fetchers { - -ref getTarballCache() -{ - static auto repoDir = std::filesystem::path(getCacheDir()) / "nix" / "tarball-cache"; - - return GitRepo::openRepo(repoDir, true, true); -} - -} diff --git a/src/libfetchers/tarball-cache.hh b/src/libfetchers/tarball-cache.hh deleted file mode 100644 index e1517038b..000000000 --- a/src/libfetchers/tarball-cache.hh +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once -///@file - -#include "ref.hh" -#include "git-utils.hh" - -namespace nix::fetchers { - -struct TarballInfo -{ - Hash treeHash; - time_t lastModified; -}; - -ref getTarballCache(); - -}