diff --git a/src/libfetchers/attrs.cc b/src/libfetchers/attrs.cc index a565d19d4..b788c5948 100644 --- a/src/libfetchers/attrs.cc +++ b/src/libfetchers/attrs.cc @@ -104,4 +104,9 @@ std::map attrsToQuery(const Attrs & attrs) return query; } +Hash getRevAttr(const Attrs & attrs, const std::string & name) +{ + return Hash::parseAny(getStrAttr(attrs, name), HashAlgorithm::SHA1); +} + } diff --git a/src/libfetchers/attrs.hh b/src/libfetchers/attrs.hh index b9a2c824e..97a74bce0 100644 --- a/src/libfetchers/attrs.hh +++ b/src/libfetchers/attrs.hh @@ -39,4 +39,6 @@ bool getBoolAttr(const Attrs & attrs, const std::string & name); std::map attrsToQuery(const Attrs & attrs); +Hash getRevAttr(const Attrs & attrs, const std::string & name); + } diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index 466bdc6c7..4f034e9d4 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -6,8 +6,8 @@ #include "finally.hh" #include "processes.hh" #include "signals.hh" - -#include +#include "users.hh" +#include "fs-sink.hh" #include #include @@ -28,6 +28,7 @@ #include #include #include +#include namespace std { @@ -356,6 +357,8 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this ref getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError e) override; + ref getFileSystemObjectSink() override; + static int sidebandProgressCallback(const char * str, int len, void * payload) { auto act = (Activity *) payload; @@ -770,6 +773,154 @@ struct GitExportIgnoreInputAccessor : CachingFilteringInputAccessor { }; +struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink +{ + ref repo; + + struct PendingDir + { + std::string name; + TreeBuilder builder; + }; + + std::vector pendingDirs; + + size_t componentsToStrip = 1; + + void pushBuilder(std::string name) + { + git_treebuilder * b; + if (git_treebuilder_new(&b, *repo, nullptr)) + throw Error("creating a tree builder: %s", git_error_last()->message); + pendingDirs.push_back({ .name = std::move(name), .builder = TreeBuilder(b) }); + }; + + GitFileSystemObjectSinkImpl(ref repo) : repo(repo) + { + pushBuilder(""); + } + + std::pair popBuilder() + { + assert(!pendingDirs.empty()); + auto pending = std::move(pendingDirs.back()); + git_oid oid; + if (git_treebuilder_write(&oid, pending.builder.get())) + throw Error("creating a tree object: %s", git_error_last()->message); + pendingDirs.pop_back(); + return {oid, pending.name}; + }; + + void addToTree(const std::string & name, const git_oid & oid, git_filemode_t mode) + { + assert(!pendingDirs.empty()); + auto & pending = pendingDirs.back(); + if (git_treebuilder_insert(nullptr, pending.builder.get(), name.c_str(), &oid, mode)) + throw Error("adding a file to a tree builder: %s", git_error_last()->message); + }; + + void updateBuilders(std::span names) + { + // Find the common prefix of pendingDirs and names. + size_t prefixLen = 0; + for (; prefixLen < names.size() && prefixLen + 1 < pendingDirs.size(); ++prefixLen) + if (names[prefixLen] != pendingDirs[prefixLen + 1].name) + break; + + // Finish the builders that are not part of the common prefix. + for (auto n = pendingDirs.size(); n > prefixLen + 1; --n) { + auto [oid, name] = popBuilder(); + addToTree(name, oid, GIT_FILEMODE_TREE); + } + + // Create builders for the new directories. + for (auto n = prefixLen; n < names.size(); ++n) + pushBuilder(names[n]); + }; + + bool prepareDirs(const std::vector & pathComponents, bool isDir) + { + std::span pathComponents2{pathComponents}; + + if (pathComponents2.size() <= componentsToStrip) return false; + pathComponents2 = pathComponents2.subspan(componentsToStrip); + + updateBuilders( + isDir + ? pathComponents2 + : pathComponents2.first(pathComponents2.size() - 1)); + + return true; + } + + void createRegularFile( + const Path & path, + std::function func) override + { + auto pathComponents = tokenizeString>(path, "/"); + if (!prepareDirs(pathComponents, false)) return; + + git_writestream * stream = nullptr; + if (git_blob_create_from_stream(&stream, *repo, nullptr)) + throw Error("creating a blob stream object: %s", git_error_last()->message); + + struct CRF : CreateRegularFileSink { + const Path & path; + GitFileSystemObjectSinkImpl & back; + git_writestream * stream; + bool executable = false; + CRF(const Path & path, GitFileSystemObjectSinkImpl & back, git_writestream * stream) + : path(path), back(back), stream(stream) + {} + void operator () (std::string_view data) override + { + if (stream->write(stream, data.data(), data.size())) + throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message); + } + void isExecutable() override + { + executable = true; + } + } crf { path, *this, stream }; + func(crf); + + git_oid oid; + if (git_blob_create_from_stream_commit(&oid, stream)) + throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message); + + addToTree(*pathComponents.rbegin(), oid, + crf.executable + ? GIT_FILEMODE_BLOB_EXECUTABLE + : GIT_FILEMODE_BLOB); + } + + void createDirectory(const Path & path) override + { + auto pathComponents = tokenizeString>(path, "/"); + (void) prepareDirs(pathComponents, true); + } + + void createSymlink(const Path & path, const std::string & target) override + { + auto pathComponents = tokenizeString>(path, "/"); + if (!prepareDirs(pathComponents, false)) return; + + git_oid oid; + if (git_blob_create_from_buffer(&oid, *repo, target.c_str(), target.size())) + throw Error("creating a blob object for tarball symlink member '%s': %s", path, git_error_last()->message); + + addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK); + } + + Hash sync() override { + updateBuilders({}); + + auto [oid, _name] = popBuilder(); + + return toHash(oid); + } +}; + ref GitRepoImpl::getRawAccessor(const Hash & rev) { auto self = ref(shared_from_this()); @@ -804,6 +955,11 @@ ref GitRepoImpl::getAccessor(const WorkdirInfo & wd, bool exportI } } +ref GitRepoImpl::getFileSystemObjectSink() +{ + return make_ref(ref(shared_from_this())); +} + std::vector> GitRepoImpl::getSubmodules(const Hash & rev, bool exportIgnore) { /* Read the .gitmodules files from this revision. */ @@ -830,5 +986,11 @@ std::vector> GitRepoImpl::getSubmodules return result; } +ref getTarballCache() +{ + static auto repoDir = std::filesystem::path(getCacheDir()) / "nix" / "tarball-cache"; + + return GitRepo::openRepo(repoDir, true, true); +} } diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index e55affb12..5f68d26a7 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -2,11 +2,20 @@ #include "filtering-input-accessor.hh" #include "input-accessor.hh" +#include "fs-sink.hh" namespace nix { namespace fetchers { struct PublicKey; } +struct GitFileSystemObjectSink : FileSystemObjectSink +{ + /** + * Flush builder and return a final Git hash. + */ + virtual Hash sync() = 0; +}; + struct GitRepo { virtual ~GitRepo() @@ -64,18 +73,14 @@ struct GitRepo const std::string & url, const std::string & base) = 0; - struct TarballInfo - { - Hash treeHash; - time_t lastModified; - }; - virtual bool hasObject(const Hash & oid) = 0; virtual ref getAccessor(const Hash & rev, bool exportIgnore) = 0; virtual ref getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError makeNotAllowedError) = 0; + virtual ref getFileSystemObjectSink() = 0; + virtual void fetch( const std::string & url, const std::string & refspec, @@ -90,4 +95,6 @@ struct GitRepo const std::vector & publicKeys) = 0; }; +ref getTarballCache(); + } diff --git a/src/libfetchers/github.cc b/src/libfetchers/github.cc index 498e41357..e6fbece13 100644 --- a/src/libfetchers/github.cc +++ b/src/libfetchers/github.cc @@ -8,6 +8,8 @@ #include "fetchers.hh" #include "fetch-settings.hh" #include "tarball.hh" +#include "tarfile.hh" +#include "git-utils.hh" #include #include @@ -180,49 +182,102 @@ struct GitArchiveInputScheme : InputScheme return headers; } - virtual Hash getRevFromRef(nix::ref store, const Input & input) const = 0; + struct RefInfo + { + Hash rev; + std::optional treeHash; + }; + + virtual RefInfo getRevFromRef(nix::ref store, const Input & input) const = 0; virtual DownloadUrl getDownloadUrl(const Input & input) const = 0; - std::pair fetch(ref store, const Input & _input) override + struct TarballInfo { - Input input(_input); + Hash treeHash; + time_t lastModified; + }; + std::pair downloadArchive(ref store, Input input) const + { if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD"); + std::optional upstreamTreeHash; + auto rev = input.getRev(); - if (!rev) rev = getRevFromRef(store, input); + if (!rev) { + auto refInfo = getRevFromRef(store, input); + rev = refInfo.rev; + upstreamTreeHash = refInfo.treeHash; + debug("HEAD revision for '%s' is %s", input.to_string(), refInfo.rev.gitRev()); + } input.attrs.erase("ref"); input.attrs.insert_or_assign("rev", rev->gitRev()); - Attrs lockedAttrs({ - {"type", "git-tarball"}, - {"rev", rev->gitRev()}, - }); + auto cache = getCache(); - if (auto res = getCache()->lookup(*store, lockedAttrs)) { - input.attrs.insert_or_assign("lastModified", getIntAttr(res->first, "lastModified")); - return {std::move(res->second), input}; + Attrs treeHashKey{{"_what", "gitRevToTreeHash"}, {"rev", rev->gitRev()}}; + Attrs lastModifiedKey{{"_what", "gitRevToLastModified"}, {"rev", rev->gitRev()}}; + + if (auto treeHashAttrs = cache->lookup(treeHashKey)) { + if (auto lastModifiedAttrs = cache->lookup(lastModifiedKey)) { + auto treeHash = getRevAttr(*treeHashAttrs, "treeHash"); + auto lastModified = getIntAttr(*lastModifiedAttrs, "lastModified"); + if (getTarballCache()->hasObject(treeHash)) + return {std::move(input), TarballInfo { .treeHash = treeHash, .lastModified = (time_t) lastModified }}; + else + debug("Git tree with hash '%s' has disappeared from the cache, refetching...", treeHash.gitRev()); + } } + /* Stream the tarball into the tarball cache. */ auto url = getDownloadUrl(input); - auto result = downloadTarball(store, url.url, input.getName(), true, url.headers); + auto source = sinkToSource([&](Sink & sink) { + FileTransferRequest req(url.url); + req.headers = url.headers; + getFileTransfer()->download(std::move(req), sink); + }); - input.attrs.insert_or_assign("lastModified", uint64_t(result.lastModified)); + TarArchive archive { *source }; + auto parseSink = getTarballCache()->getFileSystemObjectSink(); + auto lastModified = unpackTarfileToSink(archive, *parseSink); - getCache()->add( - *store, - lockedAttrs, - { - {"rev", rev->gitRev()}, - {"lastModified", uint64_t(result.lastModified)} - }, - result.storePath, - true); + TarballInfo tarballInfo { + .treeHash = parseSink->sync(), + .lastModified = lastModified + }; - return {result.storePath, input}; + cache->upsert(treeHashKey, Attrs{{"treeHash", tarballInfo.treeHash.gitRev()}}); + cache->upsert(lastModifiedKey, Attrs{{"lastModified", (uint64_t) tarballInfo.lastModified}}); + + #if 0 + if (upstreamTreeHash != tarballInfo.treeHash) + warn( + "Git tree hash mismatch for revision '%s' of '%s': " + "expected '%s', got '%s'. " + "This can happen if the Git repository uses submodules.", + rev->gitRev(), input.to_string(), upstreamTreeHash->gitRev(), tarballInfo.treeHash.gitRev()); + #endif + + return {std::move(input), tarballInfo}; + } + + std::pair, Input> getAccessor(ref store, const Input & _input) const override + { + auto [input, tarballInfo] = downloadArchive(store, _input); + + input.attrs.insert_or_assign("treeHash", tarballInfo.treeHash.gitRev()); + input.attrs.insert_or_assign("lastModified", uint64_t(tarballInfo.lastModified)); + + auto accessor = getTarballCache()->getAccessor(tarballInfo.treeHash, false); + + accessor->setPathDisplay("«" + input.to_string() + "»"); + + accessor->fingerprint = input.getFingerprint(store); + + return {accessor, input}; } std::optional experimentalFeature() const override @@ -269,7 +324,7 @@ struct GitHubInputScheme : GitArchiveInputScheme return getStrAttr(input.attrs, "repo"); } - Hash getRevFromRef(nix::ref store, const Input & input) const override + RefInfo getRevFromRef(nix::ref store, const Input & input) const override { auto host = getHost(input); auto url = fmt( @@ -284,9 +339,11 @@ struct GitHubInputScheme : GitArchiveInputScheme readFile( store->toRealPath( downloadFile(store, url, "source", false, headers).storePath))); - auto rev = Hash::parseAny(std::string { json["sha"] }, HashAlgorithm::SHA1); - debug("HEAD revision for '%s' is %s", url, rev.gitRev()); - return rev; + + return RefInfo { + .rev = Hash::parseAny(std::string { json["sha"] }, HashAlgorithm::SHA1), + .treeHash = Hash::parseAny(std::string { json["commit"]["tree"]["sha"] }, HashAlgorithm::SHA1) + }; } DownloadUrl getDownloadUrl(const Input & input) const override @@ -343,7 +400,7 @@ struct GitLabInputScheme : GitArchiveInputScheme return std::make_pair(token.substr(0,fldsplit), token.substr(fldsplit+1)); } - Hash getRevFromRef(nix::ref store, const Input & input) const override + RefInfo getRevFromRef(nix::ref store, const Input & input) const override { auto host = maybeGetStrAttr(input.attrs, "host").value_or("gitlab.com"); // See rate limiting note below @@ -356,9 +413,10 @@ struct GitLabInputScheme : GitArchiveInputScheme readFile( store->toRealPath( downloadFile(store, url, "source", false, headers).storePath))); - auto rev = Hash::parseAny(std::string(json[0]["id"]), HashAlgorithm::SHA1); - debug("HEAD revision for '%s' is %s", url, rev.gitRev()); - return rev; + + return RefInfo { + .rev = Hash::parseAny(std::string(json[0]["id"]), HashAlgorithm::SHA1) + }; } DownloadUrl getDownloadUrl(const Input & input) const override @@ -402,7 +460,7 @@ struct SourceHutInputScheme : GitArchiveInputScheme // Once it is implemented, however, should work as expected. } - Hash getRevFromRef(nix::ref store, const Input & input) const override + RefInfo getRevFromRef(nix::ref store, const Input & input) const override { // TODO: In the future, when the sourcehut graphql API is implemented for mercurial // and with anonymous access, this method should use it instead. @@ -445,12 +503,12 @@ struct SourceHutInputScheme : GitArchiveInputScheme id = parsedLine->target; } - if(!id) + if (!id) throw BadURL("in '%d', couldn't find ref '%d'", input.to_string(), ref); - auto rev = Hash::parseAny(*id, HashAlgorithm::SHA1); - debug("HEAD revision for '%s' is %s", fmt("%s/%s", base_url, ref), rev.gitRev()); - return rev; + return RefInfo { + .rev = Hash::parseAny(*id, HashAlgorithm::SHA1) + }; } DownloadUrl getDownloadUrl(const Input & input) const override diff --git a/src/libutil/fs-sink.hh b/src/libutil/fs-sink.hh index 4dfb5b329..ae577819a 100644 --- a/src/libutil/fs-sink.hh +++ b/src/libutil/fs-sink.hh @@ -26,6 +26,8 @@ struct CreateRegularFileSink : Sink struct FileSystemObjectSink { + virtual ~FileSystemObjectSink() = default; + virtual void createDirectory(const Path & path) = 0; /** diff --git a/src/libutil/tarfile.cc b/src/libutil/tarfile.cc index 187b3e948..3bb6694f8 100644 --- a/src/libutil/tarfile.cc +++ b/src/libutil/tarfile.cc @@ -132,4 +132,66 @@ void unpackTarfile(const Path & tarFile, const Path & destDir) extract_archive(archive, destDir); } +time_t unpackTarfileToSink(TarArchive & archive, FileSystemObjectSink & parseSink) +{ + time_t lastModified = 0; + + for (;;) { + // FIXME: merge with extract_archive + struct archive_entry * entry; + int r = archive_read_next_header(archive.archive, &entry); + if (r == ARCHIVE_EOF) break; + auto path = archive_entry_pathname(entry); + if (!path) + throw Error("cannot get archive member name: %s", archive_error_string(archive.archive)); + if (r == ARCHIVE_WARN) + warn(archive_error_string(archive.archive)); + else + archive.check(r); + + lastModified = std::max(lastModified, archive_entry_mtime(entry)); + + switch (archive_entry_filetype(entry)) { + + case AE_IFDIR: + parseSink.createDirectory(path); + break; + + case AE_IFREG: { + parseSink.createRegularFile(path, [&](auto & crf) { + if (archive_entry_mode(entry) & S_IXUSR) + crf.isExecutable(); + + while (true) { + std::vector buf(128 * 1024); + auto n = archive_read_data(archive.archive, buf.data(), buf.size()); + if (n < 0) + throw Error("cannot read file '%s' from tarball", path); + if (n == 0) break; + crf(std::string_view { + (const char *) buf.data(), + (size_t) n, + }); + } + }); + + break; + } + + case AE_IFLNK: { + auto target = archive_entry_symlink(entry); + + parseSink.createSymlink(path, target); + + break; + } + + default: + throw Error("file '%s' in tarball has unsupported file type", path); + } + } + + return lastModified; +} + } diff --git a/src/libutil/tarfile.hh b/src/libutil/tarfile.hh index 237d18c31..6a9c42149 100644 --- a/src/libutil/tarfile.hh +++ b/src/libutil/tarfile.hh @@ -2,6 +2,7 @@ ///@file #include "serialise.hh" +#include "fs-sink.hh" #include namespace nix { @@ -29,4 +30,6 @@ void unpackTarfile(Source & source, const Path & destDir); void unpackTarfile(const Path & tarFile, const Path & destDir); +time_t unpackTarfileToSink(TarArchive & archive, FileSystemObjectSink & parseSink); + }