From cabee9815239af426cece729cb765810b8a716ce Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 20 Feb 2024 12:57:36 +0100 Subject: [PATCH] Tarball fetcher: Use the content-addressed Git cache Backported from the lazy-trees branch. --- src/libcmd/common-eval-args.cc | 6 +- src/libexpr/eval.cc | 7 +- src/libexpr/primops/fetchTree.cc | 3 +- src/libfetchers/git-utils.cc | 16 +++ src/libfetchers/git-utils.hh | 6 ++ src/libfetchers/tarball.cc | 168 ++++++++++++++++++------------- src/libfetchers/tarball.hh | 12 ++- src/libstore/filetransfer.cc | 29 ++++-- src/libstore/filetransfer.hh | 33 +++++- 9 files changed, 185 insertions(+), 95 deletions(-) diff --git a/src/libcmd/common-eval-args.cc b/src/libcmd/common-eval-args.cc index 58f04e225..444ff81c9 100644 --- a/src/libcmd/common-eval-args.cc +++ b/src/libcmd/common-eval-args.cc @@ -9,6 +9,7 @@ #include "store-api.hh" #include "command.hh" #include "tarball.hh" +#include "fetch-to-store.hh" namespace nix { @@ -167,8 +168,9 @@ Bindings * MixEvalArgs::getAutoArgs(EvalState & state) SourcePath lookupFileArg(EvalState & state, std::string_view s, const Path * baseDir) { if (EvalSettings::isPseudoUrl(s)) { - auto storePath = fetchers::downloadTarball( - state.store, EvalSettings::resolvePseudoUrl(s), "source", false).storePath; + auto accessor = fetchers::downloadTarball( + EvalSettings::resolvePseudoUrl(s)).accessor; + auto storePath = fetchToStore(*state.store, SourcePath(accessor), FetchMode::Copy); return state.rootPath(CanonPath(state.store->toRealPath(storePath))); } diff --git a/src/libexpr/eval.cc b/src/libexpr/eval.cc index 4919ac358..133d02f59 100644 --- a/src/libexpr/eval.cc +++ b/src/libexpr/eval.cc @@ -2794,10 +2794,11 @@ std::optional EvalState::resolveSearchPathPath(const SearchPath::Pa if (EvalSettings::isPseudoUrl(value)) { try { - auto storePath = fetchers::downloadTarball( - store, EvalSettings::resolvePseudoUrl(value), "source", false).storePath; + auto accessor = fetchers::downloadTarball( + EvalSettings::resolvePseudoUrl(value)).accessor; + auto storePath = fetchToStore(*store, SourcePath(accessor), FetchMode::Copy); res = { store->toRealPath(storePath) }; - } catch (FileTransferError & e) { + } catch (Error & e) { logWarning({ .msg = HintFmt("Nix search path entry '%1%' cannot be downloaded, ignoring", value) }); diff --git a/src/libexpr/primops/fetchTree.cc b/src/libexpr/primops/fetchTree.cc index 1997d5513..01a43e3fd 100644 --- a/src/libexpr/primops/fetchTree.cc +++ b/src/libexpr/primops/fetchTree.cc @@ -9,6 +9,7 @@ #include "tarball.hh" #include "url.hh" #include "value-to-json.hh" +#include "fetch-to-store.hh" #include #include @@ -473,7 +474,7 @@ static void fetch(EvalState & state, const PosIdx pos, Value * * args, Value & v // https://github.com/NixOS/nix/issues/4313 auto storePath = unpack - ? fetchers::downloadTarball(state.store, *url, name, (bool) expectedHash).storePath + ? fetchToStore(*state.store, fetchers::downloadTarball(*url).accessor, FetchMode::Copy, name) : fetchers::downloadFile(state.store, *url, name, (bool) expectedHash).storePath; if (expectedHash) { diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index 4f034e9d4..f216b6e10 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -466,6 +466,22 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this else throw Error("Commit signature verification on commit %s failed: %s", rev.gitRev(), output); } + + Hash treeHashToNarHash(const Hash & treeHash) override + { + auto accessor = getAccessor(treeHash, false); + + fetchers::Attrs cacheKey({{"_what", "treeHashToNarHash"}, {"treeHash", treeHash.gitRev()}}); + + if (auto res = fetchers::getCache()->lookup(cacheKey)) + return Hash::parseAny(fetchers::getStrAttr(*res, "narHash"), HashAlgorithm::SHA256); + + auto narHash = accessor->hashPath(CanonPath::root); + + fetchers::getCache()->upsert(cacheKey, fetchers::Attrs({{"narHash", narHash.to_string(HashFormat::SRI, true)}})); + + return narHash; + } }; ref GitRepo::openRepo(const std::filesystem::path & path, bool create, bool bare) diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index 5f68d26a7..fbb2d947b 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -93,6 +93,12 @@ struct GitRepo virtual void verifyCommit( const Hash & rev, const std::vector & publicKeys) = 0; + + /** + * Given a Git tree hash, compute the hash of its NAR + * serialisation. This is memoised on-disk. + */ + virtual Hash treeHashToNarHash(const Hash & treeHash) = 0; }; ref getTarballCache(); diff --git a/src/libfetchers/tarball.cc b/src/libfetchers/tarball.cc index 3b7709440..e3b1fbe56 100644 --- a/src/libfetchers/tarball.cc +++ b/src/libfetchers/tarball.cc @@ -9,6 +9,9 @@ #include "types.hh" #include "split.hh" #include "posix-source-accessor.hh" +#include "fs-input-accessor.hh" +#include "store-api.hh" +#include "git-utils.hh" namespace nix::fetchers { @@ -57,10 +60,8 @@ DownloadFileResult downloadFile( throw; } - // FIXME: write to temporary file. Attrs infoAttrs({ {"etag", res.etag}, - {"url", res.effectiveUri}, }); if (res.immutableUrl) @@ -91,96 +92,98 @@ DownloadFileResult downloadFile( storePath = std::move(info.path); } - getCache()->add( - *store, - inAttrs, - infoAttrs, - *storePath, - locked); - - if (url != res.effectiveUri) + /* Cache metadata for all URLs in the redirect chain. */ + for (auto & url : res.urls) { + inAttrs.insert_or_assign("url", url); + infoAttrs.insert_or_assign("url", *res.urls.rbegin()); getCache()->add( *store, - { - {"type", "file"}, - {"url", res.effectiveUri}, - {"name", name}, - }, + inAttrs, infoAttrs, *storePath, locked); + } return { .storePath = std::move(*storePath), .etag = res.etag, - .effectiveUrl = res.effectiveUri, + .effectiveUrl = *res.urls.rbegin(), .immutableUrl = res.immutableUrl, }; } DownloadTarballResult downloadTarball( - ref store, const std::string & url, - const std::string & name, - bool locked, const Headers & headers) { Attrs inAttrs({ - {"type", "tarball"}, + {"_what", "tarballCache"}, {"url", url}, - {"name", name}, }); - auto cached = getCache()->lookupExpired(*store, inAttrs); + auto cached = getCache()->lookupExpired(inAttrs); + + auto attrsToResult = [&](const Attrs & infoAttrs) + { + auto treeHash = getRevAttr(infoAttrs, "treeHash"); + return DownloadTarballResult { + .treeHash = treeHash, + .lastModified = (time_t) getIntAttr(infoAttrs, "lastModified"), + .immutableUrl = maybeGetStrAttr(infoAttrs, "immutableUrl"), + .accessor = getTarballCache()->getAccessor(treeHash, false), + }; + }; + + if (cached && !getTarballCache()->hasObject(getRevAttr(cached->infoAttrs, "treeHash"))) + cached.reset(); if (cached && !cached->expired) - return { - .storePath = std::move(cached->storePath), - .lastModified = (time_t) getIntAttr(cached->infoAttrs, "lastModified"), - .immutableUrl = maybeGetStrAttr(cached->infoAttrs, "immutableUrl"), - }; + return attrsToResult(cached->infoAttrs); - auto res = downloadFile(store, url, name, locked, headers); + auto _res = std::make_shared>(); - std::optional unpackedStorePath; - time_t lastModified; - - if (cached && res.etag != "" && getStrAttr(cached->infoAttrs, "etag") == res.etag) { - unpackedStorePath = std::move(cached->storePath); - lastModified = getIntAttr(cached->infoAttrs, "lastModified"); - } else { - Path tmpDir = createTempDir(); - AutoDelete autoDelete(tmpDir, true); - unpackTarfile(store->toRealPath(res.storePath), tmpDir); - auto members = readDirectory(tmpDir); - if (members.size() != 1) - throw nix::Error("tarball '%s' contains an unexpected number of top-level files", url); - auto topDir = tmpDir + "/" + members.begin()->name; - lastModified = lstat(topDir).st_mtime; - PosixSourceAccessor accessor; - unpackedStorePath = store->addToStore(name, accessor, CanonPath { topDir }, FileIngestionMethod::Recursive, HashAlgorithm::SHA256, {}, defaultPathFilter, NoRepair); - } - - Attrs infoAttrs({ - {"lastModified", uint64_t(lastModified)}, - {"etag", res.etag}, + auto source = sinkToSource([&](Sink & sink) { + FileTransferRequest req(url); + req.expectedETag = cached ? getStrAttr(cached->infoAttrs, "etag") : ""; + getFileTransfer()->download(std::move(req), sink, + [_res](FileTransferResult r) + { + *_res->lock() = r; + }); }); - if (res.immutableUrl) - infoAttrs.emplace("immutableUrl", *res.immutableUrl); + // TODO: fall back to cached value if download fails. - getCache()->add( - *store, - inAttrs, - infoAttrs, - *unpackedStorePath, - locked); + /* Note: if the download is cached, `importTarball()` will receive + no data, which causes it to import an empty tarball. */ + TarArchive archive { *source }; + auto parseSink = getTarballCache()->getFileSystemObjectSink(); + auto lastModified = unpackTarfileToSink(archive, *parseSink); - return { - .storePath = std::move(*unpackedStorePath), - .lastModified = lastModified, - .immutableUrl = res.immutableUrl, - }; + auto res(_res->lock()); + + Attrs infoAttrs; + + if (res->cached) { + infoAttrs = cached->infoAttrs; + } else { + infoAttrs.insert_or_assign("etag", res->etag); + infoAttrs.insert_or_assign("treeHash", parseSink->sync().gitRev()); + infoAttrs.insert_or_assign("lastModified", uint64_t(lastModified)); + if (res->immutableUrl) + infoAttrs.insert_or_assign("immutableUrl", *res->immutableUrl); + } + + /* Insert a cache entry for every URL in the redirect chain. */ + for (auto & url : res->urls) { + inAttrs.insert_or_assign("url", url); + getCache()->upsert(inAttrs, infoAttrs); + } + + // FIXME: add a cache entry for immutableUrl? That could allow + // cache poisoning. + + return attrsToResult(infoAttrs); } // An input scheme corresponding to a curl-downloadable resource. @@ -198,6 +201,8 @@ struct CurlInputScheme : InputScheme virtual bool isValidURL(const ParsedURL & url, bool requireTree) const = 0; + static const std::set specialParams; + std::optional inputFromURL(const ParsedURL & _url, bool requireTree) const override { if (!isValidURL(_url, requireTree)) @@ -220,8 +225,12 @@ struct CurlInputScheme : InputScheme if (auto n = string2Int(*i)) input.attrs.insert_or_assign("revCount", *n); - url.query.erase("rev"); - url.query.erase("revCount"); + if (auto i = get(url.query, "lastModified")) + if (auto n = string2Int(*i)) + input.attrs.insert_or_assign("lastModified", *n); + + for (auto & param : allowedAttrs()) + url.query.erase(param); input.attrs.insert_or_assign("type", std::string { schemeName() }); input.attrs.insert_or_assign("url", url.to_string()); @@ -275,10 +284,20 @@ struct FileInputScheme : CurlInputScheme : (!requireTree && !hasTarballExtension(url.path))); } - std::pair fetch(ref store, const Input & input) override + std::pair, Input> getAccessor(ref store, const Input & _input) const override { + auto input(_input); + auto file = downloadFile(store, getStrAttr(input.attrs, "url"), input.getName(), false); - return {std::move(file.storePath), input}; + + auto narHash = store->queryPathInfo(file.storePath)->narHash; + input.attrs.insert_or_assign("narHash", narHash.to_string(HashFormat::SRI, true)); + + auto accessor = makeStorePathAccessor(store, file.storePath); + + accessor->setPathDisplay("«" + input.to_string() + "»"); + + return {accessor, input}; } }; @@ -296,11 +315,13 @@ struct TarballInputScheme : CurlInputScheme : (requireTree || hasTarballExtension(url.path))); } - std::pair fetch(ref store, const Input & _input) override + std::pair, Input> getAccessor(ref store, const Input & _input) const override { - Input input(_input); - auto url = getStrAttr(input.attrs, "url"); - auto result = downloadTarball(store, url, input.getName(), false); + auto input(_input); + + auto result = downloadTarball(getStrAttr(input.attrs, "url"), {}); + + result.accessor->setPathDisplay("«" + input.to_string() + "»"); if (result.immutableUrl) { auto immutableInput = Input::fromURL(*result.immutableUrl); @@ -314,7 +335,10 @@ struct TarballInputScheme : CurlInputScheme if (result.lastModified && !input.attrs.contains("lastModified")) input.attrs.insert_or_assign("lastModified", uint64_t(result.lastModified)); - return {result.storePath, std::move(input)}; + input.attrs.insert_or_assign("narHash", + getTarballCache()->treeHashToNarHash(result.treeHash).to_string(HashFormat::SRI, true)); + + return {result.accessor, input}; } }; diff --git a/src/libfetchers/tarball.hh b/src/libfetchers/tarball.hh index 9e6b50b31..77ad3bf09 100644 --- a/src/libfetchers/tarball.hh +++ b/src/libfetchers/tarball.hh @@ -2,11 +2,13 @@ #include "types.hh" #include "path.hh" +#include "hash.hh" #include namespace nix { class Store; +struct InputAccessor; } namespace nix::fetchers { @@ -28,16 +30,18 @@ DownloadFileResult downloadFile( struct DownloadTarballResult { - StorePath storePath; + Hash treeHash; time_t lastModified; std::optional immutableUrl; + ref accessor; }; +/** + * Download and import a tarball into the Git cache. The result is the + * Git tree hash of the root directory. + */ DownloadTarballResult downloadTarball( - ref store, const std::string & url, - const std::string & name, - bool locked, const Headers & headers = {}); } diff --git a/src/libstore/filetransfer.cc b/src/libstore/filetransfer.cc index ebfae346f..bab21bf51 100644 --- a/src/libstore/filetransfer.cc +++ b/src/libstore/filetransfer.cc @@ -106,6 +106,8 @@ struct curlFileTransfer : public FileTransfer this->result.data.append(data); }) { + result.urls.push_back(request.uri); + requestHeaders = curl_slist_append(requestHeaders, "Accept-Encoding: zstd, br, gzip, deflate, bzip2, xz"); if (!request.expectedETag.empty()) requestHeaders = curl_slist_append(requestHeaders, ("If-None-Match: " + request.expectedETag).c_str()); @@ -182,6 +184,14 @@ struct curlFileTransfer : public FileTransfer return ((TransferItem *) userp)->writeCallback(contents, size, nmemb); } + void appendCurrentUrl() + { + char * effectiveUriCStr = nullptr; + curl_easy_getinfo(req, CURLINFO_EFFECTIVE_URL, &effectiveUriCStr); + if (effectiveUriCStr && *result.urls.rbegin() != effectiveUriCStr) + result.urls.push_back(effectiveUriCStr); + } + size_t headerCallback(void * contents, size_t size, size_t nmemb) { size_t realSize = size * nmemb; @@ -196,6 +206,7 @@ struct curlFileTransfer : public FileTransfer statusMsg = trim(match.str(1)); acceptRanges = false; encoding = ""; + appendCurrentUrl(); } else { auto i = line.find(':'); @@ -360,14 +371,11 @@ struct curlFileTransfer : public FileTransfer { auto httpStatus = getHTTPStatus(); - char * effectiveUriCStr = nullptr; - curl_easy_getinfo(req, CURLINFO_EFFECTIVE_URL, &effectiveUriCStr); - if (effectiveUriCStr) - result.effectiveUri = effectiveUriCStr; - debug("finished %s of '%s'; curl status = %d, HTTP status = %d, body = %d bytes", request.verb(), request.uri, code, httpStatus, result.bodySize); + appendCurrentUrl(); + if (decompressionSink) { try { decompressionSink->finish(); @@ -779,7 +787,10 @@ FileTransferResult FileTransfer::upload(const FileTransferRequest & request) return enqueueFileTransfer(request).get(); } -void FileTransfer::download(FileTransferRequest && request, Sink & sink) +void FileTransfer::download( + FileTransferRequest && request, + Sink & sink, + std::function resultCallback) { /* Note: we can't call 'sink' via request.dataCallback, because that would cause the sink to execute on the fileTransfer @@ -829,11 +840,13 @@ void FileTransfer::download(FileTransferRequest && request, Sink & sink) }; enqueueFileTransfer(request, - {[_state](std::future fut) { + {[_state, resultCallback{std::move(resultCallback)}](std::future fut) { auto state(_state->lock()); state->quit = true; try { - fut.get(); + auto res = fut.get(); + if (resultCallback) + resultCallback(std::move(res)); } catch (...) { state->exc = std::current_exception(); } diff --git a/src/libstore/filetransfer.hh b/src/libstore/filetransfer.hh index a3b0dde1f..1c271cbec 100644 --- a/src/libstore/filetransfer.hh +++ b/src/libstore/filetransfer.hh @@ -75,14 +75,34 @@ struct FileTransferRequest struct FileTransferResult { + /** + * Whether this is a cache hit (i.e. the ETag supplied in the + * request is still valid). If so, `data` is empty. + */ bool cached = false; + + /** + * The ETag of the object. + */ std::string etag; - std::string effectiveUri; + + /** + * All URLs visited in the redirect chain. + */ + std::vector urls; + + /** + * The response body. + */ std::string data; + uint64_t bodySize = 0; - /* An "immutable" URL for this resource (i.e. one whose contents - will never change), as returned by the `Link: ; - rel="immutable"` header. */ + + /** + * An "immutable" URL for this resource (i.e. one whose contents + * will never change), as returned by the `Link: ; + * rel="immutable"` header. + */ std::optional immutableUrl; }; @@ -116,7 +136,10 @@ struct FileTransfer * Download a file, writing its data to a sink. The sink will be * invoked on the thread of the caller. */ - void download(FileTransferRequest && request, Sink & sink); + void download( + FileTransferRequest && request, + Sink & sink, + std::function resultCallback = {}); enum Error { NotFound, Forbidden, Misc, Transient, Interrupted }; };