Git fetcher: Improve submodule handling

Instead of making a complete copy of the repo, fetching the
submodules, and writing the result to the store (which is all
superexpensive), we now fetch the submodules recursively using the Git
fetcher, and return a union accessor that "mounts" the accessors for
the submodules on top of the root accessor.
This commit is contained in:
Eelco Dolstra 2023-10-27 18:39:00 +02:00
parent ee36a44bf2
commit d88106df24
6 changed files with 212 additions and 84 deletions

View file

@ -1,11 +1,13 @@
#include "git-utils.hh" #include "git-utils.hh"
#include "input-accessor.hh" #include "input-accessor.hh"
#include "cache.hh" #include "cache.hh"
#include "finally.hh"
#include <boost/core/span.hpp> #include <boost/core/span.hpp>
#include <git2/blob.h> #include <git2/blob.h>
#include <git2/commit.h> #include <git2/commit.h>
#include <git2/config.h>
#include <git2/describe.h> #include <git2/describe.h>
#include <git2/errors.h> #include <git2/errors.h>
#include <git2/global.h> #include <git2/global.h>
@ -14,6 +16,7 @@
#include <git2/remote.h> #include <git2/remote.h>
#include <git2/repository.h> #include <git2/repository.h>
#include <git2/status.h> #include <git2/status.h>
#include <git2/submodule.h>
#include <git2/tree.h> #include <git2/tree.h>
#include <unordered_set> #include <unordered_set>
@ -63,6 +66,8 @@ typedef std::unique_ptr<git_reference, Deleter<git_reference_free>> Reference;
typedef std::unique_ptr<git_describe_result, Deleter<git_describe_result_free>> DescribeResult; typedef std::unique_ptr<git_describe_result, Deleter<git_describe_result_free>> DescribeResult;
typedef std::unique_ptr<git_status_list, Deleter<git_status_list_free>> StatusList; typedef std::unique_ptr<git_status_list, Deleter<git_status_list_free>> StatusList;
typedef std::unique_ptr<git_remote, Deleter<git_remote_free>> Remote; typedef std::unique_ptr<git_remote, Deleter<git_remote_free>> Remote;
typedef std::unique_ptr<git_config, Deleter<git_config_free>> GitConfig;
typedef std::unique_ptr<git_config_iterator, Deleter<git_config_iterator_free>> ConfigIterator;
// A helper to ensure that we don't leak objects returned by libgit2. // A helper to ensure that we don't leak objects returned by libgit2.
template<typename T> template<typename T>
@ -256,6 +261,17 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
return std::nullopt; return std::nullopt;
} }
std::vector<Submodule> getSubmodules(const Hash & rev) override;
std::string resolveSubmoduleUrl(const std::string & url) override
{
git_buf buf = GIT_BUF_INIT;
if (git_submodule_resolve_url(&buf, *this, url.c_str()))
throw Error("resolving Git submodule URL '%s'", url);
Finally cleanup = [&]() { git_buf_dispose(&buf); };
return buf.ptr;
}
bool hasObject(const Hash & oid_) override bool hasObject(const Hash & oid_) override
{ {
auto oid = hashToOID(oid_); auto oid = hashToOID(oid_);
@ -400,6 +416,16 @@ struct GitInputAccessor : InputAccessor
return readBlob(path, true); return readBlob(path, true);
} }
Hash getSubmoduleRev(const CanonPath & path)
{
auto entry = need(path);
if (git_tree_entry_type(entry) != GIT_OBJECT_COMMIT)
throw Error("'%s' is not a submodule", showPath(path));
return toHash(*git_tree_entry_id(entry));
}
std::map<CanonPath, TreeEntry> lookupCache; std::map<CanonPath, TreeEntry> lookupCache;
/* Recursively look up 'path' relative to the root. */ /* Recursively look up 'path' relative to the root. */
@ -495,4 +521,56 @@ ref<InputAccessor> GitRepoImpl::getAccessor(const Hash & rev)
return make_ref<GitInputAccessor>(ref<GitRepoImpl>(shared_from_this()), rev); return make_ref<GitInputAccessor>(ref<GitRepoImpl>(shared_from_this()), rev);
} }
std::vector<GitRepoImpl::Submodule> GitRepoImpl::getSubmodules(const Hash & rev)
{
/* Read the .gitmodules files from this revision. */
CanonPath modulesFile(".gitmodules");
auto accessor = getAccessor(rev);
if (!accessor->pathExists(modulesFile)) return {};
/* Parse it. */
auto configS = accessor->readFile(modulesFile);
auto [fdTemp, pathTemp] = createTempFile("nix-git-submodules");
writeFull(fdTemp.get(), configS);
GitConfig config;
if (git_config_open_ondisk(Setter(config), pathTemp.c_str()))
throw Error("parsing .gitmodules file: %s", git_error_last()->message);
ConfigIterator it;
if (git_config_iterator_glob_new(Setter(it), config.get(), "^submodule\\..*\\.(path|url|branch)$"))
throw Error("iterating over .gitmodules: %s", git_error_last()->message);
std::map<std::string, std::string> entries;
while (true) {
git_config_entry * entry = nullptr;
if (auto err = git_config_next(&entry, it.get())) {
if (err == GIT_ITEROVER) break;
throw Error("iterating over .gitmodules: %s", git_error_last()->message);
}
entries.emplace(entry->name + 10, entry->value);
}
std::vector<Submodule> result;
for (auto & [key, value] : entries) {
if (!hasSuffix(key, ".path")) continue;
std::string key2(key, 0, key.size() - 5);
auto path = CanonPath(value);
auto rev = accessor.dynamic_pointer_cast<GitInputAccessor>()->getSubmoduleRev(path);
result.push_back(Submodule {
.path = path,
.url = entries[key2 + ".url"],
.branch = entries[key2 + ".branch"],
.rev = rev,
});
}
return result;
}
} }

View file

@ -38,6 +38,18 @@ struct GitRepo
/* Get the ref that HEAD points to. */ /* Get the ref that HEAD points to. */
virtual std::optional<std::string> getWorkdirRef() = 0; virtual std::optional<std::string> getWorkdirRef() = 0;
struct Submodule
{
CanonPath path;
std::string url;
std::string branch;
Hash rev;
};
virtual std::vector<Submodule> getSubmodules(const Hash & rev) = 0;
virtual std::string resolveSubmoduleUrl(const std::string & url) = 0;
struct TarballInfo struct TarballInfo
{ {
Hash treeHash; Hash treeHash;

View file

@ -8,6 +8,7 @@
#include "util.hh" #include "util.hh"
#include "git.hh" #include "git.hh"
#include "fs-input-accessor.hh" #include "fs-input-accessor.hh"
#include "union-input-accessor.hh"
#include "git-utils.hh" #include "git-utils.hh"
#include "fetch-settings.hh" #include "fetch-settings.hh"
@ -134,11 +135,6 @@ std::optional<std::string> readHeadCached(const std::string & actualUrl)
return std::nullopt; return std::nullopt;
} }
bool isNotDotGitDirectory(const Path & path)
{
return baseNameOf(path) != ".git";
}
} // end namespace } // end namespace
struct GitInputScheme : InputScheme struct GitInputScheme : InputScheme
@ -413,7 +409,7 @@ struct GitInputScheme : InputScheme
std::string name = input.getName(); std::string name = input.getName();
auto makeResult2 = [&](const Attrs & infoAttrs, ref<InputAccessor> accessor) -> std::pair<ref<InputAccessor>, Input> auto makeResult = [&](const Attrs & infoAttrs, ref<InputAccessor> accessor) -> std::pair<ref<InputAccessor>, Input>
{ {
assert(input.getRev()); assert(input.getRev());
assert(!origRev || origRev == input.getRev()); assert(!origRev || origRev == input.getRev());
@ -424,18 +420,6 @@ struct GitInputScheme : InputScheme
return {accessor, std::move(input)}; return {accessor, std::move(input)};
}; };
auto makeResult = [&](const Attrs & infoAttrs, const StorePath & storePath) -> std::pair<ref<InputAccessor>, Input>
{
// FIXME: remove?
//input.attrs.erase("narHash");
auto narHash = store->queryPathInfo(storePath)->narHash;
input.attrs.insert_or_assign("narHash", narHash.to_string(HashFormat::SRI, true));
auto accessor = makeStorePathAccessor(store, storePath, makeNotAllowedError(repoInfo.url));
return makeResult2(infoAttrs, accessor);
};
auto originalRef = input.getRef(); auto originalRef = input.getRef();
auto ref = originalRef ? *originalRef : getDefaultRef(repoInfo); auto ref = originalRef ? *originalRef : getDefaultRef(repoInfo);
input.attrs.insert_or_assign("ref", ref); input.attrs.insert_or_assign("ref", ref);
@ -542,66 +526,39 @@ struct GitInputScheme : InputScheme
printTalkative("using revision %s of repo '%s'", rev.gitRev(), repoInfo.url); printTalkative("using revision %s of repo '%s'", rev.gitRev(), repoInfo.url);
if (!repoInfo.submodules) { auto repo = GitRepo::openRepo(CanonPath(repoDir));
auto accessor = GitRepo::openRepo(CanonPath(repoDir))->getAccessor(rev);
return makeResult2(infoAttrs, accessor); auto accessor = repo->getAccessor(rev);
/* If the repo has submodules, fetch them and return a union
input accessor consisting of the accessor for the top-level
repo and the accessors for the submodules. */
if (repoInfo.submodules) {
std::map<CanonPath, nix::ref<InputAccessor>> mounts;
for (auto & submodule : repo->getSubmodules(rev)) {
auto resolved = repo->resolveSubmoduleUrl(submodule.url);
debug("Git submodule %s: %s %s %s -> %s",
submodule.path, submodule.url, submodule.branch, submodule.rev.gitRev(), resolved);
fetchers::Attrs attrs;
attrs.insert_or_assign("type", "git");
attrs.insert_or_assign("url", resolved);
if (submodule.branch != "")
attrs.insert_or_assign("ref", submodule.branch);
attrs.insert_or_assign("rev", submodule.rev.gitRev());
auto submoduleInput = fetchers::Input::fromAttrs(std::move(attrs));
auto [submoduleAccessor, submoduleInput2] =
submoduleInput.scheme->getAccessor(store, submoduleInput);
mounts.insert_or_assign(submodule.path, submoduleAccessor);
}
if (!mounts.empty()) {
mounts.insert_or_assign(CanonPath::root, accessor);
accessor = makeUnionInputAccessor(std::move(mounts));
}
} }
else { return makeResult(infoAttrs, accessor);
// FIXME: use libgit2
Path tmpDir = createTempDir();
AutoDelete delTmpDir(tmpDir, true);
PathFilter filter = defaultPathFilter;
Activity act(*logger, lvlChatty, actUnknown, fmt("copying Git tree '%s' to the store", input.to_string()));
Path tmpGitDir = createTempDir();
AutoDelete delTmpGitDir(tmpGitDir, true);
runProgram("git", true, { "-c", "init.defaultBranch=" + gitInitialBranch, "init", tmpDir, "--separate-git-dir", tmpGitDir });
{
// TODO: repoDir might lack the ref (it only checks if rev
// exists, see FIXME above) so use a big hammer and fetch
// everything to ensure we get the rev.
Activity act(*logger, lvlTalkative, actUnknown, fmt("making temporary clone of '%s'", repoDir));
runProgram("git", true, { "-C", tmpDir, "fetch", "--quiet", "--force",
"--update-head-ok", "--", repoDir, "refs/*:refs/*" }, {}, true);
}
runProgram("git", true, { "-C", tmpDir, "checkout", "--quiet", rev.gitRev() });
/* Ensure that we use the correct origin for fetching
submodules. This matters for submodules with relative
URLs. */
if (repoInfo.isLocal) {
writeFile(tmpGitDir + "/config", readFile(repoDir + "/" + repoInfo.gitDir + "/config"));
/* Restore the config.bare setting we may have just
copied erroneously from the user's repo. */
runProgram("git", true, { "-C", tmpDir, "config", "core.bare", "false" });
} else
runProgram("git", true, { "-C", tmpDir, "config", "remote.origin.url", repoInfo.url });
/* As an optimisation, copy the modules directory of the
source repo if it exists. */
auto modulesPath = repoDir + "/" + repoInfo.gitDir + "/modules";
if (pathExists(modulesPath)) {
Activity act(*logger, lvlTalkative, actUnknown, fmt("copying submodules of '%s'", repoInfo.url));
runProgram("cp", true, { "-R", "--", modulesPath, tmpGitDir + "/modules" });
}
{
Activity act(*logger, lvlTalkative, actUnknown, fmt("fetching submodules of '%s'", repoInfo.url));
runProgram("git", true, { "-C", tmpDir, "submodule", "--quiet", "update", "--init", "--recursive" }, {}, true);
}
filter = isNotDotGitDirectory;
auto storePath = store->addToStore(name, tmpDir, FileIngestionMethod::Recursive, htSHA256, filter);
return makeResult(infoAttrs, std::move(storePath));
}
} }
std::pair<ref<InputAccessor>, Input> getAccessorFromWorkdir( std::pair<ref<InputAccessor>, Input> getAccessorFromWorkdir(

View file

@ -0,0 +1,80 @@
#include "union-input-accessor.hh"
namespace nix {
struct UnionInputAccessor : InputAccessor
{
std::map<CanonPath, ref<InputAccessor>> mounts;
UnionInputAccessor(std::map<CanonPath, ref<InputAccessor>> _mounts)
: mounts(std::move(_mounts))
{
// Currently we require a root filesystem. This could be relaxed.
assert(mounts.contains(CanonPath::root));
// FIXME: should check that every mount point exists. Or we
// could return dummy parent directories automatically.
}
std::string readFile(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->readFile(subpath);
}
bool pathExists(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->pathExists(subpath);
}
Stat lstat(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->lstat(subpath);
}
DirEntries readDirectory(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->readDirectory(subpath);
}
std::string readLink(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->readLink(subpath);
}
std::string showPath(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->showPath(subpath);
}
std::pair<ref<InputAccessor>, CanonPath> resolve(CanonPath path)
{
// Find the nearest parent of `path` that is a mount point.
std::vector<std::string> ss;
while (true) {
auto i = mounts.find(path);
if (i != mounts.end()) {
auto subpath = CanonPath::root;
for (auto j = ss.rbegin(); j != ss.rend(); ++j)
subpath.push(*j);
return {i->second, std::move(subpath)};
}
assert(!path.isRoot());
ss.push_back(std::string(*path.baseName()));
path.pop();
}
}
};
ref<InputAccessor> makeUnionInputAccessor(std::map<CanonPath, ref<InputAccessor>> mounts)
{
return make_ref<UnionInputAccessor>(std::move(mounts));
}
}

View file

@ -0,0 +1,9 @@
#pragma once
#include "input-accessor.hh"
namespace nix {
ref<InputAccessor> makeUnionInputAccessor(std::map<CanonPath, ref<InputAccessor>> mounts);
}

View file

@ -118,11 +118,3 @@ cloneRepo=$TEST_ROOT/a/b/gitSubmodulesClone # NB /a/b to make the relative path
git clone $rootRepo $cloneRepo git clone $rootRepo $cloneRepo
pathIndirect=$(nix eval --raw --expr "(builtins.fetchGit { url = file://$cloneRepo; rev = \"$rev2\"; submodules = true; }).outPath") pathIndirect=$(nix eval --raw --expr "(builtins.fetchGit { url = file://$cloneRepo; rev = \"$rev2\"; submodules = true; }).outPath")
[[ $pathIndirect = $pathWithRelative ]] [[ $pathIndirect = $pathWithRelative ]]
# Test that if the clone has the submodule already, we're not fetching
# it again.
git -C $cloneRepo submodule update --init
rm $TEST_HOME/.cache/nix/fetcher-cache*
rm -rf $subRepo
pathSubmoduleGone=$(nix eval --raw --expr "(builtins.fetchGit { url = file://$cloneRepo; rev = \"$rev2\"; submodules = true; }).outPath")
[[ $pathSubmoduleGone = $pathWithRelative ]]