From a8629de827e4d5a67372614727ce6fcc26423f8c Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 9 Oct 2007 22:14:27 +0000 Subject: [PATCH] * New command `nix-store --optimise' to reduce Nix store disk space usage by finding identical files in the store and hard-linking them to each other. It typically reduces the size of the store by something like 25-35%. This is what the optimise-store.pl script did, but the new command is faster and more correct (it's safe wrt garbage collection and concurrent builds). --- scripts/optimise-store.pl | 91 -------------------------- src/libstore/local-store.cc | 125 ++++++++++++++++++++++++++++++++++-- src/libstore/local-store.hh | 18 ++++++ src/nix-store/help.txt | 1 + src/nix-store/nix-store.cc | 52 ++++++++++++++- 5 files changed, 188 insertions(+), 99 deletions(-) delete mode 100755 scripts/optimise-store.pl diff --git a/scripts/optimise-store.pl b/scripts/optimise-store.pl deleted file mode 100755 index 41557e6d1..000000000 --- a/scripts/optimise-store.pl +++ /dev/null @@ -1,91 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use File::Basename; - - -my @paths = ("/nix/store"); - - -print "hashing...\n"; - -my $hashList = "/tmp/nix-optimise-hash-list"; - -system("find @paths -type f -print0 | xargs -0 md5sum -- > $hashList") == 0 - or die "cannot hash store files"; - - -print "sorting by hash...\n"; - -system("sort $hashList > $hashList.sorted") == 0 - or die "cannot sort list"; - - -sub atomicLink { - my $target = shift; - my $new = shift; - my $tmpNew = "${new}_optimise.$$"; - - # Make the directory writable temporarily. - my $dir = dirname $new; - my @st = stat $dir or die; - - chmod ($st[2] | 0200, $dir) or die "cannot make `$dir' writable: $!"; - - link $target, $tmpNew or die "cannot create hard link `$tmpNew': $!"; - - rename $tmpNew, $new or die "cannot rename `$tmpNew' to `$new': $!"; - - chmod ($st[2], $dir) or die "cannot restore permission on `$dir': $!"; - utime ($st[8], $st[9], $dir) or die "cannot restore timestamp on `$dir': $!"; -} - - -print "hard-linking...\n"; - -open LIST, "<$hashList.sorted" or die; - -my $prevFile; -my $prevHash; -my $prevInode; -my $prevExec; - -my $totalSpace = 0; -my $savedSpace = 0; - -while () { - /^([0-9a-f]*)\s+(.*)$/ or die; - my $curFile = $2; - my $curHash = $1; - - my @st = stat $curFile or die; - next if ($st[2] & 0222) != 0; # skip writable files - - my $fileSize = $st[7]; - $totalSpace += $fileSize; - my $isExec = ($st[2] & 0111) == 0111; - - if (defined $prevHash && $curHash eq $prevHash - && $prevExec == $isExec) - { - - if ($st[1] != $prevInode) { - print "$curFile = $prevFile\n"; - atomicLink $prevFile, $curFile; - $savedSpace += $fileSize; - } - - } else { - $prevFile = $curFile; - $prevHash = $curHash; - $prevInode = $st[1]; - $prevExec = ($st[2] & 0111) == 0111; - } -} - -print "total space = $totalSpace\n"; -print "saved space = $savedSpace\n"; -my $savings = ($savedSpace / $totalSpace) * 100.0; -print "savings = $savings %\n"; - -close LIST; diff --git a/src/libstore/local-store.cc b/src/libstore/local-store.cc index 4378f0ba6..c77ab3c6c 100644 --- a/src/libstore/local-store.cc +++ b/src/libstore/local-store.cc @@ -174,7 +174,7 @@ void copyPath(const Path & src, const Path & dst, PathFilter & filter) } -static void _canonicalisePathMetaData(const Path & path) +static void _canonicalisePathMetaData(const Path & path, bool recurse) { checkInterrupt(); @@ -223,17 +223,17 @@ static void _canonicalisePathMetaData(const Path & path) } - if (S_ISDIR(st.st_mode)) { + if (recurse && S_ISDIR(st.st_mode)) { Strings names = readDirectory(path); for (Strings::iterator i = names.begin(); i != names.end(); ++i) - _canonicalisePathMetaData(path + "/" + *i); + _canonicalisePathMetaData(path + "/" + *i, true); } } void canonicalisePathMetaData(const Path & path) { - _canonicalisePathMetaData(path); + _canonicalisePathMetaData(path, true); /* On platforms that don't have lchown(), the top-level path can't be a symlink, since we can't change its ownership. */ @@ -625,7 +625,7 @@ void LocalStore::exportPath(const Path & path, bool sign, consistent metadata. */ Transaction txn(nixDB); addTempRoot(path); - if (!isValidPath(path)) + if (!isValidPathTxn(txn, path)) throw Error(format("path `%1%' is not valid") % path); HashAndWriteSink hashAndWriteSink(sink); @@ -950,6 +950,121 @@ void verifyStore(bool checkContents) } +typedef std::map > HashToPath; + + +static void toggleWritable(const Path & path, bool writable) +{ + struct stat st; + if (lstat(path.c_str(), &st)) + throw SysError(format("getting attributes of path `%1%'") % path); + + mode_t mode = st.st_mode; + if (writable) mode |= S_IWUSR; + else mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH); + + if (chmod(path.c_str(), mode) == -1) + throw SysError(format("changing writability of `%1%'") % path); +} + + +static void hashAndLink(bool dryRun, HashToPath & hashToPath, + OptimiseStats & stats, const Path & path) +{ + struct stat st; + if (lstat(path.c_str(), &st)) + throw SysError(format("getting attributes of path `%1%'") % path); + + /* Sometimes SNAFUs can cause files in the Nix store to be + modified, in particular when running programs as root under + NixOS (example: $fontconfig/var/cache being modified). Skip + those files. */ + if (S_ISREG(st.st_mode) && (st.st_mode & S_IWUSR)) { + printMsg(lvlError, format("skipping suspicious writable file `%1%'") % path); + return; + } + + /* We can hard link regular files and symlinks. */ + if (S_ISREG(st.st_mode) || S_ISLNK(st.st_mode)) { + + /* Hash the file. Note that hashPath() returns the hash over + the NAR serialisation, which includes the execute bit on + the file. Thus, executable and non-executable files with + the same contents *won't* be linked (which is good because + otherwise the permissions would be screwed up). + + Also note that if `path' is a symlink, then we're hashing + the contents of the symlink (i.e. the result of + readlink()), not the contents of the target (which may not + even exist). */ + Hash hash = hashPath(htSHA256, path); + stats.totalFiles++; + printMsg(lvlDebug, format("`%1%' has hash `%2%'") % path % printHash(hash)); + + std::pair prevPath = hashToPath[hash]; + + if (prevPath.first == "") { + hashToPath[hash] = std::pair(path, st.st_ino); + return; + } + + /* Yes! We've seen a file with the same contents. Replace + the current file with a hard link to that file. */ + stats.sameContents++; + if (prevPath.second == st.st_ino) { + printMsg(lvlDebug, format("`%1%' is already linked to `%2%'") % path % prevPath.first); + return; + } + + printMsg(lvlTalkative, format("linking `%1%' to `%2%'") % path % prevPath.first); + + Path tempLink = (format("%1%.tmp-%2%-%3%") + % path % getpid() % rand()).str(); + + toggleWritable(dirOf(path), true); + + if (link(prevPath.first.c_str(), tempLink.c_str()) == -1) + throw SysError(format("cannot link `%1%' to `%2%'") + % tempLink % prevPath.first); + + /* Atomically replace the old file with the new hard link. */ + if (rename(tempLink.c_str(), path.c_str()) == -1) + throw SysError(format("cannot rename `%1%' to `%2%'") + % tempLink % path); + + /* Make the directory read-only again and reset its timestamp + back to 0. */ + _canonicalisePathMetaData(dirOf(path), false); + + stats.filesLinked++; + stats.bytesFreed += st.st_size; + } + + if (S_ISDIR(st.st_mode)) { + Strings names = readDirectory(path); + for (Strings::iterator i = names.begin(); i != names.end(); ++i) + hashAndLink(dryRun, hashToPath, stats, path + "/" + *i); + } +} + + +void LocalStore::optimiseStore(bool dryRun, OptimiseStats & stats) +{ + HashToPath hashToPath; + + Paths paths; + PathSet validPaths; + nixDB.enumTable(noTxn, dbValidPaths, paths); + + for (Paths::iterator i = paths.begin(); i != paths.end(); ++i) { + addTempRoot(*i); + if (!isValidPath(*i)) continue; /* path was GC'ed, probably */ + startNest(nest, lvlChatty, format("hashing files in `%1%'") % *i); + hashAndLink(dryRun, hashToPath, stats, *i); + } +} + + /* Upgrade from schema 1 (Nix <= 0.7) to schema 2 (Nix >= 0.8). */ static void upgradeStore07() { diff --git a/src/libstore/local-store.hh b/src/libstore/local-store.hh index 8bd37bc0a..6c366167f 100644 --- a/src/libstore/local-store.hh +++ b/src/libstore/local-store.hh @@ -21,6 +21,20 @@ const int nixSchemaVersion = 4; extern string drvsLogDir; +struct OptimiseStats +{ + unsigned long totalFiles; + unsigned long sameContents; + unsigned long filesLinked; + unsigned long long bytesFreed; + OptimiseStats() + { + totalFiles = sameContents = filesLinked = 0; + bytesFreed = 0; + } +}; + + class LocalStore : public StoreAPI { private: @@ -83,6 +97,10 @@ public: void collectGarbage(GCAction action, const PathSet & pathsToDelete, bool ignoreLiveness, PathSet & result, unsigned long long & bytesFreed); + + /* Optimise the disk space usage of the Nix store by hard-linking + files with the same contents. */ + void optimiseStore(bool dryRun, OptimiseStats & stats); }; diff --git a/src/nix-store/help.txt b/src/nix-store/help.txt index 0662f6796..14b83a06c 100644 --- a/src/nix-store/help.txt +++ b/src/nix-store/help.txt @@ -21,6 +21,7 @@ Operations: --init: initialise the Nix database --verify: verify Nix structures + --optimise: optimise the Nix store by hard-linking identical files --version: output version information --help: display help diff --git a/src/nix-store/nix-store.cc b/src/nix-store/nix-store.cc index 176dc39f9..678ce2ae9 100644 --- a/src/nix-store/nix-store.cc +++ b/src/nix-store/nix-store.cc @@ -466,6 +466,13 @@ static void opCheckValidity(Strings opFlags, Strings opArgs) } +static string showBytes(unsigned long long bytes) +{ + return (format("%d bytes (%.2f MiB)") + % bytes % (bytes / (1024.0 * 1024.0))).str(); +} + + struct PrintFreed { bool show, dryRun; @@ -477,9 +484,9 @@ struct PrintFreed if (show) cout << format( (dryRun - ? "%d bytes would be freed (%.2f MiB)\n" - : "%d bytes freed (%.2f MiB)\n")) - % bytesFreed % (bytesFreed / (1024.0 * 1024.0)); + ? "%1% would be freed\n" + : "%1% freed (%.2f MiB)\n")) + % showBytes(bytesFreed); } }; @@ -614,6 +621,43 @@ static void opVerify(Strings opFlags, Strings opArgs) } + +static void showOptimiseStats(OptimiseStats & stats) +{ + printMsg(lvlError, + format("%1% freed by hard-linking %2% files; there are %3% files with equal contents out of %4% files in total") + % showBytes(stats.bytesFreed) + % stats.filesLinked + % stats.sameContents + % stats.totalFiles); +} + + +/* Optimise the disk space usage of the Nix store by hard-linking + files with the same contents. */ +static void opOptimise(Strings opFlags, Strings opArgs) +{ + if (!opArgs.empty()) + throw UsageError("no arguments expected"); + + for (Strings::iterator i = opFlags.begin(); + i != opFlags.end(); ++i) + throw UsageError(format("unknown flag `%1%'") % *i); + + LocalStore * store2(dynamic_cast(store.get())); + if (!store2) throw Error("you don't have sufficient rights to use --optimise"); + + OptimiseStats stats; + try { + store2->optimiseStore(true, stats); + } catch (...) { + showOptimiseStats(stats); + throw; + } + showOptimiseStats(stats); +} + + /* Scan the arguments; find the operation, set global flags, put all other flags in a list, and put all other arguments in another list. */ @@ -659,6 +703,8 @@ void run(Strings args) op = opInit; else if (arg == "--verify") op = opVerify; + else if (arg == "--optimise") + op = opOptimise; else if (arg == "--add-root") { if (i == args.end()) throw UsageError("`--add-root requires an argument");