diff --git a/dvc/api.py b/dvc/api.py index b1ac0a0f..238b7f56 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -28,8 +28,8 @@ def get_url(path, repo=None, rev=None, remote=None): cloud = info["repo"].cloud dvc_path = _repo.fs.path.relpath(fs_path, info["repo"].root_dir) - md5 = info["repo"].dvcfs.info(dvc_path)["md5"] - return cloud.get_url_for(remote, checksum=md5) + sha256 = info["repo"].dvcfs.info(dvc_path)["sha256"] + return cloud.get_url_for(remote, checksum=sha256) def open( # noqa, pylint: disable=redefined-builtin diff --git a/dvc/data/stage.py b/dvc/data/stage.py index 4ab026dd..7151761a 100644 --- a/dvc/data/stage.py +++ b/dvc/data/stage.py @@ -10,7 +10,7 @@ from dvc.hash_info import HashInfo from dvc.ignore import DvcIgnore from dvc.objects.file import HashFile from dvc.progress import Tqdm -from dvc.utils import file_md5, is_exec +from dvc.utils import file_sha256, is_exec from .db.reference import ReferenceObjectDB from .meta import Meta @@ -67,8 +67,8 @@ def _get_file_hash(fs_path, fs, name): elif hasattr(fs, name): func = getattr(fs, name) hash_value = func(fs_path) - elif name == "md5": - hash_value = file_md5(fs_path, fs) + elif name == "sha256": + hash_value = file_sha256(fs_path, fs) else: raise NotImplementedError @@ -98,7 +98,7 @@ def _stage_file(fs_path, fs, name, odb=None, upload_odb=None, dry_run=False): state = odb.state if odb else None meta, hash_info = get_file_hash(fs_path, fs, name, state=state) if upload_odb and not dry_run: - assert odb and name == "md5" + assert odb and name == "sha256" return _upload_file(fs_path, fs, odb, upload_odb) if dry_run: @@ -124,7 +124,7 @@ def _build_objects( else: walk_iterator = fs.find(fs_path) with Tqdm( - unit="md5", + unit="sha256", desc="Computing file/dir hashes (only done once)", disable=no_progress_bar, ) as pbar: @@ -269,9 +269,9 @@ def _load_from_state(odb, staging, fs_path, fs, name): def _stage_external_tree_info(odb, tree, name): # NOTE: used only for external outputs. Initial reasoning was to be # able to validate .dir files right in the workspace (e.g. check s3 - # etag), but could be dropped for manual validation with regular md5, + # etag), but could be dropped for manual validation with regular sha256, # that would be universal for all clouds. - assert odb and name != "md5" + assert odb and name != "sha256" odb.add(tree.fs_path, tree.fs, tree.hash_info) raw = odb.get(tree.hash_info) @@ -330,7 +330,7 @@ def stage( **kwargs, ) logger.debug("staged tree '%s'", obj) - if name != "md5": + if name != "sha256": obj = _stage_external_tree_info(odb, obj, name) else: _, meta, obj = _stage_file( diff --git a/dvc/data/tree.py b/dvc/data/tree.py index 25e29d81..9bbb64b4 100644 --- a/dvc/data/tree.py +++ b/dvc/data/tree.py @@ -65,7 +65,7 @@ class Tree(HashFile): if hash_info: self.hash_info = hash_info else: - _, self.hash_info = get_file_hash(fs_path, memfs, "md5") + _, self.hash_info = get_file_hash(fs_path, memfs, "sha256") assert self.hash_info.value self.hash_info.value += ".dir" diff --git a/dvc/fs/dvc.py b/dvc/fs/dvc.py index cbe45713..9dedc753 100644 --- a/dvc/fs/dvc.py +++ b/dvc/fs/dvc.py @@ -21,7 +21,7 @@ class DvcFileSystem(FileSystem): # pylint:disable=abstract-method sep = os.sep scheme = "local" - PARAM_CHECKSUM = "md5" + PARAM_CHECKSUM = "sha256" def __init__(self, **kwargs): super().__init__(**kwargs) @@ -56,7 +56,7 @@ class DvcFileSystem(FileSystem): # pylint:disable=abstract-method if info["type"] == "directory": raise IsADirectoryError - value = info.get("md5") + value = info.get("sha256") if not value: raise FileNotFoundError @@ -216,7 +216,7 @@ class DvcFileSystem(FileSystem): # pylint:disable=abstract-method def checksum(self, path): info = self.info(path) - md5 = info.get("md5") - if md5: - return md5 + sha256 = info.get("sha256") + if sha256: + return sha256 raise NotImplementedError diff --git a/dvc/fs/local.py b/dvc/fs/local.py index 8bbfa212..bae885ee 100644 --- a/dvc/fs/local.py +++ b/dvc/fs/local.py @@ -16,7 +16,7 @@ class LocalFileSystem(FileSystem): sep = os.sep scheme = Schemes.LOCAL - PARAM_CHECKSUM = "md5" + PARAM_CHECKSUM = "sha256" PARAM_PATH = "path" TRAVERSE_PREFIX_LEN = 2 diff --git a/dvc/fs/memory.py b/dvc/fs/memory.py index 32d4402a..6d44f520 100644 --- a/dvc/fs/memory.py +++ b/dvc/fs/memory.py @@ -9,7 +9,7 @@ from .fsspec_wrapper import FSSpecWrapper class MemoryFileSystem(FSSpecWrapper): # pylint:disable=abstract-method scheme = Schemes.MEMORY - PARAM_CHECKSUM = "md5" + PARAM_CHECKSUM = "sha256" TRAVERSE_PREFIX_LEN = 2 DEFAULT_BLOCKSIZE = 4096 diff --git a/dvc/fs/repo.py b/dvc/fs/repo.py index ba4258a6..2454fa2b 100644 --- a/dvc/fs/repo.py +++ b/dvc/fs/repo.py @@ -36,7 +36,7 @@ class RepoFileSystem(FileSystem): # pylint:disable=abstract-method sep = os.sep scheme = "local" - PARAM_CHECKSUM = "md5" + PARAM_CHECKSUM = "sha256" PARAM_REPO_URL = "repo_url" PARAM_REPO_ROOT = "repo_root" PARAM_REV = "rev" diff --git a/dvc/fs/ssh.py b/dvc/fs/ssh.py index ba069b1d..709753c2 100644 --- a/dvc/fs/ssh.py +++ b/dvc/fs/ssh.py @@ -32,7 +32,7 @@ class SSHFileSystem(FSSpecWrapper): REQUIRES = {"sshfs": "sshfs"} DEFAULT_PORT = 22 - PARAM_CHECKSUM = "md5" + PARAM_CHECKSUM = "sha256" @classmethod def _strip_protocol(cls, path: str) -> str: diff --git a/dvc/lock.py b/dvc/lock.py index 3360001c..706a1f10 100644 --- a/dvc/lock.py +++ b/dvc/lock.py @@ -181,7 +181,7 @@ class HardlinkLock(flufl.lock.Lock, LockBase): if self._tmp_dir is not None: # Under Windows file path length is limited so we hash it - filename = hashlib.md5(self._claimfile.encode()).hexdigest() + filename = hashlib.sha256(self._claimfile.encode()).hexdigest() self._claimfile = os.path.join(self._tmp_dir, filename + ".lock") diff --git a/dvc/objects/db.py b/dvc/objects/db.py index a30c2c6f..56c84b41 100644 --- a/dvc/objects/db.py +++ b/dvc/objects/db.py @@ -288,7 +288,7 @@ class ObjectDB: returned. NOTE: For large remotes the list of hashes will be very - big(e.g. 100M entries, md5 for each is 32 bytes, so ~3200Mb list) + big(e.g. 100M entries, sha256 for each is 32 bytes, so ~3200Mb list) and we don't really need all of it at the same time, so it makes sense to use a generator to gradually iterate over it, without keeping all of it in memory. diff --git a/dvc/output.py b/dvc/output.py index 13fd8e73..429a17e1 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -54,7 +54,7 @@ CASE_SENSITIVE_CHECKSUM_SCHEMA = Any( # NOTE: currently there are only 3 possible checksum names: # -# 1) md5 (LOCAL, SSH); +# 1) sha256 (LOCAL, SSH); # 2) etag (S3, GS, OSS, AZURE, HTTP); # 3) checksum (HDFS); # @@ -808,7 +808,7 @@ class Output: odb, from_info, from_fs, - "md5", + "sha256", upload=upload, jobs=jobs, no_progress_bar=no_progress_bar, diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index fba275f0..ed515b64 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -112,7 +112,7 @@ class Repo: def _get_database_dir(self, db_name): # NOTE: by default, store SQLite-based remote indexes and state's - # `links` and `md5s` caches in the repository itself to avoid any + # `links` and `sha256s` caches in the repository itself to avoid any # possible state corruption in 'shared cache dir' scenario, but allow # user to override this through config when, say, the repository is # located on a mounted volume — see diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index f6b6920f..26f4f4a4 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -140,7 +140,7 @@ def _output_paths(repo, targets): repo.odb.local, output.fs_path, repo.odb.local.fs, - "md5", + "sha256", dry_run=True, dvcignore=output.dvcignore, ) diff --git a/dvc/repo/imp_url.py b/dvc/repo/imp_url.py index aa8ec83b..c92cfa7b 100644 --- a/dvc/repo/imp_url.py +++ b/dvc/repo/imp_url.py @@ -78,7 +78,7 @@ def imp_url( remote_odb = self.cloud.get_remote_odb(remote, "import-url") stage.outs[0].transfer(url, odb=remote_odb, jobs=jobs) stage.save_deps() - stage.md5 = stage.compute_md5() + stage.sha256 = stage.compute_sha256() else: stage.run(jobs=jobs) diff --git a/dvc/repo/index.py b/dvc/repo/index.py index ccf667b0..a781747e 100644 --- a/dvc/repo/index.py +++ b/dvc/repo/index.py @@ -13,7 +13,7 @@ from typing import ( from funcy import cached_property, nullcontext -from dvc.utils import dict_md5 +from dvc.utils import dict_sha256 if TYPE_CHECKING: from networkx import DiGraph @@ -287,7 +287,7 @@ class Index: Currently, it is unique to the platform (windows vs posix). """ - return dict_md5(self.dumpd()) + return dict_sha256(self.dumpd()) if __name__ == "__main__": diff --git a/dvc/stage/__init__.py b/dvc/stage/__init__.py index c4a1e443..ed2568c5 100644 --- a/dvc/stage/__init__.py +++ b/dvc/stage/__init__.py @@ -28,7 +28,7 @@ from .utils import ( check_missing_outputs, check_no_externals, check_stage_path, - compute_md5, + compute_sha256, fill_stage_dependencies, fill_stage_outputs, get_dump, @@ -131,7 +131,7 @@ class Stage(params.StageParams): wdir=os.curdir, deps=None, outs=None, - md5=None, + sha256=None, locked=False, # backward compatibility frozen=False, always_changed=False, @@ -151,7 +151,7 @@ class Stage(params.StageParams): self.wdir = wdir self.outs = outs self.deps = deps - self.md5 = md5 + self.sha256 = sha256 self.frozen = locked or frozen self.always_changed = always_changed self._stage_text = stage_text @@ -345,7 +345,7 @@ class Stage(params.StageParams): return False def changed_stage(self): - changed = self.md5 != self.compute_md5() + changed = self.sha256 != self.compute_sha256() if changed: logger.debug(self._changed_stage_entry()) return changed @@ -353,7 +353,7 @@ class Stage(params.StageParams): @rwlocked(read=["deps", "outs"]) def changed(self): is_changed = ( - # Short-circuit order: stage md5 is fast, + # Short-circuit order: stage sha256 is fast, # deps are expected to change self.changed_stage() or self.changed_deps() @@ -443,19 +443,19 @@ class Stage(params.StageParams): def dumpd(self): return get_dump(self) - def compute_md5(self): - # `dvc add`ed files don't need stage md5 + def compute_sha256(self): + # `dvc add`ed files don't need stage sha256 if self.is_data_source and not (self.is_import or self.is_repo_import): m = None else: - m = compute_md5(self) - logger.debug(f"Computed {self} md5: '{m}'") + m = compute_sha256(self) + logger.debug(f"Computed {self} sha256: '{m}'") return m def save(self, allow_missing=False): self.save_deps(allow_missing=allow_missing) self.save_outs(allow_missing=allow_missing) - self.md5 = self.compute_md5() + self.sha256 = self.compute_sha256() self.repo.stage_cache.save(self) @@ -488,7 +488,7 @@ class Stage(params.StageParams): return [str(entry) for entry in entries if entry.workspace_status()] def _changed_stage_entry(self): - return f"'md5' of {self} changed." + return f"'sha256' of {self} changed." def changed_entries(self): changed_deps = self._changed_entries(self.deps) diff --git a/dvc/stage/params.py b/dvc/stage/params.py index c43a75b1..961a8168 100644 --- a/dvc/stage/params.py +++ b/dvc/stage/params.py @@ -1,5 +1,5 @@ class StageParams: - PARAM_MD5 = "md5" + PARAM_MD5 = "sha256" PARAM_CMD = "cmd" PARAM_WDIR = "wdir" PARAM_DEPS = "deps" diff --git a/dvc/stage/utils.py b/dvc/stage/utils.py index a48b8ef9..0c36d256 100644 --- a/dvc/stage/utils.py +++ b/dvc/stage/utils.py @@ -168,26 +168,26 @@ def check_missing_outputs(stage): raise MissingDataSource(paths) -def compute_md5(stage): +def compute_sha256(stage): from dvc.output import Output - from ..utils import dict_md5 + from ..utils import dict_sha256 d = stage.dumpd() - # Remove md5 and meta, these should not affect stage md5 + # Remove sha256 and meta, these should not affect stage sha256 d.pop(stage.PARAM_MD5, None) d.pop(stage.PARAM_META, None) d.pop(stage.PARAM_DESC, None) # Ignore the wdir default value. In this case DVC file w/o - # wdir has the same md5 as a file with the default value specified. + # wdir has the same sha256 as a file with the default value specified. # It's important for backward compatibility with pipelines that # didn't have WDIR in their DVC files. if d.get(stage.PARAM_WDIR) == ".": del d[stage.PARAM_WDIR] - return dict_md5( + return dict_sha256( d, exclude=[ stage.PARAM_LOCKED, # backward compatibility @@ -222,7 +222,7 @@ def get_dump(stage): key: value for key, value in { stage.PARAM_DESC: stage.desc, - stage.PARAM_MD5: stage.md5, + stage.PARAM_MD5: stage.sha256, stage.PARAM_CMD: stage.cmd, stage.PARAM_WDIR: resolve_wdir(stage.wdir, stage.path), stage.PARAM_FROZEN: stage.frozen, diff --git a/dvc/state.py b/dvc/state.py index a1463a23..d2a78fa0 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -63,13 +63,13 @@ class State(StateBase): # pylint: disable=too-many-instance-attributes "disk_pickle_protocol": 4, } self.links = Cache(directory=os.path.join(tmp_dir, "links"), **config) - self.md5s = Cache(directory=os.path.join(tmp_dir, "md5s"), **config) + self.sha256s = Cache(directory=os.path.join(tmp_dir, "sha256s"), **config) def close(self): - self.md5s.close() + self.sha256s.close() self.links.close() - @with_diskcache(name="md5s") + @with_diskcache(name="sha256s") def save(self, path, fs, hash_info): """Save hash for the specified path info. @@ -92,9 +92,9 @@ class State(StateBase): # pylint: disable=too-many-instance-attributes hash_info.value, ) - self.md5s[inode] = (mtime, str(size), hash_info.value) + self.sha256s[inode] = (mtime, str(size), hash_info.value) - @with_diskcache(name="md5s") + @with_diskcache(name="sha256s") def get(self, path, fs): """Gets the hash for the specified path info. Hash will be retrieved from the state database if available. @@ -118,12 +118,12 @@ class State(StateBase): # pylint: disable=too-many-instance-attributes inode = get_inode(path) - value = self.md5s.get(inode) + value = self.sha256s.get(inode) if not value or value[0] != mtime or value[1] != str(size): return None, None - return Meta(size=size), HashInfo("md5", value[2]) + return Meta(size=size), HashInfo("sha256", value[2]) @with_diskcache(name="links") def save_link(self, path, fs): diff --git a/dvc/testing/test_workspace.py b/dvc/testing/test_workspace.py index f6225a2f..486442e7 100644 --- a/dvc/testing/test_workspace.py +++ b/dvc/testing/test_workspace.py @@ -12,14 +12,14 @@ class TestImport: assert dvc.status() == {} @pytest.fixture - def stage_md5(self): + def stage_sha256(self): pytest.skip() @pytest.fixture - def dir_md5(self): + def dir_sha256(self): pytest.skip() - def test_import_dir(self, tmp_dir, dvc, workspace, stage_md5, dir_md5): + def test_import_dir(self, tmp_dir, dvc, workspace, stage_sha256, dir_sha256): from dvc.data.db import ODBManager workspace.gen( @@ -43,17 +43,17 @@ class TestImport: assert dvc.status() == {} - if stage_md5 is not None and dir_md5 is not None: + if stage_sha256 is not None and dir_sha256 is not None: assert (tmp_dir / "dir.dvc").read_text() == ( - f"md5: {stage_md5}\n" + f"sha256: {stage_sha256}\n" "frozen: true\n" "deps:\n" - f"- md5: {dir_md5}\n" + f"- sha256: {dir_sha256}\n" " size: 11\n" " nfiles: 2\n" " path: remote://workspace/dir\n" "outs:\n" - "- md5: b6dcab6ccd17ca0a8bf4a215a37d14cc.dir\n" + "- sha256: b6dcab6ccd17ca0a8bf4a215a37d14cc.dir\n" " size: 11\n" " nfiles: 2\n" " path: dir\n" diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index b2388287..d7062bde 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -25,7 +25,7 @@ def dos2unix(data): return data.replace(b"\r\n", b"\n") -def _fobj_md5(fobj, hash_md5, binary, progress_func=None): +def _fobj_sha256(fobj, hash_sha256, binary, progress_func=None): while True: data = fobj.read(LOCAL_CHUNK_SIZE) if not data: @@ -36,24 +36,24 @@ def _fobj_md5(fobj, hash_md5, binary, progress_func=None): else: chunk = dos2unix(data) - hash_md5.update(chunk) + hash_sha256.update(chunk) if progress_func: progress_func(len(data)) -def file_md5(fname, fs): - """get the (md5 hexdigest, md5 digest) of a file""" +def file_sha256(fname, fs): + """get the (sha256 hexdigest, sha256 digest) of a file""" from dvc.istextfile import istextfile from dvc.progress import Tqdm - hash_md5 = hashlib.md5() + hash_sha256 = hashlib.sha256() binary = not istextfile(fname, fs=fs) size = fs.getsize(fname) or 0 no_progress_bar = True if size >= LARGE_FILE_SIZE: no_progress_bar = False msg = ( - f"Computing md5 for a large file '{fname}'. " + f"Computing sha256 for a large file '{fname}'. " "This is only done once." ) logger.info(msg) @@ -66,9 +66,9 @@ def file_md5(fname, fs): leave=False, ) as pbar: with fs.open(fname, "rb") as fobj: - _fobj_md5(fobj, hash_md5, binary, pbar.update) + _fobj_sha256(fobj, hash_sha256, binary, pbar.update) - return hash_md5.hexdigest() + return hash_sha256.hexdigest() def bytes_hash(byts, typ): @@ -98,8 +98,8 @@ def dict_hash(d, typ, exclude=()): return bytes_hash(byts, typ) -def dict_md5(d, **kwargs): - return dict_hash(d, "md5", **kwargs) +def dict_sha256(d, **kwargs): + return dict_hash(d, "sha256", **kwargs) def dict_sha256(d, **kwargs): diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index c12ce400..7d719177 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -9,7 +9,7 @@ from typing import TYPE_CHECKING from dvc.exceptions import DvcException from dvc.system import System -from dvc.utils import dict_md5 +from dvc.utils import dict_sha256 if TYPE_CHECKING: from dvc.types import StrPath @@ -51,7 +51,7 @@ def get_mtime_and_size(path, fs, dvcignore=None): # We track file changes and moves, which cannot be detected with simply # max(mtime(f) for f in non_ignored_files) - mtime = dict_md5(files_mtimes) + mtime = dict_sha256(files_mtimes) else: base_stat = fs.info(path) size = base_stat["size"] diff --git a/dvc/utils/stream.py b/dvc/utils/stream.py index a0a7ac8f..7da46934 100644 --- a/dvc/utils/stream.py +++ b/dvc/utils/stream.py @@ -10,11 +10,11 @@ from dvc.utils import dos2unix class HashedStreamReader(io.IOBase): - PARAM_CHECKSUM = "md5" + PARAM_CHECKSUM = "sha256" def __init__(self, fobj): self.fobj = fobj - self.md5 = hashlib.md5() + self.sha256 = hashlib.sha256() self.total_read = 0 self.is_text_file = None super().__init__() @@ -40,11 +40,11 @@ class HashedStreamReader(io.IOBase): data = dos2unix(chunk) else: data = chunk - self.md5.update(data) + self.sha256.update(data) self.total_read += len(data) return chunk @property def hash_info(self): - return HashInfo(self.PARAM_CHECKSUM, self.md5.hexdigest()) + return HashInfo(self.PARAM_CHECKSUM, self.sha256.hexdigest()) diff --git a/scripts/innosetup/dvc.ico.dvc b/scripts/innosetup/dvc.ico.dvc index e8ca30f5..78b76603 100644 --- a/scripts/innosetup/dvc.ico.dvc +++ b/scripts/innosetup/dvc.ico.dvc @@ -1,3 +1,3 @@ outs: -- md5: 90104d9e83cfb825cf45507e90aadd27 +- sha256: 90104d9e83cfb825cf45507e90aadd27 path: dvc.ico diff --git a/scripts/innosetup/dvc_left.bmp.dvc b/scripts/innosetup/dvc_left.bmp.dvc index be60334b..c97e16f8 100644 --- a/scripts/innosetup/dvc_left.bmp.dvc +++ b/scripts/innosetup/dvc_left.bmp.dvc @@ -1,3 +1,3 @@ outs: -- md5: 9106cda08aa427e73492389a0f17c72d +- sha256: 9106cda08aa427e73492389a0f17c72d path: dvc_left.bmp diff --git a/scripts/innosetup/dvc_up.bmp.dvc b/scripts/innosetup/dvc_up.bmp.dvc index 7fb5ae55..59df4a87 100644 --- a/scripts/innosetup/dvc_up.bmp.dvc +++ b/scripts/innosetup/dvc_up.bmp.dvc @@ -1,3 +1,3 @@ outs: -- md5: 94614d6650e062655f9f77507dc9c1f2 +- sha256: 94614d6650e062655f9f77507dc9c1f2 path: dvc_up.bmp diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 43c2f3c0..33e6f368 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -35,7 +35,7 @@ from dvc.stage.exceptions import ( ) from dvc.system import System from dvc.testing.test_workspace import TestAdd -from dvc.utils import LARGE_DIR_SIZE, file_md5, relpath +from dvc.utils import LARGE_DIR_SIZE, file_sha256, relpath from dvc.utils.fs import path_isin from dvc.utils.serialize import YAMLFileCorruptedError, load_yaml from tests.basic_env import TestDvc @@ -44,7 +44,7 @@ from tests.utils import get_gitignore_content def test_add(tmp_dir, dvc): (stage,) = tmp_dir.dvc_gen({"foo": "foo"}) - md5 = file_md5("foo", dvc.fs) + sha256 = file_sha256("foo", dvc.fs) assert stage is not None @@ -53,13 +53,13 @@ def test_add(tmp_dir, dvc): assert len(stage.outs) == 1 assert len(stage.deps) == 0 assert stage.cmd is None - assert stage.outs[0].hash_info == HashInfo("md5", md5) - assert stage.md5 is None + assert stage.outs[0].hash_info == HashInfo("sha256", sha256) + assert stage.sha256 is None assert (tmp_dir / "foo.dvc").parse() == { "outs": [ { - "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "sha256": "acbd18db4cc2f85cedef654fccc4a4d8", "path": "foo", "size": 3, } @@ -77,7 +77,7 @@ def test_add_executable(tmp_dir, dvc): assert (tmp_dir / "foo.dvc").parse() == { "outs": [ { - "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "sha256": "acbd18db4cc2f85cedef654fccc4a4d8", "path": "foo", "size": 3, "isexec": True, @@ -295,7 +295,7 @@ def test_add_filtered_files_in_dir( class TestAddExternal(TestAdd): @pytest.fixture def hash_name(self): - return "md5" + return "sha256" @pytest.fixture def hash_value(self): @@ -316,7 +316,7 @@ def test_add_external_relpath(tmp_dir, dvc, local_cloud): dvc.add(rel, external=True) assert (tmp_dir / "file.dvc").read_text() == ( "outs:\n" - "- md5: 8c7dd922ad47494fc02c388e12c00eac\n" + "- sha256: 8c7dd922ad47494fc02c388e12c00eac\n" " size: 4\n" f" path: {rel}\n" ) @@ -378,7 +378,7 @@ class TestDoubleAddUnchanged(TestDvc): def test_should_update_state_entry_for_file_after_add(mocker, dvc, tmp_dir): - file_md5_counter = mocker.spy(dvc_module.data.stage, "file_md5") + file_sha256_counter = mocker.spy(dvc_module.data.stage, "file_sha256") tmp_dir.gen("foo", "foo") ret = main(["config", "cache.type", "copy"]) @@ -386,30 +386,30 @@ def test_should_update_state_entry_for_file_after_add(mocker, dvc, tmp_dir): ret = main(["add", "foo"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 1 + assert file_sha256_counter.mock.call_count == 1 ret = main(["status"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 1 + assert file_sha256_counter.mock.call_count == 1 ret = main(["run", "--single-stage", "-d", "foo", "echo foo"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 1 + assert file_sha256_counter.mock.call_count == 1 os.rename("foo", "foo.back") ret = main(["checkout"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 1 + assert file_sha256_counter.mock.call_count == 1 ret = main(["status"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 1 + assert file_sha256_counter.mock.call_count == 1 def test_should_update_state_entry_for_directory_after_add( mocker, dvc, tmp_dir ): - file_md5_counter = mocker.spy(dvc_module.data.stage, "file_md5") + file_sha256_counter = mocker.spy(dvc_module.data.stage, "file_sha256") tmp_dir.gen({"data/data": "foo", "data/data_sub/sub_data": "foo"}) @@ -418,27 +418,27 @@ def test_should_update_state_entry_for_directory_after_add( ret = main(["add", "data"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 3 + assert file_sha256_counter.mock.call_count == 3 ret = main(["status"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 3 + assert file_sha256_counter.mock.call_count == 3 ls = "dir" if os.name == "nt" else "ls" ret = main( ["run", "--single-stage", "-d", "data", "{} {}".format(ls, "data")] ) assert ret == 0 - assert file_md5_counter.mock.call_count == 3 + assert file_sha256_counter.mock.call_count == 3 os.rename("data", "data" + ".back") ret = main(["checkout"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 3 + assert file_sha256_counter.mock.call_count == 3 ret = main(["status"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 3 + assert file_sha256_counter.mock.call_count == 3 class TestAddCommit(TestDvc): @@ -915,7 +915,7 @@ def test_add_preserve_meta(tmp_dir, dvc): outs: - path: foo # out comment desc: out desc - md5: acbd18db4cc2f85cedef654fccc4a4d8 + sha256: acbd18db4cc2f85cedef654fccc4a4d8 size: 3 meta: some metadata """ diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index 865e8d2e..12620ec9 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -986,7 +986,7 @@ def test_checkout_dir_compat(tmp_dir, dvc): textwrap.dedent( f"""\ outs: - - md5: {stage.outs[0].hash_info.value} + - sha256: {stage.outs[0].hash_info.value} path: data """ ), diff --git a/tests/func/test_commit.py b/tests/func/test_commit.py index afa7bec2..808c0f3d 100644 --- a/tests/func/test_commit.py +++ b/tests/func/test_commit.py @@ -60,12 +60,12 @@ def test_commit_with_deps(tmp_dir, dvc, run_copy, run_kw): assert not stage.outs[0].changed_cache() -def test_commit_changed_md5(tmp_dir, dvc): +def test_commit_changed_sha256(tmp_dir, dvc): tmp_dir.gen({"file": "file content"}) (stage,) = dvc.add("file", no_commit=True) stage_file_content = (tmp_dir / stage.path).parse() - stage_file_content["md5"] = "1111111111" + stage_file_content["sha256"] = "1111111111" (tmp_dir / stage.path).dump(stage_file_content) clean_staging() @@ -74,7 +74,7 @@ def test_commit_changed_md5(tmp_dir, dvc): dvc.commit(stage.path) dvc.commit(stage.path, force=True) - assert "md5" not in (tmp_dir / stage.path).parse() + assert "sha256" not in (tmp_dir / stage.path).parse() def test_commit_no_exec(tmp_dir, dvc): diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index eea0e549..d9a6de16 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -132,7 +132,7 @@ def test_warn_on_outdated_stage(tmp_dir, dvc, local_remote, caplog): stage_file_path = stage.relpath content = (tmp_dir / stage_file_path).parse() - del content["outs"][0]["md5"] + del content["outs"][0]["sha256"] (tmp_dir / stage_file_path).dump(content) with caplog.at_level(logging.WARNING, logger="dvc"): @@ -149,7 +149,7 @@ def test_warn_on_outdated_stage(tmp_dir, dvc, local_remote, caplog): def test_hash_recalculation(mocker, dvc, tmp_dir, local_remote): tmp_dir.gen({"foo": "foo"}) - test_file_md5 = mocker.spy(dvc_module.data.stage, "file_md5") + test_file_sha256 = mocker.spy(dvc_module.data.stage, "file_sha256") ret = main(["config", "cache.type", "hardlink"]) assert ret == 0 ret = main(["add", "foo"]) @@ -158,7 +158,7 @@ def test_hash_recalculation(mocker, dvc, tmp_dir, local_remote): assert ret == 0 ret = main(["run", "--single-stage", "-d", "foo", "echo foo"]) assert ret == 0 - assert test_file_md5.mock.call_count == 1 + assert test_file_sha256.mock.call_count == 1 def test_missing_cache(tmp_dir, dvc, local_remote, caplog): @@ -174,8 +174,8 @@ def test_missing_cache(tmp_dir, dvc, local_remote, caplog): "Some of the cache files do not exist " "neither locally nor on remote. Missing cache files:\n" ) - foo = "name: bar, md5: 37b51d194a7513e45b56f6524f2d51f2\n" - bar = "name: foo, md5: acbd18db4cc2f85cedef654fccc4a4d8\n" + foo = "name: bar, sha256: 37b51d194a7513e45b56f6524f2d51f2\n" + bar = "name: foo, sha256: acbd18db4cc2f85cedef654fccc4a4d8\n" caplog.clear() dvc.push() @@ -211,7 +211,7 @@ def test_verify_hashes( remove("dir") remove(dvc.odb.local.cache_dir) - hash_spy = mocker.spy(dvc_module.data.stage, "file_md5") + hash_spy = mocker.spy(dvc_module.data.stage, "file_sha256") dvc.pull() assert hash_spy.call_count == 0 diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 976facc4..c5a794a1 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -9,7 +9,7 @@ from dvc.utils.fs import remove def digest(text): - return hashlib.md5(bytes(text, "utf-8")).hexdigest() + return hashlib.sha256(bytes(text, "utf-8")).hexdigest() def test_no_scm(tmp_dir, dvc): diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index 068a20c1..1b6ae70d 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -212,7 +212,7 @@ def test_subrepos_are_ignored(tmp_dir, erepo_dir): repo.odb.local, os.path.join(repo.root_dir, "dir"), repo.repo_fs, - "md5", + "sha256", dvcignore=repo.dvcignore, ) transfer( diff --git a/tests/func/test_gc.py b/tests/func/test_gc.py index c2c33046..d74d41c6 100644 --- a/tests/func/test_gc.py +++ b/tests/func/test_gc.py @@ -22,8 +22,8 @@ class TestGC(TestDvcGit): self.dvc.add(self.FOO) self.dvc.add(self.DATA_DIR) self.good_cache = [ - self.dvc.odb.local.hash_to_path(md5) - for md5 in self.dvc.odb.local.all() + self.dvc.odb.local.hash_to_path(sha256) + for sha256 in self.dvc.odb.local.all() ] self.bad_cache = [] diff --git a/tests/func/test_import_url.py b/tests/func/test_import_url.py index 5868716b..309bf74e 100644 --- a/tests/func/test_import_url.py +++ b/tests/func/test_import_url.py @@ -120,11 +120,11 @@ def test_import_url_with_no_exec(tmp_dir, dvc, erepo_dir): class TestImport(_TestImport): @pytest.fixture - def stage_md5(self): + def stage_sha256(self): return "dc24e1271084ee317ac3c2656fb8812b" @pytest.fixture - def dir_md5(self): + def dir_sha256(self): return "b6dcab6ccd17ca0a8bf4a215a37d14cc.dir" @pytest.fixture @@ -155,15 +155,15 @@ def test_import_url_preserve_meta(tmp_dir, dvc): desc: top desc deps: - path: foo # dep comment - md5: acbd18db4cc2f85cedef654fccc4a4d8 + sha256: acbd18db4cc2f85cedef654fccc4a4d8 size: 3 outs: - path: bar # out comment desc: out desc - md5: acbd18db4cc2f85cedef654fccc4a4d8 + sha256: acbd18db4cc2f85cedef654fccc4a4d8 size: 3 meta: some metadata - md5: be7ade0aa89cc8d56e320867a9de9740 + sha256: be7ade0aa89cc8d56e320867a9de9740 frozen: true """ ) @@ -229,7 +229,7 @@ def test_import_url_to_remote_directory(tmp_dir, dvc, workspace, local_remote): for file_part in file_parts: with open( - local_remote.hash_to_path(file_part["md5"]), encoding="utf-8" + local_remote.hash_to_path(file_part["sha256"]), encoding="utf-8" ) as fobj: assert fobj.read() == file_part["relpath"] @@ -263,7 +263,7 @@ def test_import_url_to_remote_status(tmp_dir, dvc, local_cloud, local_remote): local_cloud.gen("foo", "foo") stage = dvc.imp_url(str(local_cloud / "foo"), to_remote=True) - assert stage.md5 is not None + assert stage.sha256 is not None status = dvc.status() assert status["foo.dvc"] == [{"changed outs": {"foo": "not in cache"}}] diff --git a/tests/func/test_install.py b/tests/func/test_install.py index ee6fde29..e7f4d6d8 100644 --- a/tests/func/test_install.py +++ b/tests/func/test_install.py @@ -6,7 +6,7 @@ import pytest from git import GitCommandError from dvc.exceptions import DvcException -from dvc.utils import file_md5 +from dvc.utils import file_sha256 from tests.func.parsing.test_errors import escape_ansi @@ -76,7 +76,7 @@ class TestInstall: conf["core"]["remote"] = "store" tmp_dir.dvc_gen("file", "file_content", "commit message") - file_checksum = file_md5("file", dvc.fs) + file_checksum = file_sha256("file", dvc.fs) expected_storage_path = ( storage_path / file_checksum[:2] / file_checksum[2:] ) @@ -117,7 +117,7 @@ def test_merge_driver_no_ancestor(tmp_dir, scm, dvc): assert (tmp_dir / "data").read_text() == {"bar": "bar"} assert (tmp_dir / "data.dvc").read_text() == ( "outs:\n" - "- md5: 5ea40360f5b4ec688df672a4db9c17d1.dir\n" + "- sha256: 5ea40360f5b4ec688df672a4db9c17d1.dir\n" " size: 6\n" " nfiles: 2\n" " path: data\n" @@ -154,7 +154,7 @@ def test_merge_driver(tmp_dir, scm, dvc): assert (tmp_dir / "data").read_text() == {"master": "master", "two": "two"} assert (tmp_dir / "data.dvc").read_text() == ( "outs:\n" - "- md5: 839ef9371606817569c1ee0e5f4ed233.dir\n" + "- sha256: 839ef9371606817569c1ee0e5f4ed233.dir\n" " size: 12\n" " nfiles: 3\n" " path: data\n" diff --git a/tests/func/test_lockfile.py b/tests/func/test_lockfile.py index eefeb210..4e1f1fcb 100644 --- a/tests/func/test_lockfile.py +++ b/tests/func/test_lockfile.py @@ -48,12 +48,12 @@ def test_deps_outs_are_sorted_by_path(tmp_dir, dvc, run_head): # lock stage key order: assert list(lock.keys()) == ["cmd", "deps", "outs"] - # `path` key appear first and then the `md5` + # `path` key appear first and then the `sha256` assert all( - list(dep.keys()) == ["path", "md5", "size"] for dep in lock["deps"] + list(dep.keys()) == ["path", "sha256", "size"] for dep in lock["deps"] ) assert all( - list(out.keys()) == ["path", "md5", "size"] for out in lock["outs"] + list(out.keys()) == ["path", "sha256", "size"] for out in lock["outs"] ) # deps are always sorted by the file path naming @@ -167,7 +167,7 @@ def test_params_dump(tmp_dir, dvc, run_head): def v1_repo_lock(tmp_dir, dvc): """Generates a repo having v1 format lockfile""" size = 5 if os.name == "nt" else 4 - hi = HashInfo(name="md5", value="c157a79031e1c40f85931829bc5fc552") + hi = HashInfo(name="sha256", value="c157a79031e1c40f85931829bc5fc552") v1_lockdata = { "foo": {"cmd": "echo foo"}, "bar": { diff --git a/tests/func/test_merge_driver.py b/tests/func/test_merge_driver.py index 113984f9..218e524a 100644 --- a/tests/func/test_merge_driver.py +++ b/tests/func/test_merge_driver.py @@ -118,11 +118,11 @@ def test_merge_different_output_options(tmp_dir, dvc, caplog): (tmp_dir / "ancestor").touch() (tmp_dir / "our").write_text( - "outs:\n- md5: f123456789.dir\n path: path\n" + "outs:\n- sha256: f123456789.dir\n path: path\n" ) (tmp_dir / "their").write_text( - "outs:\n- md5: f987654321.dir\n path: path\n cache: false\n" + "outs:\n- sha256: f987654321.dir\n path: path\n cache: false\n" ) assert ( @@ -149,10 +149,10 @@ def test_merge_file(tmp_dir, dvc, caplog): (tmp_dir / "ancestor").touch() (tmp_dir / "our").write_text( - "outs:\n- md5: f123456789.dir\n path: path\n" + "outs:\n- sha256: f123456789.dir\n path: path\n" ) - (tmp_dir / "their").write_text("outs:\n- md5: f987654321\n path: path\n") + (tmp_dir / "their").write_text("outs:\n- sha256: f987654321\n path: path\n") assert ( main( @@ -179,13 +179,13 @@ def test_merge_non_dvc_add(tmp_dir, dvc, caplog): (tmp_dir / "our").write_text( "outs:\n" - "- md5: f123456789.dir\n" + "- sha256: f123456789.dir\n" " path: path\n" - "- md5: ff123456789.dir\n" + "- sha256: ff123456789.dir\n" " path: another\n" ) - (tmp_dir / "their").write_text("outs:\n- md5: f987654321\n path: path\n") + (tmp_dir / "their").write_text("outs:\n- sha256: f987654321\n path: path\n") assert ( main( diff --git a/tests/func/test_move.py b/tests/func/test_move.py index aad2003a..3e28e628 100644 --- a/tests/func/test_move.py +++ b/tests/func/test_move.py @@ -261,7 +261,7 @@ def test_move_meta(tmp_dir, dvc): assert res == textwrap.dedent( """\ outs: - - md5: acbd18db4cc2f85cedef654fccc4a4d8 + - sha256: acbd18db4cc2f85cedef654fccc4a4d8 size: 3 path: bar meta: diff --git a/tests/func/test_odb.py b/tests/func/test_odb.py index ea548e00..2a85d4b4 100644 --- a/tests/func/test_odb.py +++ b/tests/func/test_odb.py @@ -12,17 +12,17 @@ from dvc.utils import relpath def test_cache(tmp_dir, dvc): - cache1_md5 = "123" - cache2_md5 = "234" + cache1_sha256 = "123" + cache2_sha256 = "234" cache1 = os.path.join( dvc.odb.local.cache_dir, - cache1_md5[0:2], - cache1_md5[2:], + cache1_sha256[0:2], + cache1_sha256[2:], ) cache2 = os.path.join( dvc.odb.local.cache_dir, - cache2_md5[0:2], - cache2_md5[2:], + cache2_sha256[0:2], + cache2_sha256[2:], ) tmp_dir.gen({cache1: "1", cache2: "2"}) @@ -31,13 +31,13 @@ def test_cache(tmp_dir, dvc): odb = ODBManager(dvc) - md5_list = list(odb.local.all()) - assert len(md5_list) == 2 - assert cache1_md5 in md5_list - assert cache2_md5 in md5_list + sha256_list = list(odb.local.all()) + assert len(sha256_list) == 2 + assert cache1_sha256 in sha256_list + assert cache2_sha256 in sha256_list - odb_cache1 = odb.local.hash_to_path(cache1_md5) - odb_cache2 = odb.local.hash_to_path(cache2_md5) + odb_cache1 = odb.local.hash_to_path(cache1_sha256) + odb_cache2 = odb.local.hash_to_path(cache2_sha256) assert os.fspath(odb_cache1) == cache1 assert os.fspath(odb_cache2) == cache2 @@ -49,13 +49,13 @@ def test_cache_load_bad_dir_cache(tmp_dir, dvc): fname = os.fspath(dvc.odb.local.hash_to_path(dir_hash)) tmp_dir.gen({fname: "not,json"}) with pytest.raises(ObjectFormatError): - load(dvc.odb.local, HashInfo("md5", dir_hash)) + load(dvc.odb.local, HashInfo("sha256", dir_hash)) dir_hash = "234.dir" fname = os.fspath(dvc.odb.local.hash_to_path(dir_hash)) tmp_dir.gen({fname: '{"a": "b"}'}) with pytest.raises(ObjectFormatError): - load(dvc.odb.local, HashInfo("md5", dir_hash)) + load(dvc.odb.local, HashInfo("sha256", dir_hash)) def test_external_cache_dir(tmp_dir, dvc, make_tmp_dir): diff --git a/tests/func/test_remote.py b/tests/func/test_remote.py index ff844ed7..5cbb8fe9 100644 --- a/tests/func/test_remote.py +++ b/tests/func/test_remote.py @@ -147,19 +147,19 @@ def test_dir_hash_should_be_key_order_agnostic(tmp_dir, dvc): path = (tmp_dir / "data").fs_path tree = Tree.from_list( - [{"relpath": "1", "md5": "1"}, {"relpath": "2", "md5": "2"}] + [{"relpath": "1", "sha256": "1"}, {"relpath": "2", "sha256": "2"}] ) tree.digest() with patch("dvc.data.stage._stage_tree", return_value=(None, tree)): - _, _, obj = stage(dvc.odb.local, path, dvc.odb.local.fs, "md5") + _, _, obj = stage(dvc.odb.local, path, dvc.odb.local.fs, "sha256") hash1 = obj.hash_info tree = Tree.from_list( - [{"md5": "1", "relpath": "1"}, {"md5": "2", "relpath": "2"}] + [{"sha256": "1", "relpath": "1"}, {"sha256": "2", "relpath": "2"}] ) tree.digest() with patch("dvc.data.stage._stage_tree", return_value=(None, tree)): - _, _, obj = stage(dvc.odb.local, path, dvc.odb.local.fs, "md5") + _, _, obj = stage(dvc.odb.local, path, dvc.odb.local.fs, "sha256") hash2 = obj.hash_info assert hash1 == hash2 @@ -245,7 +245,7 @@ def test_remote_modify_local_on_repo_config(tmp_dir, dvc): def test_external_dir_resource_on_no_cache(tmp_dir, dvc, tmp_path_factory): # https://github.com/iterative/dvc/issues/2647, is some situations - # (external dir dependency) cache is required to calculate dir md5 + # (external dir dependency) cache is required to calculate dir sha256 external_dir = tmp_path_factory.mktemp("external_dir") file = external_dir / "file" diff --git a/tests/func/test_repo_index.py b/tests/func/test_repo_index.py index 22826a78..875a1a7f 100644 --- a/tests/func/test_repo_index.py +++ b/tests/func/test_repo_index.py @@ -269,17 +269,17 @@ def test_used_objs(tmp_dir, scm, dvc, run_copy, rev): expected_objs = [ HashInfo( - name="md5", + name="sha256", value="acbd18db4cc2f85cedef654fccc4a4d8", obj_name="bar", ), HashInfo( - name="md5", + name="sha256", value="8c7dd922ad47494fc02c388e12c00eac", obj_name="dir/subdir/file", ), HashInfo( - name="md5", + name="sha256", value="d28c9e28591aeb7e303dc6772ffa6f6b.dir", obj_name="dir", ), diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index 4426e9aa..d0a62183 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -19,7 +19,7 @@ from dvc.output import Output from dvc.stage import Stage from dvc.stage.exceptions import StageFileDoesNotExistError from dvc.system import System -from dvc.utils import file_md5, relpath +from dvc.utils import file_sha256, relpath from dvc.utils.fs import remove from dvc.utils.serialize import dump_yaml, load_yaml from tests.basic_env import TestDvc @@ -654,7 +654,7 @@ class TestReproDataSource(TestReproChangedData): self.assertTrue(filecmp.cmp(self.FOO, self.BAR, shallow=False)) self.assertEqual( - stages[0].outs[0].hash_info.value, file_md5(self.BAR, self.dvc.fs) + stages[0].outs[0].hash_info.value, file_sha256(self.BAR, self.dvc.fs) ) @@ -1127,21 +1127,21 @@ def test_dvc_formatting_retained(tmp_dir, dvc, run_copy): (tmp_dir / "foo").write_text("new foo") dvc.reproduce("foo_copy.dvc", force=True) - def _hide_md5(text): - return re.sub(r"\b[a-f0-9]{32}\b", "", text) + def _hide_sha256(text): + return re.sub(r"\b[a-f0-9]{32}\b", "", text) def _hide_size(text): return re.sub(r"size: [0-9]*\b", "size: ", text) def _mask(text): - return _hide_size(_hide_md5(text)) + return _hide_size(_hide_sha256(text)) assert _mask(stage_text) == _mask(stage_path.read_text()) def _format_dvc_line(line): - # Add line comment for all cache and md5 keys - if "cache:" in line or "md5:" in line: + # Add line comment for all cache and sha256 keys + if "cache:" in line or "sha256:" in line: return line + " # line comment" # Format command as one word per line if line.startswith("cmd: "): diff --git a/tests/func/test_run_multistage.py b/tests/func/test_run_multistage.py index f83b7e18..569a86de 100644 --- a/tests/func/test_run_multistage.py +++ b/tests/func/test_run_multistage.py @@ -355,7 +355,7 @@ def test_run_external_outputs( dvc, local_workspace, ): - hash_name = "md5" + hash_name = "sha256" foo_hash = "acbd18db4cc2f85cedef654fccc4a4d8" bar_hash = "37b51d194a7513e45b56f6524f2d51f2" diff --git a/tests/func/test_run_single_stage.py b/tests/func/test_run_single_stage.py index a4db9b13..62ea01f3 100644 --- a/tests/func/test_run_single_stage.py +++ b/tests/func/test_run_single_stage.py @@ -30,7 +30,7 @@ from dvc.stage.exceptions import ( StagePathOutsideError, ) from dvc.system import System -from dvc.utils import file_md5 +from dvc.utils import file_sha256 from dvc.utils.serialize import load_yaml from tests.basic_env import TestDvc, TestDvcGit @@ -60,7 +60,7 @@ class TestRun(TestDvc): self.assertEqual(len(stage.outs), len(outs + outs_no_cache)) self.assertEqual(stage.outs[0].fspath, outs[0]) self.assertEqual( - stage.outs[0].hash_info.value, file_md5(self.FOO, self.dvc.fs) + stage.outs[0].hash_info.value, file_sha256(self.FOO, self.dvc.fs) ) self.assertTrue(stage.path, fname) @@ -990,20 +990,20 @@ def test_run_force_preserves_comments_and_meta(tmp_dir, dvc, run_copy): cmd: python copy.py foo bar deps: - path: copy.py - md5: 90c27dd80b698fe766f0c3ee0b6b9729 + sha256: 90c27dd80b698fe766f0c3ee0b6b9729 size: {code_size} - path: foo - md5: acbd18db4cc2f85cedef654fccc4a4d8 + sha256: acbd18db4cc2f85cedef654fccc4a4d8 size: 3 outs: # comment preserved - path: bar desc: out desc - md5: acbd18db4cc2f85cedef654fccc4a4d8 + sha256: acbd18db4cc2f85cedef654fccc4a4d8 size: 3 meta: name: copy-foo-bar - md5: be659ce4a33cebb85d4e8e1335d394ad + sha256: be659ce4a33cebb85d4e8e1335d394ad """ ) @@ -1014,18 +1014,18 @@ def test_run_force_preserves_comments_and_meta(tmp_dir, dvc, run_copy): cmd: python copy.py foo1 bar1 deps: - path: foo1 - md5: 299a0be4a5a79e6a59fdd251b19d78bb + sha256: 299a0be4a5a79e6a59fdd251b19d78bb size: 4 - path: copy.py - md5: 90c27dd80b698fe766f0c3ee0b6b9729 + sha256: 90c27dd80b698fe766f0c3ee0b6b9729 size: {code_size} outs: # comment preserved - path: bar1 - md5: 299a0be4a5a79e6a59fdd251b19d78bb + sha256: 299a0be4a5a79e6a59fdd251b19d78bb size: 4 meta: name: copy-foo-bar - md5: 9e725b11cb393e6a7468369fa50328b7 + sha256: 9e725b11cb393e6a7468369fa50328b7 """ ) diff --git a/tests/func/test_stage.py b/tests/func/test_stage.py index 99908d8b..4c2a7bc9 100644 --- a/tests/func/test_stage.py +++ b/tests/func/test_stage.py @@ -78,8 +78,8 @@ class TestReload(TestDvc): d = load_yaml(stage.relpath) # NOTE: checking that reloaded stage didn't change its checksum - md5 = "11111111111111111111111111111111" - d[stage.PARAM_MD5] = md5 + sha256 = "11111111111111111111111111111111" + d[stage.PARAM_MD5] = sha256 dump_yaml(stage.relpath, d) dvcfile = SingleStageFile(self.dvc, stage.relpath) @@ -89,7 +89,7 @@ class TestReload(TestDvc): dvcfile.dump(stage) d = load_yaml(stage.relpath) - self.assertEqual(d[stage.PARAM_MD5], md5) + self.assertEqual(d[stage.PARAM_MD5], sha256) class TestDefaultWorkingDirectory(TestDvc): @@ -154,7 +154,7 @@ class TestExternalRemoteResolution(TestDvc): assert os.path.exists("movie.txt") -def test_md5_ignores_comments(tmp_dir, dvc): +def test_sha256_ignores_comments(tmp_dir, dvc): (stage,) = tmp_dir.dvc_gen("foo", "foo content") with open(stage.path, "a", encoding="utf-8") as f: diff --git a/tests/func/test_state.py b/tests/func/test_state.py index 173821a6..d7eab49e 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -4,13 +4,13 @@ import re from dvc.hash_info import HashInfo from dvc.repo import Repo from dvc.state import State -from dvc.utils import file_md5 +from dvc.utils import file_sha256 def test_state(tmp_dir, dvc): tmp_dir.gen("foo", "foo content") path = tmp_dir / "foo" - hash_info = HashInfo("md5", file_md5(path, dvc.fs)) + hash_info = HashInfo("sha256", file_sha256(path, dvc.fs)) state = State(dvc.root_dir, dvc.tmp_dir, dvc.dvcignore) @@ -22,7 +22,7 @@ def test_state(tmp_dir, dvc): assert state.get(path, dvc.fs) == (None, None) - hash_info = HashInfo("md5", file_md5(path, dvc.fs)) + hash_info = HashInfo("sha256", file_sha256(path, dvc.fs)) state.save(path, dvc.fs, hash_info) assert state.get(path, dvc.fs)[1] == hash_info diff --git a/tests/func/test_utils.py b/tests/func/test_utils.py index 026357af..eaa0abfb 100644 --- a/tests/func/test_utils.py +++ b/tests/func/test_utils.py @@ -2,14 +2,14 @@ from dvc import utils from dvc.fs.local import LocalFileSystem -def test_file_md5_crlf(tmp_dir): +def test_file_sha256_crlf(tmp_dir): fs = LocalFileSystem() tmp_dir.gen("cr", b"a\nb\nc") tmp_dir.gen("crlf", b"a\r\nb\r\nc") - assert utils.file_md5("cr", fs) == utils.file_md5("crlf", fs) + assert utils.file_sha256("cr", fs) == utils.file_sha256("crlf", fs) -def test_dict_md5(): +def test_dict_sha256(): d = { "cmd": "python code.py foo file1", "locked": "true", @@ -18,18 +18,18 @@ def test_dict_md5(): "path": "file1", "metric": {"type": "raw"}, "cache": False, - "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "sha256": "acbd18db4cc2f85cedef654fccc4a4d8", } ], "deps": [ - {"path": "foo", "md5": "acbd18db4cc2f85cedef654fccc4a4d8"}, - {"path": "code.py", "md5": "d05447644b89960913c7eee5fd776adb"}, + {"path": "foo", "sha256": "acbd18db4cc2f85cedef654fccc4a4d8"}, + {"path": "code.py", "sha256": "d05447644b89960913c7eee5fd776adb"}, ], } - md5 = "8b263fa05ede6c3145c164829be694b4" + sha256 = "8b263fa05ede6c3145c164829be694b4" - assert md5 == utils.dict_md5(d, exclude=["metric", "locked"]) + assert sha256 == utils.dict_sha256(d, exclude=["metric", "locked"]) def test_boxify(): diff --git a/tests/unit/fs/test_dvc.py b/tests/unit/fs/test_dvc.py index 4dd7a9c0..fb94bc63 100644 --- a/tests/unit/fs/test_dvc.py +++ b/tests/unit/fs/test_dvc.py @@ -221,7 +221,7 @@ def test_isdvc(tmp_dir, dvc): def test_get_hash_file(tmp_dir, dvc): tmp_dir.dvc_gen({"foo": "foo"}) fs = DvcFileSystem(repo=dvc) - assert fs.info("foo")["md5"] == "acbd18db4cc2f85cedef654fccc4a4d8" + assert fs.info("foo")["sha256"] == "acbd18db4cc2f85cedef654fccc4a4d8" def test_get_hash_dir(tmp_dir, dvc, mocker): @@ -232,7 +232,7 @@ def test_get_hash_dir(tmp_dir, dvc, mocker): ) fs = DvcFileSystem(repo=dvc) get_file_hash_spy = mocker.spy(dvc_module.data.stage, "get_file_hash") - assert fs.info("dir")["md5"] == "8761c4e9acad696bee718615e23e22db.dir" + assert fs.info("dir")["sha256"] == "8761c4e9acad696bee718615e23e22db.dir" assert not get_file_hash_spy.called @@ -242,15 +242,15 @@ def test_get_hash_granular(tmp_dir, dvc): ) fs = DvcFileSystem(repo=dvc) subdir = os.path.join("dir", "subdir") - assert fs.info(subdir).get("md5") is None - _, _, obj = stage(dvc.odb.local, subdir, fs, "md5", dry_run=True) + assert fs.info(subdir).get("sha256") is None + _, _, obj = stage(dvc.odb.local, subdir, fs, "sha256", dry_run=True) assert obj.hash_info == HashInfo( - "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir" + "sha256", "af314506f1622d107e0ed3f14ec1a3b5.dir" ) data = os.path.join(subdir, "data") - assert fs.info(data)["md5"] == "8d777f385d3dfec8815d20f7496026dc" - _, _, obj = stage(dvc.odb.local, data, fs, "md5", dry_run=True) - assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc") + assert fs.info(data)["sha256"] == "8d777f385d3dfec8815d20f7496026dc" + _, _, obj = stage(dvc.odb.local, data, fs, "sha256", dry_run=True) + assert obj.hash_info == HashInfo("sha256", "8d777f385d3dfec8815d20f7496026dc") def test_get_hash_dirty_file(tmp_dir, dvc): @@ -259,9 +259,9 @@ def test_get_hash_dirty_file(tmp_dir, dvc): fs = DvcFileSystem(repo=dvc) expected = "8c7dd922ad47494fc02c388e12c00eac" - assert fs.info("file").get("md5") == expected - _, _, obj = stage(dvc.odb.local, "file", fs, "md5", dry_run=True) - assert obj.hash_info == HashInfo("md5", expected) + assert fs.info("file").get("sha256") == expected + _, _, obj = stage(dvc.odb.local, "file", fs, "sha256", dry_run=True) + assert obj.hash_info == HashInfo("sha256", expected) def test_get_hash_dirty_dir(tmp_dir, dvc): @@ -270,6 +270,6 @@ def test_get_hash_dirty_dir(tmp_dir, dvc): fs = DvcFileSystem(repo=dvc) expected = "5ea40360f5b4ec688df672a4db9c17d1.dir" - assert fs.info("dir").get("md5") == expected - _, _, obj = stage(dvc.odb.local, "dir", fs, "md5", dry_run=True) - assert obj.hash_info == HashInfo("md5", expected) + assert fs.info("dir").get("sha256") == expected + _, _, obj = stage(dvc.odb.local, "dir", fs, "sha256", dry_run=True) + assert obj.hash_info == HashInfo("sha256", expected) diff --git a/tests/unit/fs/test_repo.py b/tests/unit/fs/test_repo.py index 24b241fc..40b7ee89 100644 --- a/tests/unit/fs/test_repo.py +++ b/tests/unit/fs/test_repo.py @@ -508,11 +508,11 @@ def test_get_hash_cached_file(tmp_dir, dvc, mocker): tmp_dir.dvc_gen({"foo": "foo"}) fs = RepoFileSystem(repo=dvc) expected = "acbd18db4cc2f85cedef654fccc4a4d8" - assert fs.info((tmp_dir / "foo").fs_path).get("md5") is None - _, _, obj = stage(dvc.odb.local, (tmp_dir / "foo").fs_path, fs, "md5") - assert obj.hash_info == HashInfo("md5", expected) + assert fs.info((tmp_dir / "foo").fs_path).get("sha256") is None + _, _, obj = stage(dvc.odb.local, (tmp_dir / "foo").fs_path, fs, "sha256") + assert obj.hash_info == HashInfo("sha256", expected) (tmp_dir / "foo").unlink() - assert fs.info((tmp_dir / "foo").fs_path)["md5"] == expected + assert fs.info((tmp_dir / "foo").fs_path)["sha256"] == expected def test_get_hash_cached_dir(tmp_dir, dvc, mocker): @@ -521,17 +521,17 @@ def test_get_hash_cached_dir(tmp_dir, dvc, mocker): ) fs = RepoFileSystem(repo=dvc) expected = "8761c4e9acad696bee718615e23e22db.dir" - assert fs.info((tmp_dir / "dir").fs_path).get("md5") is None - _, _, obj = stage(dvc.odb.local, (tmp_dir / "dir").fs_path, fs, "md5") + assert fs.info((tmp_dir / "dir").fs_path).get("sha256") is None + _, _, obj = stage(dvc.odb.local, (tmp_dir / "dir").fs_path, fs, "sha256") assert obj.hash_info == HashInfo( - "md5", "8761c4e9acad696bee718615e23e22db.dir" + "sha256", "8761c4e9acad696bee718615e23e22db.dir" ) shutil.rmtree(tmp_dir / "dir") - assert fs.info((tmp_dir / "dir").fs_path)["md5"] == expected - _, _, obj = stage(dvc.odb.local, (tmp_dir / "dir").fs_path, fs, "md5") + assert fs.info((tmp_dir / "dir").fs_path)["sha256"] == expected + _, _, obj = stage(dvc.odb.local, (tmp_dir / "dir").fs_path, fs, "sha256") assert obj.hash_info == HashInfo( - "md5", "8761c4e9acad696bee718615e23e22db.dir" + "sha256", "8761c4e9acad696bee718615e23e22db.dir" ) @@ -541,17 +541,17 @@ def test_get_hash_cached_granular(tmp_dir, dvc, mocker): ) fs = RepoFileSystem(repo=dvc) subdir = tmp_dir / "dir" / "subdir" - assert fs.info(subdir.fs_path).get("md5") is None - _, _, obj = stage(dvc.odb.local, subdir.fs_path, fs, "md5") + assert fs.info(subdir.fs_path).get("sha256") is None + _, _, obj = stage(dvc.odb.local, subdir.fs_path, fs, "sha256") assert obj.hash_info == HashInfo( - "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir" + "sha256", "af314506f1622d107e0ed3f14ec1a3b5.dir" ) - assert fs.info((subdir / "data").fs_path).get("md5") is None - _, _, obj = stage(dvc.odb.local, (subdir / "data").fs_path, fs, "md5") - assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc") + assert fs.info((subdir / "data").fs_path).get("sha256") is None + _, _, obj = stage(dvc.odb.local, (subdir / "data").fs_path, fs, "sha256") + assert obj.hash_info == HashInfo("sha256", "8d777f385d3dfec8815d20f7496026dc") (tmp_dir / "dir" / "subdir" / "data").unlink() assert ( - fs.info((subdir / "data").fs_path)["md5"] + fs.info((subdir / "data").fs_path)["sha256"] == "8d777f385d3dfec8815d20f7496026dc" ) @@ -570,9 +570,9 @@ def test_get_hash_mixed_dir(tmp_dir, scm, dvc): clean_staging() fs = RepoFileSystem(repo=dvc) - _, _, obj = stage(dvc.odb.local, (tmp_dir / "dir").fs_path, fs, "md5") + _, _, obj = stage(dvc.odb.local, (tmp_dir / "dir").fs_path, fs, "sha256") assert obj.hash_info == HashInfo( - "md5", "e1d9e8eae5374860ae025ec84cfd85c7.dir" + "sha256", "e1d9e8eae5374860ae025ec84cfd85c7.dir" ) @@ -582,19 +582,19 @@ def test_get_hash_dirty_file(tmp_dir, dvc): from dvc.objects.errors import ObjectFormatError tmp_dir.dvc_gen("file", "file") - file_hash_info = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac") + file_hash_info = HashInfo("sha256", "8c7dd922ad47494fc02c388e12c00eac") (tmp_dir / "file").write_text("something") - something_hash_info = HashInfo("md5", "437b930db84b8079c2dd804a71936b5f") + something_hash_info = HashInfo("sha256", "437b930db84b8079c2dd804a71936b5f") clean_staging() # file is modified in workspace # get_file_hash(file) should return workspace hash, not DVC cached hash fs = RepoFileSystem(repo=dvc) - assert fs.info((tmp_dir / "file").fs_path).get("md5") is None + assert fs.info((tmp_dir / "file").fs_path).get("sha256") is None staging, _, obj = stage( - dvc.odb.local, (tmp_dir / "file").fs_path, fs, "md5" + dvc.odb.local, (tmp_dir / "file").fs_path, fs, "sha256" ) assert obj.hash_info == something_hash_info check(staging, obj) @@ -606,15 +606,15 @@ def test_get_hash_dirty_file(tmp_dir, dvc): check(staging, obj) # get_file_hash(file) should return DVC cached hash - assert fs.info((tmp_dir / "file").fs_path)["md5"] == file_hash_info.value + assert fs.info((tmp_dir / "file").fs_path)["sha256"] == file_hash_info.value _, hash_info = get_file_hash( - (tmp_dir / "file").fs_path, fs, "md5", state=dvc.state + (tmp_dir / "file").fs_path, fs, "sha256", state=dvc.state ) assert hash_info == file_hash_info # tmp_dir/file can be staged even though it is missing in workspace since # repofs will use the DVC cached hash (and refer to the local cache object) - _, _, obj = stage(dvc.odb.local, (tmp_dir / "file").fs_path, fs, "md5") + _, _, obj = stage(dvc.odb.local, (tmp_dir / "file").fs_path, fs, "sha256") assert obj.hash_info == file_hash_info @@ -624,9 +624,9 @@ def test_get_hash_dirty_dir(tmp_dir, dvc): clean_staging() fs = RepoFileSystem(repo=dvc) - _, meta, obj = stage(dvc.odb.local, (tmp_dir / "dir").fs_path, fs, "md5") + _, meta, obj = stage(dvc.odb.local, (tmp_dir / "dir").fs_path, fs, "sha256") assert obj.hash_info == HashInfo( - "md5", "ba75a2162ca9c29acecb7957105a0bc2.dir" + "sha256", "ba75a2162ca9c29acecb7957105a0bc2.dir" ) assert meta.nfiles == 3 diff --git a/tests/unit/objects/db/test_local.py b/tests/unit/objects/db/test_local.py index 57b452bc..2f46ff53 100644 --- a/tests/unit/objects/db/test_local.py +++ b/tests/unit/objects/db/test_local.py @@ -17,8 +17,8 @@ def test_status_download_optimization(mocker, dvc): odb = LocalObjectDB(LocalFileSystem(), os.getcwd()) obj_ids = { - HashInfo("md5", "acbd18db4cc2f85cedef654fccc4a4d8"), - HashInfo("md5", "37b51d194a7513e45b56f6524f2d51f2"), + HashInfo("sha256", "acbd18db4cc2f85cedef654fccc4a4d8"), + HashInfo("sha256", "37b51d194a7513e45b56f6524f2d51f2"), } local_exists = [hash_info.value for hash_info in obj_ids] @@ -94,7 +94,7 @@ def test_staging_file(tmp_dir, dvc): local_odb = dvc.odb.local staging_odb, _, obj = stage( - local_odb, (tmp_dir / "foo").fs_path, fs, "md5" + local_odb, (tmp_dir / "foo").fs_path, fs, "sha256" ) assert not local_odb.exists(obj.hash_info) @@ -122,7 +122,7 @@ def test_staging_dir(tmp_dir, dvc): local_odb = dvc.odb.local staging_odb, _, obj = stage( - local_odb, (tmp_dir / "dir").fs_path, fs, "md5" + local_odb, (tmp_dir / "dir").fs_path, fs, "sha256" ) assert not local_odb.exists(obj.hash_info) diff --git a/tests/unit/objects/test_tree.py b/tests/unit/objects/test_tree.py index f765a663..9dd53ae0 100644 --- a/tests/unit/objects/test_tree.py +++ b/tests/unit/objects/test_tree.py @@ -13,57 +13,57 @@ from dvc.hash_info import HashInfo ([], {}), ( [ - {"md5": "def", "relpath": "zzz"}, - {"md5": "123", "relpath": "foo"}, - {"md5": "abc", "relpath": "aaa"}, - {"md5": "456", "relpath": "bar"}, + {"sha256": "def", "relpath": "zzz"}, + {"sha256": "123", "relpath": "foo"}, + {"sha256": "abc", "relpath": "aaa"}, + {"sha256": "456", "relpath": "bar"}, ], { - ("zzz",): (None, HashInfo("md5", "def")), - ("foo",): (None, HashInfo("md5", "123")), - ("bar",): (None, HashInfo("md5", "456")), - ("aaa",): (None, HashInfo("md5", "abc")), + ("zzz",): (None, HashInfo("sha256", "def")), + ("foo",): (None, HashInfo("sha256", "123")), + ("bar",): (None, HashInfo("sha256", "456")), + ("aaa",): (None, HashInfo("sha256", "abc")), }, ), ( [ - {"md5": "123", "relpath": "dir/b"}, - {"md5": "456", "relpath": "dir/z"}, - {"md5": "789", "relpath": "dir/a"}, - {"md5": "abc", "relpath": "b"}, - {"md5": "def", "relpath": "a"}, - {"md5": "ghi", "relpath": "z"}, - {"md5": "jkl", "relpath": "dir/subdir/b"}, - {"md5": "mno", "relpath": "dir/subdir/z"}, - {"md5": "pqr", "relpath": "dir/subdir/a"}, + {"sha256": "123", "relpath": "dir/b"}, + {"sha256": "456", "relpath": "dir/z"}, + {"sha256": "789", "relpath": "dir/a"}, + {"sha256": "abc", "relpath": "b"}, + {"sha256": "def", "relpath": "a"}, + {"sha256": "ghi", "relpath": "z"}, + {"sha256": "jkl", "relpath": "dir/subdir/b"}, + {"sha256": "mno", "relpath": "dir/subdir/z"}, + {"sha256": "pqr", "relpath": "dir/subdir/a"}, ], { ("dir", "b"): ( None, - HashInfo("md5", "123"), + HashInfo("sha256", "123"), ), ("dir", "z"): ( None, - HashInfo("md5", "456"), + HashInfo("sha256", "456"), ), ("dir", "a"): ( None, - HashInfo("md5", "789"), + HashInfo("sha256", "789"), ), - ("b",): (None, HashInfo("md5", "abc")), - ("a",): (None, HashInfo("md5", "def")), - ("z",): (None, HashInfo("md5", "ghi")), + ("b",): (None, HashInfo("sha256", "abc")), + ("a",): (None, HashInfo("sha256", "def")), + ("z",): (None, HashInfo("sha256", "ghi")), ("dir", "subdir", "b"): ( None, - HashInfo("md5", "jkl"), + HashInfo("sha256", "jkl"), ), ("dir", "subdir", "z"): ( None, - HashInfo("md5", "mno"), + HashInfo("sha256", "mno"), ), ("dir", "subdir", "a"): ( None, - HashInfo("md5", "pqr"), + HashInfo("sha256", "pqr"), ), }, ), @@ -81,19 +81,19 @@ def test_list(lst, trie_dict): ({}, 0), ( { - ("a",): (Meta(size=1), HashInfo("md5", "abc")), - ("b",): (Meta(size=2), HashInfo("md5", "def")), - ("c",): (Meta(size=3), HashInfo("md5", "ghi")), - ("dir", "foo"): (Meta(size=4), HashInfo("md5", "jkl")), - ("dir", "bar"): (Meta(size=5), HashInfo("md5", "mno")), - ("dir", "baz"): (Meta(size=6), HashInfo("md5", "pqr")), + ("a",): (Meta(size=1), HashInfo("sha256", "abc")), + ("b",): (Meta(size=2), HashInfo("sha256", "def")), + ("c",): (Meta(size=3), HashInfo("sha256", "ghi")), + ("dir", "foo"): (Meta(size=4), HashInfo("sha256", "jkl")), + ("dir", "bar"): (Meta(size=5), HashInfo("sha256", "mno")), + ("dir", "baz"): (Meta(size=6), HashInfo("sha256", "pqr")), }, 6, ), ( { - ("a",): (Meta(size=1), HashInfo("md5", "abc")), - ("b",): (Meta(), HashInfo("md5", "def")), + ("a",): (Meta(size=1), HashInfo("sha256", "abc")), + ("b",): (Meta(), HashInfo("sha256", "def")), }, 2, ), @@ -110,15 +110,15 @@ def test_nfiles(trie_dict, nfiles): [ {}, { - ("a",): (None, HashInfo("md5", "abc")), - ("b",): (None, HashInfo("md5", "def")), - ("c",): (None, HashInfo("md5", "ghi")), - ("dir", "foo"): (None, HashInfo("md5", "jkl")), - ("dir", "bar"): (None, HashInfo("md5", "mno")), - ("dir", "baz"): (None, HashInfo("md5", "pqr")), - ("dir", "subdir", "1"): (None, HashInfo("md5", "stu")), - ("dir", "subdir", "2"): (None, HashInfo("md5", "vwx")), - ("dir", "subdir", "3"): (None, HashInfo("md5", "yz")), + ("a",): (None, HashInfo("sha256", "abc")), + ("b",): (None, HashInfo("sha256", "def")), + ("c",): (None, HashInfo("sha256", "ghi")), + ("dir", "foo"): (None, HashInfo("sha256", "jkl")), + ("dir", "bar"): (None, HashInfo("sha256", "mno")), + ("dir", "baz"): (None, HashInfo("sha256", "pqr")), + ("dir", "subdir", "1"): (None, HashInfo("sha256", "stu")), + ("dir", "subdir", "2"): (None, HashInfo("sha256", "vwx")), + ("dir", "subdir", "3"): (None, HashInfo("sha256", "yz")), }, ], ) @@ -135,63 +135,63 @@ def test_items(trie_dict): [ ({}, {}, {}, {}), ( - {("foo",): HashInfo("md5", "123")}, + {("foo",): HashInfo("sha256", "123")}, { - ("foo",): HashInfo("md5", "123"), - ("bar",): HashInfo("md5", "345"), + ("foo",): HashInfo("sha256", "123"), + ("bar",): HashInfo("sha256", "345"), }, { - ("foo",): HashInfo("md5", "123"), - ("baz",): HashInfo("md5", "678"), + ("foo",): HashInfo("sha256", "123"), + ("baz",): HashInfo("sha256", "678"), }, { - ("foo",): HashInfo("md5", "123"), - ("bar",): HashInfo("md5", "345"), - ("baz",): HashInfo("md5", "678"), + ("foo",): HashInfo("sha256", "123"), + ("bar",): HashInfo("sha256", "345"), + ("baz",): HashInfo("sha256", "678"), }, ), ( { - ("common",): HashInfo("md5", "123"), - ("subdir", "foo"): HashInfo("md5", "345"), + ("common",): HashInfo("sha256", "123"), + ("subdir", "foo"): HashInfo("sha256", "345"), }, { - ("common",): HashInfo("md5", "123"), - ("subdir", "foo"): HashInfo("md5", "345"), - ("subdir", "bar"): HashInfo("md5", "678"), + ("common",): HashInfo("sha256", "123"), + ("subdir", "foo"): HashInfo("sha256", "345"), + ("subdir", "bar"): HashInfo("sha256", "678"), }, { - ("common",): HashInfo("md5", "123"), - ("subdir", "foo"): HashInfo("md5", "345"), - ("subdir", "baz"): HashInfo("md5", "91011"), + ("common",): HashInfo("sha256", "123"), + ("subdir", "foo"): HashInfo("sha256", "345"), + ("subdir", "baz"): HashInfo("sha256", "91011"), }, { - ("common",): HashInfo("md5", "123"), - ("subdir", "foo"): HashInfo("md5", "345"), - ("subdir", "bar"): HashInfo("md5", "678"), - ("subdir", "baz"): HashInfo("md5", "91011"), + ("common",): HashInfo("sha256", "123"), + ("subdir", "foo"): HashInfo("sha256", "345"), + ("subdir", "bar"): HashInfo("sha256", "678"), + ("subdir", "baz"): HashInfo("sha256", "91011"), }, ), ( {}, - {("foo",): HashInfo("md5", "123")}, - {("bar",): HashInfo("md5", "456")}, + {("foo",): HashInfo("sha256", "123")}, + {("bar",): HashInfo("sha256", "456")}, { - ("foo",): HashInfo("md5", "123"), - ("bar",): HashInfo("md5", "456"), + ("foo",): HashInfo("sha256", "123"), + ("bar",): HashInfo("sha256", "456"), }, ), ( {}, {}, - {("bar",): HashInfo("md5", "123")}, - {("bar",): HashInfo("md5", "123")}, + {("bar",): HashInfo("sha256", "123")}, + {("bar",): HashInfo("sha256", "123")}, ), ( {}, - {("bar",): HashInfo("md5", "123")}, + {("bar",): HashInfo("sha256", "123")}, {}, - {("bar",): HashInfo("md5", "123")}, + {("bar",): HashInfo("sha256", "123")}, ), ], ) diff --git a/tests/unit/output/test_local.py b/tests/unit/output/test_local.py index 2cb3ce14..ee8f961d 100644 --- a/tests/unit/output/test_local.py +++ b/tests/unit/output/test_local.py @@ -64,12 +64,12 @@ class TestGetFilesNumber(TestDvc): def test_return_multiple_for_dir(self): o = self._get_output() - o.hash_info = HashInfo("md5", "12345678.dir") + o.hash_info = HashInfo("sha256", "12345678.dir") o.meta = Meta(nfiles=2) self.assertEqual(2, o.get_files_number()) @patch.object(Output, "is_dir_checksum", False) def test_return_1_on_single_file_cache(self): o = self._get_output() - o.hash_info = HashInfo("md5", "12345678") + o.hash_info = HashInfo("sha256", "12345678") self.assertEqual(1, o.get_files_number()) diff --git a/tests/unit/output/test_output.py b/tests/unit/output/test_output.py index 46e892b1..0bdcd3fd 100644 --- a/tests/unit/output/test_output.py +++ b/tests/unit/output/test_output.py @@ -29,7 +29,7 @@ def test_save_missing(dvc, mocker): ( "3cc286c534a71504476da009ed174423", "3cc286c534a71504476da009ed174423", - ), # md5 + ), # sha256 ( "d41d8cd98f00b204e9800998ecf8427e-38", "d41d8cd98f00b204e9800998ecf8427e-38", diff --git a/tests/unit/repo/test_repo.py b/tests/unit/repo/test_repo.py index 1528ca6b..93e094c9 100644 --- a/tests/unit/repo/test_repo.py +++ b/tests/unit/repo/test_repo.py @@ -48,8 +48,8 @@ def test_used_objs(tmp_dir, dvc, path): tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}}) expected = { - HashInfo("md5", "70922d6bf66eb073053a82f77d58c536.dir"), - HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac"), + HashInfo("sha256", "70922d6bf66eb073053a82f77d58c536.dir"), + HashInfo("sha256", "8c7dd922ad47494fc02c388e12c00eac"), } used = set() diff --git a/tests/unit/stage/test_loader_pipeline_file.py b/tests/unit/stage/test_loader_pipeline_file.py index 84847ed2..696b3302 100644 --- a/tests/unit/stage/test_loader_pipeline_file.py +++ b/tests/unit/stage/test_loader_pipeline_file.py @@ -20,8 +20,8 @@ def stage_data(): def lock_data(): return { "cmd": "command", - "deps": [{"path": "foo", "md5": "foo_checksum"}], - "outs": [{"path": "bar", "md5": "bar_checksum"}], + "deps": [{"path": "foo", "sha256": "foo_checksum"}], + "outs": [{"path": "bar", "sha256": "bar_checksum"}], } @@ -35,8 +35,8 @@ def test_fill_from_lock_deps_outs(dvc, lock_data): StageLoader.fill_from_lock(stage, lock_data) - assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") - assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") + assert stage.deps[0].hash_info == HashInfo("sha256", "foo_checksum") + assert stage.outs[0].hash_info == HashInfo("sha256", "bar_checksum") def test_fill_from_lock_outs_isexec(dvc): @@ -48,12 +48,12 @@ def test_fill_from_lock_outs_isexec(dvc): stage, { "cmd": "command", - "outs": [{"path": "foo", "md5": "foo_checksum", "isexec": True}], + "outs": [{"path": "foo", "sha256": "foo_checksum", "isexec": True}], }, ) assert stage.outs[0].def_path == "foo" - assert stage.outs[0].hash_info == HashInfo("md5", "foo_checksum") + assert stage.outs[0].hash_info == HashInfo("sha256", "foo_checksum") assert stage.outs[0].meta.isexec @@ -118,8 +118,8 @@ def test_fill_from_lock_missing_checksums(dvc, lock_data): StageLoader.fill_from_lock(stage, lock_data) - assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") - assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") + assert stage.deps[0].hash_info == HashInfo("sha256", "foo_checksum") + assert stage.outs[0].hash_info == HashInfo("sha256", "bar_checksum") assert not stage.deps[1].hash_info and not stage.outs[1].hash_info @@ -134,7 +134,7 @@ def test_fill_from_lock_use_appropriate_checksum(dvc, lock_data): lock_data["deps"] = [{"path": "s3://dvc-temp/foo", "etag": "e-tag"}] StageLoader.fill_from_lock(stage, lock_data) assert stage.deps[0].hash_info == HashInfo("etag", "e-tag") - assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") + assert stage.outs[0].hash_info == HashInfo("sha256", "bar_checksum") def test_fill_from_lock_with_missing_sections(dvc, lock_data): @@ -145,12 +145,12 @@ def test_fill_from_lock_with_missing_sections(dvc, lock_data): del lock["deps"] StageLoader.fill_from_lock(stage, lock) assert not stage.deps[0].hash_info - assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") + assert stage.outs[0].hash_info == HashInfo("sha256", "bar_checksum") lock = deepcopy(lock_data) del lock["outs"] StageLoader.fill_from_lock(stage, lock) - assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") + assert stage.deps[0].hash_info == HashInfo("sha256", "foo_checksum") assert not stage.outs[0].hash_info @@ -173,9 +173,9 @@ def test_load_stage(dvc, stage_data, lock_data): assert stage.cmd == "command" assert stage.path == os.path.abspath(PIPELINE_FILE) assert stage.deps[0].def_path == "foo" - assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") + assert stage.deps[0].hash_info == HashInfo("sha256", "foo_checksum") assert stage.outs[0].def_path == "bar" - assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") + assert stage.outs[0].hash_info == HashInfo("sha256", "bar_checksum") def test_load_stage_cmd_with_list(dvc, stage_data, lock_data): @@ -210,8 +210,8 @@ def test_load_stage_with_params(dvc, stage_data, lock_data): assert deps[0].def_path == "foo" and stage.outs[0].def_path == "bar" assert params[0].def_path == "params.yaml" assert params[0].hash_info == HashInfo("params", {"lorem": "ipsum"}) - assert deps[0].hash_info == HashInfo("md5", "foo_checksum") - assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") + assert deps[0].hash_info == HashInfo("sha256", "foo_checksum") + assert stage.outs[0].hash_info == HashInfo("sha256", "bar_checksum") @pytest.mark.parametrize("typ", ["metrics", "plots"]) @@ -221,7 +221,7 @@ def test_load_stage_with_metrics_and_plots(dvc, stage_data, lock_data, typ): stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data) assert stage.outs[0].def_path == "bar" - assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") + assert stage.outs[0].hash_info == HashInfo("sha256", "bar_checksum") def test_load_changed_command(dvc, stage_data, lock_data): diff --git a/tests/unit/stage/test_serialize_pipeline_lock.py b/tests/unit/stage/test_serialize_pipeline_lock.py index 968b3183..846c2c62 100644 --- a/tests/unit/stage/test_serialize_pipeline_lock.py +++ b/tests/unit/stage/test_serialize_pipeline_lock.py @@ -31,11 +31,11 @@ def test_lock(dvc): def test_lock_deps(dvc): stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs) - stage.deps[0].hash_info = HashInfo("md5", "md-five") + stage.deps[0].hash_info = HashInfo("sha256", "md-five") assert to_single_stage_lockfile(stage) == OrderedDict( [ ("cmd", "command"), - ("deps", [OrderedDict([("path", "input"), ("md5", "md-five")])]), + ("deps", [OrderedDict([("path", "input"), ("sha256", "md-five")])]), ] ) @@ -44,16 +44,16 @@ def test_lock_deps_order(dvc): stage = create_stage( PipelineStage, dvc, deps=["input1", "input0"], **kwargs ) - stage.deps[0].hash_info = HashInfo("md5", "md-one1") - stage.deps[1].hash_info = HashInfo("md5", "md-zer0") + stage.deps[0].hash_info = HashInfo("sha256", "md-one1") + stage.deps[1].hash_info = HashInfo("sha256", "md-zer0") assert to_single_stage_lockfile(stage) == OrderedDict( [ ("cmd", "command"), ( "deps", [ - OrderedDict([("path", "input0"), ("md5", "md-zer0")]), - OrderedDict([("path", "input1"), ("md5", "md-one1")]), + OrderedDict([("path", "input0"), ("sha256", "md-zer0")]), + OrderedDict([("path", "input1"), ("sha256", "md-one1")]), ], ), ] @@ -123,11 +123,11 @@ def test_lock_params_no_values_filled(dvc): @pytest.mark.parametrize("typ", ["plots", "metrics", "outs"]) def test_lock_outs(dvc, typ): stage = create_stage(PipelineStage, dvc, **{typ: ["input"]}, **kwargs) - stage.outs[0].hash_info = HashInfo("md5", "md-five") + stage.outs[0].hash_info = HashInfo("sha256", "md-five") assert to_single_stage_lockfile(stage) == OrderedDict( [ ("cmd", "command"), - ("outs", [OrderedDict([("path", "input"), ("md5", "md-five")])]), + ("outs", [OrderedDict([("path", "input"), ("sha256", "md-five")])]), ] ) @@ -135,7 +135,7 @@ def test_lock_outs(dvc, typ): @pytest.mark.parametrize("typ", ["plots", "metrics", "outs"]) def test_lock_outs_isexec(dvc, typ): stage = create_stage(PipelineStage, dvc, **{typ: ["input"]}, **kwargs) - stage.outs[0].hash_info = HashInfo("md5", "md-five") + stage.outs[0].hash_info = HashInfo("sha256", "md-five") stage.outs[0].meta.isexec = True assert to_single_stage_lockfile(stage) == OrderedDict( [ @@ -146,7 +146,7 @@ def test_lock_outs_isexec(dvc, typ): OrderedDict( [ ("path", "input"), - ("md5", "md-five"), + ("sha256", "md-five"), ("isexec", True), ] ) @@ -161,16 +161,16 @@ def test_lock_outs_order(dvc, typ): stage = create_stage( PipelineStage, dvc, **{typ: ["input1", "input0"]}, **kwargs ) - stage.outs[0].hash_info = HashInfo("md5", "md-one1") - stage.outs[1].hash_info = HashInfo("md5", "md-zer0") + stage.outs[0].hash_info = HashInfo("sha256", "md-one1") + stage.outs[1].hash_info = HashInfo("sha256", "md-zer0") assert to_single_stage_lockfile(stage) == OrderedDict( [ ("cmd", "command"), ( "outs", [ - OrderedDict([("path", "input0"), ("md5", "md-zer0")]), - OrderedDict([("path", "input1"), ("md5", "md-one1")]), + OrderedDict([("path", "input0"), ("sha256", "md-zer0")]), + OrderedDict([("path", "input1"), ("sha256", "md-one1")]), ], ), ] @@ -181,7 +181,7 @@ def test_dump_nondefault_hash(dvc): stage = create_stage( PipelineStage, dvc, deps=["s3://dvc-temp/file"], **kwargs ) - stage.deps[0].hash_info = HashInfo("md5", "value") + stage.deps[0].hash_info = HashInfo("sha256", "value") assert to_single_stage_lockfile(stage) == OrderedDict( [ ("cmd", "command"), @@ -189,7 +189,7 @@ def test_dump_nondefault_hash(dvc): "deps", [ OrderedDict( - [("path", "s3://dvc-temp/file"), ("md5", "value")] + [("path", "s3://dvc-temp/file"), ("sha256", "value")] ) ], ), @@ -208,23 +208,23 @@ def test_order(dvc): ) params, deps = split_params_deps(stage) - deps[0].hash_info = HashInfo("md5", "md-five") + deps[0].hash_info = HashInfo("sha256", "md-five") params[0].hash_info = HashInfo("params", {"foo-param": "value"}) - stage.outs[0].hash_info = HashInfo("md5", "md5-output") + stage.outs[0].hash_info = HashInfo("sha256", "sha256-output") assert to_single_stage_lockfile(stage) == OrderedDict( [ ("cmd", "command"), - ("deps", [{"path": "input", "md5": "md-five"}]), + ("deps", [{"path": "input", "sha256": "md-five"}]), ("params", {"params.yaml": {"foo-param": "value"}}), - ("outs", [{"path": "output", "md5": "md5-output"}]), + ("outs", [{"path": "output", "sha256": "sha256-output"}]), ] ) def test_to_lockfile(dvc): stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs) - stage.deps[0].hash_info = HashInfo("md5", "md-five") + stage.deps[0].hash_info = HashInfo("sha256", "md-five") entry = to_lockfile(stage) assert len(entry) == 1 _Schema(LOCKFILE_STAGES_SCHEMA)(entry) @@ -232,7 +232,7 @@ def test_to_lockfile(dvc): "something": OrderedDict( [ ("cmd", "command"), - ("deps", [{"path": "input", "md5": "md-five"}]), + ("deps", [{"path": "input", "sha256": "md-five"}]), ] ) } diff --git a/tests/unit/stage/test_stage.py b/tests/unit/stage/test_stage.py index f564448a..fb6ac3d2 100644 --- a/tests/unit/stage/test_stage.py +++ b/tests/unit/stage/test_stage.py @@ -10,10 +10,10 @@ from dvc.stage import Stage from dvc.stage.exceptions import StageUpdateError TEST_STAGE_DICT = { - "md5": "123456", + "sha256": "123456", "cmd": "mycmd", - "outs": [{"path": "a", "md5": "123456789"}], - "deps": [{"path": "b", "md5": "987654321"}], + "outs": [{"path": "a", "sha256": "123456789"}], + "deps": [{"path": "b", "sha256": "987654321"}], } @@ -21,7 +21,7 @@ def test_stage_checksum(mocker): stage = Stage(None, "path", cmd="mycmd") mocker.patch.object(stage, "dumpd", return_value=TEST_STAGE_DICT) - assert stage.compute_md5() == "e9521a22111493406ea64a88cda63e0b" + assert stage.compute_sha256() == "e9521a22111493406ea64a88cda63e0b" def test_wdir_default_ignored(mocker): @@ -29,7 +29,7 @@ def test_wdir_default_ignored(mocker): d = dict(TEST_STAGE_DICT, wdir=".") mocker.patch.object(stage, "dumpd", return_value=d) - assert stage.compute_md5() == "e9521a22111493406ea64a88cda63e0b" + assert stage.compute_sha256() == "e9521a22111493406ea64a88cda63e0b" def test_wdir_non_default_is_not_ignored(mocker): @@ -37,7 +37,7 @@ def test_wdir_non_default_is_not_ignored(mocker): d = dict(TEST_STAGE_DICT, wdir="..") mocker.patch.object(stage, "dumpd", return_value=d) - assert stage.compute_md5() == "2ceba15e87f6848aa756502c1e6d24e9" + assert stage.compute_sha256() == "2ceba15e87f6848aa756502c1e6d24e9" def test_meta_ignored(mocker): @@ -45,7 +45,7 @@ def test_meta_ignored(mocker): d = dict(TEST_STAGE_DICT, meta={"author": "Suor"}) mocker.patch.object(stage, "dumpd", return_value=d) - assert stage.compute_md5() == "e9521a22111493406ea64a88cda63e0b" + assert stage.compute_sha256() == "e9521a22111493406ea64a88cda63e0b" def test_path_conversion(dvc): diff --git a/tests/unit/test_lockfile.py b/tests/unit/test_lockfile.py index ff42a775..831f9c45 100644 --- a/tests/unit/test_lockfile.py +++ b/tests/unit/test_lockfile.py @@ -31,8 +31,8 @@ def test_stage_dump_with_deps_and_outs(tmp_dir, dvc): data = { "s1": { "cmd": "command", - "deps": [{"md5": "1.txt", "path": "checksum"}], - "outs": [{"md5": "2.txt", "path": "checksum"}], + "deps": [{"sha256": "1.txt", "path": "checksum"}], + "outs": [{"sha256": "2.txt", "path": "checksum"}], } } (tmp_dir / "path.lock").dump(data) @@ -70,11 +70,11 @@ def test_load_when_lockfile_does_not_exist(tmp_dir, dvc): "s1": { "cmd": "command", "outs": [ - {"md5": "checksum", "path": "path", "random": "value"} + {"sha256": "checksum", "path": "path", "random": "value"} ], } }, - {"s1": {"cmd": "command", "deps": [{"md5": "checksum"}]}}, + {"s1": {"cmd": "command", "deps": [{"sha256": "checksum"}]}}, ], ) def test_load_when_lockfile_is_corrupted(tmp_dir, dvc, corrupt_data): diff --git a/tests/unit/utils/test_stream.py b/tests/unit/utils/test_stream.py index 2a80c3f0..ce454733 100644 --- a/tests/unit/utils/test_stream.py +++ b/tests/unit/utils/test_stream.py @@ -2,7 +2,7 @@ import pytest from dvc.fs.local import LocalFileSystem from dvc.istextfile import DEFAULT_CHUNK_SIZE, istextfile -from dvc.utils import file_md5 +from dvc.utils import file_sha256 from dvc.utils.stream import HashedStreamReader @@ -22,7 +22,7 @@ def test_hashed_stream_reader(tmp_dir): assert stream_reader.read(1) == b"o" assert stream_reader.tell() == 3 - hex_digest = file_md5(foo, LocalFileSystem()) + hex_digest = file_sha256(foo, LocalFileSystem()) assert stream_reader.is_text_file assert hex_digest == stream_reader.hash_info.value @@ -46,7 +46,7 @@ def test_hashed_stream_reader_as_chunks(tmp_dir): assert stream_reader.tell() == actual_size == total_read - hex_digest = file_md5(foo, LocalFileSystem()) + hex_digest = file_sha256(foo, LocalFileSystem()) assert not stream_reader.is_text_file assert hex_digest == stream_reader.hash_info.value @@ -68,7 +68,7 @@ def test_hashed_stream_reader_compatibility(tmp_dir, contents): stream_reader.read(chunk_size) local_fs = LocalFileSystem() - hex_digest = file_md5(data, local_fs) + hex_digest = file_sha256(data, local_fs) assert stream_reader.is_text_file is istextfile(data, local_fs) assert stream_reader.hash_info.value == hex_digest diff --git a/tests/unit/utils/test_utils.py b/tests/unit/utils/test_utils.py index a4800b46..7066b63c 100644 --- a/tests/unit/utils/test_utils.py +++ b/tests/unit/utils/test_utils.py @@ -6,7 +6,7 @@ import pytest from dvc.fs.local import LocalFileSystem from dvc.utils import ( dict_sha256, - file_md5, + file_sha256, fix_env, parse_target, relpath, @@ -83,11 +83,11 @@ def test_fix_env_pyenv(path, orig): assert fix_env(env)["PATH"] == orig -def test_file_md5(tmp_dir): +def test_file_sha256(tmp_dir): tmp_dir.gen("foo", "foo content") fs = LocalFileSystem() - assert file_md5("foo", fs) == file_md5("foo", fs) + assert file_sha256("foo", fs) == file_sha256("foo", fs) def test_tmp_fname():