commit d7d093fcb91b0d21faf36dbf62924f23b45abb9b Author: Max Date: Sat Dec 17 14:23:59 2022 +0100 md5 to sha256 for 2.17.0 diff --git a/src/dvc_data/build.py b/src/dvc_data/build.py index 3656ca5..3837763 100644 --- a/src/dvc_data/build.py +++ b/src/dvc_data/build.py @@ -63,7 +63,7 @@ def _build_file(path, fs, name, odb=None, upload_odb=None, dry_run=False): state = odb.state if odb else None meta, hash_info = hash_file(path, fs, name, state=state) if upload_odb and not dry_run: - assert odb and name == "md5" + assert odb and name == "sha256" return _upload_file(path, fs, odb, upload_odb) oid = hash_info.value @@ -195,9 +195,9 @@ def _get_staging(odb: "HashFileDB") -> "ReferenceHashFileDB": def _build_external_tree_info(odb, tree, name): # NOTE: used only for external outputs. Initial reasoning was to be # able to validate .dir files right in the workspace (e.g. check s3 - # etag), but could be dropped for manual validation with regular md5, + # etag), but could be dropped for manual validation with regular sha256, # that would be universal for all clouds. - assert odb and name != "md5" + assert odb and name != "sha256" oid = tree.hash_info.value odb.add(tree.path, tree.fs, oid) @@ -253,7 +253,7 @@ def build( **kwargs, ) logger.debug("built tree '%s'", obj) - if name != "md5": + if name != "sha256": obj = _build_external_tree_info(odb, obj, name) else: meta, obj = _build_file( diff --git a/src/dvc_data/cli.py b/src/dvc_data/cli.py index 2348875..ece639a 100644 --- a/src/dvc_data/cli.py +++ b/src/dvc_data/cli.py @@ -29,8 +29,8 @@ from dvc_data.diff import ROOT from dvc_data.diff import diff as _diff from dvc_data.hashfile.db import HashFileDB from dvc_data.hashfile.hash import algorithms_available -from dvc_data.hashfile.hash import file_md5 as _file_md5 -from dvc_data.hashfile.hash import fobj_md5 as _fobj_md5 +from dvc_data.hashfile.hash import file_sha256 as _file_sha256 +from dvc_data.hashfile.hash import fobj_sha256 as _fobj_sha256 from dvc_data.hashfile.hash_info import HashInfo from dvc_data.hashfile.obj import HashFile from dvc_data.hashfile.state import State @@ -93,7 +93,7 @@ app = Application( @app.command(name="hash", help="Compute checksum of the file") def hash_file( file: Path = file_type, - name: HashEnum = typer.Option("md5", "-n", "--name"), + name: HashEnum = typer.Option("sha256", "-n", "--name"), progress: bool = typer.Option(False, "--progress", "-p"), text: Optional[bool] = typer.Option(None, "--text/--binary", "-t/-b"), ): @@ -108,9 +108,9 @@ def hash_file( with callback: if path == "-": fobj = callback.wrap_attr(sys.stdin.buffer) - hash_value = _fobj_md5(fobj, text=text, name=hash_name) + hash_value = _fobj_sha256(fobj, text=text, name=hash_name) else: - hash_value = _file_md5( + hash_value = _file_sha256( path, name=hash_name, callback=callback, text=text ) print(hash_name, hash_value, sep=": ") @@ -262,7 +262,7 @@ def build( fs = MemoryFileSystem() fs.put_file(sys.stdin.buffer, fs_path) - object_store, _, obj = _build(odb, fs_path, fs, name="md5") + object_store, _, obj = _build(odb, fs_path, fs, name="sha256") if write: _transfer( object_store, @@ -285,7 +285,7 @@ def ls(oid: str = typer.Argument(..., allow_dash=True)): odb = get_odb() oid = from_shortoid(odb, oid) try: - tree = Tree.load(odb, HashInfo("md5", oid)) + tree = Tree.load(odb, HashInfo("sha256", oid)) except ObjectFormatError as exc: typer.echo(exc, err=True) raise typer.Exit(1) from exc @@ -454,7 +454,7 @@ def apply_op(odb, obj, application): ) fs = LocalFileSystem() - _, meta, new_obj = _build(odb, path, fs, "md5") + _, meta, new_obj = _build(odb, path, fs, "sha256") odb.add(path, fs, new_obj.hash_info.value, hardlink=False) return obj.add(new, meta, new_obj.hash_info) diff --git a/src/dvc_data/fs.py b/src/dvc_data/fs.py index c972981..ac45ad3 100644 --- a/src/dvc_data/fs.py +++ b/src/dvc_data/fs.py @@ -47,7 +47,7 @@ class DataFileSystem(AbstractFileSystem): # pylint:disable=abstract-method if info["type"] == "directory": raise IsADirectoryError - value = info.get("md5") + value = info.get("sha256") if not value: raise FileNotFoundError @@ -142,7 +142,7 @@ class DataFileSystem(AbstractFileSystem): # pylint:disable=abstract-method def checksum(self, path): info = self.info(path) - md5 = info.get("md5") - if md5: - return md5 + sha256 = info.get("sha256") + if sha256: + return sha256 raise NotImplementedError diff --git a/src/dvc_data/hashfile/hash.py b/src/dvc_data/hashfile/hash.py index 9bef01d..03f731c 100644 --- a/src/dvc_data/hashfile/hash.py +++ b/src/dvc_data/hashfile/hash.py @@ -42,7 +42,7 @@ class HashStreamFile(io.IOBase): def __init__( self, fobj: BinaryIO, - hash_name: str = "md5", + hash_name: str = "sha256", text: Optional[bool] = None, ) -> None: self.fobj = fobj @@ -77,11 +77,11 @@ class HashStreamFile(io.IOBase): return self.hasher.name -def fobj_md5( +def fobj_sha256( fobj: BinaryIO, chunk_size: int = 2**20, text: Optional[bool] = None, - name="md5", + name="sha256", ) -> str: # ideally, we want the heuristics to be applied in a similar way, # regardless of the size of the first chunk, @@ -95,17 +95,17 @@ def fobj_md5( return stream.hash_value -def file_md5( +def file_sha256( fname: "AnyFSPath", fs: "FileSystem" = localfs, callback: "Callback" = DEFAULT_CALLBACK, text: Optional[bool] = None, - name: str = "md5", + name: str = "sha256", ) -> str: size = fs.size(fname) or 0 callback.set_size(size) with fs.open(fname, "rb") as fobj: - return fobj_md5(callback.wrap_attr(fobj), text=text, name=name) + return fobj_sha256(callback.wrap_attr(fobj), text=text, name=name) def _adapt_info(info: Dict[str, Any], scheme: str) -> Dict[str, Any]: @@ -139,8 +139,8 @@ def _hash_file( func = getattr(fs, name) return str(func(path)), info - if name == "md5": - return file_md5(path, fs, callback=callback), info + if name == "sha256": + return file_sha256(path, fs, callback=callback), info raise NotImplementedError @@ -162,7 +162,7 @@ class LargeFileHashingCallback(TqdmCallback): if self.size and self.size > self.LARGE_FILE_SIZE: if not self._logged: logger.info( - f"Computing md5 for a large file '{self.fname}'. " + f"Computing sha256 for a large file '{self.fname}'. " "This is only done once." ) self._logged = True diff --git a/src/dvc_data/hashfile/utils.py b/src/dvc_data/hashfile/utils.py index ea2da9c..b1e7726 100644 --- a/src/dvc_data/hashfile/utils.py +++ b/src/dvc_data/hashfile/utils.py @@ -38,7 +38,7 @@ def get_mtime_and_size( # We track file changes and moves, which cannot be detected with simply # max(mtime(f) for f in non_ignored_files) - hasher = hashlib.md5() + hasher = hashlib.sha256() hasher.update(json.dumps(files_mtimes, sort_keys=True).encode("utf-8")) mtime = hasher.hexdigest() return mtime, size diff --git a/src/dvc_data/objects/tree.py b/src/dvc_data/objects/tree.py index 4f11fa4..7c8b417 100644 --- a/src/dvc_data/objects/tree.py +++ b/src/dvc_data/objects/tree.py @@ -81,7 +81,7 @@ class Tree(HashFile): memfs.pipe_file(path, self.as_bytes()) self.fs = memfs self.path = path - _, self.hash_info = hash_file(path, memfs, "md5") + _, self.hash_info = hash_file(path, memfs, "sha256") assert self.hash_info.value self.hash_info.value += ".dir" self.oid = self.hash_info.value diff --git a/tests/hashfile/test_hash.py b/tests/hashfile/test_hash.py index ca920d8..59bf765 100644 --- a/tests/hashfile/test_hash.py +++ b/tests/hashfile/test_hash.py @@ -2,21 +2,21 @@ from os import fspath from dvc_objects.fs import LocalFileSystem -from dvc_data.hashfile.hash import file_md5 +from dvc_data.hashfile.hash import file_sha256 -def test_file_md5(tmp_path): +def test_file_sha256(tmp_path): foo = tmp_path / "foo" foo.write_text("foo content", encoding="utf8") fs = LocalFileSystem() - assert file_md5(fspath(foo), fs) == file_md5(fspath(foo), fs) + assert file_sha256(fspath(foo), fs) == file_sha256(fspath(foo), fs) -def test_file_md5_crlf(tmp_path): +def test_file_sha256_crlf(tmp_path): fs = LocalFileSystem() cr = tmp_path / "cr" crlf = tmp_path / "crlf" cr.write_bytes(b"a\nb\nc") crlf.write_bytes(b"a\r\nb\r\nc") - assert file_md5(fspath(cr), fs) == file_md5(fspath(crlf), fs) + assert file_sha256(fspath(cr), fs) == file_sha256(fspath(crlf), fs) diff --git a/tests/hashfile/test_hash_stream.py b/tests/hashfile/test_hash_stream.py index a003a29..e67b7c1 100644 --- a/tests/hashfile/test_hash_stream.py +++ b/tests/hashfile/test_hash_stream.py @@ -3,7 +3,7 @@ from os import fspath import pytest from dvc_objects.fs import LocalFileSystem -from dvc_data.hashfile.hash import HashStreamFile, file_md5 +from dvc_data.hashfile.hash import HashStreamFile, file_sha256 from dvc_data.hashfile.istextfile import DEFAULT_CHUNK_SIZE, istextfile @@ -23,7 +23,7 @@ def test_hashed_stream_reader(tmp_path): assert stream_reader.read(1) == b"o" assert stream_reader.tell() == 3 - hex_digest = file_md5(fspath(foo), LocalFileSystem()) + hex_digest = file_sha256(fspath(foo), LocalFileSystem()) assert stream_reader.is_text assert hex_digest == stream_reader.hash_value @@ -46,7 +46,7 @@ def test_hashed_stream_reader_as_chunks(tmp_path): assert stream_reader.tell() == actual_size == total_read - hex_digest = file_md5(fspath(foo), LocalFileSystem()) + hex_digest = file_sha256(fspath(foo), LocalFileSystem()) assert not stream_reader.is_text assert hex_digest == stream_reader.hash_value @@ -68,7 +68,7 @@ def test_hashed_stream_reader_compatibility(tmp_path, contents): stream_reader.read(chunk_size) local_fs = LocalFileSystem() - hex_digest = file_md5(fspath(data), local_fs) + hex_digest = file_sha256(fspath(data), local_fs) assert stream_reader.is_text is istextfile(fspath(data), local_fs) assert stream_reader.hash_value == hex_digest diff --git a/tests/hashfile/test_obj.py b/tests/hashfile/test_obj.py index 01e9fc2..6c47b3c 100644 --- a/tests/hashfile/test_obj.py +++ b/tests/hashfile/test_obj.py @@ -3,7 +3,7 @@ from dvc_data.hashfile.obj import HashFile def test_obj(tmp_upath): - hash_info = HashInfo("md5", "123456") + hash_info = HashInfo("sha256", "123456") obj = HashFile(tmp_upath, tmp_upath.fs, hash_info) assert obj.path == tmp_upath assert obj.fs == tmp_upath.fs diff --git a/tests/objects/test_tree.py b/tests/objects/test_tree.py index 6c514ba..611a72f 100644 --- a/tests/objects/test_tree.py +++ b/tests/objects/test_tree.py @@ -13,57 +13,57 @@ from dvc_data.objects.tree import Tree, _merge ([], {}), ( [ - {"md5": "def", "relpath": "zzz"}, - {"md5": "123", "relpath": "foo"}, - {"md5": "abc", "relpath": "aaa"}, - {"md5": "456", "relpath": "bar"}, + {"sha256": "def", "relpath": "zzz"}, + {"sha256": "123", "relpath": "foo"}, + {"sha256": "abc", "relpath": "aaa"}, + {"sha256": "456", "relpath": "bar"}, ], { - ("zzz",): (None, HashInfo("md5", "def")), - ("foo",): (None, HashInfo("md5", "123")), - ("bar",): (None, HashInfo("md5", "456")), - ("aaa",): (None, HashInfo("md5", "abc")), + ("zzz",): (None, HashInfo("sha256", "def")), + ("foo",): (None, HashInfo("sha256", "123")), + ("bar",): (None, HashInfo("sha256", "456")), + ("aaa",): (None, HashInfo("sha256", "abc")), }, ), ( [ - {"md5": "123", "relpath": "dir/b"}, - {"md5": "456", "relpath": "dir/z"}, - {"md5": "789", "relpath": "dir/a"}, - {"md5": "abc", "relpath": "b"}, - {"md5": "def", "relpath": "a"}, - {"md5": "ghi", "relpath": "z"}, - {"md5": "jkl", "relpath": "dir/subdir/b"}, - {"md5": "mno", "relpath": "dir/subdir/z"}, - {"md5": "pqr", "relpath": "dir/subdir/a"}, + {"sha256": "123", "relpath": "dir/b"}, + {"sha256": "456", "relpath": "dir/z"}, + {"sha256": "789", "relpath": "dir/a"}, + {"sha256": "abc", "relpath": "b"}, + {"sha256": "def", "relpath": "a"}, + {"sha256": "ghi", "relpath": "z"}, + {"sha256": "jkl", "relpath": "dir/subdir/b"}, + {"sha256": "mno", "relpath": "dir/subdir/z"}, + {"sha256": "pqr", "relpath": "dir/subdir/a"}, ], { ("dir", "b"): ( None, - HashInfo("md5", "123"), + HashInfo("sha256", "123"), ), ("dir", "z"): ( None, - HashInfo("md5", "456"), + HashInfo("sha256", "456"), ), ("dir", "a"): ( None, - HashInfo("md5", "789"), + HashInfo("sha256", "789"), ), - ("b",): (None, HashInfo("md5", "abc")), - ("a",): (None, HashInfo("md5", "def")), - ("z",): (None, HashInfo("md5", "ghi")), + ("b",): (None, HashInfo("sha256", "abc")), + ("a",): (None, HashInfo("sha256", "def")), + ("z",): (None, HashInfo("sha256", "ghi")), ("dir", "subdir", "b"): ( None, - HashInfo("md5", "jkl"), + HashInfo("sha256", "jkl"), ), ("dir", "subdir", "z"): ( None, - HashInfo("md5", "mno"), + HashInfo("sha256", "mno"), ), ("dir", "subdir", "a"): ( None, - HashInfo("md5", "pqr"), + HashInfo("sha256", "pqr"), ), }, ), @@ -81,19 +81,19 @@ def test_list(lst, trie_dict): ({}, 0), ( { - ("a",): (Meta(size=1), HashInfo("md5", "abc")), - ("b",): (Meta(size=2), HashInfo("md5", "def")), - ("c",): (Meta(size=3), HashInfo("md5", "ghi")), - ("dir", "foo"): (Meta(size=4), HashInfo("md5", "jkl")), - ("dir", "bar"): (Meta(size=5), HashInfo("md5", "mno")), - ("dir", "baz"): (Meta(size=6), HashInfo("md5", "pqr")), + ("a",): (Meta(size=1), HashInfo("sha256", "abc")), + ("b",): (Meta(size=2), HashInfo("sha256", "def")), + ("c",): (Meta(size=3), HashInfo("sha256", "ghi")), + ("dir", "foo"): (Meta(size=4), HashInfo("sha256", "jkl")), + ("dir", "bar"): (Meta(size=5), HashInfo("sha256", "mno")), + ("dir", "baz"): (Meta(size=6), HashInfo("sha256", "pqr")), }, 6, ), ( { - ("a",): (Meta(size=1), HashInfo("md5", "abc")), - ("b",): (Meta(), HashInfo("md5", "def")), + ("a",): (Meta(size=1), HashInfo("sha256", "abc")), + ("b",): (Meta(), HashInfo("sha256", "def")), }, 2, ), @@ -110,15 +110,15 @@ def test_nfiles(trie_dict, nfiles): [ {}, { - ("a",): (None, HashInfo("md5", "abc")), - ("b",): (None, HashInfo("md5", "def")), - ("c",): (None, HashInfo("md5", "ghi")), - ("dir", "foo"): (None, HashInfo("md5", "jkl")), - ("dir", "bar"): (None, HashInfo("md5", "mno")), - ("dir", "baz"): (None, HashInfo("md5", "pqr")), - ("dir", "subdir", "1"): (None, HashInfo("md5", "stu")), - ("dir", "subdir", "2"): (None, HashInfo("md5", "vwx")), - ("dir", "subdir", "3"): (None, HashInfo("md5", "yz")), + ("a",): (None, HashInfo("sha256", "abc")), + ("b",): (None, HashInfo("sha256", "def")), + ("c",): (None, HashInfo("sha256", "ghi")), + ("dir", "foo"): (None, HashInfo("sha256", "jkl")), + ("dir", "bar"): (None, HashInfo("sha256", "mno")), + ("dir", "baz"): (None, HashInfo("sha256", "pqr")), + ("dir", "subdir", "1"): (None, HashInfo("sha256", "stu")), + ("dir", "subdir", "2"): (None, HashInfo("sha256", "vwx")), + ("dir", "subdir", "3"): (None, HashInfo("sha256", "yz")), }, ], ) @@ -135,63 +135,63 @@ def test_items(trie_dict): [ ({}, {}, {}, {}), ( - {("foo",): HashInfo("md5", "123")}, + {("foo",): HashInfo("sha256", "123")}, { - ("foo",): HashInfo("md5", "123"), - ("bar",): HashInfo("md5", "345"), + ("foo",): HashInfo("sha256", "123"), + ("bar",): HashInfo("sha256", "345"), }, { - ("foo",): HashInfo("md5", "123"), - ("baz",): HashInfo("md5", "678"), + ("foo",): HashInfo("sha256", "123"), + ("baz",): HashInfo("sha256", "678"), }, { - ("foo",): HashInfo("md5", "123"), - ("bar",): HashInfo("md5", "345"), - ("baz",): HashInfo("md5", "678"), + ("foo",): HashInfo("sha256", "123"), + ("bar",): HashInfo("sha256", "345"), + ("baz",): HashInfo("sha256", "678"), }, ), ( { - ("common",): HashInfo("md5", "123"), - ("subdir", "foo"): HashInfo("md5", "345"), + ("common",): HashInfo("sha256", "123"), + ("subdir", "foo"): HashInfo("sha256", "345"), }, { - ("common",): HashInfo("md5", "123"), - ("subdir", "foo"): HashInfo("md5", "345"), - ("subdir", "bar"): HashInfo("md5", "678"), + ("common",): HashInfo("sha256", "123"), + ("subdir", "foo"): HashInfo("sha256", "345"), + ("subdir", "bar"): HashInfo("sha256", "678"), }, { - ("common",): HashInfo("md5", "123"), - ("subdir", "foo"): HashInfo("md5", "345"), - ("subdir", "baz"): HashInfo("md5", "91011"), + ("common",): HashInfo("sha256", "123"), + ("subdir", "foo"): HashInfo("sha256", "345"), + ("subdir", "baz"): HashInfo("sha256", "91011"), }, { - ("common",): HashInfo("md5", "123"), - ("subdir", "foo"): HashInfo("md5", "345"), - ("subdir", "bar"): HashInfo("md5", "678"), - ("subdir", "baz"): HashInfo("md5", "91011"), + ("common",): HashInfo("sha256", "123"), + ("subdir", "foo"): HashInfo("sha256", "345"), + ("subdir", "bar"): HashInfo("sha256", "678"), + ("subdir", "baz"): HashInfo("sha256", "91011"), }, ), ( {}, - {("foo",): HashInfo("md5", "123")}, - {("bar",): HashInfo("md5", "456")}, + {("foo",): HashInfo("sha256", "123")}, + {("bar",): HashInfo("sha256", "456")}, { - ("foo",): HashInfo("md5", "123"), - ("bar",): HashInfo("md5", "456"), + ("foo",): HashInfo("sha256", "123"), + ("bar",): HashInfo("sha256", "456"), }, ), ( {}, {}, - {("bar",): HashInfo("md5", "123")}, - {("bar",): HashInfo("md5", "123")}, + {("bar",): HashInfo("sha256", "123")}, + {("bar",): HashInfo("sha256", "123")}, ), ( {}, - {("bar",): HashInfo("md5", "123")}, + {("bar",): HashInfo("sha256", "123")}, {}, - {("bar",): HashInfo("md5", "123")}, + {("bar",): HashInfo("sha256", "123")}, ), ], ) diff --git a/tests/test_index.py b/tests/test_index.py index c6404fa..635bf66 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -17,8 +17,8 @@ def odb(tmp_upath_factory, as_filesystem): data = tmp_upath_factory.mktemp() / "data.dir" data.write_bytes( - b'[{"md5": "c157a79031e1c40f85931829bc5fc552", "relpath": "bar"}, ' - b'{"md5": "258622b1688250cb619f3c9ccaefb7eb", "relpath": "baz"}]' + b'[{"sha256": "c157a79031e1c40f85931829bc5fc552", "relpath": "bar"}, ' + b'{"sha256": "258622b1688250cb619f3c9ccaefb7eb", "relpath": "baz"}]' ) bar = tmp_upath_factory.mktemp() / "bar" @@ -46,13 +46,13 @@ def test_fs(tmp_upath, odb, as_filesystem): ("foo",): DataIndexEntry( odb=odb, hash_info=HashInfo( - name="md5", value="d3b07384d113edec49eaa6238ad5ff00" + name="sha256", value="d3b07384d113edec49eaa6238ad5ff00" ), ), ("data",): DataIndexEntry( odb=odb, hash_info=HashInfo( - name="md5", + name="sha256", value="1f69c66028c35037e8bf67e5bc4ceb6a.dir", ), ), @@ -80,22 +80,22 @@ def test_build(tmp_upath, odb, as_filesystem): }, ) build(index, tmp_upath, as_filesystem(tmp_upath.fs)) - assert index[("foo",)].hash_info.name == "md5" + assert index[("foo",)].hash_info.name == "sha256" assert ( index[("foo",)].hash_info.value == "d3b07384d113edec49eaa6238ad5ff00" ) assert index[("foo",)].odb == odb - assert index[("data",)].hash_info.name == "md5" + assert index[("data",)].hash_info.name == "sha256" assert ( index[("data",)].hash_info.value == "1f69c66028c35037e8bf67e5bc4ceb6a.dir" ) - assert index[("data", "bar")].hash_info.name == "md5" + assert index[("data", "bar")].hash_info.name == "sha256" assert ( index[("data", "bar")].hash_info.value == "c157a79031e1c40f85931829bc5fc552" ) - assert index[("data", "baz")].hash_info.name == "md5" + assert index[("data", "baz")].hash_info.name == "sha256" assert ( index[("data", "baz")].hash_info.value == "258622b1688250cb619f3c9ccaefb7eb" @@ -108,13 +108,13 @@ def test_checkout(tmp_upath, odb, as_filesystem): ("foo",): DataIndexEntry( odb=odb, hash_info=HashInfo( - name="md5", value="d3b07384d113edec49eaa6238ad5ff00" + name="sha256", value="d3b07384d113edec49eaa6238ad5ff00" ), ), ("data",): DataIndexEntry( odb=odb, hash_info=HashInfo( - name="md5", + name="sha256", value="1f69c66028c35037e8bf67e5bc4ceb6a.dir", ), ),