Skip to content

output: deprecate dos2unix md5 hashing #9517

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dvc/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def __call__(self, data):
Optional("check_update", default=True): Bool,
"site_cache_dir": str,
"machine": Lower,
Optional("legacy_md5", default=True): Bool,
},
"cache": {
"local": str,
Expand Down
13 changes: 13 additions & 0 deletions dvc/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,12 @@ def cache_path(self):
self.cache.oid_to_path(self.hash_info.value)
)

@cached_property
def _is_text(self) -> Optional[bool]:
if self.repo.config["core"].get("legacy_md5", True):
return None
return False

def get_hash(self):
_, hash_info = self._get_hash_meta()
return hash_info
Expand All @@ -531,6 +537,7 @@ def _get_hash_meta(self):
self.hash_name,
ignore=self.dvcignore,
dry_run=not self.use_cache,
text=self._is_text,
)
return meta, obj.hash_info

Expand Down Expand Up @@ -671,6 +678,7 @@ def save(self) -> None:
self.fs,
self.hash_name,
ignore=self.dvcignore,
text=self._is_text,
)
else:
_, self.meta, self.obj = build(
Expand All @@ -680,6 +688,7 @@ def save(self) -> None:
self.hash_name,
ignore=self.dvcignore,
dry_run=True,
text=self._is_text,
)
if not self.IS_DEPENDENCY:
logger.debug("Output '%s' doesn't use cache. Skipping saving.", self)
Expand Down Expand Up @@ -727,6 +736,7 @@ def commit(self, filter_info=None, relink=True) -> None:
self.fs,
self.hash_name,
ignore=self.dvcignore,
text=self._is_text,
)
otransfer(
staging,
Expand Down Expand Up @@ -758,6 +768,7 @@ def _commit_granular_dir(self, filter_info, hardlink) -> Optional["HashFile"]:
self.fs,
self.hash_name,
ignore=self.dvcignore,
text=self._is_text,
)
assert isinstance(obj, Tree)
save_obj = obj.filter(prefix)
Expand Down Expand Up @@ -973,6 +984,7 @@ def transfer(
"md5",
upload=upload,
no_progress_bar=no_progress_bar,
text=self._is_text,
)
otransfer(
staging,
Expand Down Expand Up @@ -1305,6 +1317,7 @@ def add( # noqa: C901
self.hash_name,
ignore=self.dvcignore,
dry_run=not self.use_cache,
text=self._is_text,
)
except FileNotFoundError as exc:
if self.fs_path == path:
Expand Down
3 changes: 2 additions & 1 deletion dvc/repo/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def save_imports(
logger.warning(str(DataSourceChanged(f"{dep.stage} ({dep})")))

data_view = unfetched.data["repo"]
is_text = None if repo.config["core"].get("legacy_md5", True) else False
if len(data_view):
cache = repo.cache.local
if not cache.fs.exists(cache.path):
Expand All @@ -111,7 +112,7 @@ def save_imports(
unit="files",
) as cb:
checkout(data_view, tmpdir, cache.fs, callback=cb, storage="data")
md5(data_view)
md5(data_view, text=is_text)
save(data_view, odb=cache, hardlink=True)

downloaded.update(
Expand Down
5 changes: 3 additions & 2 deletions dvc/repo/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,9 @@ def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False): # noqa:

config = Config.init(dvc_dir)

if no_scm:
with config.edit() as conf:
with config.edit() as conf:
conf["core"]["legacy_md5"] = False
if no_scm:
conf["core"]["no_scm"] = True

dvcignore = init_dvcignore(root_dir)
Expand Down
22 changes: 22 additions & 0 deletions tests/func/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import pytest

import dvc.output as output_module
import dvc_data
from dvc.cachemgr import CacheManager
from dvc.cli import main
Expand Down Expand Up @@ -1113,3 +1114,24 @@ def test_add_updates_to_cloud_versioning_dir(tmp_dir, dvc):
}
]
}


@pytest.mark.parametrize(
"legacy_md5,expected_hash",
[
(False, "3dbec9c1b92200eb56349835275e00b9"),
(True, "f47c75614087a8dd938ba4acff252494"),
],
)
def test_add_legacy_md5(tmp_dir, dvc, mocker, legacy_md5, expected_hash):
with dvc.config.edit() as conf:
conf["core"]["legacy_md5"] = legacy_md5
expected_text = None if legacy_md5 else False
tmp_dir.gen("foo", "foo\r\nbar\r\n")

build_spy = mocker.spy(output_module, "build")
(stage,) = dvc.add("foo")
build_spy.assert_called()
for _args, kwargs in build_spy.call_args_list:
assert kwargs["text"] is expected_text
assert stage.outs[0].hash_info == HashInfo("md5", expected_hash)
5 changes: 5 additions & 0 deletions tests/func/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,8 @@ def test_init_when_ignored_by_git(tmp_dir, scm, caplog):
)
in caplog.text
)


def test_init_legacy_md5(scm):
with DvcRepo.init() as repo:
assert repo.config["core"].get("legacy_md5") is False