Skip to content

git: don't use multithreading for computhing dir hash #4083

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,7 @@ def stat(self, path):
if self.exists(path):
return self.tree.stat(path)
raise FileNotFoundError

@property
def hash_jobs(self):
return self.tree.hash_jobs
5 changes: 3 additions & 2 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,15 +323,16 @@ def save_info(self, path_info, tree=None, **kwargs):
self.PARAM_CHECKSUM: self.get_hash(path_info, tree=tree, **kwargs)
}

def _calculate_hashes(self, file_infos, tree):
@staticmethod
def _calculate_hashes(file_infos, tree):
file_infos = list(file_infos)
with Tqdm(
total=len(file_infos),
unit="md5",
desc="Computing file/dir hashes (only done once)",
) as pbar:
worker = pbar.wrap_fn(tree.get_file_hash)
with ThreadPoolExecutor(max_workers=self.hash_jobs) as executor:
with ThreadPoolExecutor(max_workers=tree.hash_jobs) as executor:
tasks = executor.map(worker, file_infos)
hashes = dict(zip(file_infos, tasks))
return hashes
Expand Down
4 changes: 4 additions & 0 deletions dvc/repo/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,3 +421,7 @@ def copytree(self, top, dest):
src = root / fname
with self.open(src, mode="rb") as fobj:
copy_fobj_to_file(fobj, dest / fname)

@property
def hash_jobs(self):
return self.repo.tree.hash_jobs
6 changes: 6 additions & 0 deletions dvc/scm/git/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,9 @@ def to_ctime(git_time):
to_ctime(entry.ctime),
)
)

@property
def hash_jobs(self):
# NOTE: gitpython is not threadsafe. See
# https://github.com/iterative/dvc/issues/4079
return 1
7 changes: 7 additions & 0 deletions dvc/scm/tree.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import os
import stat
from multiprocessing import cpu_count

from funcy import cached_property


class BaseTree:
Expand Down Expand Up @@ -83,6 +86,10 @@ def isexec(self, path):
def stat(path):
return os.stat(path)

@cached_property
def hash_jobs(self):
return max(1, min(4, cpu_count() // 2))


def is_working_tree(tree):
return isinstance(tree, WorkingTree) or isinstance(
Expand Down