Skip to content

Commit d77af81

Browse files
authored
optimize local status (#3867)
* remote: remove unpacked dir We've done a lot of optimizations lately, which made unpacked dir trick obsoleted. * local: save us a stat call in is_protected * local: speedup checksum_to_path ~x5.5 * tree: walk: don't use os.path.join It is ~5.5 times slower than joining by hand. * dvcignore: optimize matching x10 On a repo with a dvcignore with 1 pattern and a directory with 400K files, `dvc status` now takes ~8 sec instead of ~30 sec. To achieve that, we make some assumptions about the paths formats that we are dealing with, so we could use simpler logic instead of using very slow `relpath`, `abspath` etc on every entry in a directory. It is also clear that CleanTree behavior is inconsistent (even tests expect very different outputs from it), so we will need to look into this later.
1 parent c5920b0 commit d77af81

File tree

7 files changed

+32
-158
lines changed

7 files changed

+32
-158
lines changed

dvc/ignore.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from pathspec.patterns import GitWildMatchPattern
77

88
from dvc.scm.tree import BaseTree
9-
from dvc.utils import relpath
109

1110
logger = logging.getLogger(__name__)
1211

@@ -35,12 +34,19 @@ def __call__(self, root, dirs, files):
3534
return dirs, files
3635

3736
def matches(self, dirname, basename):
38-
abs_path = os.path.join(dirname, basename)
39-
rel_path = relpath(abs_path, self.dirname)
40-
41-
if os.pardir + os.sep in rel_path:
37+
# NOTE: `relpath` is too slow, so we have to assume that both
38+
# `dirname` and `self.dirname` are relative or absolute together.
39+
prefix = self.dirname + os.sep
40+
if dirname == self.dirname:
41+
path = basename
42+
elif dirname.startswith(prefix):
43+
rel = dirname[len(prefix) :]
44+
# NOTE: `os.path.join` is ~x5.5 slower
45+
path = f"{rel}{os.sep}{basename}"
46+
else:
4247
return False
43-
return self.ignore_spec.match_file(rel_path)
48+
49+
return self.ignore_spec.match_file(path)
4450

4551
def __hash__(self):
4652
return hash(self.ignore_file_path)
@@ -135,7 +141,9 @@ def isexec(self, path):
135141

136142
def walk(self, top, topdown=True):
137143
for root, dirs, files in self.tree.walk(top, topdown):
138-
dirs[:], files[:] = self.dvcignore(root, dirs, files)
144+
dirs[:], files[:] = self.dvcignore(
145+
os.path.abspath(root), dirs, files
146+
)
139147

140148
yield root, dirs, files
141149

dvc/remote/base.py

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -831,7 +831,7 @@ def all(self, jobs=None, name=None):
831831

832832
@index_locked
833833
def gc(self, named_cache, jobs=None):
834-
used = self.extract_used_local_checksums(named_cache)
834+
used = set(named_cache.scheme_keys("local"))
835835

836836
if self.scheme != "":
837837
used.update(named_cache.scheme_keys(self.scheme))
@@ -847,6 +847,7 @@ def gc(self, named_cache, jobs=None):
847847
continue
848848
path_info = self.checksum_to_path_info(checksum)
849849
if self.is_dir_checksum(checksum):
850+
# backward compatibility
850851
self._remove_unpacked_dir(checksum)
851852
self.remove(path_info)
852853
removed = True
@@ -905,11 +906,6 @@ def _changed_dir_cache(self, checksum, path_info=None, filter_info=None):
905906
if self.changed_cache_file(checksum):
906907
return True
907908

908-
if not (path_info and filter_info) and not self._changed_unpacked_dir(
909-
checksum
910-
):
911-
return False
912-
913909
for entry in self.get_dir_cache(checksum):
914910
entry_checksum = entry[self.PARAM_CHECKSUM]
915911

@@ -921,9 +917,6 @@ def _changed_dir_cache(self, checksum, path_info=None, filter_info=None):
921917
if self.changed_cache_file(entry_checksum):
922918
return True
923919

924-
if not (path_info and filter_info):
925-
self._update_unpacked_dir(checksum)
926-
927920
return False
928921

929922
def changed_cache(self, checksum, path_info=None, filter_info=None):
@@ -1362,19 +1355,5 @@ def get_files_number(self, path_info, checksum, filter_info):
13621355
def unprotect(path_info):
13631356
pass
13641357

1365-
def _get_unpacked_dir_names(self, checksums):
1366-
return set()
1367-
1368-
def extract_used_local_checksums(self, named_cache):
1369-
used = set(named_cache.scheme_keys("local"))
1370-
unpacked = self._get_unpacked_dir_names(used)
1371-
return used | unpacked
1372-
1373-
def _changed_unpacked_dir(self, checksum):
1374-
return True
1375-
1376-
def _update_unpacked_dir(self, checksum):
1377-
pass
1378-
13791358
def _remove_unpacked_dir(self, checksum):
13801359
pass

dvc/remote/local.py

Lines changed: 11 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,12 @@ def cache_path(self):
9292
return os.path.abspath(self.cache_dir)
9393

9494
def checksum_to_path(self, checksum):
95-
return os.path.join(self.cache_path, checksum[0:2], checksum[2:])
95+
# NOTE: `self.cache_path` is already normalized so we can simply use
96+
# `os.sep` instead of `os.path.join`. This results in this helper
97+
# being ~5.5 times faster.
98+
return (
99+
f"{self.cache_path}{os.sep}{checksum[0:2]}{os.sep}{checksum[2:]}"
100+
)
96101

97102
def list_cache_paths(self, prefix=None, progress_callback=None):
98103
assert self.path_info is not None
@@ -695,70 +700,15 @@ def protect(self, path_info):
695700
if actual != mode:
696701
raise
697702

698-
def _get_unpacked_dir_path_info(self, checksum):
699-
info = self.checksum_to_path_info(checksum)
700-
return info.with_name(info.name + self.UNPACKED_DIR_SUFFIX)
701-
702703
def _remove_unpacked_dir(self, checksum):
703-
path_info = self._get_unpacked_dir_path_info(checksum)
704+
info = self.checksum_to_path_info(checksum)
705+
path_info = info.with_name(info.name + self.UNPACKED_DIR_SUFFIX)
704706
self.remove(path_info)
705707

706-
def _path_info_changed(self, path_info):
707-
if self.exists(path_info) and self.state.get(path_info):
708-
return False
709-
return True
710-
711-
def _update_unpacked_dir(self, checksum):
712-
unpacked_dir_info = self._get_unpacked_dir_path_info(checksum)
713-
714-
if not self._path_info_changed(unpacked_dir_info):
715-
return
716-
717-
self.remove(unpacked_dir_info)
718-
719-
try:
720-
dir_info = self.get_dir_cache(checksum)
721-
self._create_unpacked_dir(checksum, dir_info, unpacked_dir_info)
722-
except DvcException:
723-
logger.warning(f"Could not create '{unpacked_dir_info}'")
724-
725-
self.remove(unpacked_dir_info)
726-
727-
def _create_unpacked_dir(self, checksum, dir_info, unpacked_dir_info):
728-
self.makedirs(unpacked_dir_info)
729-
730-
for entry in Tqdm(dir_info, desc="Creating unpacked dir", unit="file"):
731-
entry_cache_info = self.checksum_to_path_info(
732-
entry[self.PARAM_CHECKSUM]
733-
)
734-
relative_path = entry[self.PARAM_RELPATH]
735-
# In shared cache mode some cache files might not be owned by the
736-
# user, so we need to use symlinks because, unless
737-
# /proc/sys/fs/protected_hardlinks is disabled, the user is not
738-
# allowed to create hardlinks to files that he doesn't own.
739-
link_types = ["hardlink", "symlink"]
740-
self._link(
741-
entry_cache_info, unpacked_dir_info / relative_path, link_types
742-
)
743-
744-
self.state.save(unpacked_dir_info, checksum)
745-
746-
def _changed_unpacked_dir(self, checksum):
747-
status_unpacked_dir_info = self._get_unpacked_dir_path_info(checksum)
748-
749-
return not self.state.get(status_unpacked_dir_info)
750-
751-
def _get_unpacked_dir_names(self, checksums):
752-
unpacked = set()
753-
for c in checksums:
754-
if self.is_dir_checksum(c):
755-
unpacked.add(c + self.UNPACKED_DIR_SUFFIX)
756-
return unpacked
757-
758708
def is_protected(self, path_info):
759-
if not self.exists(path_info):
709+
try:
710+
mode = os.stat(path_info).st_mode
711+
except FileNotFoundError:
760712
return False
761713

762-
mode = os.stat(path_info).st_mode
763-
764714
return stat.S_IMODE(mode) == self.CACHE_MODE

dvc/scm/tree.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ def walk(self, top, topdown=True):
3232
def walk_files(self, top):
3333
for root, _, files in self.walk(top):
3434
for file in files:
35-
yield os.path.join(root, file)
35+
# NOTE: os.path.join is ~5.5 times slower
36+
yield f"{root}{os.sep}{file}"
3637

3738

3839
class WorkingTree(BaseTree):

tests/func/remote/test_local.py

Lines changed: 0 additions & 65 deletions
This file was deleted.

tests/func/test_gc.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ def test_gc_no_unpacked_dir(tmp_dir, dvc):
219219
dir_stages[0].outs[0].cache_path + LocalRemote.UNPACKED_DIR_SUFFIX
220220
)
221221

222+
# older (pre 1.0) versions of dvc used to generate this dir
223+
shutil.copytree("dir", unpackeddir)
222224
assert os.path.exists(unpackeddir)
223225

224226
dvc.gc(force=True, workspace=True)

tests/unit/test_ignore.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def test_ignore_from_file_should_filter_dirs_and_files():
6969
),
7070
("dont_ignore.txt", ["dont_ignore"], False),
7171
("dont_ignore.txt", ["dont*", "!dont_ignore.txt"], False),
72-
("../../../something.txt", ["**/something.txt"], False),
7372
],
7473
)
7574
def test_match_ignore_from_file(

0 commit comments

Comments
 (0)