diff --git a/dvc/ignore.py b/dvc/ignore.py index 94a72b72b9..b9d4de35af 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -139,7 +139,5 @@ def walk(self, top, topdown=True): yield root, dirs, files - def walk_files(self, top): - for root, _, files in self.walk(top): - for file in files: - yield os.path.join(root, file) + def stat(self, path): + return self.tree.stat(path) diff --git a/dvc/scm/git/tree.py b/dvc/scm/git/tree.py index 316204a2de..9b48d4a8a9 100644 --- a/dvc/scm/git/tree.py +++ b/dvc/scm/git/tree.py @@ -1,6 +1,7 @@ import errno import io import os +import stat from dvc.exceptions import DvcException from dvc.scm.tree import BaseTree @@ -87,7 +88,9 @@ def git_object_by_path(self, path): import git path = relpath(os.path.realpath(path), self.git.working_dir) - assert path.split(os.sep, 1)[0] != ".." + if path.split(os.sep, 1)[0] == "..": + # path points outside of git repository + return None try: tree = self.git.tree(self.rev) @@ -138,3 +141,40 @@ def walk(self, top, topdown=True): raise OSError(errno.ENOENT, "No such file") yield from self._walk(tree, topdown=topdown) + + def isexec(self, path): + if not self.exists(path): + return False + + mode = self.stat(path).st_mode + return mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + + def stat(self, path): + import git + + def to_ctime(git_time): + sec, nano_sec = git_time + return sec + nano_sec / 1000000000 + + obj = self.git_object_by_path(path) + if obj is None: + raise OSError(errno.ENOENT, "No such file") + entry = git.index.IndexEntry.from_blob(obj) + + # os.stat_result takes a tuple in the form: + # (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime) + return os.stat_result( + ( + entry.mode, + entry.inode, + entry.dev, + 0, + entry.uid, + entry.gid, + entry.size, + # git index has no atime equivalent, use mtime + to_ctime(entry.mtime), + to_ctime(entry.mtime), + to_ctime(entry.ctime), + ) + ) diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index ea4967cccf..f65d8eefb8 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -29,6 +29,11 @@ def walk(self, top, topdown=True): - it could raise exceptions, there is no onerror argument """ + def walk_files(self, top): + for root, _, files in self.walk(top): + for file in files: + yield os.path.join(root, file) + class WorkingTree(BaseTree): """Proxies the repo file access methods to working tree files""" @@ -42,6 +47,8 @@ def tree_root(self): def open(self, path, mode="r", encoding="utf-8"): """Open file and return a stream.""" + if "b" in mode: + encoding = None return open(path, mode=mode, encoding=encoding) def exists(self, path): @@ -76,6 +83,10 @@ def isexec(self, path): mode = os.stat(path).st_mode return mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + @staticmethod + def stat(path): + return os.stat(path) + def is_working_tree(tree): return isinstance(tree, WorkingTree) or isinstance( diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 3d45698c2d..3055371682 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -27,15 +27,44 @@ def dos2unix(data): return data.replace(b"\r\n", b"\n") -def file_md5(fname): +def _fobj_md5(fobj, hash_md5, binary, progress_func=None): + while True: + data = fobj.read(LOCAL_CHUNK_SIZE) + if not data: + break + + if binary: + chunk = data + else: + chunk = dos2unix(data) + + hash_md5.update(chunk) + if progress_func: + progress_func(len(data)) + + +def file_md5(fname, tree=None): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.progress import Tqdm from dvc.istextfile import istextfile - if os.path.exists(fname): + if tree: + exists_func = tree.exists + stat_func = tree.stat + open_func = tree.open + # assume we don't need to run dos2unix when comparing git blobs + binary = True + else: + exists_func = os.path.exists + stat_func = os.stat + open_func = open + binary = False + + if exists_func(fname): hash_md5 = hashlib.md5() - binary = not istextfile(fname) - size = os.path.getsize(fname) + if not binary: + binary = not istextfile(fname) + size = stat_func(fname).st_size no_progress_bar = True if size >= LARGE_FILE_SIZE: no_progress_bar = False @@ -52,19 +81,8 @@ def file_md5(fname): bytes=True, leave=False, ) as pbar: - with open(fname, "rb") as fobj: - while True: - data = fobj.read(LOCAL_CHUNK_SIZE) - if not data: - break - - if binary: - chunk = data - else: - chunk = dos2unix(data) - - hash_md5.update(chunk) - pbar.update(len(data)) + with open_func(fname, "rb") as fobj: + _fobj_md5(fobj, hash_md5, binary, pbar.update) return (hash_md5.hexdigest(), hash_md5.digest()) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index 57a8864d5b..1fcb8880e5 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -9,7 +9,6 @@ from shortuuid import uuid from dvc.exceptions import DvcException -from dvc.scm.tree import is_working_tree from dvc.system import System from dvc.utils import dict_md5 @@ -33,14 +32,12 @@ def get_inode(path): def get_mtime_and_size(path, tree): - if os.path.isdir(path): - assert is_working_tree(tree) - + if tree.isdir(path): size = 0 files_mtimes = {} for file_path in tree.walk_files(path): try: - stats = os.stat(file_path) + stats = tree.stat(file_path) except OSError as exc: # NOTE: broken symlink case. if exc.errno != errno.ENOENT: @@ -53,7 +50,7 @@ def get_mtime_and_size(path, tree): # max(mtime(f) for f in non_ignored_files) mtime = dict_md5(files_mtimes) else: - base_stat = os.stat(path) + base_stat = tree.stat(path) size = base_stat.st_size mtime = base_stat.st_mtime mtime = int(nanotime.timestamp(mtime)) @@ -175,7 +172,6 @@ def copyfile(src, dest, no_progress_bar=False, name=None): """Copy file with progress bar""" from dvc.exceptions import DvcException from dvc.progress import Tqdm - from dvc.system import System name = name if name else os.path.basename(dest) total = os.stat(src).st_size @@ -202,6 +198,12 @@ def copyfile(src, dest, no_progress_bar=False, name=None): fdest_wrapped.write(buf) +def copy_fobj_to_file(fsrc, dest): + """Copy contents of open file object to destination path.""" + with open(dest, "wb+") as fdest: + shutil.copyfileobj(fsrc, fdest) + + def walk_files(directory): for root, _, files in os.walk(directory): for f in files: diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index c8ed92271f..f69e31d28d 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -14,6 +14,7 @@ from dvc.utils.fs import ( BasePathNotInCheckedPathException, contains_symlink_up_to, + copy_fobj_to_file, copyfile, get_inode, get_mtime_and_size, @@ -260,5 +261,15 @@ def test_copyfile(path, tmp_dir): assert filecmp.cmp(src_info, dest_info, shallow=False) +def test_copy_fobj_to_file(tmp_dir): + tmp_dir.gen({"foo": "foo content"}) + src = tmp_dir / "foo" + dest = "path" + + with open(src, "rb") as fobj: + copy_fobj_to_file(fobj, dest) + assert filecmp.cmp(src, dest) + + def test_walk_files(tmp_dir): assert list(walk_files(".")) == list(walk_files(tmp_dir))