Skip to content

Commit b88b3b8

Browse files
authored
utils: support use of (GitTree) tree file objects in utils (#3819)
* GitTree: add stat() function * BaseTree: make walk_files a base tree method * WorkingTree: add stat() method (wrapper for os.stat) * utils.fs: support non-working tree for get_mtime_and_size * utils.fs: add copy_fobj_to_file * utils: support using tree.open in file_md5
1 parent 81d04f8 commit b88b3b8

File tree

6 files changed

+109
-29
lines changed

6 files changed

+109
-29
lines changed

dvc/ignore.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,5 @@ def walk(self, top, topdown=True):
139139

140140
yield root, dirs, files
141141

142-
def walk_files(self, top):
143-
for root, _, files in self.walk(top):
144-
for file in files:
145-
yield os.path.join(root, file)
142+
def stat(self, path):
143+
return self.tree.stat(path)

dvc/scm/git/tree.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import errno
22
import io
33
import os
4+
import stat
45

56
from dvc.exceptions import DvcException
67
from dvc.scm.tree import BaseTree
@@ -87,7 +88,9 @@ def git_object_by_path(self, path):
8788
import git
8889

8990
path = relpath(os.path.realpath(path), self.git.working_dir)
90-
assert path.split(os.sep, 1)[0] != ".."
91+
if path.split(os.sep, 1)[0] == "..":
92+
# path points outside of git repository
93+
return None
9194

9295
try:
9396
tree = self.git.tree(self.rev)
@@ -138,3 +141,40 @@ def walk(self, top, topdown=True):
138141
raise OSError(errno.ENOENT, "No such file")
139142

140143
yield from self._walk(tree, topdown=topdown)
144+
145+
def isexec(self, path):
146+
if not self.exists(path):
147+
return False
148+
149+
mode = self.stat(path).st_mode
150+
return mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
151+
152+
def stat(self, path):
153+
import git
154+
155+
def to_ctime(git_time):
156+
sec, nano_sec = git_time
157+
return sec + nano_sec / 1000000000
158+
159+
obj = self.git_object_by_path(path)
160+
if obj is None:
161+
raise OSError(errno.ENOENT, "No such file")
162+
entry = git.index.IndexEntry.from_blob(obj)
163+
164+
# os.stat_result takes a tuple in the form:
165+
# (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime)
166+
return os.stat_result(
167+
(
168+
entry.mode,
169+
entry.inode,
170+
entry.dev,
171+
0,
172+
entry.uid,
173+
entry.gid,
174+
entry.size,
175+
# git index has no atime equivalent, use mtime
176+
to_ctime(entry.mtime),
177+
to_ctime(entry.mtime),
178+
to_ctime(entry.ctime),
179+
)
180+
)

dvc/scm/tree.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ def walk(self, top, topdown=True):
2929
- it could raise exceptions, there is no onerror argument
3030
"""
3131

32+
def walk_files(self, top):
33+
for root, _, files in self.walk(top):
34+
for file in files:
35+
yield os.path.join(root, file)
36+
3237

3338
class WorkingTree(BaseTree):
3439
"""Proxies the repo file access methods to working tree files"""
@@ -42,6 +47,8 @@ def tree_root(self):
4247

4348
def open(self, path, mode="r", encoding="utf-8"):
4449
"""Open file and return a stream."""
50+
if "b" in mode:
51+
encoding = None
4552
return open(path, mode=mode, encoding=encoding)
4653

4754
def exists(self, path):
@@ -76,6 +83,10 @@ def isexec(self, path):
7683
mode = os.stat(path).st_mode
7784
return mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
7885

86+
@staticmethod
87+
def stat(path):
88+
return os.stat(path)
89+
7990

8091
def is_working_tree(tree):
8192
return isinstance(tree, WorkingTree) or isinstance(

dvc/utils/__init__.py

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,44 @@ def dos2unix(data):
2727
return data.replace(b"\r\n", b"\n")
2828

2929

30-
def file_md5(fname):
30+
def _fobj_md5(fobj, hash_md5, binary, progress_func=None):
31+
while True:
32+
data = fobj.read(LOCAL_CHUNK_SIZE)
33+
if not data:
34+
break
35+
36+
if binary:
37+
chunk = data
38+
else:
39+
chunk = dos2unix(data)
40+
41+
hash_md5.update(chunk)
42+
if progress_func:
43+
progress_func(len(data))
44+
45+
46+
def file_md5(fname, tree=None):
3147
""" get the (md5 hexdigest, md5 digest) of a file """
3248
from dvc.progress import Tqdm
3349
from dvc.istextfile import istextfile
3450

35-
if os.path.exists(fname):
51+
if tree:
52+
exists_func = tree.exists
53+
stat_func = tree.stat
54+
open_func = tree.open
55+
# assume we don't need to run dos2unix when comparing git blobs
56+
binary = True
57+
else:
58+
exists_func = os.path.exists
59+
stat_func = os.stat
60+
open_func = open
61+
binary = False
62+
63+
if exists_func(fname):
3664
hash_md5 = hashlib.md5()
37-
binary = not istextfile(fname)
38-
size = os.path.getsize(fname)
65+
if not binary:
66+
binary = not istextfile(fname)
67+
size = stat_func(fname).st_size
3968
no_progress_bar = True
4069
if size >= LARGE_FILE_SIZE:
4170
no_progress_bar = False
@@ -52,19 +81,8 @@ def file_md5(fname):
5281
bytes=True,
5382
leave=False,
5483
) as pbar:
55-
with open(fname, "rb") as fobj:
56-
while True:
57-
data = fobj.read(LOCAL_CHUNK_SIZE)
58-
if not data:
59-
break
60-
61-
if binary:
62-
chunk = data
63-
else:
64-
chunk = dos2unix(data)
65-
66-
hash_md5.update(chunk)
67-
pbar.update(len(data))
84+
with open_func(fname, "rb") as fobj:
85+
_fobj_md5(fobj, hash_md5, binary, pbar.update)
6886

6987
return (hash_md5.hexdigest(), hash_md5.digest())
7088

dvc/utils/fs.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from shortuuid import uuid
1010

1111
from dvc.exceptions import DvcException
12-
from dvc.scm.tree import is_working_tree
1312
from dvc.system import System
1413
from dvc.utils import dict_md5
1514

@@ -33,14 +32,12 @@ def get_inode(path):
3332

3433
def get_mtime_and_size(path, tree):
3534

36-
if os.path.isdir(path):
37-
assert is_working_tree(tree)
38-
35+
if tree.isdir(path):
3936
size = 0
4037
files_mtimes = {}
4138
for file_path in tree.walk_files(path):
4239
try:
43-
stats = os.stat(file_path)
40+
stats = tree.stat(file_path)
4441
except OSError as exc:
4542
# NOTE: broken symlink case.
4643
if exc.errno != errno.ENOENT:
@@ -53,7 +50,7 @@ def get_mtime_and_size(path, tree):
5350
# max(mtime(f) for f in non_ignored_files)
5451
mtime = dict_md5(files_mtimes)
5552
else:
56-
base_stat = os.stat(path)
53+
base_stat = tree.stat(path)
5754
size = base_stat.st_size
5855
mtime = base_stat.st_mtime
5956
mtime = int(nanotime.timestamp(mtime))
@@ -175,7 +172,6 @@ def copyfile(src, dest, no_progress_bar=False, name=None):
175172
"""Copy file with progress bar"""
176173
from dvc.exceptions import DvcException
177174
from dvc.progress import Tqdm
178-
from dvc.system import System
179175

180176
name = name if name else os.path.basename(dest)
181177
total = os.stat(src).st_size
@@ -202,6 +198,12 @@ def copyfile(src, dest, no_progress_bar=False, name=None):
202198
fdest_wrapped.write(buf)
203199

204200

201+
def copy_fobj_to_file(fsrc, dest):
202+
"""Copy contents of open file object to destination path."""
203+
with open(dest, "wb+") as fdest:
204+
shutil.copyfileobj(fsrc, fdest)
205+
206+
205207
def walk_files(directory):
206208
for root, _, files in os.walk(directory):
207209
for f in files:

tests/unit/utils/test_fs.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from dvc.utils.fs import (
1515
BasePathNotInCheckedPathException,
1616
contains_symlink_up_to,
17+
copy_fobj_to_file,
1718
copyfile,
1819
get_inode,
1920
get_mtime_and_size,
@@ -260,5 +261,15 @@ def test_copyfile(path, tmp_dir):
260261
assert filecmp.cmp(src_info, dest_info, shallow=False)
261262

262263

264+
def test_copy_fobj_to_file(tmp_dir):
265+
tmp_dir.gen({"foo": "foo content"})
266+
src = tmp_dir / "foo"
267+
dest = "path"
268+
269+
with open(src, "rb") as fobj:
270+
copy_fobj_to_file(fobj, dest)
271+
assert filecmp.cmp(src, dest)
272+
273+
263274
def test_walk_files(tmp_dir):
264275
assert list(walk_files(".")) == list(walk_files(tmp_dir))

0 commit comments

Comments
 (0)