Skip to content

utils: support use of (GitTree) tree file objects in utils #3819

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,5 @@ def walk(self, top, topdown=True):

yield root, dirs, files

def walk_files(self, top):
for root, _, files in self.walk(top):
for file in files:
yield os.path.join(root, file)
def stat(self, path):
return self.tree.stat(path)
42 changes: 41 additions & 1 deletion dvc/scm/git/tree.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import errno
import io
import os
import stat

from dvc.exceptions import DvcException
from dvc.scm.tree import BaseTree
Expand Down Expand Up @@ -87,7 +88,9 @@ def git_object_by_path(self, path):
import git

path = relpath(os.path.realpath(path), self.git.working_dir)
assert path.split(os.sep, 1)[0] != ".."
if path.split(os.sep, 1)[0] == "..":
# path points outside of git repository
return None

try:
tree = self.git.tree(self.rev)
Expand Down Expand Up @@ -138,3 +141,40 @@ def walk(self, top, topdown=True):
raise OSError(errno.ENOENT, "No such file")

yield from self._walk(tree, topdown=topdown)

def isexec(self, path):
if not self.exists(path):
return False

mode = self.stat(path).st_mode
return mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)

def stat(self, path):
import git

def to_ctime(git_time):
sec, nano_sec = git_time
return sec + nano_sec / 1000000000

obj = self.git_object_by_path(path)
if obj is None:
raise OSError(errno.ENOENT, "No such file")
entry = git.index.IndexEntry.from_blob(obj)

# os.stat_result takes a tuple in the form:
# (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime)
return os.stat_result(
(
entry.mode,
entry.inode,
entry.dev,
0,
entry.uid,
entry.gid,
entry.size,
# git index has no atime equivalent, use mtime
to_ctime(entry.mtime),
to_ctime(entry.mtime),
to_ctime(entry.ctime),
)
)
11 changes: 11 additions & 0 deletions dvc/scm/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ def walk(self, top, topdown=True):
- it could raise exceptions, there is no onerror argument
"""

def walk_files(self, top):
for root, _, files in self.walk(top):
for file in files:
yield os.path.join(root, file)


class WorkingTree(BaseTree):
"""Proxies the repo file access methods to working tree files"""
Expand All @@ -42,6 +47,8 @@ def tree_root(self):

def open(self, path, mode="r", encoding="utf-8"):
"""Open file and return a stream."""
if "b" in mode:
encoding = None
return open(path, mode=mode, encoding=encoding)

def exists(self, path):
Expand Down Expand Up @@ -76,6 +83,10 @@ def isexec(self, path):
mode = os.stat(path).st_mode
return mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)

@staticmethod
def stat(path):
return os.stat(path)


def is_working_tree(tree):
return isinstance(tree, WorkingTree) or isinstance(
Expand Down
52 changes: 35 additions & 17 deletions dvc/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,44 @@ def dos2unix(data):
return data.replace(b"\r\n", b"\n")


def file_md5(fname):
def _fobj_md5(fobj, hash_md5, binary, progress_func=None):
while True:
data = fobj.read(LOCAL_CHUNK_SIZE)
if not data:
break

if binary:
chunk = data
else:
chunk = dos2unix(data)

hash_md5.update(chunk)
if progress_func:
progress_func(len(data))


def file_md5(fname, tree=None):
""" get the (md5 hexdigest, md5 digest) of a file """
from dvc.progress import Tqdm
from dvc.istextfile import istextfile

if os.path.exists(fname):
if tree:
exists_func = tree.exists
stat_func = tree.stat
open_func = tree.open
# assume we don't need to run dos2unix when comparing git blobs
binary = True
else:
exists_func = os.path.exists
stat_func = os.stat
open_func = open
binary = False

if exists_func(fname):
hash_md5 = hashlib.md5()
binary = not istextfile(fname)
size = os.path.getsize(fname)
if not binary:
binary = not istextfile(fname)
size = stat_func(fname).st_size
no_progress_bar = True
if size >= LARGE_FILE_SIZE:
no_progress_bar = False
Expand All @@ -52,19 +81,8 @@ def file_md5(fname):
bytes=True,
leave=False,
) as pbar:
with open(fname, "rb") as fobj:
while True:
data = fobj.read(LOCAL_CHUNK_SIZE)
if not data:
break

if binary:
chunk = data
else:
chunk = dos2unix(data)

hash_md5.update(chunk)
pbar.update(len(data))
with open_func(fname, "rb") as fobj:
_fobj_md5(fobj, hash_md5, binary, pbar.update)

return (hash_md5.hexdigest(), hash_md5.digest())

Expand Down
16 changes: 9 additions & 7 deletions dvc/utils/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from shortuuid import uuid

from dvc.exceptions import DvcException
from dvc.scm.tree import is_working_tree
from dvc.system import System
from dvc.utils import dict_md5

Expand All @@ -33,14 +32,12 @@ def get_inode(path):

def get_mtime_and_size(path, tree):

if os.path.isdir(path):
assert is_working_tree(tree)

if tree.isdir(path):
size = 0
files_mtimes = {}
for file_path in tree.walk_files(path):
try:
stats = os.stat(file_path)
stats = tree.stat(file_path)
except OSError as exc:
# NOTE: broken symlink case.
if exc.errno != errno.ENOENT:
Expand All @@ -53,7 +50,7 @@ def get_mtime_and_size(path, tree):
# max(mtime(f) for f in non_ignored_files)
mtime = dict_md5(files_mtimes)
else:
base_stat = os.stat(path)
base_stat = tree.stat(path)
size = base_stat.st_size
mtime = base_stat.st_mtime
mtime = int(nanotime.timestamp(mtime))
Expand Down Expand Up @@ -175,7 +172,6 @@ def copyfile(src, dest, no_progress_bar=False, name=None):
"""Copy file with progress bar"""
from dvc.exceptions import DvcException
from dvc.progress import Tqdm
from dvc.system import System

name = name if name else os.path.basename(dest)
total = os.stat(src).st_size
Expand All @@ -202,6 +198,12 @@ def copyfile(src, dest, no_progress_bar=False, name=None):
fdest_wrapped.write(buf)


def copy_fobj_to_file(fsrc, dest):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we will probably want progress bar support for this once we are saving files from git tree to local cache, but I think it may make more sense to have the pbar instance in the higher level save() method.

So at that point we can add a pbar update callback parameter to this method (and do buffered read/write instead of using shutil.copyfileobj as needed).

"""Copy contents of open file object to destination path."""
with open(dest, "wb+") as fdest:
shutil.copyfileobj(fsrc, fdest)


def walk_files(directory):
for root, _, files in os.walk(directory):
for f in files:
Expand Down
11 changes: 11 additions & 0 deletions tests/unit/utils/test_fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from dvc.utils.fs import (
BasePathNotInCheckedPathException,
contains_symlink_up_to,
copy_fobj_to_file,
copyfile,
get_inode,
get_mtime_and_size,
Expand Down Expand Up @@ -260,5 +261,15 @@ def test_copyfile(path, tmp_dir):
assert filecmp.cmp(src_info, dest_info, shallow=False)


def test_copy_fobj_to_file(tmp_dir):
tmp_dir.gen({"foo": "foo content"})
src = tmp_dir / "foo"
dest = "path"

with open(src, "rb") as fobj:
copy_fobj_to_file(fobj, dest)
assert filecmp.cmp(src, dest)


def test_walk_files(tmp_dir):
assert list(walk_files(".")) == list(walk_files(tmp_dir))