Skip to content

import: allow downloading regular files/dirs tracked by git #2889

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Dec 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions dvc/dependency/repo.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from __future__ import unicode_literals

import copy
import os
from contextlib import contextmanager
from dvc.utils.compat import FileNotFoundError

from funcy import merge

from .local import DependencyLOCAL
from dvc.external_repo import external_repo
from dvc.utils.compat import str
from dvc.exceptions import OutputNotFoundError
from dvc.exceptions import PathMissingError
from dvc.utils.fs import fs_copy


class DependencyREPO(DependencyLOCAL):
Expand Down Expand Up @@ -72,10 +77,42 @@ def fetch(self):

return out

@staticmethod
def _is_git_file(repo, path):
if not os.path.isabs(path):
try:
output = repo.find_out_by_relpath(path)
if not output.use_cache:
return True
except OutputNotFoundError:
return True
return False

def _copy_if_git_file(self, to_path):
src_path = self.def_path
with self._make_repo(
cache_dir=self.repo.cache.local.cache_dir
) as repo:
if not self._is_git_file(repo, src_path):
return False

src_full_path = os.path.join(repo.root_dir, src_path)
dst_full_path = os.path.abspath(to_path)
fs_copy(src_full_path, dst_full_path)
return True

def download(self, to):
out = self.fetch()
to.info = copy.copy(out.info)
to.checkout()
try:
if self._copy_if_git_file(to.fspath):
return
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a note: Not setting to.info as we do down below is fine, as git files are tiny and the hash will be computed later in the output itself.


out = self.fetch()
to.info = copy.copy(out.info)
to.checkout()
except (FileNotFoundError):
raise PathMissingError(
self.def_path, self.def_repo[self.PARAM_URL]
)

def update(self):
with self._make_repo(rev_lock=None) as repo:
Expand Down
9 changes: 9 additions & 0 deletions dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,3 +344,12 @@ def __init__(self, path, external_repo_path, external_repo_url):
class HTTPError(DvcException):
def __init__(self, code, reason):
super(HTTPError, self).__init__("'{} {}'".format(code, reason))


class PathMissingError(DvcException):
def __init__(self, path, repo):
msg = (
"The path '{}' does not exist in the target repository '{}'"
" neighther as an output nor a git-handled file."
)
super(PathMissingError, self).__init__(msg.format(path, repo))
21 changes: 3 additions & 18 deletions dvc/repo/get.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import os
import shutil
from dvc.utils.compat import FileNotFoundError

import shortuuid
Expand All @@ -10,12 +9,14 @@
NotDvcRepoError,
OutputNotFoundError,
UrlNotDvcRepoError,
PathMissingError,
)
from dvc.external_repo import external_repo
from dvc.path_info import PathInfo
from dvc.stage import Stage
from dvc.utils import resolve_output
from dvc.utils.fs import remove
from dvc.utils.fs import fs_copy

logger = logging.getLogger(__name__)

Expand All @@ -28,15 +29,6 @@ def __init__(self):
)


class PathMissingError(DvcException):
def __init__(self, path, repo):
msg = (
"The path '{}' does not exist in the target repository '{}'"
" neighther as an output nor a git-handled file."
)
super(PathMissingError, self).__init__(msg.format(path, repo))


@staticmethod
def get(url, path, out=None, rev=None):
out = resolve_output(path, out)
Expand Down Expand Up @@ -76,7 +68,7 @@ def get(url, path, out=None, rev=None):
if os.path.isabs(path):
raise FileNotFoundError

_copy(os.path.join(repo.root_dir, path), out)
fs_copy(os.path.join(repo.root_dir, path), out)

except (OutputNotFoundError, FileNotFoundError):
raise PathMissingError(path, url)
Expand All @@ -94,10 +86,3 @@ def _get_cached(repo, output, out):
# This might happen when pull haven't really pulled all the files
if failed:
raise FileNotFoundError


def _copy(src, dst):
if os.path.isdir(src):
shutil.copytree(src, dst)
else:
shutil.copy2(src, dst)
7 changes: 7 additions & 0 deletions dvc/utils/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
logger = logging.getLogger(__name__)


def fs_copy(src, dst):
if os.path.isdir(src):
shutil.copytree(src, dst)
else:
shutil.copy2(src, dst)


def get_inode(path):
inode = System.inode(path)
logger.debug("Path {} inode {}".format(path, inode))
Expand Down
62 changes: 59 additions & 3 deletions tests/func/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@
from mock import patch

from dvc.config import Config
from dvc.exceptions import DownloadError, NoOutputInExternalRepoError
from dvc.exceptions import DownloadError
from dvc.exceptions import PathMissingError
from dvc.exceptions import NoOutputInExternalRepoError
from dvc.stage import Stage
from dvc.system import System
from dvc.utils import makedirs
from dvc.utils.compat import fspath
from tests.utils import trees_equal


Expand All @@ -27,6 +30,34 @@ def test_import(git, dvc_repo, erepo):
assert git.git.check_ignore(dst)


def test_import_git_file(erepo_dir, tmp_dir, dvc, scm):
src = "some_file"
dst = "some_file_imported"

erepo_dir.scm_gen({src: "hello"}, commit="add a regular file")

tmp_dir.dvc.imp(fspath(erepo_dir), src, dst)

assert (tmp_dir / dst).is_file()
assert filecmp.cmp(
fspath(erepo_dir / src), fspath(tmp_dir / dst), shallow=False
)
assert tmp_dir.scm.repo.git.check_ignore(fspath(tmp_dir / dst))


def test_import_git_dir(erepo_dir, tmp_dir, dvc, scm):
src = "some_directory"
dst = "some_directory_imported"

erepo_dir.scm_gen({src: {"file.txt": "hello"}}, commit="add a dir")

tmp_dir.dvc.imp(fspath(erepo_dir), src, dst)

assert (tmp_dir / dst).is_dir()
trees_equal(fspath(erepo_dir / src), fspath(tmp_dir / dst))
assert tmp_dir.scm.repo.git.check_ignore(fspath(tmp_dir / dst))


def test_import_dir(git, dvc_repo, erepo):
src = erepo.DATA_DIR
dst = erepo.DATA_DIR + "_imported"
Expand All @@ -39,6 +70,28 @@ def test_import_dir(git, dvc_repo, erepo):
assert git.git.check_ignore(dst)


def test_import_non_cached(erepo_dir, tmp_dir, dvc, scm):
src = "non_cached_output"
dst = src + "_imported"

erepo_dir.dvc.run(
cmd="echo hello > {}".format(src),
outs_no_cache=[src],
cwd=fspath(erepo_dir),
)

erepo_dir.scm.add([fspath(erepo_dir / src)])
erepo_dir.scm.commit("add a non-cached output")

tmp_dir.dvc.imp(fspath(erepo_dir), src, dst)

assert (tmp_dir / dst).is_file()
assert filecmp.cmp(
fspath(erepo_dir / src), fspath(tmp_dir / dst), shallow=False
)
assert tmp_dir.scm.repo.git.check_ignore(dst)


def test_import_rev(git, dvc_repo, erepo):
src = "version"
dst = src
Expand Down Expand Up @@ -156,6 +209,9 @@ def test_pull_non_workspace(git, dvc_repo, erepo):
assert os.path.exists(stage.outs[0].cache_path)


def test_import_non_existing(dvc_repo, erepo):
def test_import_non_existing(erepo_dir, tmp_dir, dvc):
with pytest.raises(PathMissingError):
tmp_dir.dvc.imp(fspath(erepo_dir), "invalid_output")
# https://github.com/iterative/dvc/pull/2837#discussion_r352123053
with pytest.raises(NoOutputInExternalRepoError):
dvc_repo.imp(erepo.root_dir, "invalid_output")
tmp_dir.dvc.imp(fspath(erepo_dir), "/root/", "root")