Skip to content

Commit 26a9702

Browse files
authored
Merge pull request #2837 from danihodovic/feat/2515
get: copy/download files tracked by Git
2 parents 237704e + 29b34ba commit 26a9702

File tree

4 files changed

+131
-10
lines changed

4 files changed

+131
-10
lines changed

dvc/command/get.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def run(self):
3333

3434

3535
def add_parser(subparsers, parent_parser):
36-
GET_HELP = "Download data from DVC repository."
36+
GET_HELP = "Download/copy files or directories from DVC repository."
3737
get_parser = subparsers.add_parser(
3838
"get",
3939
parents=[parent_parser],
@@ -44,9 +44,14 @@ def add_parser(subparsers, parent_parser):
4444
get_parser.add_argument(
4545
"url", help="URL of Git repository with DVC project to download from."
4646
)
47-
get_parser.add_argument("path", help="Path to data within DVC repository.")
4847
get_parser.add_argument(
49-
"-o", "--out", nargs="?", help="Destination path to put data to."
48+
"path", help="Path to a file or directory within a DVC repository."
49+
)
50+
get_parser.add_argument(
51+
"-o",
52+
"--out",
53+
nargs="?",
54+
help="Destination path to copy/download files to.",
5055
)
5156
get_parser.add_argument(
5257
"--rev", nargs="?", help="DVC repository git revision."

dvc/exceptions.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,12 @@ def __init__(self, path, cause=None):
268268
)
269269

270270

271+
class PathOutsideRepoError(DvcException):
272+
def __init__(self, path, repo):
273+
msg = "The path '{}' does not exist in the target repository '{}'."
274+
super(PathOutsideRepoError, self).__init__(msg.format(path, repo))
275+
276+
271277
class DvcIgnoreInCollectedDirError(DvcException):
272278
def __init__(self, ignore_dirname):
273279
super(DvcIgnoreInCollectedDirError, self).__init__(

dvc/repo/get.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,39 @@
11
import logging
22
import os
3+
import shutil
34

45
import shortuuid
56

67
from dvc.exceptions import GetDVCFileError
78
from dvc.exceptions import NotDvcRepoError
89
from dvc.exceptions import OutputNotFoundError
910
from dvc.exceptions import UrlNotDvcRepoError
11+
from dvc.exceptions import PathOutsideRepoError
1012
from dvc.external_repo import external_repo
1113
from dvc.path_info import PathInfo
1214
from dvc.stage import Stage
1315
from dvc.state import StateNoop
1416
from dvc.utils import resolve_output
1517
from dvc.utils.fs import remove
18+
from dvc.utils.compat import FileNotFoundError
1619

1720
logger = logging.getLogger(__name__)
1821

1922

23+
def _copy_git_file(repo, src, dst, repo_url):
24+
src_full_path = os.path.join(repo.root_dir, src)
25+
dst_full_path = os.path.abspath(dst)
26+
27+
if os.path.isdir(src_full_path):
28+
shutil.copytree(src_full_path, dst_full_path)
29+
return
30+
31+
try:
32+
shutil.copy2(src_full_path, dst_full_path)
33+
except FileNotFoundError:
34+
raise PathOutsideRepoError(src, repo_url)
35+
36+
2037
@staticmethod
2138
def get(url, path, out=None, rev=None):
2239
out = resolve_output(path, out)
@@ -49,16 +66,31 @@ def get(url, path, out=None, rev=None):
4966
# the same cache file might be used a few times in a directory.
5067
repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]
5168

52-
o = repo.find_out_by_relpath(path)
69+
output = None
70+
output_error = None
71+
72+
try:
73+
output = repo.find_out_by_relpath(path)
74+
except OutputNotFoundError as ex:
75+
output_error = ex
76+
77+
is_git_file = output_error and not os.path.isabs(path)
78+
is_not_cached = output and not output.use_cache
79+
80+
if is_git_file or is_not_cached:
81+
_copy_git_file(repo, path, out, url)
82+
return
83+
84+
if output_error:
85+
raise OutputNotFoundError(path)
86+
5387
with repo.state:
54-
repo.cloud.pull(o.get_used_cache())
55-
o.path_info = PathInfo(os.path.abspath(out))
56-
with o.repo.state:
57-
o.checkout()
88+
repo.cloud.pull(output.get_used_cache())
89+
output.path_info = PathInfo(os.path.abspath(out))
90+
with output.repo.state:
91+
output.checkout()
5892

5993
except NotDvcRepoError:
6094
raise UrlNotDvcRepoError(url)
61-
except OutputNotFoundError:
62-
raise OutputNotFoundError(path)
6395
finally:
6496
remove(tmp_dir)

tests/func/test_get.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,13 @@
99
from dvc.config import Config
1010
from dvc.exceptions import GetDVCFileError
1111
from dvc.exceptions import UrlNotDvcRepoError
12+
from dvc.exceptions import OutputNotFoundError
13+
from dvc.exceptions import PathOutsideRepoError
1214
from dvc.repo import Repo
1315
from dvc.system import System
1416
from dvc.utils import makedirs
1517
from dvc.utils.compat import fspath
18+
from dvc.utils import fspath_py35
1619
from tests.utils import trees_equal
1720

1821

@@ -38,6 +41,36 @@ def test_get_repo_dir(erepo):
3841
trees_equal(src, dst)
3942

4043

44+
def test_get_regular_file(erepo):
45+
src = "some_file"
46+
dst = "some_file_imported"
47+
48+
src_path = os.path.join(erepo.root_dir, src)
49+
erepo.create(src_path, "hello")
50+
erepo.dvc.scm.add([src_path])
51+
erepo.dvc.scm.commit("add a regular file")
52+
Repo.get(erepo.root_dir, src, dst)
53+
54+
assert os.path.exists(dst)
55+
assert os.path.isfile(dst)
56+
assert filecmp.cmp(src_path, dst, shallow=False)
57+
58+
59+
def test_get_regular_dir(erepo):
60+
src = "some_directory"
61+
dst = "some_directory_imported"
62+
63+
src_file_path = os.path.join(erepo.root_dir, src, "file.txt")
64+
erepo.create(src_file_path, "hello")
65+
erepo.dvc.scm.add([src_file_path])
66+
erepo.dvc.scm.commit("add a regular dir")
67+
Repo.get(erepo.root_dir, src, dst)
68+
69+
assert os.path.exists(dst)
70+
assert os.path.isdir(dst)
71+
trees_equal(os.path.join(erepo.root_dir, src), dst)
72+
73+
4174
def test_cache_type_is_properly_overridden(erepo):
4275
erepo.dvc.config.set(
4376
Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "symlink"
@@ -77,6 +110,51 @@ def test_get_a_dvc_file(erepo):
77110
Repo.get(erepo.root_dir, "some_file.dvc")
78111

79112

113+
# https://github.com/iterative/dvc/pull/2837#discussion_r352123053
114+
def test_get_full_dvc_path(erepo):
115+
external_data_dir = erepo.mkdtemp()
116+
external_data = os.path.join(external_data_dir, "ext_data")
117+
with open(external_data, "w+") as fobj:
118+
fobj.write("ext_data")
119+
120+
cur_dir = os.getcwd()
121+
os.chdir(erepo.root_dir)
122+
erepo.dvc.add(external_data)
123+
erepo.dvc.scm.add(["ext_data.dvc"])
124+
erepo.dvc.scm.commit("add external data")
125+
os.chdir(cur_dir)
126+
127+
Repo.get(erepo.root_dir, external_data, "ext_data_imported")
128+
assert os.path.isfile("ext_data_imported")
129+
assert filecmp.cmp(external_data, "ext_data_imported", shallow=False)
130+
131+
132+
def test_non_cached_output(tmp_path, erepo):
133+
os.chdir(erepo.root_dir)
134+
erepo.dvc.run(
135+
outs_no_cache=["non_cached_file"], cmd="echo hello > non_cached_file"
136+
)
137+
erepo.dvc.scm.add(["non_cached_file", "non_cached_file.dvc"])
138+
erepo.dvc.scm.commit("add non-cached output")
139+
os.chdir(fspath_py35(tmp_path))
140+
Repo.get(erepo.root_dir, "non_cached_file")
141+
142+
src = os.path.join(erepo.root_dir, "non_cached_file")
143+
assert os.path.isfile("non_cached_file")
144+
assert filecmp.cmp(src, "non_cached_file", shallow=False)
145+
146+
147+
# https://github.com/iterative/dvc/pull/2837#discussion_r352123053
148+
def test_fails_with_files_outside_repo(erepo):
149+
with pytest.raises(OutputNotFoundError):
150+
Repo.get(erepo.root_dir, "/root/")
151+
152+
153+
def test_fails_with_non_existing_files(erepo):
154+
with pytest.raises(PathOutsideRepoError):
155+
Repo.get(erepo.root_dir, "file_does_not_exist")
156+
157+
80158
@pytest.mark.parametrize("dname", [".", "dir", "dir/subdir"])
81159
def test_get_to_dir(dname, erepo):
82160
src = erepo.FOO

0 commit comments

Comments
 (0)