Skip to content

Commit a4b3c6d

Browse files
authored
erepo: --rev BRANCH: shallow clone (#4246)
* erepo: use shallow clones when rev is branch or tag name * tests: test erepo shallow clone behavior * erepo: fix detached head after shallow cloning a tag * move unshallow logic into its own function * erepo: default rev should be origin HEAD not local HEAD * use functools.partial * review fixes * comment shallow clone behavior in erepo * don't duplicate unnecessary fetch/pull call
1 parent b77ce02 commit a4b3c6d

File tree

3 files changed

+122
-19
lines changed

3 files changed

+122
-19
lines changed

dvc/external_repo.py

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from dvc.path_info import PathInfo
2020
from dvc.repo import Repo
2121
from dvc.repo.tree import RepoTree
22+
from dvc.scm.base import CloneError
2223
from dvc.scm.git import Git
2324
from dvc.tree.local import LocalTree
2425
from dvc.utils.fs import remove
@@ -31,7 +32,10 @@ def external_repo(url, rev=None, for_write=False):
3132
logger.debug("Creating external repo %s@%s", url, rev)
3233
path = _cached_clone(url, rev, for_write=for_write)
3334
if not rev:
34-
rev = "HEAD"
35+
# Local HEAD points to the tip of whatever branch we first cloned from
36+
# (which may not be the default branch), use origin/HEAD here to get
37+
# the tip of the default branch
38+
rev = "refs/remotes/origin/HEAD"
3539
try:
3640
repo = ExternalRepo(path, url, rev, for_write=for_write)
3741
except NotDvcRepoError:
@@ -59,7 +63,7 @@ def external_repo(url, rev=None, for_write=False):
5963

6064
def clean_repos():
6165
# Outside code should not see cache while we are removing
62-
paths = list(CLONES.values()) + list(CACHE_DIRS.values())
66+
paths = [path for path, _ in CLONES.values()] + list(CACHE_DIRS.values())
6367
CLONES.clear()
6468
CACHE_DIRS.clear()
6569

@@ -251,10 +255,10 @@ def _cached_clone(url, rev, for_write=False):
251255
"""
252256
# even if we have already cloned this repo, we may need to
253257
# fetch/fast-forward to get specified rev
254-
clone_path = _clone_default_branch(url, rev)
258+
clone_path, shallow = _clone_default_branch(url, rev, for_write=for_write)
255259

256260
if not for_write and (url) in CLONES:
257-
return CLONES[url]
261+
return CLONES[url][0]
258262

259263
# Copy to a new dir to keep the clone clean
260264
repo_path = tempfile.mkdtemp("dvc-erepo")
@@ -265,36 +269,73 @@ def _cached_clone(url, rev, for_write=False):
265269
if for_write:
266270
_git_checkout(repo_path, rev)
267271
else:
268-
CLONES[url] = repo_path
272+
CLONES[url] = (repo_path, shallow)
269273
return repo_path
270274

271275

272276
@wrap_with(threading.Lock())
273-
def _clone_default_branch(url, rev):
277+
def _clone_default_branch(url, rev, for_write=False):
274278
"""Get or create a clean clone of the url.
275279
276280
The cloned is reactualized with git pull unless rev is a known sha.
277281
"""
278-
clone_path = CLONES.get(url)
282+
clone_path, shallow = CLONES.get(url, (None, False))
279283

280284
git = None
281285
try:
282286
if clone_path:
283287
git = Git(clone_path)
284288
# Do not pull for known shas, branches and tags might move
285289
if not Git.is_sha(rev) or not git.has_rev(rev):
286-
logger.debug("erepo: git pull %s", url)
287-
git.pull()
290+
if shallow:
291+
# If we are missing a rev in a shallow clone, fallback to
292+
# a full (unshallowed) clone. Since fetching specific rev
293+
# SHAs is only available in certain git versions, if we
294+
# have need to reference multiple specific revs for a
295+
# given repo URL it is easier/safer for us to work with
296+
# full clones in this case.
297+
logger.debug("erepo: unshallowing clone for '%s'", url)
298+
_unshallow(git)
299+
shallow = False
300+
CLONES[url] = (clone_path, shallow)
301+
else:
302+
logger.debug("erepo: git pull '%s'", url)
303+
git.pull()
288304
else:
289-
logger.debug("erepo: git clone %s to a temporary dir", url)
305+
logger.debug("erepo: git clone '%s' to a temporary dir", url)
290306
clone_path = tempfile.mkdtemp("dvc-clone")
291-
git = Git.clone(url, clone_path)
292-
CLONES[url] = clone_path
307+
if not for_write and rev and not Git.is_sha(rev):
308+
# If rev is a tag or branch name try shallow clone first
309+
try:
310+
git = Git.clone(url, clone_path, shallow_branch=rev)
311+
shallow = True
312+
logger.debug(
313+
"erepo: using shallow clone for branch '%s'", rev
314+
)
315+
except CloneError:
316+
pass
317+
if not git:
318+
git = Git.clone(url, clone_path)
319+
shallow = False
320+
CLONES[url] = (clone_path, shallow)
293321
finally:
294322
if git:
295323
git.close()
296324

297-
return clone_path
325+
return clone_path, shallow
326+
327+
328+
def _unshallow(git):
329+
if git.repo.head.is_detached:
330+
# If this is a detached head (i.e. we shallow cloned a tag) switch to
331+
# the default branch
332+
origin_refs = git.repo.remotes["origin"].refs
333+
ref = origin_refs["HEAD"].reference
334+
branch_name = ref.name.split("/")[-1]
335+
branch = git.repo.create_head(branch_name, ref)
336+
branch.set_tracking_branch(ref)
337+
branch.checkout()
338+
git.pull(unshallow=True)
298339

299340

300341
def _git_checkout(repo_path, rev):

dvc/scm/git.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import os
55
import shlex
6+
from functools import partial
67

78
import yaml
89
from funcy import cached_property
@@ -92,7 +93,7 @@ def root_dir(self):
9293
return self.repo.working_tree_dir
9394

9495
@staticmethod
95-
def clone(url, to_path, rev=None):
96+
def clone(url, to_path, rev=None, shallow_branch=None):
9697
import git
9798

9899
ld_key = "LD_LIBRARY_PATH"
@@ -109,14 +110,23 @@ def clone(url, to_path, rev=None):
109110
env[ld_key] = ""
110111

111112
try:
113+
if shallow_branch is not None and os.path.exists(url):
114+
# git disables --depth for local clones unless file:// url
115+
# scheme is used
116+
url = f"file://{url}"
112117
with TqdmGit(desc="Cloning", unit="obj") as pbar:
113-
tmp_repo = git.Repo.clone_from(
118+
clone_from = partial(
119+
git.Repo.clone_from,
114120
url,
115121
to_path,
116122
env=env, # needed before we can fix it in __init__
117123
no_single_branch=True,
118124
progress=pbar.update_git,
119125
)
126+
if shallow_branch is None:
127+
tmp_repo = clone_from()
128+
else:
129+
tmp_repo = clone_from(branch=shallow_branch, depth=1)
120130
tmp_repo.close()
121131
except git.exc.GitCommandError as exc: # pylint: disable=no-member
122132
raise CloneError(url, to_path) from exc
@@ -250,8 +260,8 @@ def checkout(self, branch, create_new=False):
250260
else:
251261
self.repo.git.checkout(branch)
252262

253-
def pull(self):
254-
infos = self.repo.remote().pull()
263+
def pull(self, **kwargs):
264+
infos = self.repo.remote().pull(**kwargs)
255265
for info in infos:
256266
if info.flags & info.ERROR:
257267
raise SCMError(f"pull failed: {info.note}")

tests/func/test_external_repo.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import os
22

3-
from mock import patch
3+
from mock import ANY, patch
44

5-
from dvc.external_repo import external_repo
5+
from dvc.external_repo import CLONES, external_repo
66
from dvc.path_info import PathInfo
77
from dvc.scm.git import Git
88
from dvc.tree.local import LocalTree
@@ -121,3 +121,55 @@ def test_relative_remote(erepo_dir, tmp_dir):
121121
assert os.path.isdir(repo.config["remote"]["upstream"]["url"])
122122
with repo.open_by_relpath("file") as fd:
123123
assert fd.read() == "contents"
124+
125+
126+
def test_shallow_clone_branch(erepo_dir):
127+
with erepo_dir.chdir():
128+
with erepo_dir.branch("branch", new=True):
129+
erepo_dir.dvc_gen("file", "branch", commit="create file on branch")
130+
erepo_dir.dvc_gen("file", "master", commit="create file on master")
131+
132+
url = os.fspath(erepo_dir)
133+
134+
with patch.object(Git, "clone", wraps=Git.clone) as mock_clone:
135+
with external_repo(url, rev="branch") as repo:
136+
with repo.open_by_relpath("file") as fd:
137+
assert fd.read() == "branch"
138+
139+
mock_clone.assert_called_with(url, ANY, shallow_branch="branch")
140+
_, shallow = CLONES[url]
141+
assert shallow
142+
143+
with external_repo(url) as repo:
144+
with repo.open_by_relpath("file") as fd:
145+
assert fd.read() == "master"
146+
147+
assert mock_clone.call_count == 1
148+
_, shallow = CLONES[url]
149+
assert not shallow
150+
151+
152+
def test_shallow_clone_tag(erepo_dir):
153+
with erepo_dir.chdir():
154+
erepo_dir.dvc_gen("file", "foo", commit="init")
155+
erepo_dir.scm.tag("v1")
156+
erepo_dir.dvc_gen("file", "bar", commit="update file")
157+
158+
url = os.fspath(erepo_dir)
159+
160+
with patch.object(Git, "clone", wraps=Git.clone) as mock_clone:
161+
with external_repo(url, rev="v1") as repo:
162+
with repo.open_by_relpath("file") as fd:
163+
assert fd.read() == "foo"
164+
165+
mock_clone.assert_called_with(url, ANY, shallow_branch="v1")
166+
_, shallow = CLONES[url]
167+
assert shallow
168+
169+
with external_repo(url, rev="master") as repo:
170+
with repo.open_by_relpath("file") as fd:
171+
assert fd.read() == "bar"
172+
173+
assert mock_clone.call_count == 1
174+
_, shallow = CLONES[url]
175+
assert not shallow

0 commit comments

Comments
 (0)