Skip to content

erepo: --rev BRANCH: shallow clone #4246

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 54 additions & 13 deletions dvc/external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from dvc.path_info import PathInfo
from dvc.repo import Repo
from dvc.repo.tree import RepoTree
from dvc.scm.base import CloneError
from dvc.scm.git import Git
from dvc.tree.local import LocalTree
from dvc.utils.fs import remove
Expand All @@ -31,7 +32,10 @@ def external_repo(url, rev=None, for_write=False):
logger.debug("Creating external repo %s@%s", url, rev)
path = _cached_clone(url, rev, for_write=for_write)
if not rev:
rev = "HEAD"
# Local HEAD points to the tip of whatever branch we first cloned from
# (which may not be the default branch), use origin/HEAD here to get
# the tip of the default branch
rev = "refs/remotes/origin/HEAD"
Copy link
Contributor Author

@pmrowla pmrowla Jul 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since rev=None is supposed to mean tip on the default branch (master), we want to use origin/HEAD, not HEAD from our local clone working copy. Local HEAD points to the tip of whatever branch we first cloned from (which may not be the default branch), but here we want the tip of the default branch.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be happy to see such things commented about in dvc code.

try:
repo = ExternalRepo(path, url, rev, for_write=for_write)
except NotDvcRepoError:
Expand Down Expand Up @@ -59,7 +63,7 @@ def external_repo(url, rev=None, for_write=False):

def clean_repos():
# Outside code should not see cache while we are removing
paths = list(CLONES.values()) + list(CACHE_DIRS.values())
paths = [path for path, _ in CLONES.values()] + list(CACHE_DIRS.values())
CLONES.clear()
CACHE_DIRS.clear()

Expand Down Expand Up @@ -251,10 +255,10 @@ def _cached_clone(url, rev, for_write=False):
"""
# even if we have already cloned this repo, we may need to
# fetch/fast-forward to get specified rev
clone_path = _clone_default_branch(url, rev)
clone_path, shallow = _clone_default_branch(url, rev, for_write=for_write)

if not for_write and (url) in CLONES:
return CLONES[url]
return CLONES[url][0]

# Copy to a new dir to keep the clone clean
repo_path = tempfile.mkdtemp("dvc-erepo")
Expand All @@ -265,36 +269,73 @@ def _cached_clone(url, rev, for_write=False):
if for_write:
_git_checkout(repo_path, rev)
else:
CLONES[url] = repo_path
CLONES[url] = (repo_path, shallow)
return repo_path


@wrap_with(threading.Lock())
def _clone_default_branch(url, rev):
def _clone_default_branch(url, rev, for_write=False):
"""Get or create a clean clone of the url.

The cloned is reactualized with git pull unless rev is a known sha.
"""
clone_path = CLONES.get(url)
clone_path, shallow = CLONES.get(url, (None, False))

git = None
try:
if clone_path:
git = Git(clone_path)
# Do not pull for known shas, branches and tags might move
if not Git.is_sha(rev) or not git.has_rev(rev):
logger.debug("erepo: git pull %s", url)
git.pull()
if shallow:
# If we are missing a rev in a shallow clone, fallback to
# a full (unshallowed) clone. Since fetching specific rev
# SHAs is only available in certain git versions, if we
# have need to reference multiple specific revs for a
# given repo URL it is easier/safer for us to work with
# full clones in this case.
logger.debug("erepo: unshallowing clone for '%s'", url)
_unshallow(git)
shallow = False
CLONES[url] = (clone_path, shallow)
else:
logger.debug("erepo: git pull '%s'", url)
git.pull()
else:
logger.debug("erepo: git clone %s to a temporary dir", url)
logger.debug("erepo: git clone '%s' to a temporary dir", url)
clone_path = tempfile.mkdtemp("dvc-clone")
git = Git.clone(url, clone_path)
CLONES[url] = clone_path
if not for_write and rev and not Git.is_sha(rev):
# If rev is a tag or branch name try shallow clone first
try:
git = Git.clone(url, clone_path, shallow_branch=rev)
shallow = True
logger.debug(
"erepo: using shallow clone for branch '%s'", rev
)
except CloneError:
pass
if not git:
git = Git.clone(url, clone_path)
shallow = False
CLONES[url] = (clone_path, shallow)
finally:
if git:
git.close()

return clone_path
return clone_path, shallow


def _unshallow(git):
if git.repo.head.is_detached:
# If this is a detached head (i.e. we shallow cloned a tag) switch to
# the default branch
origin_refs = git.repo.remotes["origin"].refs
ref = origin_refs["HEAD"].reference
branch_name = ref.name.split("/")[-1]
branch = git.repo.create_head(branch_name, ref)
branch.set_tracking_branch(ref)
branch.checkout()
git.pull(unshallow=True)


def _git_checkout(repo_path, rev):
Expand Down
18 changes: 14 additions & 4 deletions dvc/scm/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import shlex
from functools import partial

import yaml
from funcy import cached_property
Expand Down Expand Up @@ -92,7 +93,7 @@ def root_dir(self):
return self.repo.working_tree_dir

@staticmethod
def clone(url, to_path, rev=None):
def clone(url, to_path, rev=None, shallow_branch=None):
import git

ld_key = "LD_LIBRARY_PATH"
Expand All @@ -109,14 +110,23 @@ def clone(url, to_path, rev=None):
env[ld_key] = ""

try:
if shallow_branch is not None and os.path.exists(url):
# git disables --depth for local clones unless file:// url
# scheme is used
url = f"file://{url}"
with TqdmGit(desc="Cloning", unit="obj") as pbar:
tmp_repo = git.Repo.clone_from(
clone_from = partial(
git.Repo.clone_from,
url,
to_path,
env=env, # needed before we can fix it in __init__
no_single_branch=True,
progress=pbar.update_git,
)
if shallow_branch is None:
tmp_repo = clone_from()
else:
tmp_repo = clone_from(branch=shallow_branch, depth=1)
Comment on lines 117 to +129
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems essentially the same method as #3585 - just checking if @Suor has the same concern here as with https://github.com/iterative/dvc/pull/3585/files#r402728411

Copy link
Contributor Author

@pmrowla pmrowla Jul 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case the caller has to explicitly specify that they want a shallow clone now, rather than us making a choice in Git.clone based on the rev parameter (which we don't use when cloning from erepo).

tmp_repo.close()
except git.exc.GitCommandError as exc: # pylint: disable=no-member
raise CloneError(url, to_path) from exc
Expand Down Expand Up @@ -250,8 +260,8 @@ def checkout(self, branch, create_new=False):
else:
self.repo.git.checkout(branch)

def pull(self):
infos = self.repo.remote().pull()
def pull(self, **kwargs):
infos = self.repo.remote().pull(**kwargs)
for info in infos:
if info.flags & info.ERROR:
raise SCMError(f"pull failed: {info.note}")
Expand Down
56 changes: 54 additions & 2 deletions tests/func/test_external_repo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os

from mock import patch
from mock import ANY, patch

from dvc.external_repo import external_repo
from dvc.external_repo import CLONES, external_repo
from dvc.path_info import PathInfo
from dvc.scm.git import Git
from dvc.tree.local import LocalTree
Expand Down Expand Up @@ -121,3 +121,55 @@ def test_relative_remote(erepo_dir, tmp_dir):
assert os.path.isdir(repo.config["remote"]["upstream"]["url"])
with repo.open_by_relpath("file") as fd:
assert fd.read() == "contents"


def test_shallow_clone_branch(erepo_dir):
with erepo_dir.chdir():
with erepo_dir.branch("branch", new=True):
erepo_dir.dvc_gen("file", "branch", commit="create file on branch")
erepo_dir.dvc_gen("file", "master", commit="create file on master")

url = os.fspath(erepo_dir)

with patch.object(Git, "clone", wraps=Git.clone) as mock_clone:
with external_repo(url, rev="branch") as repo:
with repo.open_by_relpath("file") as fd:
assert fd.read() == "branch"

mock_clone.assert_called_with(url, ANY, shallow_branch="branch")
_, shallow = CLONES[url]
assert shallow

with external_repo(url) as repo:
with repo.open_by_relpath("file") as fd:
assert fd.read() == "master"

assert mock_clone.call_count == 1
_, shallow = CLONES[url]
assert not shallow


def test_shallow_clone_tag(erepo_dir):
with erepo_dir.chdir():
erepo_dir.dvc_gen("file", "foo", commit="init")
erepo_dir.scm.tag("v1")
erepo_dir.dvc_gen("file", "bar", commit="update file")

url = os.fspath(erepo_dir)

with patch.object(Git, "clone", wraps=Git.clone) as mock_clone:
with external_repo(url, rev="v1") as repo:
with repo.open_by_relpath("file") as fd:
assert fd.read() == "foo"

mock_clone.assert_called_with(url, ANY, shallow_branch="v1")
_, shallow = CLONES[url]
assert shallow

with external_repo(url, rev="master") as repo:
with repo.open_by_relpath("file") as fd:
assert fd.read() == "bar"

assert mock_clone.call_count == 1
_, shallow = CLONES[url]
assert not shallow