Skip to content

checkout: show summary by default and introduce --show-changes flag #3401

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 81 additions & 7 deletions dvc/command/checkout.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,86 @@
import argparse
import logging

import colorama

from dvc.command.base import append_doc_link
from dvc.command.base import CmdBase
from dvc.exceptions import CheckoutError

logger = logging.getLogger(__name__)


def _human_join(words):
words = list(words)
if not words:
return ""

return (
"{before} and {after}".format(
before=", ".join(words[:-1]), after=words[-1],
)
if len(words) > 1
else words[0]
)


def log_summary(stats):
states = ("added", "deleted", "modified")
summary = (
"{} {}".format(len(stats[state]), state)
for state in states
if stats.get(state)
)
logger.info(_human_join(summary))


def log_changes(stats):
colors = [
("modified", colorama.Fore.YELLOW,),
("added", colorama.Fore.GREEN),
("deleted", colorama.Fore.RED,),
]

for state, color in colors:
entries = stats.get(state)

if not entries:
continue

for entry in entries:
logger.info(
"{color}{state}{nc}{spacing}{entry}".format(
color=color,
state=state[0].upper(),
nc=colorama.Fore.RESET,
spacing="\t",
entry=entry,
)
)


class CmdCheckout(CmdBase):
def run(self):
self.repo.checkout(
targets=self.args.targets,
with_deps=self.args.with_deps,
force=self.args.force,
relink=self.args.relink,
recursive=self.args.recursive,
)
stats, exc = None, None
try:
stats = self.repo.checkout(
targets=self.args.targets,
with_deps=self.args.with_deps,
force=self.args.force,
relink=self.args.relink,
recursive=self.args.recursive,
)
except CheckoutError as _exc:
exc = _exc
stats = exc.stats

if self.args.summary:
log_summary(stats)
else:
log_changes(stats)

if exc:
raise exc
return 0


Expand All @@ -26,6 +94,12 @@ def add_parser(subparsers, parent_parser):
help=CHECKOUT_HELP,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
checkout_parser.add_argument(
"--summary",
action="store_true",
default=False,
help="Show summary of the changes.",
)
checkout_parser.add_argument(
"-d",
"--with-deps",
Expand Down
12 changes: 8 additions & 4 deletions dvc/command/data_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

from dvc.command.base import append_doc_link
from dvc.command.base import CmdBase
from dvc.exceptions import DvcException
from dvc.command.checkout import log_summary
from dvc.exceptions import DvcException, CheckoutError


logger = logging.getLogger(__name__)
Expand All @@ -19,7 +20,7 @@ def check_up_to_date(cls, processed_files_count):
class CmdDataPull(CmdDataBase):
def run(self):
try:
processed_files_count = self.repo.pull(
stats = self.repo.pull(
targets=self.args.targets,
jobs=self.args.jobs,
remote=self.args.remote,
Expand All @@ -29,10 +30,13 @@ def run(self):
force=self.args.force,
recursive=self.args.recursive,
)
except DvcException:
log_summary(stats)
except (CheckoutError, DvcException) as exc:
log_summary(getattr(exc, "stats", {}))
logger.exception("failed to pull data from the cloud")
return 1
self.check_up_to_date(processed_files_count)

self.check_up_to_date(stats["downloaded"])
return 0


Expand Down
6 changes: 4 additions & 2 deletions dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,12 @@ def __init__(self, amount):


class CheckoutError(DvcException):
def __init__(self, target_infos):
def __init__(self, target_infos, stats=None):
self.target_infos = target_infos
self.stats = stats
targets = [str(t) for t in target_infos]
m = (
"Checkout failed for following targets:\n {}\nDid you "
"Checkout failed for following targets:\n{}\nDid you "
"forget to fetch?".format("\n".join(targets))
)
super().__init__(m)
Expand Down
10 changes: 5 additions & 5 deletions dvc/external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from dvc.compat import fspath
from dvc.repo import Repo
from dvc.config import NoRemoteError, NotDvcRepoError
from dvc.exceptions import NoRemoteInExternalRepoError
from dvc.exceptions import NoRemoteInExternalRepoError, CheckoutError
from dvc.exceptions import OutputNotFoundError, NoOutputInExternalRepoError
from dvc.exceptions import FileMissingError, PathMissingError
from dvc.utils.fs import remove, fs_copy, move
Expand Down Expand Up @@ -106,14 +106,14 @@ def _pull_cached(self, out, path_info, dest):
if out.changed_cache(filter_info=src):
self.cloud.pull(out.get_used_cache(filter_info=src))

failed = out.checkout(filter_info=src)
try:
out.checkout(filter_info=src)
except CheckoutError:
raise FileNotFoundError

move(src, dest)
remove(tmp)

if failed:
raise FileNotFoundError

@wrap_with(threading.Lock())
def _set_cache_dir(self):
try:
Expand Down
35 changes: 27 additions & 8 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import dvc.prompt as prompt
from dvc.exceptions import (
CheckoutError,
DvcException,
ConfirmRemoveError,
DvcIgnoreInCollectedDirError,
Expand Down Expand Up @@ -851,20 +852,24 @@ def safe_remove(self, path_info, force=False):
self.remove(path_info)

def _checkout_file(
self, path_info, checksum, force, progress_callback=None
self, path_info, checksum, force, progress_callback=None, relink=False
):
"""The file is changed we need to checkout a new copy"""
added, modified = True, False
cache_info = self.checksum_to_path_info(checksum)
if self.exists(path_info):
logger.debug("data '{}' will be replaced.", path_info)
self.safe_remove(path_info, force=force)
added, modified = False, True

self.link(cache_info, path_info)
self.state.save_link(path_info)
self.state.save(path_info, checksum)
if progress_callback:
progress_callback(str(path_info))

return added, modified and not relink

def makedirs(self, path_info):
"""Optional: Implement only if the remote needs to create
directories before copying/linking/moving data
Expand All @@ -879,9 +884,11 @@ def _checkout_dir(
relink=False,
filter_info=None,
):
added, modified = False, False
# Create dir separately so that dir is created
# even if there are no files in it
if not self.exists(path_info):
added = True
self.makedirs(path_info)

dir_info = self.get_dir_cache(checksum)
Expand All @@ -899,27 +906,36 @@ def _checkout_dir(

entry_checksum_info = {self.PARAM_CHECKSUM: entry_checksum}
if relink or self.changed(entry_info, entry_checksum_info):
modified = True
self.safe_remove(entry_info, force=force)
self.link(entry_cache_info, entry_info)
self.state.save(entry_info, entry_checksum)
if progress_callback:
progress_callback(str(entry_info))

self._remove_redundant_files(path_info, dir_info, force)
modified = (
self._remove_redundant_files(path_info, dir_info, force)
or modified
)

self.state.save_link(path_info)
self.state.save(path_info, checksum)

# relink is not modified, assume it as nochange
return added, not added and modified and not relink

def _remove_redundant_files(self, path_info, dir_info, force):
existing_files = set(self.walk_files(path_info))

needed_files = {
path_info / entry[self.PARAM_RELPATH] for entry in dir_info
}

for path in existing_files - needed_files:
redundant_files = existing_files - needed_files
for path in redundant_files:
self.safe_remove(path, force)

return bool(redundant_files)

def checkout(
self,
path_info,
Expand Down Expand Up @@ -966,12 +982,14 @@ def checkout(
self.path_info, checksum, filter_info
),
)
return failed
if failed:
raise CheckoutError([failed])
return

logger.debug("Checking out '{}' with cache '{}'.", path_info, checksum)

self._checkout(
path_info, checksum, force, progress_callback, relink, filter_info
return self._checkout(
path_info, checksum, force, progress_callback, relink, filter_info,
)

def _checkout(
Expand All @@ -985,8 +1003,9 @@ def _checkout(
):
if not self.is_dir_checksum(checksum):
return self._checkout_file(
path_info, checksum, force, progress_callback=progress_callback
path_info, checksum, force, progress_callback, relink
)

return self._checkout_dir(
path_info, checksum, force, progress_callback, relink, filter_info
)
Expand Down
55 changes: 39 additions & 16 deletions dvc/repo/checkout.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,31 @@
import logging
import os

from dvc.compat import fspath
from dvc.exceptions import CheckoutError
from dvc.exceptions import CheckoutErrorSuggestGit
from dvc.progress import Tqdm

from dvc.utils import relpath

logger = logging.getLogger(__name__)


def _cleanup_unused_links(repo):
def _get_unused_links(repo):
used = [
out.fspath
for stage in repo.stages
for out in stage.outs
if out.scheme == "local"
]
repo.state.remove_unused_links(used)
return repo.state.get_unused_links(used)


def _fspath_dir(path, root):
if not os.path.exists(str(path)):
return str(path)

path = relpath(fspath(path), root)
return os.path.join(path, "") if os.path.isdir(path) else path


def get_all_files_numbers(pairs):
Expand All @@ -34,9 +44,19 @@ def _checkout(
):
from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError

unused = []
stats = {
"added": [],
"deleted": [],
"modified": [],
"failed": [],
}
if not targets:
targets = [None]
_cleanup_unused_links(self)
unused = _get_unused_links(self)

stats["deleted"] = [_fspath_dir(u, self.root_dir) for u in unused]
self.state.remove_links(unused)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this not under if? Looks fragile this way, we shouldn't call remove_links() at all if a particular target is specified.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's empty if there's no argument.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I need to check if something is a dir before deleting.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checkout in dvc:

  • checks out particular thing(s) if that things (target) is specified
  • if no target is specified it checks out everything and removes anything previously checked out and having no references now.

I.e. if there is target we don't try to change the rest of the workspace, so .remove_links() should not be called. The logic of your code is different.

Copy link
Collaborator Author

@skshetry skshetry Mar 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If targets are specified, unused will be empty and hence, nothing will get removed. If your concern is regarding calling .remove_links() at all, I don't think there's any reason to be defensive.


pairs = set()
for target in targets:
Expand All @@ -52,20 +72,23 @@ def _checkout(
raise CheckoutErrorSuggestGit(target) from exc

total = get_all_files_numbers(pairs)
if total == 0:
logger.info("Nothing to do")
failed = []
with Tqdm(
total=total, unit="file", desc="Checkout", disable=total == 0
) as pbar:
for stage, filter_info in pairs:
failed.extend(
stage.checkout(
force=force,
progress_callback=pbar.update_desc,
relink=relink,
filter_info=filter_info,
)
result = stage.checkout(
force=force,
progress_callback=pbar.update_desc,
relink=relink,
filter_info=filter_info,
)
if failed:
raise CheckoutError(failed)
for data in ["failed", "added", "modified"]:
stats[data].extend(
_fspath_dir(path, self.root_dir) for path in result[data]
)

if stats.get("failed"):
raise CheckoutError(stats["failed"], stats)

del stats["failed"]
return stats
Loading