Skip to content

Brancher refactor #1709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dvc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def _remove_version_file():
if (
os.getenv("APPVEYOR_REPO_TAG", "").lower() != "true"
and os.getenv("TRAVIS_TAG", "") == ""
and os.getenv("DVC_TEST", "").lower() != "true"
):
__version__ = _update_version_file()
else: # pragma: no cover
Expand Down
27 changes: 16 additions & 11 deletions dvc/repo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import unicode_literals

from dvc.utils.compat import str

import os
import dvc.prompt as prompt
import dvc.logger as logger
Expand Down Expand Up @@ -29,6 +27,7 @@ class Repo(object):
from dvc.repo.gc import gc
from dvc.repo.commit import commit
from dvc.repo.pkg import install_pkg
from dvc.repo.brancher import brancher

def __init__(self, root_dir=None):
from dvc.config import Config
Expand All @@ -39,6 +38,7 @@ def __init__(self, root_dir=None):
from dvc.data_cloud import DataCloud
from dvc.updater import Updater
from dvc.repo.metrics import Metrics
from dvc.repo.tree import WorkingTree

root_dir = self.find_root(root_dir)

Expand All @@ -47,6 +47,8 @@ def __init__(self, root_dir=None):

self.config = Config(self.dvc_dir)

self.tree = WorkingTree()

self.scm = SCM(self.root_dir, repo=self)
self.lock = Lock(self.dvc_dir)
# NOTE: storing state and link_state in the repository itself to avoid
Expand Down Expand Up @@ -133,7 +135,7 @@ def _check_cyclic_graph(graph):
raise CyclicGraphError(cycles[0])

def _get_pipeline(self, node):
pipelines = list(filter(lambda g: node in g.nodes(), self.pipelines()))
pipelines = [i for i in self.pipelines() if i.has_node(node)]
assert len(pipelines) == 1
return pipelines[0]

Expand All @@ -147,11 +149,10 @@ def collect(self, target, with_deps=False):

node = os.path.relpath(stage.path, self.root_dir)
G = self._get_pipeline(node)
stages = nx.get_node_attributes(G, "stage")

ret = [stage]
ret = []
for n in nx.dfs_postorder_nodes(G, node):
ret.append(stages[n])
ret.append(G.node[n]["stage"])

return ret

Expand Down Expand Up @@ -244,7 +245,7 @@ def used_cache(
cache["ssh"] = []
cache["azure"] = []

for branch in self.scm.brancher(
for branch in self.brancher(
all_branches=all_branches, all_tags=all_tags
):
if target:
Expand Down Expand Up @@ -361,10 +362,10 @@ def stages(self, from_directory=None, check_dag=True):

stages = []
outs = []
for root, dirs, files in os.walk(str(from_directory)):
for root, dirs, files in self.tree.walk(from_directory):
for fname in files:
path = os.path.join(root, fname)
if not Stage.is_stage_file(path):
if not Stage.is_valid_filename(path):
continue
stage = Stage.load(self, path)
for out in stage.outs:
Expand Down Expand Up @@ -397,10 +398,14 @@ def active_stages(self, from_directory=None):

def find_outs_by_path(self, path, outs=None, recursive=False):
if not outs:
outs = [out for stage in self.stages() for out in stage.outs]
# there is no `from_directory=path` argument because some data
# files might be generated to an upper level, and so it is
# needed to look at all the files (project root_dir)
stages = self.stages()
outs = [out for stage in stages for out in stage.outs]

abs_path = os.path.abspath(path)
is_dir = os.path.isdir(abs_path)
is_dir = self.tree.isdir(abs_path)

def func(out):
if out.path == abs_path:
Expand Down
56 changes: 56 additions & 0 deletions dvc/repo/brancher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
def brancher( # noqa: E302
self, branches=None, all_branches=False, tags=None, all_tags=False
):
"""Generator that iterates over specified revisions.

Args:
branches (list): a list of branches to iterate over.
all_branches (bool): iterate over all available branches.
tags (list): a list of tags to iterate over.
all_tags (bool): iterate over all available tags.

Yields:
str: the display name for the currently selected tree, it could be:
- a git revision identifier
- empty string it there is no branches to iterate over
- "Working Tree" if there are uncommited changes in the SCM repo
"""
if not any([branches, all_branches, tags, all_tags]):
yield ""
return

saved_tree = self.tree
revs = []

scm = self.scm

if self.scm.is_dirty():
from dvc.repo.tree import WorkingTree

self.tree = WorkingTree()
yield "Working Tree"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This breaks tests that expect previous behaviour of yielding "" if it is not from any branch. Could we continue doing that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops. Not this one actually, but the same thing in line 16. I did run the tests many times, but not after the line 16 change which I added at the last moment :-/.

I think it is better to clarify in the docstring that what the brancher yeilds is not a branch names (instead it is just the abstract name of the currently selected tree), and adjust the tests for this behavior. What do you think? @efiop @shcheklein

Copy link
Contributor Author

@ei-grad ei-grad Mar 21, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIR it is currently used only to display the name for metrics in different branches in dvc metrics show -a output.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh... and in dvc push output, where it is reduntant for sure...

Copy link
Contributor Author

@ei-grad ei-grad Mar 21, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I think it is still ok to leave the "Working Tree" string for iteration with --all-branches, but yield an empty string when there is no things to iterate over.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ei-grad "index" is even more confusing IMHO :) Empty string feels nice, but maybe I'm just too used to it 🙂

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the docstring to about what the brancher could yeild.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so, this is about printing a tree name when we do some output, right? First, I would make a tree itself return the name, no need then to yield anything and pass it around, we will be able to access it from the repo, the same way we access the current object. Is it possible? Second, in the FsTree we can return "Working Tree" or "*" or "workspace", or whatever :) Btw, i'm more or less fine with the current solution as well, it's not ideal but it's not hard to fix anyway. As for the empty string. @efiop does it mean we will have to do something like if not branch: name "Working Tree" further along, in multiple place? I would try to avoid that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shcheklein No, I don't see why we even need "Working Tree" anywhere.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In dvc metrics show -a when we print output we need a way to distinguish "workspace" from an actual branch. May be some other places, @ei-grad should know better than me by now :)


if all_branches:
branches = scm.list_branches()

if all_tags:
tags = scm.list_tags()

if branches is None:
revs.extend([scm.active_branch()])
else:
revs.extend(branches)

if tags is not None:
revs.extend(tags)

# NOTE: it might be a good idea to wrap this loop in try/finally block
# to don't leave the tree on some unexpected branch after the
# `brancher()`, but this could cause problems on exception handling
# code which might expect the tree on which exception was raised to
# stay in place. This behavior is a subject to change.
for rev in revs:
self.tree = scm.get_tree(rev)
yield rev

self.tree = saved_tree
4 changes: 2 additions & 2 deletions dvc/repo/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import dvc.logger as logger
from dvc.repo import Repo
from dvc.scm import SCM, Base
from dvc.scm import SCM, NoSCM
from dvc.config import Config
from dvc.exceptions import InitError

Expand Down Expand Up @@ -56,7 +56,7 @@ def init(root_dir=os.curdir, no_scm=False, force=False):
root_dir = os.path.abspath(root_dir)
dvc_dir = os.path.join(root_dir, Repo.DVC_DIR)
scm = SCM(root_dir)
if type(scm) == Base and not no_scm:
if isinstance(scm, NoSCM) and not no_scm:
raise InitError(
"{repo} is not tracked by any supported scm tool (e.g. git). "
"Use '--no-scm' if you don't want to use any scm.".format(
Expand Down
49 changes: 29 additions & 20 deletions dvc/repo/metrics/show.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,30 +66,23 @@ def _read_typed_metric(typ, xpath, fd):
return ret


def _read_metric(path, typ=None, xpath=None, branch=None):
ret = None
if not os.path.exists(path):
return ret

def _read_metric(fd, typ=None, xpath=None, rel_path=None, branch=None):
typ = typ.lower().strip() if typ else typ
xpath = xpath.strip() if xpath else xpath
try:
with open(path, "r") as fd:
if not xpath:
ret = fd.read().strip()
else:
ret = _read_typed_metric(typ, xpath, fd)
if xpath:
return _read_typed_metric(typ, xpath.strip(), fd)
else:
return fd.read().strip()
# Json path library has to be replaced or wrapped in
# order to fix this too broad except clause.
except Exception:
logger.warning(
"unable to read metric in '{}' in branch '{}'".format(
path, branch
rel_path, branch
),
parse_exception=True,
)

return ret
return None


def _collect_metrics(self, path, recursive, typ, xpath, branch):
Expand Down Expand Up @@ -123,16 +116,33 @@ def _collect_metrics(self, path, recursive, typ, xpath, branch):
return res


def _read_metrics_filesystem(path, typ, xpath, rel_path, branch):
if not os.path.exists(path):
return None
with open(path, "r") as fd:
return _read_metric(
fd, typ=typ, xpath=xpath, rel_path=rel_path, branch=branch
)


def _read_metrics(self, metrics, branch):
res = {}
for out, typ, xpath in metrics:
assert out.scheme == "local"
if out.use_cache:
path = self.cache.local.get(out.checksum)
metric = _read_metrics_filesystem(
self.cache.local.get(out.checksum),
typ=typ,
xpath=xpath,
rel_path=out.rel_path,
branch=branch,
)
else:
path = out.path
fd = self.tree.open(out.path)
metric = _read_metric(
fd, typ=typ, xpath=xpath, rel_path=out.rel_path, branch=branch
)

metric = _read_metric(path, typ=typ, xpath=xpath, branch=branch)
if not metric:
continue

Expand All @@ -151,9 +161,8 @@ def show(
recursive=False,
):
res = {}
for branch in self.scm.brancher(
all_branches=all_branches, all_tags=all_tags
):

for branch in self.brancher(all_branches=all_branches, all_tags=all_tags):
entries = _collect_metrics(self, path, recursive, typ, xpath, branch)
metrics = _read_metrics(self, entries, branch)
if metrics:
Expand Down
65 changes: 65 additions & 0 deletions dvc/repo/tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os

from dvc.utils.compat import open


class BaseTree(object):
"""Abstract class to represent access to files"""

def open(self, path):
"""Open file and return a stream."""

def exists(self, path):
"""Test whether a path exists."""

def isdir(self, path):
"""Return true if the pathname refers to an existing directory."""

def isfile(self, path):
"""Test whether a path is a regular file"""

def walk(self, top, topdown=True):
"""Directory tree generator.

See `os.walk` for the docs. Differences:
- no support for symlinks
- it could raise exceptions, there is no onerror argument
"""


class WorkingTree(BaseTree):
"""Proxies the repo file access methods to working tree files"""

def open(self, path):
"""Open file and return a stream."""
return open(path, encoding="utf-8")

def exists(self, path):
"""Test whether a path exists."""
return os.path.exists(path)

def isdir(self, path):
"""Return true if the pathname refers to an existing directory."""
return os.path.isdir(path)

def isfile(self, path):
"""Test whether a path is a regular file"""
return os.path.isfile(path)

def walk(self, top, topdown=True):
"""Directory tree generator.

See `os.walk` for the docs. Differences:
- no support for symlinks
- it could raise exceptions, there is no onerror argument
"""

def onerror(e):
raise e

for root, dirs, files in os.walk(
top, topdown=topdown, onerror=onerror
):
if topdown:
dirs[:] = [i for i in dirs if i not in (".git", ".hg", ".dvc")]
yield os.path.normpath(root), dirs, files
8 changes: 7 additions & 1 deletion dvc/scm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
import dvc


# just a sugar to point that this is an actual implementation for a dvc
# project under no SCM control
class NoSCM(Base):
pass


def SCM(root_dir, repo=None): # pylint: disable=invalid-name
"""Returns SCM instance that corresponds to a repo at the specified
path.
Expand All @@ -22,7 +28,7 @@ def SCM(root_dir, repo=None): # pylint: disable=invalid-name
if Git.is_repo(root_dir) or Git.is_submodule(root_dir):
return Git(root_dir, repo=repo)

return Base(root_dir, repo=repo)
return NoSCM(root_dir, repo=repo)


def scm_context(method):
Expand Down
Loading