-
Notifications
You must be signed in to change notification settings - Fork 1.2k
repro: support glob/foreach-group to run at once through CLI #4976
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
skshetry
merged 7 commits into
iterative:master
from
skshetry:foreach-group-repro-and-regex-filter
Nov 27, 2020
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
f9bc69f
repro: support regex/foreach-group to run at once
skshetry 4ec61d1
Use `tree.isdir` rather than `os.path.isdir`
skshetry c0fc735
Use glob rather than regex
skshetry 734f21f
Update dvc/command/repro.py
skshetry b3b05c4
s/regex/glob
skshetry cfe33c8
disable glob on `collect_granular`
skshetry ff15c11
add tests for `collect` and `collect_granular`
skshetry File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,14 @@ | ||
import logging | ||
import os | ||
from contextlib import contextmanager | ||
from contextlib import contextmanager, suppress | ||
from functools import wraps | ||
from typing import TYPE_CHECKING | ||
from typing import TYPE_CHECKING, List | ||
|
||
from funcy import cached_property, cat | ||
from git import InvalidGitRepositoryError | ||
|
||
from dvc.config import Config | ||
from dvc.dvcfile import PIPELINE_FILE, Dvcfile, is_valid_filename | ||
from dvc.dvcfile import PIPELINE_FILE, is_valid_filename | ||
from dvc.exceptions import FileMissingError | ||
from dvc.exceptions import IsADirectoryError as DvcIsADirectoryError | ||
from dvc.exceptions import ( | ||
|
@@ -17,6 +17,7 @@ | |
OutputNotFoundError, | ||
) | ||
from dvc.path_info import PathInfo | ||
from dvc.repo.stage import StageLoad | ||
from dvc.scm import Base | ||
from dvc.scm.base import SCMError | ||
from dvc.tree.repo import RepoTree | ||
|
@@ -28,6 +29,9 @@ | |
from .trie import build_outs_trie | ||
|
||
if TYPE_CHECKING: | ||
from networkx import DiGraph | ||
|
||
from dvc.stage import Stage | ||
from dvc.tree.base import BaseTree | ||
|
||
|
||
|
@@ -165,6 +169,7 @@ def __init__( | |
|
||
self.cache = Cache(self) | ||
self.cloud = DataCloud(self) | ||
self.stage = StageLoad(self) | ||
|
||
if scm or not self.dvc_dir: | ||
self.lock = LockNoop() | ||
|
@@ -270,25 +275,6 @@ def _ignore(self): | |
|
||
self.scm.ignore_list(flist) | ||
|
||
def get_stage(self, path=None, name=None): | ||
if not path: | ||
path = PIPELINE_FILE | ||
logger.debug("Assuming '%s' to be a stage inside '%s'", name, path) | ||
|
||
dvcfile = Dvcfile(self, path) | ||
return dvcfile.stages[name] | ||
|
||
def get_stages(self, path=None, name=None): | ||
if not path: | ||
path = PIPELINE_FILE | ||
logger.debug("Assuming '%s' to be a stage inside '%s'", name, path) | ||
|
||
if name: | ||
return [self.get_stage(path, name)] | ||
|
||
dvcfile = Dvcfile(self, path) | ||
return list(dvcfile.stages.values()) | ||
|
||
def check_modified_graph(self, new_stages): | ||
"""Generate graph including the new stage to check for errors""" | ||
# Building graph might be costly for the ones with many DVC-files, | ||
|
@@ -306,79 +292,105 @@ def check_modified_graph(self, new_stages): | |
if not getattr(self, "_skip_graph_checks", False): | ||
build_graph(self.stages + new_stages) | ||
|
||
def _collect_inside(self, path, graph): | ||
@staticmethod | ||
def _collect_inside(path: str, graph: "DiGraph"): | ||
import networkx as nx | ||
|
||
stages = nx.dfs_postorder_nodes(graph) | ||
return [stage for stage in stages if path_isin(stage.path, path)] | ||
|
||
def collect( | ||
self, target=None, with_deps=False, recursive=False, graph=None | ||
self, | ||
target: str = None, | ||
with_deps: bool = False, | ||
recursive: bool = False, | ||
graph: "DiGraph" = None, | ||
accept_group: bool = False, | ||
glob: bool = False, | ||
): | ||
if not target: | ||
return list(graph) if graph else self.stages | ||
|
||
if recursive and os.path.isdir(target): | ||
if recursive and self.tree.isdir(target): | ||
return self._collect_inside( | ||
os.path.abspath(target), graph or self.graph | ||
) | ||
|
||
path, name = parse_target(target) | ||
stages = self.get_stages(path, name) | ||
stages = self.stage.from_target( | ||
target, accept_group=accept_group, glob=glob | ||
) | ||
if not with_deps: | ||
return stages | ||
|
||
return self._collect_stages_with_deps(stages, graph=graph) | ||
|
||
def _collect_stages_with_deps( | ||
self, stages: List["Stage"], graph: "DiGraph" = None | ||
): | ||
res = set() | ||
for stage in stages: | ||
res.update(self._collect_pipeline(stage, graph=graph)) | ||
return res | ||
|
||
def _collect_pipeline(self, stage, graph=None): | ||
def _collect_pipeline(self, stage: "Stage", graph: "DiGraph" = None): | ||
import networkx as nx | ||
|
||
pipeline = get_pipeline(get_pipelines(graph or self.graph), stage) | ||
return nx.dfs_postorder_nodes(pipeline, stage) | ||
|
||
def _collect_from_default_dvcfile(self, target): | ||
dvcfile = Dvcfile(self, PIPELINE_FILE) | ||
if dvcfile.exists(): | ||
return dvcfile.stages.get(target) | ||
|
||
def collect_granular( | ||
self, target=None, with_deps=False, recursive=False, graph=None | ||
def _collect_specific_target( | ||
self, target: str, with_deps: bool, recursive: bool, accept_group: bool | ||
): | ||
""" | ||
Priority is in the order of following in case of ambiguity: | ||
- .dvc file or .yaml file | ||
- dir if recursive and directory exists | ||
- stage_name | ||
- output file | ||
""" | ||
if not target: | ||
return [(stage, None) for stage in self.stages] | ||
|
||
# Optimization: do not collect the graph for a specific target | ||
file, name = parse_target(target) | ||
stages = [] | ||
|
||
# Optimization: do not collect the graph for a specific target | ||
if not file: | ||
# parsing is ambiguous when it does not have a colon | ||
# or if it's not a dvcfile, as it can be a stage name | ||
# in `dvc.yaml` or, an output in a stage. | ||
logger.debug( | ||
"Checking if stage '%s' is in '%s'", target, PIPELINE_FILE | ||
) | ||
if not (recursive and os.path.isdir(target)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
stage = self._collect_from_default_dvcfile(target) | ||
if stage: | ||
stages = ( | ||
self._collect_pipeline(stage) if with_deps else [stage] | ||
if not ( | ||
recursive and self.tree.isdir(target) | ||
) and self.tree.exists(PIPELINE_FILE): | ||
with suppress(StageNotFound): | ||
stages = self.stage.load_all( | ||
PIPELINE_FILE, target, accept_group=accept_group | ||
) | ||
if with_deps: | ||
stages = self._collect_stages_with_deps(stages) | ||
|
||
elif not with_deps and is_valid_filename(file): | ||
stages = self.get_stages(file, name) | ||
stages = self.stage.load_all( | ||
file, name, accept_group=accept_group, | ||
) | ||
|
||
return stages, file, name | ||
|
||
def collect_granular( | ||
self, | ||
target: str = None, | ||
with_deps: bool = False, | ||
recursive: bool = False, | ||
graph: "DiGraph" = None, | ||
accept_group: bool = False, | ||
): | ||
""" | ||
Priority is in the order of following in case of ambiguity: | ||
- .dvc file or .yaml file | ||
- dir if recursive and directory exists | ||
- stage_name | ||
- output file | ||
""" | ||
if not target: | ||
return [(stage, None) for stage in self.stages] | ||
|
||
stages, file, _ = self._collect_specific_target( | ||
target, with_deps, recursive, accept_group | ||
) | ||
if not stages: | ||
if not (recursive and os.path.isdir(target)): | ||
if not (recursive and self.tree.isdir(target)): | ||
try: | ||
(out,) = self.find_outs_by_path(target, strict=False) | ||
filter_info = PathInfo(os.path.abspath(target)) | ||
|
@@ -387,7 +399,13 @@ def collect_granular( | |
pass | ||
|
||
try: | ||
stages = self.collect(target, with_deps, recursive, graph) | ||
stages = self.collect( | ||
target, | ||
with_deps, | ||
recursive, | ||
graph, | ||
accept_group=accept_group, | ||
) | ||
except StageFileDoesNotExistError as exc: | ||
# collect() might try to use `target` as a stage name | ||
# and throw error that dvc.yaml does not exist, whereas it | ||
|
@@ -498,7 +516,9 @@ def _collect_stages(self): | |
|
||
for root, dirs, files in self.tree.walk(self.root_dir): | ||
for file_name in filter(is_valid_filename, files): | ||
new_stages = self.get_stages(os.path.join(root, file_name)) | ||
new_stages = self.stage.load_file( | ||
os.path.join(root, file_name) | ||
) | ||
stages.extend(new_stages) | ||
outs.update( | ||
out.fspath | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion for better naming would be appreciated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StageResolver
? Also, I think that naming the objectstage
while we have a class of this name will get confusing and some point. Maybestage_load
orstage_resolver
will be ok, depending on what name we choose in the end.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@pared, will make it
StageLoader
, and change the other one toMultiStageLoader
.stage_load.load_one()
looks a bit odd to me. :) But, I understand your concern.