Skip to content

add: support simple wildcards #4864

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions dvc/command/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def run(self):
no_commit=self.args.no_commit,
fname=self.args.file,
external=self.args.external,
glob=self.args.glob,
)

except DvcException:
Expand Down Expand Up @@ -57,6 +58,12 @@ def add_parser(subparsers, parent_parser):
default=False,
help="Allow targets that are outside of the DVC repository.",
)
parser.add_argument(
"--glob",
action="store_true",
default=False,
help="Allows targets containing shell-style wildcards.",
)
Comment on lines +61 to +66
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi! A few questions (related to documenting this per iterative/dvc.org/issues/1928):

  • What does "glob" abbreviate or stand for?
  • Why is a flag needed to allow/enable wildcards?
  • Is there a short description of what "shell-style" means exactly? Or a link to some Linux doc, Python package, or something specifying what can be done in terms of wildcards?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's more natural to use wildcards in the default mode without any flag, just like what we do in Unix. And itβ€˜s better to extract the glob to some common function that could be used in all commands.

Copy link
Contributor

@efiop efiop Nov 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@karajan1001 I personally wasn't ready to enable this by default right away as I'm not sure it won't affect negatively some existing usecases πŸ™ Once we're ready for it, it will be as simple as turning that flag on by default. For PS users --glob will be a temporary inconvenience, but at least the functionality is being slowly filled up.

@jorgeorpinel Maybe just forget about the doc ticket for it for now, we'll be adding more stuff for it in the future.

What does "glob" abbreviate or stand for?

https://docs.python.org/3/library/glob.html

Why is a flag needed to allow/enable wildcards?

Usually, your shell handles the expansion and it will still do that. Our --glob is useful for programatic use or in shells that don't support explansion natively (e.g. powershell). I'm personally not ready to enable it by default right now.

Is there a short description of what "shell-style" means exactly? Or a link to some Linux doc, Python package, or something specifying what can be done in terms of wildcards?

https://docs.python.org/3/library/glob.html

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, just wanted to understand this. The initial docs PR is ready for merge.

parser.add_argument(
"--file",
help="Specify name of the DVC-file this command will generate.",
Expand Down
35 changes: 29 additions & 6 deletions dvc/repo/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@
@locked
@scm_context
def add(
repo, targets, recursive=False, no_commit=False, fname=None, external=False
repo,
targets,
recursive=False,
no_commit=False,
fname=None,
external=False,
glob=False,
):
if recursive and fname:
raise RecursiveAddingWhileUsingFilename()
Expand Down Expand Up @@ -57,7 +63,12 @@ def add(
)

stages = _create_stages(
repo, sub_targets, fname, pbar=pbar, external=external
repo,
sub_targets,
fname,
pbar=pbar,
external=external,
glob=glob,
)

try:
Expand Down Expand Up @@ -149,15 +160,27 @@ def _find_all_targets(repo, target, recursive):
return [target]


def _create_stages(repo, targets, fname, pbar=None, external=False):
def _create_stages(
repo, targets, fname, pbar=None, external=False, glob=False
):
from glob import iglob

from dvc.stage import Stage, create_stage

stages = []
if glob:
expanded_targets = [
exp_target
for target in targets
for exp_target in iglob(target, recursive=True)
]
else:
expanded_targets = targets

stages = []
for out in Tqdm(
targets,
expanded_targets,
desc="Creating DVC-files",
disable=len(targets) < LARGE_DIR_SIZE,
disable=len(expanded_targets) < LARGE_DIR_SIZE,
unit="file",
):
path, wdir, out = resolve_paths(repo, out)
Expand Down
71 changes: 71 additions & 0 deletions tests/func/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,77 @@ def test_add_file_in_dir(tmp_dir, dvc):
assert stage.outs[0].def_path == "subdata"


@pytest.mark.parametrize(
"target, expected_def_paths, expected_rel_paths",
[
(
os.path.join("dir", "subdir", "subdata*"),
["subdata", "subdata123"],
[
os.path.join("dir", "subdir", "subdata") + ".dvc",
os.path.join("dir", "subdir", "subdata123") + ".dvc",
],
),
(
os.path.join("dir", "subdir", "?subdata"),
["esubdata", "isubdata"],
[
os.path.join("dir", "subdir", "esubdata") + ".dvc",
os.path.join("dir", "subdir", "isubdata") + ".dvc",
],
),
(
os.path.join("dir", "subdir", "[aiou]subdata"),
["isubdata"],
[os.path.join("dir", "subdir", "isubdata") + ".dvc"],
),
(
os.path.join("dir", "**", "subdata*"),
["subdata", "subdata123", "subdata4", "subdata5"],
[
os.path.join("dir", "subdir", "subdata") + ".dvc",
os.path.join("dir", "subdir", "subdata123") + ".dvc",
os.path.join("dir", "anotherdir", "subdata4") + ".dvc",
os.path.join("dir", "subdata5") + ".dvc",
],
),
],
)
def test_add_filtered_files_in_dir(
tmp_dir, dvc, target, expected_def_paths, expected_rel_paths
):
tmp_dir.gen(
{
"dir": {
"subdir": {
"subdata": "subdata content",
"esubdata": "extra subdata content",
"isubdata": "i subdata content",
"subdata123": "subdata content 123",
},
"anotherdir": {
"subdata4": "subdata 4 content",
"esubdata": "extra 2 subdata content",
},
"subdata5": "subdata 5 content",
}
}
)

stages = dvc.add(target, glob=True)

assert len(stages) == len(expected_def_paths)
for stage in stages:
assert stage is not None
assert len(stage.deps) == 0
assert len(stage.outs) == 1
assert stage.relpath in expected_rel_paths

# Current dir should not be taken into account
assert stage.wdir == os.path.dirname(stage.path)
assert stage.outs[0].def_path in expected_def_paths


@pytest.mark.parametrize(
"workspace, hash_name, hash_value",
[
Expand Down