Skip to content

dvc: drop cached external outputs #9570

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 0 additions & 11 deletions dvc/cachemgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,6 @@ def _get_odb(

class CacheManager:
CACHE_DIR = "cache"
CLOUD_SCHEMES = [
Schemes.S3,
Schemes.GS,
Schemes.SSH,
Schemes.HDFS,
Schemes.WEBHDFS,
]
FILES_DIR = "files"

def __init__(self, repo):
Expand Down Expand Up @@ -91,16 +84,12 @@ def _init_odb(self, schemes):
)

def __getattr__(self, name):
if name not in self._odb and name in self.CLOUD_SCHEMES:
self._init_odb([name])

try:
return self._odb[name]
except KeyError as exc:
raise AttributeError from exc

def by_scheme(self):
self._init_odb(self.CLOUD_SCHEMES)
yield from self._odb.items()

@property
Expand Down
9 changes: 0 additions & 9 deletions dvc/commands/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ def validate_args(self) -> None:
invalid_opt = "--glob option"
elif args.no_commit:
invalid_opt = "--no-commit option"
elif args.external:
invalid_opt = "--external option"
else:
message = "{option} can't be used without --to-remote"
if args.remote:
Expand All @@ -49,7 +47,6 @@ def run(self):
self.repo.add(
self.args.targets,
no_commit=self.args.no_commit,
external=self.args.external,
glob=self.args.glob,
out=self.args.out,
remote=self.args.remote,
Expand Down Expand Up @@ -82,12 +79,6 @@ def add_parser(subparsers, parent_parser):
default=False,
help="Don't put files/directories into cache.",
)
parser.add_argument(
"--external",
action="store_true",
default=False,
help="Allow targets that are outside of the DVC repository.",
)
parser.add_argument(
"--glob",
action="store_true",
Expand Down
14 changes: 7 additions & 7 deletions dvc/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,13 @@ def __call__(self, data):
"machine": Lower,
},
"cache": {
"local": str,
"s3": str,
"gs": str,
"hdfs": str,
"webhdfs": str,
"ssh": str,
"azure": str,
"local": str, # obsoleted
"s3": str, # obsoleted
"gs": str, # obsoleted
"hdfs": str, # obsoleted
"webhdfs": str, # obsoleted
"ssh": str, # obsoleted
"azure": str, # obsoleted
# This is for default local cache
"dir": str,
**LOCAL_COMMON,
Expand Down
15 changes: 0 additions & 15 deletions dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,21 +299,6 @@ def __init__(self, url):
super().__init__(f"The path '{url}' does not exist")


class RemoteCacheRequiredError(DvcException):
def __init__(self, scheme, fs_path):
super().__init__(
(
"Current operation was unsuccessful because '{}' requires "
"existing cache on '{}' remote. See {} for information on how "
"to set up remote cache."
).format(
fs_path,
scheme,
format_link("https://man.dvc.org/config#cache"),
)
)


class IsADirectoryError(DvcException): # noqa,pylint:disable=redefined-builtin
"""Raised when a file operation is requested on a directory."""

Expand Down
12 changes: 3 additions & 9 deletions dvc/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
ConfirmRemoveError,
DvcException,
MergeError,
RemoteCacheRequiredError,
)
from dvc.utils.objects import cached_property
from dvc_data.hashfile import check as ocheck
Expand Down Expand Up @@ -526,14 +525,9 @@ def use_scm_ignore(self):
def cache(self):
from dvc.cachemgr import LEGACY_HASH_NAMES

if self.is_in_repo:
odb_name = "legacy" if self.hash_name in LEGACY_HASH_NAMES else "repo"
else:
odb_name = self.protocol
odb = getattr(self.repo.cache, odb_name)
if self.use_cache and odb is None:
raise RemoteCacheRequiredError(self.fs.protocol, self.fs_path)
return odb
assert self.is_in_repo
odb_name = "legacy" if self.hash_name in LEGACY_HASH_NAMES else "repo"
return getattr(self.repo.cache, odb_name)

@property
def local_cache(self):
Expand Down
4 changes: 0 additions & 4 deletions dvc/repo/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def find_targets(
def get_or_create_stage(
repo: "Repo",
target: str,
external: bool = False,
out: Optional[str] = None,
to_remote: bool = False,
force: bool = False,
Expand All @@ -79,7 +78,6 @@ def get_or_create_stage(
fname=path,
wdir=wdir,
outs=[out],
external=external,
force=force,
)
return StageInfo(stage, output_exists=False)
Expand Down Expand Up @@ -199,7 +197,6 @@ def add(
repo: "Repo",
targets: Union["StrOrBytesPath", Iterator["StrOrBytesPath"]],
no_commit: bool = False,
external: bool = False,
glob: bool = False,
out: Optional[str] = None,
remote: Optional[str] = None,
Expand All @@ -215,7 +212,6 @@ def add(
target: get_or_create_stage(
repo,
target,
external=external,
out=out,
to_remote=to_remote,
force=force,
Expand Down
5 changes: 2 additions & 3 deletions dvc/stage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class RawData:
generated_from: Optional[str] = None


def create_stage(cls: Type[_T], repo, path, external=False, **kwargs) -> _T:
def create_stage(cls: Type[_T], repo, path, **kwargs) -> _T:
from dvc.dvcfile import check_dvcfile_path

wdir = os.path.abspath(kwargs.get("wdir", None) or os.curdir)
Expand All @@ -101,8 +101,7 @@ def create_stage(cls: Type[_T], repo, path, external=False, **kwargs) -> _T:

stage = loads_from(cls, repo, path, wdir, kwargs)
fill_stage_outputs(stage, **kwargs)
if not external:
check_no_externals(stage)
check_no_externals(stage)
fill_stage_dependencies(
stage, **project(kwargs, ["deps", "erepo", "params", "fs_config"])
)
Expand Down
10 changes: 7 additions & 3 deletions dvc/stage/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@ def _can_hash(stage):
return False

for out in stage.outs:
if out.protocol != "local" or not out.def_path or out.persist:
if (
out.protocol != "local"
or not out.def_path
or out.persist
or not out.is_in_repo
):
return False

return True
Expand Down Expand Up @@ -112,7 +117,6 @@ def _create_stage(self, cache, wdir=None):
cmd=cache["cmd"],
wdir=wdir,
outs=[out["path"] for out in cache["outs"]],
external=True,
)
StageLoader.fill_from_lock(stage, cache)
return stage
Expand All @@ -137,7 +141,7 @@ def _uncached_outs(self, stage, cache):
# NOTE: using copy link to make it look like a git-tracked file
with self._cache_type_copy():
for out in cached_stage.outs:
if out.def_path in outs_no_cache:
if out.def_path in outs_no_cache and out.is_in_repo:
yield out

def save(self, stage):
Expand Down
20 changes: 8 additions & 12 deletions dvc/stage/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,28 +87,24 @@ def fill_stage_dependencies(stage, deps=None, erepo=None, params=None, fs_config


def check_no_externals(stage):
from urllib.parse import urlparse

from dvc.utils import format_link

# NOTE: preventing users from accidentally using external outputs. See
# https://github.com/iterative/dvc/issues/1545 for more details.

def _is_external(out):
# NOTE: in case of `remote://` notation, the user clearly knows that
# this is an advanced feature and so we shouldn't error-out.
if out.is_in_repo or urlparse(out.def_path).scheme == "remote":
def _is_cached_external(out):
if out.is_in_repo or not out.use_cache:
return False
return True

outs = [str(out) for out in stage.outs if _is_external(out)]
outs = [str(out) for out in stage.outs if _is_cached_external(out)]
if not outs:
return

str_outs = ", ".join(outs)
link = format_link("https://dvc.org/doc/user-guide/managing-external-data")
link = format_link(
"https://dvc.org/doc/user-guide/pipelines/external-dependencies-and-outputs"
)
raise StageExternalOutputsError(
f"Output(s) outside of DVC project: {str_outs}. See {link} for more info."
f"Cached output(s) outside of DVC project: {str_outs}. "
f"See {link} for more info."
)


Expand Down
51 changes: 0 additions & 51 deletions dvc/testing/workspace_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,57 +153,6 @@ def test_import_no_download(self, tmp_dir, dvc, remote_version_aware):
assert dvc.status() == {}


class TestAdd:
@pytest.fixture
def hash_name(self):
pytest.skip()

@pytest.fixture
def hash_value(self):
pytest.skip()

@pytest.fixture
def dir_hash_value(self):
pytest.skip()

def test_add(self, tmp_dir, dvc, workspace, hash_name, hash_value):
from dvc.stage.exceptions import StageExternalOutputsError

workspace.gen("file", "file")

with pytest.raises(StageExternalOutputsError):
dvc.add(workspace.url)

dvc.add("remote://workspace/file")
assert (tmp_dir / "file.dvc").read_text() == (
"outs:\n"
f"- {hash_name}: {hash_value}\n"
" size: 4\n"
" hash: md5\n"
" path: remote://workspace/file\n"
)
assert (workspace / "file").read_text() == "file"
assert (
workspace / "cache" / "files" / "md5" / hash_value[:2] / hash_value[2:]
).read_text() == "file"

assert dvc.status() == {}

# pylint: disable-next=unused-argument
def test_add_dir(self, tmp_dir, dvc, workspace, hash_name, dir_hash_value):
workspace.gen({"dir": {"file": "file", "subdir": {"subfile": "subfile"}}})

dvc.add("remote://workspace/dir")
assert (
workspace
/ "cache"
/ "files"
/ "md5"
/ dir_hash_value[:2]
/ dir_hash_value[2:]
).is_file()


def match_files(fs, entries, expected):
entries_content = {(fs.path.normpath(d["path"]), d["isdir"]) for d in entries}
expected_content = {(fs.path.normpath(d["path"]), d["isdir"]) for d in expected}
Expand Down
Loading