Skip to content

tests serializing of stage to lockfile #3988

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 35 additions & 29 deletions dvc/stage/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,51 +63,56 @@ def _serialize_outs(outputs: List[BaseOutput]):
return outs, metrics, plots


def _serialize_params(params: List[ParamsDependency]):
"""Return two types of values from stage:

`keys` - which is list of params without values, used in a pipeline file

which is in the shape of:
['lr', 'train', {'params2.yaml': ['lr']}]
def _serialize_params_keys(params):
"""
Returns the following format of data:
['lr', 'train', {'params2.yaml': ['lr']}]

`key_vals` - which is list of params with values, used in a lockfile
which is in the shape of:
{'params.yaml': {'lr': '1', 'train': 2}, {'params2.yaml': {'lr': '1'}}
The output is sorted, with keys of params from default params file being
at the first, and then followed by entry of other files in lexicographic
order. The keys of those custom files are also sorted in the same order.
"""
keys = []
key_vals = OrderedDict()

for param_dep in sort_by_path(params):
dump = param_dep.dumpd()
path, params = dump[PARAM_PATH], dump[PARAM_PARAMS]
if isinstance(params, dict):
k = sorted(params.keys())
if not k:
continue
key_vals[path] = OrderedDict([(key, params[key]) for key in k])
else:
assert isinstance(params, list)
# no params values available here, entry will be skipped for lock
k = sorted(params)
assert isinstance(params, (dict, list))
# when on no_exec, params are not filled and are saved as list
k = sorted(params.keys() if isinstance(params, dict) else params)
if not k:
continue

# params from default file is always kept at the start of the `params:`
if path == DEFAULT_PARAMS_FILE:
keys = k + keys
if key_vals:
key_vals.move_to_end(path, last=False)
else:
# if it's not a default file, change the shape
# to: {path: k}
keys.append({path: k})
return keys, key_vals
return keys


def _serialize_params_values(params: List[ParamsDependency]):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Splitted serialization of params to serializing keys and values. Was getting too complex to understand.

"""Returns output of following format, used for lockfile:
{'params.yaml': {'lr': '1', 'train': 2}, {'params2.yaml': {'lr': '1'}}

Default params file are always kept at the start, followed by others in
alphabetical order. The param values are sorted too(not recursively though)
"""
key_vals = OrderedDict()
for param_dep in sort_by_path(params):
dump = param_dep.dumpd()
path, params = dump[PARAM_PATH], dump[PARAM_PARAMS]
if isinstance(params, dict):
kv = [(key, params[key]) for key in sorted(params.keys())]
key_vals[path] = OrderedDict(kv)
if path == DEFAULT_PARAMS_FILE:
key_vals.move_to_end(path, last=False)
return key_vals


def to_pipeline_file(stage: "PipelineStage"):
wdir = resolve_wdir(stage.wdir, stage.path)
params, deps = split_params_deps(stage)
deps = sorted([d.def_path for d in deps])
params, _ = _serialize_params(params)
params = _serialize_params_keys(params)

outs, metrics, plots = _serialize_outs(stage.outs)
res = [
Expand Down Expand Up @@ -143,10 +148,11 @@ def to_single_stage_lockfile(stage: "Stage") -> dict:
]
for items in [deps, stage.outs]
]
params = _serialize_params_values(params)
if deps:
res[PARAM_DEPS] = deps
if params:
_, res[PARAM_PARAMS] = _serialize_params(params)
res[PARAM_PARAMS] = params
if outs:
res[PARAM_OUTS] = outs

Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@ include_trailing_comma=true
known_first_party=dvc,tests
known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,google,grandalf,mock,mockssh,moto,nanotime,networkx,packaging,paramiko,pathspec,pytest,requests,ruamel,setuptools,shortuuid,tqdm,voluptuous,yaml,zc
line_length=79
force_grid_wrap=0
use_parentheses=True
Comment on lines +17 to +18
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why I added this is because, isort adds a \ on hanging indent for imports:

from dvc.stage.serialize import \
    to_single_stage_lockfile as _to_single_stage_lockfile

Whereas black formats it into this:
vs:

from dvc.stage.serialize import (
    to_single_stage_lockfile as _to_single_stage_lockfile
)

multi_line_output=3
208 changes: 208 additions & 0 deletions tests/unit/stage/test_serialize_pipeline_lock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
from collections import OrderedDict

import pytest
from voluptuous import Schema as _Schema

from dvc.dvcfile import PIPELINE_FILE
from dvc.schema import LOCK_FILE_STAGE_SCHEMA, LOCKFILE_SCHEMA
from dvc.stage import PipelineStage, create_stage
from dvc.stage.serialize import DEFAULT_PARAMS_FILE, to_lockfile
from dvc.stage.serialize import (
to_single_stage_lockfile as _to_single_stage_lockfile,
)
from dvc.stage.utils import split_params_deps

kwargs = {"name": "something", "cmd": "command", "path": PIPELINE_FILE}
Schema = _Schema(LOCK_FILE_STAGE_SCHEMA)


def to_single_stage_lockfile(stage):
"""Validate schema on each serialization."""
e = _to_single_stage_lockfile(stage)
assert Schema(e)
return e


def test_lock(dvc):
stage = create_stage(PipelineStage, dvc, **kwargs)
assert to_single_stage_lockfile(stage) == {"cmd": "command"}


def test_lock_deps(dvc):
stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs)
stage.deps[0].info = {"md5": "md-five"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
("deps", [OrderedDict([("path", "input"), ("md5", "md-five")])]),
]
)
Comment on lines +34 to +39
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the order is very important for the lockfile, most of the tests use ugly OrderedDict.



def test_lock_deps_order(dvc):
stage = create_stage(
PipelineStage, dvc, deps=["input1", "input0"], **kwargs
)
stage.deps[0].info = {"md5": "md-one1"}
stage.deps[1].info = {"md5": "md-zer0"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
(
"deps",
[
OrderedDict([("path", "input0"), ("md5", "md-zer0")]),
OrderedDict([("path", "input1"), ("md5", "md-one1")]),
],
),
]
)


def test_lock_params(dvc):
stage = create_stage(
PipelineStage, dvc, params=["lorem.ipsum", "abc"], **kwargs
)
stage.deps[0].info = {"lorem.ipsum": {"lorem1": 1, "lorem2": 2}, "abc": 3}
assert to_single_stage_lockfile(stage)["params"][
DEFAULT_PARAMS_FILE
] == OrderedDict([("abc", 3), ("lorem.ipsum", {"lorem1": 1, "lorem2": 2})])


def test_lock_params_file_sorted(dvc):
stage = create_stage(
PipelineStage,
dvc,
params=[
"lorem.ipsum",
"abc",
{"myparams.yaml": ["foo", "foobar"]},
{"a-params-file.yaml": ["bar", "barr"]},
],
**kwargs
)
stage.deps[0].info = {"lorem.ipsum": {"lorem1": 1, "lorem2": 2}, "abc": 3}
stage.deps[1].info = {"foo": ["f", "o", "o"], "foobar": "foobar"}
stage.deps[2].info = {"bar": ["b", "a", "r"], "barr": "barr"}
assert to_single_stage_lockfile(stage)["params"] == OrderedDict(
[
(
DEFAULT_PARAMS_FILE,
OrderedDict(
[("abc", 3), ("lorem.ipsum", {"lorem1": 1, "lorem2": 2})]
),
),
(
"a-params-file.yaml",
OrderedDict([("bar", ["b", "a", "r"]), ("barr", "barr")]),
),
(
"myparams.yaml",
OrderedDict([("foo", ["f", "o", "o"]), ("foobar", "foobar")]),
),
]
)


def test_lock_params_no_values_filled(dvc):
stage = create_stage(
PipelineStage, dvc, params=["lorem.ipsum", "abc"], **kwargs
)
assert to_single_stage_lockfile(stage) == {"cmd": "command"}


@pytest.mark.parametrize("typ", ["plots", "metrics", "outs"])
def test_lock_outs(dvc, typ):
stage = create_stage(PipelineStage, dvc, **{typ: ["input"]}, **kwargs)
stage.outs[0].info = {"md5": "md-five"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
("outs", [OrderedDict([("path", "input"), ("md5", "md-five")])]),
]
)


@pytest.mark.parametrize("typ", ["plots", "metrics", "outs"])
def test_lock_outs_order(dvc, typ):
stage = create_stage(
PipelineStage, dvc, **{typ: ["input1", "input0"]}, **kwargs
)
stage.outs[0].info = {"md5": "md-one1"}
stage.outs[1].info = {"md5": "md-zer0"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
(
"outs",
[
OrderedDict([("path", "input0"), ("md5", "md-zer0")]),
OrderedDict([("path", "input1"), ("md5", "md-one1")]),
],
),
]
)


def test_dump_appropriate_checksums(dvc):
stage = create_stage(
PipelineStage, dvc, deps=["s3://dvc-temp/file"], **kwargs
)
stage.deps[0].info = {"etag": "is-it-etag", "md5": "or-md5?"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
(
"deps",
[
OrderedDict(
[
("path", "s3://dvc-temp/file"),
("etag", "is-it-etag"),
]
)
],
),
]
)


def test_order(dvc):
stage = create_stage(
PipelineStage,
dvc,
deps=["input"],
outs=["output"],
params=["foo-param"],
**kwargs
)
params, deps = split_params_deps(stage)

deps[0].info = {"md5": "md-five"}
params[0].info = {"foo-param": "value"}
stage.outs[0].info = {"md5": "md5-output"}

assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
("deps", [{"path": "input", "md5": "md-five"}]),
("params", {"params.yaml": {"foo-param": "value"}}),
("outs", [{"path": "output", "md5": "md5-output"}]),
]
)


def test_to_lockfile(dvc):
stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs)
stage.deps[0].info = {"md5": "md-five"}
entry = to_lockfile(stage)
assert len(entry) == 1
_Schema(LOCKFILE_SCHEMA)(entry)
assert entry == {
"something": OrderedDict(
[
("cmd", "command"),
("deps", [{"path": "input", "md5": "md-five"}]),
]
)
}
10 changes: 4 additions & 6 deletions tests/unit/stage/test_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import signal
import subprocess
import threading
from unittest import TestCase

import mock
import pytest
Expand Down Expand Up @@ -50,12 +49,11 @@ def test_meta_ignored():
assert stage.compute_md5() == "e9521a22111493406ea64a88cda63e0b"


class TestPathConversion(TestCase):
def test(self):
stage = Stage(None, "path")
def test_path_conversion(dvc):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Converted to pytest format.

stage = Stage(dvc, "path")

stage.wdir = os.path.join("..", "..")
self.assertEqual(stage.dumpd()["wdir"], "../..")
stage.wdir = os.path.join("..", "..")
assert stage.dumpd()["wdir"] == "../.."


def test_stage_update(mocker):
Expand Down