tests serializing of stage to lockfile (#3988)

skshetry · web-flow · commit afc183f4e79a · 2020-06-09T15:48:17.000+03:00
* tests: serialize stage for lockfile

* config: make isort/black compatible

* convert class based tests to pytest
diff --git a/dvc/stage/serialize.py b/dvc/stage/serialize.py
@@ -63,51 +63,56 @@ def _serialize_outs(outputs: List[BaseOutput]):
     return outs, metrics, plots
 
 
-def _serialize_params(params: List[ParamsDependency]):
-    """Return two types of values from stage:
-
-    `keys` - which is list of params without values, used in a pipeline file
-
-    which is in the shape of:
-        ['lr', 'train', {'params2.yaml': ['lr']}]
+def _serialize_params_keys(params):
+    """
+    Returns the following format of data:
+     ['lr', 'train', {'params2.yaml': ['lr']}]
 
-    `key_vals` - which is list of params with values, used in a lockfile
-    which is in the shape of:
-        {'params.yaml': {'lr': '1', 'train': 2}, {'params2.yaml': {'lr': '1'}}
+    The output is sorted, with keys of params from default params file being
+    at the first, and then followed by entry of other files in lexicographic
+    order. The keys of those custom files are also sorted in the same order.
     """
     keys = []
-    key_vals = OrderedDict()
-
     for param_dep in sort_by_path(params):
         dump = param_dep.dumpd()
         path, params = dump[PARAM_PATH], dump[PARAM_PARAMS]
-        if isinstance(params, dict):
-            k = sorted(params.keys())
-            if not k:
-                continue
-            key_vals[path] = OrderedDict([(key, params[key]) for key in k])
-        else:
-            assert isinstance(params, list)
-            # no params values available here, entry will be skipped for lock
-            k = sorted(params)
+        assert isinstance(params, (dict, list))
+        # when on no_exec, params are not filled and are saved as list
+        k = sorted(params.keys() if isinstance(params, dict) else params)
+        if not k:
+            continue
 
-        # params from default file is always kept at the start of the `params:`
         if path == DEFAULT_PARAMS_FILE:
             keys = k + keys
-            if key_vals:
-                key_vals.move_to_end(path, last=False)
         else:
-            # if it's not a default file, change the shape
-            # to: {path: k}
             keys.append({path: k})
-    return keys, key_vals
+    return keys
+
+
+def _serialize_params_values(params: List[ParamsDependency]):
+    """Returns output of following format, used for lockfile:
+        {'params.yaml': {'lr': '1', 'train': 2}, {'params2.yaml': {'lr': '1'}}
+
+    Default params file are always kept at the start, followed by others in
+    alphabetical order. The param values are sorted too(not recursively though)
+    """
+    key_vals = OrderedDict()
+    for param_dep in sort_by_path(params):
+        dump = param_dep.dumpd()
+        path, params = dump[PARAM_PATH], dump[PARAM_PARAMS]
+        if isinstance(params, dict):
+            kv = [(key, params[key]) for key in sorted(params.keys())]
+            key_vals[path] = OrderedDict(kv)
+            if path == DEFAULT_PARAMS_FILE:
+                key_vals.move_to_end(path, last=False)
+    return key_vals
 
 
 def to_pipeline_file(stage: "PipelineStage"):
     wdir = resolve_wdir(stage.wdir, stage.path)
     params, deps = split_params_deps(stage)
     deps = sorted([d.def_path for d in deps])
-    params, _ = _serialize_params(params)
+    params = _serialize_params_keys(params)
 
     outs, metrics, plots = _serialize_outs(stage.outs)
     res = [
@@ -143,10 +148,11 @@ def to_single_stage_lockfile(stage: "Stage") -> dict:
         ]
         for items in [deps, stage.outs]
     ]
+    params = _serialize_params_values(params)
     if deps:
         res[PARAM_DEPS] = deps
     if params:
-        _, res[PARAM_PARAMS] = _serialize_params(params)
+        res[PARAM_PARAMS] = params
     if outs:
         res[PARAM_OUTS] = outs
 
diff --git a/setup.cfg b/setup.cfg
@@ -14,4 +14,6 @@ include_trailing_comma=true
 known_first_party=dvc,tests
 known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,google,grandalf,mock,mockssh,moto,nanotime,networkx,packaging,paramiko,pathspec,pytest,requests,ruamel,setuptools,shortuuid,tqdm,voluptuous,yaml,zc
 line_length=79
+force_grid_wrap=0
+use_parentheses=True
 multi_line_output=3
diff --git a/tests/unit/stage/test_serialize_pipeline_lock.py b/tests/unit/stage/test_serialize_pipeline_lock.py
@@ -0,0 +1,208 @@
+from collections import OrderedDict
+
+import pytest
+from voluptuous import Schema as _Schema
+
+from dvc.dvcfile import PIPELINE_FILE
+from dvc.schema import LOCK_FILE_STAGE_SCHEMA, LOCKFILE_SCHEMA
+from dvc.stage import PipelineStage, create_stage
+from dvc.stage.serialize import DEFAULT_PARAMS_FILE, to_lockfile
+from dvc.stage.serialize import (
+    to_single_stage_lockfile as _to_single_stage_lockfile,
+)
+from dvc.stage.utils import split_params_deps
+
+kwargs = {"name": "something", "cmd": "command", "path": PIPELINE_FILE}
+Schema = _Schema(LOCK_FILE_STAGE_SCHEMA)
+
+
+def to_single_stage_lockfile(stage):
+    """Validate schema on each serialization."""
+    e = _to_single_stage_lockfile(stage)
+    assert Schema(e)
+    return e
+
+
+def test_lock(dvc):
+    stage = create_stage(PipelineStage, dvc, **kwargs)
+    assert to_single_stage_lockfile(stage) == {"cmd": "command"}
+
+
+def test_lock_deps(dvc):
+    stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs)
+    stage.deps[0].info = {"md5": "md-five"}
+    assert to_single_stage_lockfile(stage) == OrderedDict(
+        [
+            ("cmd", "command"),
+            ("deps", [OrderedDict([("path", "input"), ("md5", "md-five")])]),
+        ]
+    )
+
+
+def test_lock_deps_order(dvc):
+    stage = create_stage(
+        PipelineStage, dvc, deps=["input1", "input0"], **kwargs
+    )
+    stage.deps[0].info = {"md5": "md-one1"}
+    stage.deps[1].info = {"md5": "md-zer0"}
+    assert to_single_stage_lockfile(stage) == OrderedDict(
+        [
+            ("cmd", "command"),
+            (
+                "deps",
+                [
+                    OrderedDict([("path", "input0"), ("md5", "md-zer0")]),
+                    OrderedDict([("path", "input1"), ("md5", "md-one1")]),
+                ],
+            ),
+        ]
+    )
+
+
+def test_lock_params(dvc):
+    stage = create_stage(
+        PipelineStage, dvc, params=["lorem.ipsum", "abc"], **kwargs
+    )
+    stage.deps[0].info = {"lorem.ipsum": {"lorem1": 1, "lorem2": 2}, "abc": 3}
+    assert to_single_stage_lockfile(stage)["params"][
+        DEFAULT_PARAMS_FILE
+    ] == OrderedDict([("abc", 3), ("lorem.ipsum", {"lorem1": 1, "lorem2": 2})])
+
+
+def test_lock_params_file_sorted(dvc):
+    stage = create_stage(
+        PipelineStage,
+        dvc,
+        params=[
+            "lorem.ipsum",
+            "abc",
+            {"myparams.yaml": ["foo", "foobar"]},
+            {"a-params-file.yaml": ["bar", "barr"]},
+        ],
+        **kwargs
+    )
+    stage.deps[0].info = {"lorem.ipsum": {"lorem1": 1, "lorem2": 2}, "abc": 3}
+    stage.deps[1].info = {"foo": ["f", "o", "o"], "foobar": "foobar"}
+    stage.deps[2].info = {"bar": ["b", "a", "r"], "barr": "barr"}
+    assert to_single_stage_lockfile(stage)["params"] == OrderedDict(
+        [
+            (
+                DEFAULT_PARAMS_FILE,
+                OrderedDict(
+                    [("abc", 3), ("lorem.ipsum", {"lorem1": 1, "lorem2": 2})]
+                ),
+            ),
+            (
+                "a-params-file.yaml",
+                OrderedDict([("bar", ["b", "a", "r"]), ("barr", "barr")]),
+            ),
+            (
+                "myparams.yaml",
+                OrderedDict([("foo", ["f", "o", "o"]), ("foobar", "foobar")]),
+            ),
+        ]
+    )
+
+
+def test_lock_params_no_values_filled(dvc):
+    stage = create_stage(
+        PipelineStage, dvc, params=["lorem.ipsum", "abc"], **kwargs
+    )
+    assert to_single_stage_lockfile(stage) == {"cmd": "command"}
+
+
+@pytest.mark.parametrize("typ", ["plots", "metrics", "outs"])
+def test_lock_outs(dvc, typ):
+    stage = create_stage(PipelineStage, dvc, **{typ: ["input"]}, **kwargs)
+    stage.outs[0].info = {"md5": "md-five"}
+    assert to_single_stage_lockfile(stage) == OrderedDict(
+        [
+            ("cmd", "command"),
+            ("outs", [OrderedDict([("path", "input"), ("md5", "md-five")])]),
+        ]
+    )
+
+
+@pytest.mark.parametrize("typ", ["plots", "metrics", "outs"])
+def test_lock_outs_order(dvc, typ):
+    stage = create_stage(
+        PipelineStage, dvc, **{typ: ["input1", "input0"]}, **kwargs
+    )
+    stage.outs[0].info = {"md5": "md-one1"}
+    stage.outs[1].info = {"md5": "md-zer0"}
+    assert to_single_stage_lockfile(stage) == OrderedDict(
+        [
+            ("cmd", "command"),
+            (
+                "outs",
+                [
+                    OrderedDict([("path", "input0"), ("md5", "md-zer0")]),
+                    OrderedDict([("path", "input1"), ("md5", "md-one1")]),
+                ],
+            ),
+        ]
+    )
+
+
+def test_dump_appropriate_checksums(dvc):
+    stage = create_stage(
+        PipelineStage, dvc, deps=["s3://dvc-temp/file"], **kwargs
+    )
+    stage.deps[0].info = {"etag": "is-it-etag", "md5": "or-md5?"}
+    assert to_single_stage_lockfile(stage) == OrderedDict(
+        [
+            ("cmd", "command"),
+            (
+                "deps",
+                [
+                    OrderedDict(
+                        [
+                            ("path", "s3://dvc-temp/file"),
+                            ("etag", "is-it-etag"),
+                        ]
+                    )
+                ],
+            ),
+        ]
+    )
+
+
+def test_order(dvc):
+    stage = create_stage(
+        PipelineStage,
+        dvc,
+        deps=["input"],
+        outs=["output"],
+        params=["foo-param"],
+        **kwargs
+    )
+    params, deps = split_params_deps(stage)
+
+    deps[0].info = {"md5": "md-five"}
+    params[0].info = {"foo-param": "value"}
+    stage.outs[0].info = {"md5": "md5-output"}
+
+    assert to_single_stage_lockfile(stage) == OrderedDict(
+        [
+            ("cmd", "command"),
+            ("deps", [{"path": "input", "md5": "md-five"}]),
+            ("params", {"params.yaml": {"foo-param": "value"}}),
+            ("outs", [{"path": "output", "md5": "md5-output"}]),
+        ]
+    )
+
+
+def test_to_lockfile(dvc):
+    stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs)
+    stage.deps[0].info = {"md5": "md-five"}
+    entry = to_lockfile(stage)
+    assert len(entry) == 1
+    _Schema(LOCKFILE_SCHEMA)(entry)
+    assert entry == {
+        "something": OrderedDict(
+            [
+                ("cmd", "command"),
+                ("deps", [{"path": "input", "md5": "md-five"}]),
+            ]
+        )
+    }
diff --git a/tests/unit/stage/test_stage.py b/tests/unit/stage/test_stage.py
@@ -2,7 +2,6 @@
 import signal
 import subprocess
 import threading
-from unittest import TestCase
 
 import mock
 import pytest
@@ -50,12 +49,11 @@ def test_meta_ignored():
         assert stage.compute_md5() == "e9521a22111493406ea64a88cda63e0b"
 
 
-class TestPathConversion(TestCase):
-    def test(self):
-        stage = Stage(None, "path")
+def test_path_conversion(dvc):
+    stage = Stage(dvc, "path")
 
-        stage.wdir = os.path.join("..", "..")
-        self.assertEqual(stage.dumpd()["wdir"], "../..")
+    stage.wdir = os.path.join("..", "..")
+    assert stage.dumpd()["wdir"] == "../.."
 
 
 def test_stage_update(mocker):