Skip to content

Commit 8f79d64

Browse files
committed
dump: lockfile is dumped deterministically
The dump is no longer deterministic/dependent on the pipeline file, but is sorted based on file names in outs, deps or params. Also, the params inside each files are also sorted based on name. However, the objects inside params are not sorted deterministically as I think it's too much to sort that, and is not easy (considering the types of objects it might hold, eg: lists, objects, etc). This will also provide ordered dumps for Python3.5
1 parent f9088e1 commit 8f79d64

File tree

3 files changed

+233
-30
lines changed

3 files changed

+233
-30
lines changed

dvc/serialize.py

Lines changed: 43 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from collections import OrderedDict
2+
from functools import partial
3+
from operator import attrgetter
14
from typing import TYPE_CHECKING
25

36
from funcy import rpartial, lsplit
@@ -15,9 +18,12 @@
1518
DEFAULT_PARAMS_FILE = ParamsDependency.DEFAULT_PARAMS_FILE
1619

1720

21+
sort_by_path = partial(sorted, key=attrgetter("def_path"))
22+
23+
1824
def _get_outs(stage: "PipelineStage"):
1925
outs_bucket = {}
20-
for o in stage.outs:
26+
for o in sort_by_path(stage.outs):
2127
bucket_key = ["metrics"] if o.metric else ["outs"]
2228

2329
if not o.metric and o.persist:
@@ -26,7 +32,7 @@ def _get_outs(stage: "PipelineStage"):
2632
bucket_key += ["no_cache"]
2733
key = "_".join(bucket_key)
2834
outs_bucket[key] = outs_bucket.get(key, []) + [o.def_path]
29-
return outs_bucket
35+
return [(key, outs_bucket[key]) for key in sorted(outs_bucket.keys())]
3036

3137

3238
def get_params_deps(stage: "PipelineStage"):
@@ -40,60 +46,67 @@ def _serialize_params(params: List[ParamsDependency]):
4046
4147
which is in the shape of:
4248
['lr', 'train', {'params2.yaml': ['lr']}]
49+
4350
`key_vals` - which is list of params with values, used in a lockfile
4451
which is in the shape of:
4552
{'params.yaml': {'lr': '1', 'train': 2}, {'params2.yaml': {'lr': '1'}}
4653
"""
4754
keys = []
48-
key_vals = {}
55+
key_vals = OrderedDict()
4956

50-
for param_dep in params:
57+
for param_dep in sort_by_path(params):
5158
dump = param_dep.dumpd()
5259
path, params = dump[PARAM_PATH], dump[PARAM_PARAMS]
5360
k = list(params.keys())
5461
if not k:
5562
continue
56-
# if it's not a default file, change the shape
57-
# to: {path: k}
58-
keys.extend(k if path == DEFAULT_PARAMS_FILE else [{path: k}])
59-
key_vals.update({path: params})
60-
63+
key_vals[path] = OrderedDict([(key, params[key]) for key in sorted(k)])
64+
# params from default file is always kept at the start of the `params:`
65+
if path == DEFAULT_PARAMS_FILE:
66+
keys = k + keys
67+
key_vals.move_to_end(path, last=False)
68+
else:
69+
# if it's not a default file, change the shape
70+
# to: {path: k}
71+
keys.append({path: k})
6172
return keys, key_vals
6273

6374

6475
def to_pipeline_file(stage: "PipelineStage"):
6576
params, deps = get_params_deps(stage)
6677
serialized_params, _ = _serialize_params(params)
6778

79+
res = [
80+
(stage.PARAM_CMD, stage.cmd),
81+
(stage.PARAM_WDIR, stage.resolve_wdir()),
82+
(stage.PARAM_DEPS, [d.def_path for d in deps]),
83+
(stage.PARAM_PARAMS, serialized_params),
84+
*_get_outs(stage),
85+
(stage.PARAM_LOCKED, stage.locked),
86+
(stage.PARAM_ALWAYS_CHANGED, stage.always_changed),
87+
]
6888
return {
69-
stage.name: {
70-
key: value
71-
for key, value in {
72-
stage.PARAM_CMD: stage.cmd,
73-
stage.PARAM_WDIR: stage.resolve_wdir(),
74-
stage.PARAM_DEPS: [d.def_path for d in deps],
75-
stage.PARAM_PARAMS: serialized_params,
76-
**_get_outs(stage),
77-
stage.PARAM_LOCKED: stage.locked,
78-
stage.PARAM_ALWAYS_CHANGED: stage.always_changed,
79-
}.items()
80-
if value
81-
}
89+
stage.name: OrderedDict([(key, value) for key, value in res if value])
8290
}
8391

8492

85-
def to_lockfile(stage: "PipelineStage") -> dict:
93+
def to_lockfile(stage: "PipelineStage"):
8694
assert stage.cmd
8795
assert stage.name
8896

89-
res = {"cmd": stage.cmd}
97+
res = OrderedDict([("cmd", stage.cmd)])
9098
params, deps = get_params_deps(stage)
91-
deps = [
92-
{"path": dep.def_path, dep.checksum_type: dep.checksum} for dep in deps
93-
]
94-
outs = [
95-
{"path": out.def_path, out.checksum_type: out.checksum}
96-
for out in stage.outs
99+
deps, outs = [
100+
[
101+
OrderedDict(
102+
[
103+
("path", item.def_path),
104+
(item.checksum_type, item.checksum),
105+
]
106+
)
107+
for item in sort_by_path(items)
108+
]
109+
for items in [deps, stage.outs]
97110
]
98111
if deps:
99112
res["deps"] = deps

dvc/utils/stage.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from collections import OrderedDict
2+
13
import yaml
24
from ruamel.yaml import YAML
35
from ruamel.yaml.error import YAMLError
@@ -42,4 +44,9 @@ def dump_stage_file(path, data):
4244
with open(path, "w", encoding="utf-8") as fd:
4345
yaml = YAML()
4446
yaml.default_flow_style = False
47+
# tell Dumper to represent OrderedDict as
48+
# normal dict
49+
yaml.Representer.add_representer(
50+
OrderedDict, yaml.Representer.represent_dict
51+
)
4552
yaml.dump(data, fd)

tests/func/test_lockfile.py

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
import os
2+
from collections import OrderedDict
3+
from operator import itemgetter
4+
from textwrap import dedent
5+
6+
import pytest
7+
import yaml
8+
from dvc.dvcfile import PIPELINE_LOCK
9+
from dvc.serialize import get_params_deps
10+
from dvc.utils.fs import remove
11+
from dvc.utils.stage import parse_stage_for_update
12+
13+
from tests.func.test_run_multistage import supported_params
14+
15+
16+
FS_STRUCTURE = {
17+
"foo": "bar\nfoobar",
18+
"bar": "foo\nfoobar",
19+
"foobar": "foobar\nbar",
20+
"params.yaml": yaml.dump(supported_params),
21+
"params2.yaml": yaml.dump(supported_params),
22+
}
23+
24+
25+
@pytest.fixture
26+
def run_head(tmp_dir, dvc):
27+
"""Output first line of each file to different file with '-1' appended."""
28+
tmp_dir.gen(
29+
"head.py",
30+
dedent(
31+
"""
32+
import sys
33+
_, *files = sys.argv
34+
for file in files:
35+
with open(file) as f, open(file +"-1","w+") as w:
36+
w.write(f.readline())
37+
"""
38+
),
39+
)
40+
41+
def run(*args, **run_kwargs):
42+
return dvc.run(
43+
cmd="python head.py {}".format(" ".join(args)),
44+
outs=[dep + "-1" for dep in args],
45+
deps=args,
46+
**run_kwargs
47+
)
48+
49+
return run
50+
51+
52+
def read_lock_file(file=PIPELINE_LOCK):
53+
with open(file) as f:
54+
data = parse_stage_for_update(f.read(), file)
55+
assert isinstance(data, OrderedDict)
56+
return data
57+
58+
59+
def assert_eq_lockfile(previous, new):
60+
for content in (previous, new):
61+
assert isinstance(content, OrderedDict)
62+
63+
# if they both are OrderedDict, then `==` will also check for order
64+
assert previous == new
65+
66+
67+
def test_deps_outs_are_sorted_by_path(tmp_dir, dvc, run_head):
68+
tmp_dir.gen(FS_STRUCTURE)
69+
deps = ["foo", "bar", "foobar"]
70+
run_head(*deps, name="copy-first-line")
71+
72+
initial_content = read_lock_file()
73+
lock = initial_content["copy-first-line"]
74+
75+
# lock stage key order:
76+
assert list(lock.keys()) == ["cmd", "deps", "outs"]
77+
78+
# `path` key appear first and then the `md5`
79+
assert all(list(dep.keys()) == ["path", "md5"] for dep in lock["deps"])
80+
assert all(list(out.keys()) == ["path", "md5"] for out in lock["outs"])
81+
82+
# deps are always sorted by the file path naming
83+
assert list(map(itemgetter("path"), lock["deps"])) == sorted(deps)
84+
85+
# outs are too
86+
assert list(
87+
map(itemgetter("path"), initial_content["copy-first-line"]["outs"])
88+
) == [d + "-1" for d in sorted(deps)]
89+
90+
91+
def test_order_is_preserved_when_pipeline_order_changes(
92+
tmp_dir, dvc, run_head
93+
):
94+
tmp_dir.gen(FS_STRUCTURE)
95+
deps = ["foo", "bar", "foobar"]
96+
stage = run_head(*deps, name="copy-first-line")
97+
98+
initial_content = read_lock_file()
99+
# reverse order of stage.outs and dump to the pipeline file
100+
# then, again change stage.deps and dump to the pipeline file
101+
reversal = stage.outs.reverse, stage.deps.reverse
102+
for reverse_items in reversal:
103+
reverse_items()
104+
stage.dvcfile._dump_pipeline_file(stage)
105+
106+
# we only changed the order, should not reproduce
107+
assert not dvc.reproduce(stage.addressing)
108+
109+
new_lock_content = read_lock_file()
110+
assert_eq_lockfile(new_lock_content, initial_content)
111+
112+
(tmp_dir / PIPELINE_LOCK).unlink()
113+
assert dvc.reproduce(stage.addressing) == [stage]
114+
new_lock_content = read_lock_file()
115+
assert_eq_lockfile(new_lock_content, initial_content)
116+
117+
118+
def test_cmd_changes_other_orders_are_preserved(tmp_dir, dvc, run_head):
119+
tmp_dir.gen(FS_STRUCTURE)
120+
deps = ["foo", "bar", "foobar"]
121+
stage = run_head(*deps, name="copy-first-line")
122+
123+
initial_content = read_lock_file()
124+
# let's change cmd in pipeline file
125+
# it should only change "cmd", otherwise it should be
126+
# structurally same as cmd
127+
stage.cmd = " ".join(stage.cmd.split())
128+
stage.dvcfile._dump_pipeline_file(stage)
129+
130+
initial_content["copy-first-line"]["cmd"] = stage.cmd
131+
132+
assert dvc.reproduce(stage.addressing) == [stage]
133+
134+
new_lock_content = read_lock_file()
135+
assert_eq_lockfile(new_lock_content, initial_content)
136+
137+
138+
def test_params_dump(tmp_dir, dvc, run_head):
139+
tmp_dir.gen(FS_STRUCTURE)
140+
141+
stage = run_head(
142+
"foo",
143+
"bar",
144+
"foobar",
145+
name="copy-first-line",
146+
params=[
147+
"params2.yaml:answer,lists,name",
148+
"params.yaml:lists,floats,nested.nested1,nested.nested1.nested2",
149+
],
150+
)
151+
152+
initial_content = read_lock_file()
153+
lock = initial_content["copy-first-line"]
154+
155+
# lock stage key order:
156+
assert list(lock.keys()) == ["cmd", "deps", "params", "outs"]
157+
assert list(lock["params"].keys()) == ["params.yaml", "params2.yaml"]
158+
159+
# # params keys are always sorted by the name
160+
assert list(lock["params"]["params.yaml"].keys()) == [
161+
"floats",
162+
"lists",
163+
"nested.nested1",
164+
"nested.nested1.nested2",
165+
]
166+
assert list(lock["params"]["params2.yaml"]) == ["answer", "lists", "name"]
167+
168+
assert not dvc.reproduce(stage.addressing)
169+
170+
# let's change the order of params and dump them in pipeline file
171+
params, _ = get_params_deps(stage)
172+
for param in params:
173+
param.params.reverse()
174+
175+
stage.dvcfile._dump_pipeline_file(stage)
176+
assert not dvc.reproduce(stage.addressing)
177+
178+
(tmp_dir / PIPELINE_LOCK).unlink()
179+
# XXX: temporary workaround due to lack of params support in build cache
180+
remove(os.path.join(dvc.cache.local.cache_dir, "stages"))
181+
182+
assert dvc.reproduce(stage.addressing) == [stage]
183+
assert_eq_lockfile(initial_content, read_lock_file())

0 commit comments

Comments
 (0)