Skip to content

Commit ac24b5c

Browse files
authored
dvc: introduce merge-driver (#4298)
Related to #4162
1 parent 5b35249 commit ac24b5c

File tree

11 files changed

+558
-8
lines changed

11 files changed

+558
-8
lines changed

dvc/cache/base.py

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,12 @@
55
from shortuuid import uuid
66

77
import dvc.prompt as prompt
8-
from dvc.exceptions import CheckoutError, ConfirmRemoveError, DvcException
8+
from dvc.exceptions import (
9+
CheckoutError,
10+
ConfirmRemoveError,
11+
DvcException,
12+
MergeError,
13+
)
914
from dvc.path_info import WindowsPathInfo
1015
from dvc.progress import Tqdm
1116
from dvc.remote.slow_link_detection import slow_link_guard
@@ -552,3 +557,82 @@ def get_files_number(self, path_info, hash_, filter_info):
552557
filter_info.isin_or_eq(path_info / entry[self.tree.PARAM_CHECKSUM])
553558
for entry in self.get_dir_cache(hash_)
554559
)
560+
561+
def _to_dict(self, dir_info):
562+
return {
563+
entry[self.tree.PARAM_RELPATH]: entry[self.tree.PARAM_CHECKSUM]
564+
for entry in dir_info
565+
}
566+
567+
def _from_dict(self, dir_dict):
568+
return [
569+
{
570+
self.tree.PARAM_RELPATH: relpath,
571+
self.tree.PARAM_CHECKSUM: checksum,
572+
}
573+
for relpath, checksum in dir_dict.items()
574+
]
575+
576+
@staticmethod
577+
def _diff(ancestor, other, allow_removed=False):
578+
from dictdiffer import diff
579+
580+
allowed = ["add"]
581+
if allow_removed:
582+
allowed.append("remove")
583+
584+
result = list(diff(ancestor, other))
585+
for typ, _, _ in result:
586+
if typ not in allowed:
587+
raise MergeError(
588+
"unable to auto-merge directories with diff that contains "
589+
f"'{typ}'ed files"
590+
)
591+
return result
592+
593+
def _merge_dirs(self, ancestor_info, our_info, their_info):
594+
from operator import itemgetter
595+
596+
from dictdiffer import patch
597+
598+
ancestor = self._to_dict(ancestor_info)
599+
our = self._to_dict(our_info)
600+
their = self._to_dict(their_info)
601+
602+
our_diff = self._diff(ancestor, our)
603+
if not our_diff:
604+
return self._from_dict(their)
605+
606+
their_diff = self._diff(ancestor, their)
607+
if not their_diff:
608+
return self._from_dict(our)
609+
610+
# make sure there are no conflicting files
611+
self._diff(our, their, allow_removed=True)
612+
613+
merged = patch(our_diff + their_diff, ancestor, in_place=True)
614+
615+
# Sorting the list by path to ensure reproducibility
616+
return sorted(
617+
self._from_dict(merged), key=itemgetter(self.tree.PARAM_RELPATH)
618+
)
619+
620+
def merge(self, ancestor_info, our_info, their_info):
621+
assert our_info
622+
assert their_info
623+
624+
if ancestor_info:
625+
ancestor_hash = ancestor_info[self.tree.PARAM_CHECKSUM]
626+
ancestor = self.get_dir_cache(ancestor_hash)
627+
else:
628+
ancestor = []
629+
630+
our_hash = our_info[self.tree.PARAM_CHECKSUM]
631+
our = self.get_dir_cache(our_hash)
632+
633+
their_hash = their_info[self.tree.PARAM_CHECKSUM]
634+
their = self.get_dir_cache(their_hash)
635+
636+
merged = self._merge_dirs(ancestor, our, their)
637+
typ, merged_hash = self.tree.save_dir_info(merged)
638+
return {typ: merged_hash}

dvc/command/git_hook.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,26 @@ def _run(self):
6161
return main(["push"])
6262

6363

64+
class CmdMergeDriver(CmdHookBase):
65+
def _run(self):
66+
from dvc.dvcfile import Dvcfile
67+
from dvc.repo import Repo
68+
69+
dvc = Repo()
70+
71+
try:
72+
with dvc.state:
73+
ancestor = Dvcfile(dvc, self.args.ancestor, verify=False)
74+
our = Dvcfile(dvc, self.args.our, verify=False)
75+
their = Dvcfile(dvc, self.args.their, verify=False)
76+
77+
our.merge(ancestor, their)
78+
79+
return 0
80+
finally:
81+
dvc.close()
82+
83+
6484
def add_parser(subparsers, parent_parser):
6585
GIT_HOOK_HELP = "Run GIT hook."
6686

@@ -113,3 +133,27 @@ def add_parser(subparsers, parent_parser):
113133
"args", nargs="*", help="Arguments passed by GIT or pre-commit tool.",
114134
)
115135
pre_push_parser.set_defaults(func=CmdPrePush)
136+
137+
MERGE_DRIVER_HELP = "Run GIT merge driver."
138+
merge_driver_parser = git_hook_subparsers.add_parser(
139+
"merge-driver",
140+
parents=[parent_parser],
141+
description=MERGE_DRIVER_HELP,
142+
help=MERGE_DRIVER_HELP,
143+
)
144+
merge_driver_parser.add_argument(
145+
"--ancestor",
146+
required=True,
147+
help="Ancestor's version of the conflicting file.",
148+
)
149+
merge_driver_parser.add_argument(
150+
"--our",
151+
required=True,
152+
help="Current version of the conflicting file.",
153+
)
154+
merge_driver_parser.add_argument(
155+
"--their",
156+
required=True,
157+
help="Other branch's version of the conflicting file.",
158+
)
159+
merge_driver_parser.set_defaults(func=CmdMergeDriver)

dvc/dvcfile.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,10 @@ def check_dvc_filename(path):
5656
class FileMixin:
5757
SCHEMA = None
5858

59-
def __init__(self, repo, path, **kwargs):
59+
def __init__(self, repo, path, verify=True, **kwargs):
6060
self.repo = repo
6161
self.path = path
62+
self.verify = verify
6263

6364
def __repr__(self):
6465
return "{}: {}".format(
@@ -90,7 +91,8 @@ def _load(self):
9091
# 3. path doesn't represent a regular file
9192
if not self.exists():
9293
raise StageFileDoesNotExistError(self.path)
93-
check_dvc_filename(self.path)
94+
if self.verify:
95+
check_dvc_filename(self.path)
9496
if not self.repo.tree.isfile(self.path):
9597
raise StageFileIsNotDvcFileError(self.path)
9698

@@ -115,6 +117,9 @@ def remove(self, force=False): # pylint: disable=unused-argument
115117
def dump(self, stage, **kwargs):
116118
raise NotImplementedError
117119

120+
def merge(self, ancestor, other):
121+
raise NotImplementedError
122+
118123

119124
class SingleStageFile(FileMixin):
120125
from dvc.schema import COMPILED_SINGLE_STAGE_SCHEMA as SCHEMA
@@ -134,7 +139,8 @@ def dump(self, stage, **kwargs):
134139
from dvc.stage import PipelineStage
135140

136141
assert not isinstance(stage, PipelineStage)
137-
check_dvc_filename(self.path)
142+
if self.verify:
143+
check_dvc_filename(self.path)
138144
logger.debug(
139145
"Saving information to '{file}'.".format(file=relpath(self.path))
140146
)
@@ -144,6 +150,14 @@ def dump(self, stage, **kwargs):
144150
def remove_stage(self, stage): # pylint: disable=unused-argument
145151
self.remove()
146152

153+
def merge(self, ancestor, other):
154+
assert isinstance(ancestor, SingleStageFile)
155+
assert isinstance(other, SingleStageFile)
156+
157+
stage = self.stage
158+
stage.merge(ancestor.stage, other.stage)
159+
self.dump(stage)
160+
147161

148162
class PipelineFile(FileMixin):
149163
"""Abstraction for pipelines file, .yaml + .lock combined."""
@@ -161,7 +175,8 @@ def dump(
161175
from dvc.stage import PipelineStage
162176

163177
assert isinstance(stage, PipelineStage)
164-
check_dvc_filename(self.path)
178+
if self.verify:
179+
check_dvc_filename(self.path)
165180

166181
if update_pipeline and not stage.is_data_source:
167182
self._dump_pipeline_file(stage)
@@ -239,6 +254,9 @@ def remove_stage(self, stage):
239254
else:
240255
super().remove()
241256

257+
def merge(self, ancestor, other):
258+
raise NotImplementedError
259+
242260

243261
class Lockfile(FileMixin):
244262
from dvc.schema import COMPILED_LOCKFILE_SCHEMA as SCHEMA
@@ -295,6 +313,9 @@ def remove_stage(self, stage):
295313
else:
296314
self.remove()
297315

316+
def merge(self, ancestor, other):
317+
raise NotImplementedError
318+
298319

299320
class Dvcfile:
300321
def __new__(cls, repo, path, **kwargs):

dvc/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,3 +351,7 @@ def __init__(self, target, file):
351351
f"'{target}' "
352352
f"does not exist as an output or a stage name in '{file}'"
353353
)
354+
355+
356+
class MergeError(DvcException):
357+
pass

dvc/output/base.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from dvc.exceptions import (
1111
CollectCacheError,
1212
DvcException,
13+
MergeError,
1314
RemoteCacheRequiredError,
1415
)
1516

@@ -516,3 +517,37 @@ def _validate_output_path(cls, path, stage=None):
516517
check = stage.repo.tree.dvcignore.check_ignore(path)
517518
if check.match:
518519
raise cls.IsIgnoredError(check)
520+
521+
def _check_can_merge(self, out):
522+
if self.scheme != out.scheme:
523+
raise MergeError("unable to auto-merge outputs of different types")
524+
525+
my = self.dumpd()
526+
other = out.dumpd()
527+
528+
my.pop(self.tree.PARAM_CHECKSUM)
529+
other.pop(self.tree.PARAM_CHECKSUM)
530+
531+
if my != other:
532+
raise MergeError(
533+
"unable to auto-merge outputs with different options"
534+
)
535+
536+
if not out.is_dir_checksum:
537+
raise MergeError(
538+
"unable to auto-merge outputs that are not directories"
539+
)
540+
541+
def merge(self, ancestor, other):
542+
assert other
543+
544+
if ancestor:
545+
self._check_can_merge(ancestor)
546+
ancestor_info = ancestor.info
547+
else:
548+
ancestor_info = None
549+
550+
self._check_can_merge(self)
551+
self._check_can_merge(other)
552+
553+
self.info = self.cache.merge(ancestor_info, self.info, other.info)

dvc/scm/git.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,21 @@ def _install_hook(self, name):
307307

308308
os.chmod(hook, 0o777)
309309

310+
def _install_merge_driver(self):
311+
self.repo.git.config("merge.dvc.name", "DVC merge driver")
312+
self.repo.git.config(
313+
"merge.dvc.driver",
314+
(
315+
"dvc git-hook merge-driver "
316+
"--ancestor %O "
317+
"--our %A "
318+
"--their %B "
319+
),
320+
)
321+
310322
def install(self, use_pre_commit_tool=False):
323+
self._install_merge_driver()
324+
311325
if not use_pre_commit_tool:
312326
self._verify_dvc_hooks()
313327
self._install_hook("post-checkout")

dvc/stage/__init__.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import dvc.dependency as dependency
99
import dvc.prompt as prompt
10-
from dvc.exceptions import CheckoutError, DvcException
10+
from dvc.exceptions import CheckoutError, DvcException, MergeError
1111
from dvc.utils import relpath
1212

1313
from . import params
@@ -538,6 +538,44 @@ def get_used_cache(self, *args, **kwargs):
538538

539539
return cache
540540

541+
@staticmethod
542+
def _check_can_merge(stage, ancestor_out=None):
543+
if isinstance(stage, PipelineStage):
544+
raise MergeError("unable to auto-merge pipeline stages")
545+
546+
if not stage.is_data_source or stage.deps or len(stage.outs) > 1:
547+
raise MergeError(
548+
"unable to auto-merge DVC-files that weren't "
549+
"created by `dvc add`"
550+
)
551+
552+
if ancestor_out and not stage.outs:
553+
raise MergeError(
554+
"unable to auto-merge DVC-files with deleted outputs"
555+
)
556+
557+
def merge(self, ancestor, other):
558+
assert other
559+
560+
if not other.outs:
561+
return
562+
563+
if not self.outs:
564+
self.outs = other.outs
565+
return
566+
567+
if ancestor:
568+
self._check_can_merge(ancestor)
569+
outs = ancestor.outs
570+
ancestor_out = outs[0] if outs else None
571+
else:
572+
ancestor_out = None
573+
574+
self._check_can_merge(self, ancestor_out)
575+
self._check_can_merge(other, ancestor_out)
576+
577+
self.outs[0].merge(ancestor_out, other.outs[0])
578+
541579

542580
class PipelineStage(Stage):
543581
def __init__(self, *args, name=None, **kwargs):
@@ -577,3 +615,6 @@ def changed_stage(self):
577615

578616
def _changed_stage_entry(self):
579617
return f"'cmd' of {self} has changed."
618+
619+
def merge(self, ancestor, other):
620+
raise NotImplementedError

0 commit comments

Comments
 (0)