Add "status" to replace "running" and "queued" in output of exp show

karajan1001 · karajan1001 · commit 884182f6f2cb · 2022-10-06T17:20:03.000+08:00
fix: #7986 1. Add two new flags `--hide-queued` and `--hide-failed` to `exp show` 2. Allow `exp show` to show failed experiments. 3. Add unit test for the failed experiments shown. 4. Add name support for failed exp 5. Add error msg to the `exp show` output
diff --git a/dvc/commands/experiments/show.py b/dvc/commands/experiments/show.py
@@ -95,11 +95,8 @@ def _collect_rows(
 
         exp = results.get("data", {})
 
-        if exp.get("running"):
-            state = "Running"
-        elif exp.get("queued"):
-            state = "Queued"
-        else:
+        state = exp.get("status")
+        if state == "Success":
             state = fill_value
 
         is_baseline = rev == "baseline"
@@ -476,6 +473,8 @@ def run(self):
                 all_branches=self.args.all_branches,
                 all_tags=self.args.all_tags,
                 all_commits=self.args.all_commits,
+                hide_queued=self.args.hide_queued,
+                hide_failed=self.args.hide_failed,
                 revs=self.args.rev,
                 num=self.args.num,
                 sha_only=self.args.sha,
@@ -594,6 +593,18 @@ def add_parser(experiments_subparsers, parent_parser):
         default=False,
         help="Always show git commit SHAs instead of branch/tag names.",
     )
+    experiments_show_parser.add_argument(
+        "--hide-failed",
+        action="store_true",
+        default=False,
+        help="Hide failed experiments in the table.",
+    )
+    experiments_show_parser.add_argument(
+        "--hide-queued",
+        action="store_true",
+        default=False,
+        help="Hide queued experiments in the table.",
+    )
     experiments_show_parser.add_argument(
         "--json",
         "--show-json",
diff --git a/dvc/repo/experiments/__init__.py b/dvc/repo/experiments/__init__.py
@@ -427,6 +427,8 @@ def get_exact_name(self, rev: str):
                 pass
         if rev in self.stash_revs:
             return self.stash_revs[rev].name
+        if rev in self.celery_queue.failed_stash.stash_revs:
+            return self.celery_queue.failed_stash.stash_revs[rev].name
         return None
 
     def get_running_exps(self, fetch_refs: bool = True) -> Dict[str, Any]:
diff --git a/dvc/repo/experiments/show.py b/dvc/repo/experiments/show.py
@@ -1,9 +1,11 @@
 import logging
 from collections import OrderedDict, defaultdict
 from datetime import datetime
+from enum import Enum
 from itertools import chain
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
+from dvc.repo.experiments.queue.base import QueueDoneResult
 from dvc.repo.metrics.show import _gather_metrics
 from dvc.repo.params.show import _gather_params
 from dvc.scm import iter_revs
@@ -17,11 +19,18 @@
 logger = logging.getLogger(__name__)
 
 
+class ExpStatus(Enum):
+    Success = 0
+    Queued = 1
+    Running = 2
+    Failed = 3
+
+
 @error_handler
 def _collect_experiment_commit(
-    repo,
-    exp_rev,
-    stash=False,
+    repo: "Repo",
+    exp_rev: str,
+    status: ExpStatus = ExpStatus.Success,
     sha_only=True,
     param_deps=False,
     running=None,
@@ -67,14 +76,19 @@ def _collect_experiment_commit(
             if not (out.is_metric or out.is_plot)
         }
 
-        res["queued"] = stash
-        if running is not None and exp_rev in running:
-            res["running"] = True
+        res["status"] = status.name
+        if status == ExpStatus.Running:
             res["executor"] = running[exp_rev].get("location")
         else:
-            res["running"] = False
             res["executor"] = None
-        if not stash:
+
+        if status == ExpStatus.Failed:
+            res["error"] = {
+                "msg": "Experiment run failed.",
+                "type": "",
+            }
+
+        if status not in {ExpStatus.Queued, ExpStatus.Failed}:
             vals = _gather_metrics(
                 repo, targets=None, rev=rev, recursive=False, onerror=onerror
             )
@@ -97,16 +111,28 @@ def _collect_experiment_commit(
 
 
 def _collect_experiment_branch(
-    res, repo, branch, baseline, onerror: Optional[Callable] = None, **kwargs
+    res,
+    repo,
+    branch,
+    baseline,
+    onerror: Optional[Callable] = None,
+    running=None,
+    **kwargs
 ):
     from dvc.scm import resolve_rev
 
     exp_rev = resolve_rev(repo.scm, branch)
     prev = None
     revs = list(repo.scm.branch_revs(exp_rev, baseline))
     for rev in revs:
+        status = ExpStatus.Running if rev in running else ExpStatus.Success
         collected_exp = _collect_experiment_commit(
-            repo, rev, onerror=onerror, **kwargs
+            repo,
+            rev,
+            onerror=onerror,
+            status=status,
+            running=running,
+            **kwargs
         )
         if len(revs) > 1:
             exp = {"checkpoint_tip": exp_rev}
@@ -135,6 +161,8 @@ def show(
     all_tags=False,
     revs: Union[List[str], str, None] = None,
     all_commits=False,
+    hide_queued=False,
+    hide_failed=False,
     sha_only=False,
     num=1,
     param_deps=False,
@@ -163,10 +191,12 @@ def show(
     running = repo.experiments.get_running_exps(fetch_refs=fetch_running)
 
     for rev in found_revs:
+        status = ExpStatus.Running if rev in running else ExpStatus.Success
         res[rev]["baseline"] = _collect_experiment_commit(
             repo,
             rev,
             sha_only=sha_only,
+            status=status,
             param_deps=param_deps,
             running=running,
             onerror=onerror,
@@ -202,7 +232,19 @@ def show(
             repo.experiments.tempdir_queue.iter_active(),
             repo.experiments.celery_queue.iter_active(),
             repo.experiments.celery_queue.iter_queued(),
+            repo.experiments.celery_queue.iter_failed(),
         ):
+            if isinstance(entry, QueueDoneResult):
+                entry = entry.entry
+                if hide_failed:
+                    continue
+                status = ExpStatus.Failed
+            elif entry.stash_rev in running:
+                status = ExpStatus.Running
+            else:
+                if hide_queued:
+                    continue
+                status = ExpStatus.Queued
             stash_rev = entry.stash_rev
             if entry.baseline_rev in found_revs:
                 if stash_rev not in running or not running[stash_rev].get(
@@ -212,7 +254,7 @@ def show(
                         repo,
                         stash_rev,
                         sha_only=sha_only,
-                        stash=stash_rev not in running,
+                        status=status,
                         param_deps=param_deps,
                         running=running,
                         onerror=onerror,
diff --git a/tests/func/experiments/test_show.py b/tests/func/experiments/test_show.py
@@ -63,8 +63,7 @@ def test_show_simple(tmp_dir, scm, dvc, exp_stage):
                 "metrics": {"metrics.yaml": {"data": {"foo": 1}}},
                 "outs": {},
                 "params": {"params.yaml": {"data": {"foo": 1}}},
-                "queued": False,
-                "running": False,
+                "status": "Success",
                 "executor": None,
                 "timestamp": None,
             }
@@ -97,8 +96,7 @@ def test_show_experiment(tmp_dir, scm, dvc, exp_stage, workspace):
             "metrics": {"metrics.yaml": {"data": {"foo": 1}}},
             "outs": {},
             "params": {"params.yaml": {"data": {"foo": 1}}},
-            "queued": False,
-            "running": False,
+            "status": "Success",
             "executor": None,
             "timestamp": timestamp,
             "name": "master",
@@ -130,7 +128,7 @@ def test_show_queued(tmp_dir, scm, dvc, exp_stage):
     assert len(results) == 2
     exp = results[exp_rev]["data"]
     assert exp["name"] == "test_name"
-    assert exp["queued"]
+    assert exp["status"] == "Queued"
     assert exp["params"]["params.yaml"] == {"data": {"foo": 2}}
 
     # test that only queued experiments for the current baseline are returned
@@ -145,10 +143,67 @@ def test_show_queued(tmp_dir, scm, dvc, exp_stage):
     results = dvc.experiments.show()[new_rev]
     assert len(results) == 2
     exp = results[exp_rev]["data"]
-    assert exp["queued"]
+    assert exp["status"] == "Queued"
     assert exp["params"]["params.yaml"] == {"data": {"foo": 3}}
 
 
+@pytest.mark.vscode
+def test_show_failed_experiment(tmp_dir, scm, dvc, failed_exp_stage):
+    baseline_rev = scm.get_rev()
+    timestamp = datetime.fromtimestamp(
+        scm.gitpython.repo.rev_parse(baseline_rev).committed_date
+    )
+
+    dvc.experiments.run(
+        failed_exp_stage.addressing, params=["foo=2"], queue=True
+    )
+    exp_rev = dvc.experiments.scm.resolve_rev(f"{CELERY_STASH}@{{0}}")
+    dvc.experiments.run(run_all=True)
+    experiments = dvc.experiments.show()[baseline_rev]
+
+    expected_baseline = {
+        "data": {
+            "deps": {
+                "copy.py": {
+                    "hash": ANY,
+                    "size": ANY,
+                    "nfiles": None,
+                }
+            },
+            "metrics": {},
+            "outs": {},
+            "params": {"params.yaml": {"data": {"foo": 1}}},
+            "status": "Success",
+            "executor": None,
+            "timestamp": timestamp,
+            "name": "master",
+        }
+    }
+
+    expected_failed = {
+        "data": {
+            "timestamp": ANY,
+            "params": {"params.yaml": {"data": {"foo": 2}}},
+            "deps": {"copy.py": {"hash": None, "size": None, "nfiles": None}},
+            "outs": {},
+            "status": "Failed",
+            "executor": None,
+            "error": {
+                "msg": "Experiment run failed.",
+                "type": "",
+            },
+        }
+    }
+
+    assert len(experiments) == 2
+    for rev, exp in experiments.items():
+        if rev == "baseline":
+            assert exp == expected_baseline
+        else:
+            assert rev == exp_rev
+            assert exp == expected_failed
+
+
 @pytest.mark.vscode
 @pytest.mark.parametrize("workspace", [True, False])
 def test_show_checkpoint(
@@ -339,12 +394,8 @@ def test_show_sort(tmp_dir, scm, dvc, exp_stage, caplog):
 
 
 @pytest.mark.vscode
-@pytest.mark.parametrize(
-    "status, running", [(TaskStatus.RUNNING, True), (TaskStatus.FAILED, False)]
-)
-def test_show_running_workspace(
-    tmp_dir, scm, dvc, exp_stage, capsys, status, running
-):
+@pytest.mark.parametrize("status", [TaskStatus.RUNNING, TaskStatus.FAILED])
+def test_show_running_workspace(tmp_dir, scm, dvc, exp_stage, capsys, status):
     pid_dir = os.path.join(dvc.tmp_dir, EXEC_TMP_DIR, EXEC_PID_DIR)
     info = make_executor_info(
         location=BaseExecutor.DEFAULT_LOCATION, status=status
@@ -357,7 +408,8 @@ def test_show_running_workspace(
     makedirs(os.path.dirname(pidfile), True)
     (tmp_dir / pidfile).dump_json(info.asdict())
 
-    print(dvc.experiments.show())
+    print(dvc.experiments.show().get("workspace"))
+
     assert dvc.experiments.show().get("workspace") == {
         "baseline": {
             "data": {
@@ -371,17 +423,20 @@ def test_show_running_workspace(
                 "metrics": {"metrics.yaml": {"data": {"foo": 1}}},
                 "params": {"params.yaml": {"data": {"foo": 1}}},
                 "outs": {},
-                "queued": False,
-                "running": True if running else False,
-                "executor": info.location if running else None,
+                "status": "Running"
+                if status == TaskStatus.RUNNING
+                else "Success",
+                "executor": info.location
+                if status == TaskStatus.RUNNING
+                else None,
                 "timestamp": None,
             }
         }
     }
     capsys.readouterr()
     assert main(["exp", "show", "--csv"]) == 0
     cap = capsys.readouterr()
-    if running:
+    if status == TaskStatus.RUNNING:
         assert "Running" in cap.out
         assert info.location in cap.out
 
@@ -428,10 +483,10 @@ def test_show_running_tempdir(tmp_dir, scm, dvc, exp_stage, mocker):
         [mocker.call(stash_rev, pidfile, True)],
     )
     exp_data = get_in(results, [baseline_rev, exp_rev, "data"])
-    assert exp_data["running"]
+    assert exp_data["status"] == "Running"
     assert exp_data["executor"] == info.location
 
-    assert not results["workspace"]["baseline"]["data"]["running"]
+    assert results["workspace"]["baseline"]["data"]["status"] == "Success"
 
 
 def test_show_running_celery(tmp_dir, scm, dvc, exp_stage, mocker):
@@ -453,11 +508,10 @@ def test_show_running_celery(tmp_dir, scm, dvc, exp_stage, mocker):
 
     results = dvc.experiments.show()
     exp_data = get_in(results, [baseline_rev, exp_rev, "data"])
-    assert not exp_data["queued"]
-    assert exp_data["running"]
+    assert exp_data["status"] == "Running"
     assert exp_data["executor"] == info.location
 
-    assert not results["workspace"]["baseline"]["data"]["running"]
+    assert results["workspace"]["baseline"]["data"]["status"] == "Success"
 
 
 def test_show_running_checkpoint(tmp_dir, scm, dvc, checkpoint_stage, mocker):
@@ -479,6 +533,11 @@ def test_show_running_checkpoint(tmp_dir, scm, dvc, checkpoint_stage, mocker):
         "iter_active",
         return_value=entries,
     )
+    mocker.patch.object(
+        dvc.experiments.celery_queue,
+        "iter_failed",
+        return_value=[],
+    )
     pidfile = queue.get_infofile_path(entries[0].stash_rev)
     info = make_executor_info(
         git_url="foo.git",
@@ -495,10 +554,10 @@ def test_show_running_checkpoint(tmp_dir, scm, dvc, checkpoint_stage, mocker):
     results = dvc.experiments.show()
 
     checkpoint_res = get_in(results, [baseline_rev, checkpoint_rev, "data"])
-    assert checkpoint_res["running"]
+    assert checkpoint_res["status"] == "Running"
     assert checkpoint_res["executor"] == info.location
 
-    assert not results["workspace"]["baseline"]["data"]["running"]
+    assert results["workspace"]["baseline"]["data"]["status"] == "Success"
 
 
 def test_show_with_broken_repo(tmp_dir, scm, dvc, exp_stage, caplog):
diff --git a/tests/unit/command/test_experiments.py b/tests/unit/command/test_experiments.py