HumanCompatibleAI · AdamGleave · Nov 7, 2022 · Oct 26, 2022 · Oct 26, 2022 · Oct 26, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -37,13 +37,6 @@ executors:
       # If you change these, also change ci/code_checks.sh
       SRC_FILES: src/ tests/ experiments/ examples/ docs/conf.py setup.py ci/
       NUM_CPUS: 2
-      EXCLUDE_MYPY: |
-        (?x)(
-          src/imitation/algorithms/preference_comparisons.py$
-          | src/imitation/rewards/serialize.py$
-          | src/imitation/algorithms/mce_irl.py$
-          | tests/algorithms/test_bc.py$
-        )
 
 commands:
   dependencies-linux:
@@ -280,7 +273,7 @@ jobs:
 
       - run:
           name: mypy
-          command: mypy --version && mypy ${SRC_FILES[@]} --exclude "${EXCLUDE_MYPY}" --follow-imports=silent --show-error-codes
+          command: mypy --version && mypy ${SRC_FILES[@]} --follow-imports=silent --show-error-codes
 
   unit-test-linux:
     executor: unit-test-linux

diff --git a/setup.py b/setup.py
@@ -203,7 +203,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
         "torch>=1.4.0",
         "tqdm",
         "scikit-learn>=0.21.2",
-        "seals==0.1.4",
+        "seals>=0.1.5",
         STABLE_BASELINES3,
         # TODO(adam) switch to upstream release if they make it
         #  See https://github.com/IDSIA/sacred/issues/879

diff --git a/src/imitation/algorithms/base.py b/src/imitation/algorithms/base.py
@@ -1,7 +1,17 @@
 """Module of base classes and helper methods for imitation learning algorithms."""
 
 import abc
-from typing import Any, Generic, Iterable, Mapping, Optional, TypeVar, Union, cast
+from typing import (
+    Any,
+    Generic,
+    Iterable,
+    Iterator,
+    Mapping,
+    Optional,
+    TypeVar,
+    Union,
+    cast,
+)
 
 import numpy as np
 import torch as th
@@ -59,7 +69,7 @@ def __init__(
         self._horizon = None
 
     @property
-    def logger(self):
+    def logger(self) -> imit_logger.HierarchicalLogger:
         return self._logger
 
     @logger.setter
@@ -191,7 +201,7 @@ def __init__(
         self.data_loader = data_loader
         self.expected_batch_size = expected_batch_size
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[TransitionMapping]:
         """Yields data from `self.data_loader`, checking `self.expected_batch_size`.
 
         Yields:

diff --git a/src/imitation/algorithms/bc.py b/src/imitation/algorithms/bc.py
@@ -43,7 +43,7 @@ class BatchIteratorWithEpochEndCallback:
     n_batches: Optional[int]
     on_epoch_end: Optional[Callable[[int], None]]
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         epochs_and_batches_specified = (
             self.n_epochs is not None and self.n_batches is not None
         )
@@ -56,7 +56,7 @@ def __post_init__(self):
             )
 
     def __iter__(self) -> Iterator[algo_base.TransitionMapping]:
-        def batch_iterator():
+        def batch_iterator() -> Iterator[algo_base.TransitionMapping]:
 
             # Note: the islice here ensures we do not exceed self.n_epochs
             for epoch_num in itertools.islice(itertools.count(), self.n_epochs):

diff --git a/src/imitation/algorithms/dagger.py b/src/imitation/algorithms/dagger.py
@@ -10,7 +10,7 @@
 import logging
 import os
 import pathlib
-from typing import Callable, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import torch as th
@@ -44,7 +44,7 @@ def __call__(self, round_num: int) -> float:
 class LinearBetaSchedule(BetaSchedule):
     """Linearly-decreasing schedule for beta."""
 
-    def __init__(self, rampdown_rounds: int):
+    def __init__(self, rampdown_rounds: int) -> None:
         """Builds LinearBetaSchedule.
 
         Args:
@@ -136,7 +136,7 @@ def __init__(
         beta: float,
         save_dir: types.AnyPath,
         rng: np.random.Generator,
-    ):
+    ) -> None:
         """Builds InteractiveTrajectoryCollector.
 
         Args:
@@ -162,7 +162,7 @@ def __init__(
         self._last_user_actions = None
         self.rng = rng
 
-    def seed(self, seed=Optional[int]) -> List[Union[None, int]]:
+    def seed(self, seed: Optional[int] = None) -> List[Optional[int]]:
         """Set the seed for the DAgger random number generator and wrapped VecEnv.
 
         The DAgger RNG is used along with `self.beta` to determine whether the expert
@@ -360,7 +360,7 @@ def policy(self) -> policies.BasePolicy:
     def batch_size(self) -> int:
         return self.bc_trainer.batch_size
 
-    def _load_all_demos(self):
+    def _load_all_demos(self) -> Tuple[types.Transitions, List[int]]:
         num_demos_by_round = []
         for round_num in range(self._last_loaded_round + 1, self.round_num + 1):
             round_dir = self._demo_dir_path_for_round(round_num)
@@ -371,7 +371,7 @@ def _load_all_demos(self):
         demo_transitions = rollout.flatten_trajectories(self._all_demos)
         return demo_transitions, num_demos_by_round
 
-    def _get_demo_paths(self, round_dir):
+    def _get_demo_paths(self, round_dir: pathlib.Path) -> List[pathlib.Path]:
         return [round_dir / p for p in os.listdir(round_dir) if p.endswith(".npz")]
 
     def _demo_dir_path_for_round(self, round_num: Optional[int] = None) -> pathlib.Path:
@@ -411,7 +411,10 @@ def _try_load_demos(self) -> None:
             self.bc_trainer.set_demonstrations(data_loader)
             self._last_loaded_round = self.round_num
 
-    def extend_and_update(self, bc_train_kwargs: Optional[Mapping] = None) -> int:
+    def extend_and_update(
+        self,
+        bc_train_kwargs: Optional[Mapping[str, Any]] = None,
+    ) -> int:
         """Extend internal batch of data and train BC.
 
         Specifically, this method will load new transitions (if necessary), train

diff --git a/src/imitation/algorithms/density.py b/src/imitation/algorithms/density.py
@@ -7,7 +7,7 @@
 import enum
 import itertools
 from collections.abc import Mapping
-from typing import Dict, Iterable, List, Optional, cast
+from typing import Any, Dict, Iterable, List, Optional, cast
 
 import numpy as np
 from gym.spaces.utils import flatten
@@ -226,7 +226,7 @@ def set_demonstrations(self, demonstrations: base.AnyTransitions) -> None:
                 None: np.concatenate(list(self.transitions.values()), axis=0),
             }
 
-    def train(self):
+    def train(self) -> None:
         """Fits the density model to demonstration data `self.transitions`."""
         # if requested, we'll scale demonstration transitions so that they have
         # zero mean and unit variance (i.e. all components are equally important)
@@ -343,7 +343,7 @@ def __call__(
         rew_array = np.asarray(rew_list, dtype="float32")
         return rew_array
 
-    def train_policy(self, n_timesteps: int = int(1e6), **kwargs):
+    def train_policy(self, n_timesteps: int = int(1e6), **kwargs: Any) -> None:
         """Train the imitation policy for a given number of timesteps.
 
         Args:

diff --git a/src/imitation/algorithms/mce_irl.py b/src/imitation/algorithms/mce_irl.py
@@ -7,7 +7,7 @@
 """
 import collections
 import warnings
-from typing import Any, Iterable, List, Mapping, Optional, Tuple, Type, Union
+from typing import Any, Iterable, List, Mapping, NoReturn, Optional, Tuple, Type, Union
 
 import gym
 import numpy as np
@@ -43,9 +43,14 @@ def mce_partition_fh(
         (V, Q, \pi) corresponding to the soft values, Q-values and MCE policy.
         V is a 2d array, indexed V[t,s]. Q is a 3d array, indexed Q[t,s,a].
         \pi is a 3d array, indexed \pi[t,s,a].
+
+    Raises:
+        ValueError: if ``env.horizon`` is None (infinite horizon).
     """
     # shorthand
     horizon = env.horizon
+    if horizon is None:
+        raise ValueError("Only finite-horizon environments are supported.")
     n_states = env.state_dim
     n_actions = env.action_dim
     T = env.transition_matrix
@@ -99,9 +104,14 @@ def mce_occupancy_measures(
         ``(env.horizon, env.n_states)`` and records the probability of being in a
         given state at a given timestep. ``Dcum`` is of shape ``(env.n_states,)``
         and records the expected discounted number of times each state is visited.
+
+    Raises:
+        ValueError: if ``env.horizon`` is None (infinite horizon).
     """
     # shorthand
     horizon = env.horizon
+    if horizon is None:
+        raise ValueError("Only finite-horizon environments are supported.")
     n_states = env.state_dim
     n_actions = env.action_dim
     T = env.transition_matrix
@@ -150,7 +160,7 @@ def __init__(
         action_space: gym.Space,
         pi: np.ndarray,
         rng: np.random.Generator,
-    ):
+    ) -> None:
         """Builds TabularPolicy.
 
         Args:
@@ -182,7 +192,7 @@ def forward(
         self,
         observation: th.Tensor,
         deterministic: bool = False,
-    ):
+    ) -> NoReturn:
         raise NotImplementedError("Should never be called.")  # pragma: no cover
 
     def predict(
@@ -269,7 +279,7 @@ def __init__(
         log_interval: Optional[int] = 100,
         *,
         custom_logger: Optional[imit_logger.HierarchicalLogger] = None,
-    ):
+    ) -> None:
         r"""Creates MCE IRL.
 
         Args:
@@ -297,6 +307,9 @@ def __init__(
             log_interval: how often to log current loss stats (using `logging`).
                 None to disable.
             custom_logger: Where to log to; if None (default), creates a new logger.
+
+        Raises:
+            ValueError: if the env horizon is not finite (or an integer).
         """
         self.discount = discount
         self.env = env
@@ -318,6 +331,8 @@ def __init__(
         # Initialize policy to be uniform random. We don't use this for MCE IRL
         # training, but it gives us something to return at all times with `policy`
         # property, similar to other algorithms.
+        if self.env.horizon is None:
+            raise ValueError("Only finite-horizon environments are supported.")
         ones = np.ones((self.env.horizon, self.env.state_dim, self.env.action_dim))
         uniform_pi = ones / self.env.action_dim
         self._policy = TabularPolicy(
@@ -369,6 +384,7 @@ def _set_demo_from_obs(
             )
 
         # Normalize occupancy measure estimates
+        assert self.env.horizon is not None
         self.demo_state_om *= (self.env.horizon + 1) / self.demo_state_om.sum()
 
     def set_demonstrations(self, demonstrations: MCEDemonstrations) -> None:
@@ -381,9 +397,9 @@ def set_demonstrations(self, demonstrations: MCEDemonstrations) -> None:
         # Demonstrations are either trajectories or transitions;
         # we must compute occupancy measure from this.
         if isinstance(demonstrations, Iterable):
-            first_item, demonstrations = util.get_first_iter_element(demonstrations)
+            first_item, demonstrations_it = util.get_first_iter_element(demonstrations)
             if isinstance(first_item, types.Trajectory):
-                self._set_demo_from_trajectories(demonstrations)
+                self._set_demo_from_trajectories(demonstrations_it)
                 return
 
         # Demonstrations are from some kind of transitions-like object. This does
@@ -427,7 +443,7 @@ def set_demonstrations(self, demonstrations: MCEDemonstrations) -> None:
                 f"Unsupported demonstration type {type(demonstrations)}",
             )
 
-    def _train_step(self, obs_mat: th.Tensor):
+    def _train_step(self, obs_mat: th.Tensor) -> Tuple[np.ndarray, np.ndarray]:
         self.optimizer.zero_grad()
 
         # get reward predicted for each state by current model, & compute
@@ -487,9 +503,11 @@ def train(self, max_iter: int = 1000) -> np.ndarray:
                 predicted_r_np, visitations = self._train_step(torch_obs_mat)
 
                 # these are just for termination conditions & debug logging
-                grad_norm = util.tensor_iter_norm(
-                    p.grad for p in self.reward_net.parameters()
-                ).item()
+                grads = []
+                for p in self.reward_net.parameters():
+                    assert p.grad is not None  # for type checker
+                    grads.append(p.grad)
+                grad_norm = util.tensor_iter_norm(grads).item()
                 linf_delta = np.max(np.abs(self.demo_state_om - visitations))
 
                 if self.log_interval is not None and 0 == (t % self.log_interval):