Add extra checks for horizon type

ianyfan · dan-pandori · ianyfan · commit d713d9396fbe · 2022-10-28T13:41:47.000+01:00
Co-authored-by: Daniel Pandori &lt;dantweinand@gmail.com&gt;
diff --git a/src/imitation/algorithms/mce_irl.py b/src/imitation/algorithms/mce_irl.py
@@ -54,9 +54,14 @@ def mce_partition_fh(
         (V, Q, \pi) corresponding to the soft values, Q-values and MCE policy.
         V is a 2d array, indexed V[t,s]. Q is a 3d array, indexed Q[t,s,a].
         \pi is a 3d array, indexed \pi[t,s,a].
+
+    Raises:
+        ValueError: if the horizon is not finite (or an integer).
     """
     # shorthand
-    horizon = int(env.horizon)
+    if not isinstance(env.horizon, int):
+        raise ValueError("Only finite (integer) horizons are supported.")
+    horizon = env.horizon
     n_states = env.state_dim
     n_actions = env.action_dim
     T = env.transition_matrix
@@ -110,9 +115,14 @@ def mce_occupancy_measures(
         ``(env.horizon, env.n_states)`` and records the probability of being in a
         given state at a given timestep. ``Dcum`` is of shape ``(env.n_states,)``
         and records the expected discounted number of times each state is visited.
+
+    Raises:
+        ValueError: if the horizon is not finite (or an integer).
     """
     # shorthand
-    horizon = int(env.horizon)
+    if not isinstance(env.horizon, int):
+        raise ValueError("Only finite (integer) horizons are supported.")
+    horizon = env.horizon
     n_states = env.state_dim
     n_actions = env.action_dim
     T = env.transition_matrix
@@ -308,6 +318,9 @@ def __init__(
             log_interval: how often to log current loss stats (using `logging`).
                 None to disable.
             custom_logger: Where to log to; if None (default), creates a new logger.
+
+        Raises:
+            ValueError: if the env horizon is not finite (or an integer).
         """
         self.discount = discount
         self.env = env
@@ -329,7 +342,9 @@ def __init__(
         # Initialize policy to be uniform random. We don't use this for MCE IRL
         # training, but it gives us something to return at all times with `policy`
         # property, similar to other algorithms.
-        ones = np.ones((int(self.env.horizon), self.env.state_dim, self.env.action_dim))
+        if not isinstance(self.env.horizon, int):
+            raise ValueError("Only finite (integer) horizons are supported.")
+        ones = np.ones((self.env.horizon, self.env.state_dim, self.env.action_dim))
         uniform_pi = ones / self.env.action_dim
         self._policy = TabularPolicy(
             state_space=self.env.state_space,
@@ -380,6 +395,7 @@ def _set_demo_from_obs(
             )
 
         # Normalize occupancy measure estimates
+        assert isinstance(self.env.horizon, int)
         self.demo_state_om *= (self.env.horizon + 1) / self.demo_state_om.sum()
 
     def set_demonstrations(self, demonstrations: MCEDemonstrations) -> None:
diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
@@ -641,8 +641,10 @@ def __call__(
 
         # we need two fragments for each comparison
         for _ in range(2 * num_pairs):
-            p = np.array(weights) / sum(weights)
-            traj = self.rng.choice(trajectories, p=p)  # type: ignore[arg-type]
+            traj = self.rng.choice(
+                trajectories,  # type: ignore[arg-type]
+                p=np.array(weights) / sum(weights),
+            )
             n = len(traj)
             start = self.rng.integers(0, n - fragment_length, endpoint=True)
             end = start + fragment_length
diff --git a/tests/algorithms/test_mce_irl.py b/tests/algorithms/test_mce_irl.py
@@ -417,4 +417,7 @@ def test_mce_irl_reasonable_mdp(
         stats = rollout.rollout_stats(trajs)
         if discount > 0.0:  # skip check when discount==0.0 (random policy)
             eps = 1e-6  # avoid test failing due to rounding error
-            assert stats["return_mean"] >= (mdp.horizon - 1) * 2 * 0.8 - eps
+            assert (
+                isinstance(mdp.horizon, int)
+                and stats["return_mean"] >= (mdp.horizon - 1) * 2 * 0.8 - eps
+            )