Unity-Technologies · andrewcoh · May 8, 2020 · May 8, 2020 · May 8, 2020 · May 8, 2020
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to
 #### com.unity.ml-agents (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
 - `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
+- `beta` and `epsilon` in `PPO` are no longer decayed by default but follow the same schedule as learning rate. (#3940)
 ### Minor Changes
 #### com.unity.ml-agents (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)

diff --git a/ml-agents/mlagents/trainers/models.py b/ml-agents/mlagents/trainers/models.py
@@ -21,7 +21,7 @@ class EncoderType(Enum):
     RESNET = "resnet"
 
 
-class LearningRateSchedule(Enum):
+class ScheduleType(Enum):
     CONSTANT = "constant"
     LINEAR = "linear"
 
@@ -55,11 +55,12 @@ def create_global_steps():
         return global_step, increment_step, steps_to_increment
 
     @staticmethod
-    def create_learning_rate(
-        lr_schedule: LearningRateSchedule,
-        lr: float,
+    def create_schedule(
+        schedule: ScheduleType,
+        parameter: float,
         global_step: tf.Tensor,
         max_step: int,
+        min_value: float,
     ) -> tf.Tensor:
         """
         Create a learning rate tensor.
@@ -69,17 +70,15 @@ def create_learning_rate(
         :param max_step: The maximum number of steps in the training run.
         :return: A Tensor containing the learning rate.
         """
-        if lr_schedule == LearningRateSchedule.CONSTANT:
-            learning_rate = tf.Variable(lr)
-        elif lr_schedule == LearningRateSchedule.LINEAR:
-            learning_rate = tf.train.polynomial_decay(
-                lr, global_step, max_step, 1e-10, power=1.0
+        if schedule == ScheduleType.CONSTANT:
+            parameter_rate = tf.Variable(parameter, trainable=False)
+        elif schedule == ScheduleType.LINEAR:
+            parameter_rate = tf.train.polynomial_decay(
+                parameter, global_step, max_step, min_value, power=1.0
             )
         else:
-            raise UnityTrainerException(
-                "The learning rate schedule {} is invalid.".format(lr_schedule)
-            )
-        return learning_rate
+            raise UnityTrainerException("The schedule {} is invalid.".format(schedule))
+        return parameter_rate
 
     @staticmethod
     def scaled_init(scale):

diff --git a/ml-agents/mlagents/trainers/ppo/optimizer.py b/ml-agents/mlagents/trainers/ppo/optimizer.py
@@ -2,7 +2,7 @@
 import numpy as np
 from mlagents.tf_utils import tf
 from mlagents_envs.timers import timed
-from mlagents.trainers.models import ModelUtils, EncoderType, LearningRateSchedule
+from mlagents.trainers.models import ModelUtils, EncoderType, ScheduleType
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
 from mlagents.trainers.buffer import AgentBuffer
@@ -24,7 +24,7 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
                 super().__init__(policy, trainer_params)
 
                 lr = float(trainer_params["learning_rate"])
-                lr_schedule = LearningRateSchedule(
+                self._schedule = ScheduleType(
                     trainer_params.get("learning_rate_schedule", "linear")
                 )
                 h_size = int(trainer_params["hidden_units"])
@@ -47,6 +47,8 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
                     "Losses/Value Loss": "value_loss",
                     "Losses/Policy Loss": "policy_loss",
                     "Policy/Learning Rate": "learning_rate",
+                    "Policy/Epsilon": "decay_epsilon",
+                    "Policy/Beta": "decay_beta",
                 }
                 if self.policy.use_recurrent:
                     self.m_size = self.policy.m_size
@@ -63,8 +65,12 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
                 else:
                     self._create_dc_critic(h_size, num_layers, vis_encode_type)
 
-                self.learning_rate = ModelUtils.create_learning_rate(
-                    lr_schedule, lr, self.policy.global_step, int(max_step)
+                self.learning_rate = ModelUtils.create_schedule(
+                    self._schedule,
+                    lr,
+                    self.policy.global_step,
+                    int(max_step),
+                    min_value=1e-10,
                 )
                 self._create_losses(
                     self.policy.total_log_probs,
@@ -84,6 +90,8 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
                     "policy_loss": self.abs_policy_loss,
                     "update_batch": self.update_batch,
                     "learning_rate": self.learning_rate,
+                    "decay_epsilon": self.decay_epsilon,
+                    "decay_beta": self.decay_beta,
                 }
             )
 
@@ -232,19 +240,19 @@ def _create_losses(
         )
         advantage = tf.expand_dims(self.advantage, -1)
 
-        decay_epsilon = tf.train.polynomial_decay(
-            epsilon, self.policy.global_step, max_step, 0.1, power=1.0
+        self.decay_epsilon = ModelUtils.create_schedule(
+            self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1
         )
-        decay_beta = tf.train.polynomial_decay(
-            beta, self.policy.global_step, max_step, 1e-5, power=1.0
+        self.decay_beta = ModelUtils.create_schedule(
+            self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5
         )
 
         value_losses = []
         for name, head in value_heads.items():
             clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
                 tf.reduce_sum(head, axis=1) - self.old_values[name],
-                -decay_epsilon,
-                decay_epsilon,
+                -self.decay_epsilon,
+                self.decay_epsilon,
             )
             v_opt_a = tf.squared_difference(
                 self.returns_holders[name], tf.reduce_sum(head, axis=1)
@@ -263,7 +271,9 @@ def _create_losses(
         r_theta = tf.exp(probs - old_probs)
         p_opt_a = r_theta * advantage
         p_opt_b = (
-            tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
+            tf.clip_by_value(
+                r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon
+            )
             * advantage
         )
         self.policy_loss = -tf.reduce_mean(
@@ -275,7 +285,7 @@ def _create_losses(
         self.loss = (
             self.policy_loss
             + 0.5 * self.value_loss
-            - decay_beta
+            - self.decay_beta
             * tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
         )
 

diff --git a/ml-agents/mlagents/trainers/sac/optimizer.py b/ml-agents/mlagents/trainers/sac/optimizer.py
@@ -5,7 +5,7 @@
 
 from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
-from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
+from mlagents.trainers.models import ScheduleType, EncoderType, ModelUtils
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.buffer import AgentBuffer
@@ -45,7 +45,7 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
             with tf.variable_scope(""):
                 super().__init__(policy, trainer_params)
                 lr = float(trainer_params["learning_rate"])
-                lr_schedule = LearningRateSchedule(
+                lr_schedule = ScheduleType(
                     trainer_params.get("learning_rate_schedule", "constant")
                 )
                 self.policy = policy
@@ -111,8 +111,12 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
                 # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
                 self.m_size = 3 * self.policy.m_size
                 self._create_inputs_and_outputs()
-                self.learning_rate = ModelUtils.create_learning_rate(
-                    lr_schedule, lr, self.policy.global_step, int(max_step)
+                self.learning_rate = ModelUtils.create_schedule(
+                    lr_schedule,
+                    lr,
+                    self.policy.global_step,
+                    int(max_step),
+                    min_value=1e-10,
                 )
                 self._create_losses(
                     self.policy_network.q1_heads,

diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -475,9 +475,9 @@ def test_gail_visual_ppo(simple_record, use_discrete):
         step_size=0.2,
     )
     override_vals = {
-        "max_steps": 750,
+        "max_steps": 1000,
         "learning_rate": 3.0e-4,
-        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
+        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1500},
         "reward_signals": {
             "gail": {
                 "strength": 1.0,