diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index 02942bcd29..76e8438586 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to #### com.unity.ml-agents (C#) #### ml-agents / ml-agents-envs / gym-unity (Python) - `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`. +- `beta` and `epsilon` in `PPO` are no longer decayed by default but follow the same schedule as learning rate. (#3940) ### Minor Changes #### com.unity.ml-agents (C#) #### ml-agents / ml-agents-envs / gym-unity (Python) diff --git a/ml-agents/mlagents/trainers/models.py b/ml-agents/mlagents/trainers/models.py index b286919308..ccb2032aed 100644 --- a/ml-agents/mlagents/trainers/models.py +++ b/ml-agents/mlagents/trainers/models.py @@ -21,7 +21,7 @@ class EncoderType(Enum): RESNET = "resnet" -class LearningRateSchedule(Enum): +class ScheduleType(Enum): CONSTANT = "constant" LINEAR = "linear" @@ -55,11 +55,12 @@ def create_global_steps(): return global_step, increment_step, steps_to_increment @staticmethod - def create_learning_rate( - lr_schedule: LearningRateSchedule, - lr: float, + def create_schedule( + schedule: ScheduleType, + parameter: float, global_step: tf.Tensor, max_step: int, + min_value: float, ) -> tf.Tensor: """ Create a learning rate tensor. @@ -69,17 +70,15 @@ def create_learning_rate( :param max_step: The maximum number of steps in the training run. :return: A Tensor containing the learning rate. """ - if lr_schedule == LearningRateSchedule.CONSTANT: - learning_rate = tf.Variable(lr) - elif lr_schedule == LearningRateSchedule.LINEAR: - learning_rate = tf.train.polynomial_decay( - lr, global_step, max_step, 1e-10, power=1.0 + if schedule == ScheduleType.CONSTANT: + parameter_rate = tf.Variable(parameter, trainable=False) + elif schedule == ScheduleType.LINEAR: + parameter_rate = tf.train.polynomial_decay( + parameter, global_step, max_step, min_value, power=1.0 ) else: - raise UnityTrainerException( - "The learning rate schedule {} is invalid.".format(lr_schedule) - ) - return learning_rate + raise UnityTrainerException("The schedule {} is invalid.".format(schedule)) + return parameter_rate @staticmethod def scaled_init(scale): diff --git a/ml-agents/mlagents/trainers/ppo/optimizer.py b/ml-agents/mlagents/trainers/ppo/optimizer.py index 2151cf5707..969d612c77 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer.py @@ -2,7 +2,7 @@ import numpy as np from mlagents.tf_utils import tf from mlagents_envs.timers import timed -from mlagents.trainers.models import ModelUtils, EncoderType, LearningRateSchedule +from mlagents.trainers.models import ModelUtils, EncoderType, ScheduleType from mlagents.trainers.policy.tf_policy import TFPolicy from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer from mlagents.trainers.buffer import AgentBuffer @@ -24,7 +24,7 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): super().__init__(policy, trainer_params) lr = float(trainer_params["learning_rate"]) - lr_schedule = LearningRateSchedule( + self._schedule = ScheduleType( trainer_params.get("learning_rate_schedule", "linear") ) h_size = int(trainer_params["hidden_units"]) @@ -47,6 +47,8 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Policy/Learning Rate": "learning_rate", + "Policy/Epsilon": "decay_epsilon", + "Policy/Beta": "decay_beta", } if self.policy.use_recurrent: self.m_size = self.policy.m_size @@ -63,8 +65,12 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): else: self._create_dc_critic(h_size, num_layers, vis_encode_type) - self.learning_rate = ModelUtils.create_learning_rate( - lr_schedule, lr, self.policy.global_step, int(max_step) + self.learning_rate = ModelUtils.create_schedule( + self._schedule, + lr, + self.policy.global_step, + int(max_step), + min_value=1e-10, ) self._create_losses( self.policy.total_log_probs, @@ -84,6 +90,8 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): "policy_loss": self.abs_policy_loss, "update_batch": self.update_batch, "learning_rate": self.learning_rate, + "decay_epsilon": self.decay_epsilon, + "decay_beta": self.decay_beta, } ) @@ -232,19 +240,19 @@ def _create_losses( ) advantage = tf.expand_dims(self.advantage, -1) - decay_epsilon = tf.train.polynomial_decay( - epsilon, self.policy.global_step, max_step, 0.1, power=1.0 + self.decay_epsilon = ModelUtils.create_schedule( + self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1 ) - decay_beta = tf.train.polynomial_decay( - beta, self.policy.global_step, max_step, 1e-5, power=1.0 + self.decay_beta = ModelUtils.create_schedule( + self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5 ) value_losses = [] for name, head in value_heads.items(): clipped_value_estimate = self.old_values[name] + tf.clip_by_value( tf.reduce_sum(head, axis=1) - self.old_values[name], - -decay_epsilon, - decay_epsilon, + -self.decay_epsilon, + self.decay_epsilon, ) v_opt_a = tf.squared_difference( self.returns_holders[name], tf.reduce_sum(head, axis=1) @@ -263,7 +271,9 @@ def _create_losses( r_theta = tf.exp(probs - old_probs) p_opt_a = r_theta * advantage p_opt_b = ( - tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) + tf.clip_by_value( + r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon + ) * advantage ) self.policy_loss = -tf.reduce_mean( @@ -275,7 +285,7 @@ def _create_losses( self.loss = ( self.policy_loss + 0.5 * self.value_loss - - decay_beta + - self.decay_beta * tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1]) ) diff --git a/ml-agents/mlagents/trainers/sac/optimizer.py b/ml-agents/mlagents/trainers/sac/optimizer.py index 72494f4fe9..408e220cbf 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer.py +++ b/ml-agents/mlagents/trainers/sac/optimizer.py @@ -5,7 +5,7 @@ from mlagents_envs.logging_util import get_logger from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork -from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils +from mlagents.trainers.models import ScheduleType, EncoderType, ModelUtils from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer from mlagents.trainers.policy.tf_policy import TFPolicy from mlagents.trainers.buffer import AgentBuffer @@ -45,7 +45,7 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): with tf.variable_scope(""): super().__init__(policy, trainer_params) lr = float(trainer_params["learning_rate"]) - lr_schedule = LearningRateSchedule( + lr_schedule = ScheduleType( trainer_params.get("learning_rate_schedule", "constant") ) self.policy = policy @@ -111,8 +111,12 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value) self.m_size = 3 * self.policy.m_size self._create_inputs_and_outputs() - self.learning_rate = ModelUtils.create_learning_rate( - lr_schedule, lr, self.policy.global_step, int(max_step) + self.learning_rate = ModelUtils.create_schedule( + lr_schedule, + lr, + self.policy.global_step, + int(max_step), + min_value=1e-10, ) self._create_losses( self.policy_network.q1_heads, diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py index f3d9703b4e..2a5a8b0dc3 100644 --- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py +++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py @@ -475,9 +475,9 @@ def test_gail_visual_ppo(simple_record, use_discrete): step_size=0.2, ) override_vals = { - "max_steps": 750, + "max_steps": 1000, "learning_rate": 3.0e-4, - "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000}, + "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1500}, "reward_signals": { "gail": { "strength": 1.0,