Skip to content

Use LR schedule for beta and epsilon #3940

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions com.unity.ml-agents/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
- `beta` and `epsilon` in `PPO` are no longer decayed by default but follow the same schedule as learning rate. (#3940)
### Minor Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
Expand Down
25 changes: 12 additions & 13 deletions ml-agents/mlagents/trainers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class EncoderType(Enum):
RESNET = "resnet"


class LearningRateSchedule(Enum):
class ScheduleType(Enum):
CONSTANT = "constant"
LINEAR = "linear"

Expand Down Expand Up @@ -55,11 +55,12 @@ def create_global_steps():
return global_step, increment_step, steps_to_increment

@staticmethod
def create_learning_rate(
lr_schedule: LearningRateSchedule,
lr: float,
def create_schedule(
schedule: ScheduleType,
parameter: float,
global_step: tf.Tensor,
max_step: int,
min_value: float,
) -> tf.Tensor:
"""
Create a learning rate tensor.
Expand All @@ -69,17 +70,15 @@ def create_learning_rate(
:param max_step: The maximum number of steps in the training run.
:return: A Tensor containing the learning rate.
"""
if lr_schedule == LearningRateSchedule.CONSTANT:
learning_rate = tf.Variable(lr)
elif lr_schedule == LearningRateSchedule.LINEAR:
learning_rate = tf.train.polynomial_decay(
lr, global_step, max_step, 1e-10, power=1.0
if schedule == ScheduleType.CONSTANT:
parameter_rate = tf.Variable(parameter, trainable=False)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ervteng added trainable=False

elif schedule == ScheduleType.LINEAR:
parameter_rate = tf.train.polynomial_decay(
parameter, global_step, max_step, min_value, power=1.0
)
else:
raise UnityTrainerException(
"The learning rate schedule {} is invalid.".format(lr_schedule)
)
return learning_rate
raise UnityTrainerException("The schedule {} is invalid.".format(schedule))
return parameter_rate

@staticmethod
def scaled_init(scale):
Expand Down
34 changes: 22 additions & 12 deletions ml-agents/mlagents/trainers/ppo/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.models import ModelUtils, EncoderType, LearningRateSchedule
from mlagents.trainers.models import ModelUtils, EncoderType, ScheduleType
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer
Expand All @@ -24,7 +24,7 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
super().__init__(policy, trainer_params)

lr = float(trainer_params["learning_rate"])
lr_schedule = LearningRateSchedule(
self._schedule = ScheduleType(
trainer_params.get("learning_rate_schedule", "linear")
)
h_size = int(trainer_params["hidden_units"])
Expand All @@ -47,6 +47,8 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}
if self.policy.use_recurrent:
self.m_size = self.policy.m_size
Expand All @@ -63,8 +65,12 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
else:
self._create_dc_critic(h_size, num_layers, vis_encode_type)

self.learning_rate = ModelUtils.create_learning_rate(
lr_schedule, lr, self.policy.global_step, int(max_step)
self.learning_rate = ModelUtils.create_schedule(
self._schedule,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self._create_losses(
self.policy.total_log_probs,
Expand All @@ -84,6 +90,8 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
"policy_loss": self.abs_policy_loss,
"update_batch": self.update_batch,
"learning_rate": self.learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
}
)

Expand Down Expand Up @@ -232,19 +240,19 @@ def _create_losses(
)
advantage = tf.expand_dims(self.advantage, -1)

decay_epsilon = tf.train.polynomial_decay(
epsilon, self.policy.global_step, max_step, 0.1, power=1.0
self.decay_epsilon = ModelUtils.create_schedule(
self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1
)
decay_beta = tf.train.polynomial_decay(
beta, self.policy.global_step, max_step, 1e-5, power=1.0
self.decay_beta = ModelUtils.create_schedule(
self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5
)

value_losses = []
for name, head in value_heads.items():
clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
tf.reduce_sum(head, axis=1) - self.old_values[name],
-decay_epsilon,
decay_epsilon,
-self.decay_epsilon,
self.decay_epsilon,
)
v_opt_a = tf.squared_difference(
self.returns_holders[name], tf.reduce_sum(head, axis=1)
Expand All @@ -263,7 +271,9 @@ def _create_losses(
r_theta = tf.exp(probs - old_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
tf.clip_by_value(
r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon
)
* advantage
)
self.policy_loss = -tf.reduce_mean(
Expand All @@ -275,7 +285,7 @@ def _create_losses(
self.loss = (
self.policy_loss
+ 0.5 * self.value_loss
- decay_beta
- self.decay_beta
* tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
)

Expand Down
12 changes: 8 additions & 4 deletions ml-agents/mlagents/trainers/sac/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from mlagents_envs.logging_util import get_logger
from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
from mlagents.trainers.models import ScheduleType, EncoderType, ModelUtils
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer
Expand Down Expand Up @@ -45,7 +45,7 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
with tf.variable_scope(""):
super().__init__(policy, trainer_params)
lr = float(trainer_params["learning_rate"])
lr_schedule = LearningRateSchedule(
lr_schedule = ScheduleType(
trainer_params.get("learning_rate_schedule", "constant")
)
self.policy = policy
Expand Down Expand Up @@ -111,8 +111,12 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
# The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
self.m_size = 3 * self.policy.m_size
self._create_inputs_and_outputs()
self.learning_rate = ModelUtils.create_learning_rate(
lr_schedule, lr, self.policy.global_step, int(max_step)
self.learning_rate = ModelUtils.create_schedule(
lr_schedule,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self._create_losses(
self.policy_network.q1_heads,
Expand Down
4 changes: 2 additions & 2 deletions ml-agents/mlagents/trainers/tests/test_simple_rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,9 +475,9 @@ def test_gail_visual_ppo(simple_record, use_discrete):
step_size=0.2,
)
override_vals = {
"max_steps": 750,
"max_steps": 1000,
"learning_rate": 3.0e-4,
"behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
"behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1500},
"reward_signals": {
"gail": {
"strength": 1.0,
Expand Down