From ea4718318dbd352d48d01f0ffc7fc04afcffe17b Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 2 Jul 2020 18:52:31 -0700 Subject: [PATCH 01/16] Non-working commit of SAC --- ml-agents/mlagents/trainers/models_torch.py | 68 +++- .../mlagents/trainers/policy/torch_policy.py | 2 + ml-agents/mlagents/trainers/ppo/trainer.py | 8 - .../mlagents/trainers/sac/optimizer_torch.py | 370 ++++++++++++++++++ ml-agents/mlagents/trainers/sac/trainer.py | 62 ++- .../mlagents/trainers/trainer/rl_trainer.py | 12 + 6 files changed, 498 insertions(+), 24 deletions(-) create mode 100644 ml-agents/mlagents/trainers/sac/optimizer_torch.py diff --git a/ml-agents/mlagents/trainers/models_torch.py b/ml-agents/mlagents/trainers/models_torch.py index 2832da06a9..6375cbd626 100644 --- a/ml-agents/mlagents/trainers/models_torch.py +++ b/ml-agents/mlagents/trainers/models_torch.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Callable, NamedTuple +from typing import Callable, NamedTuple, List import torch from torch import nn @@ -10,6 +10,8 @@ ) from mlagents.trainers.exception import UnityTrainerException from mlagents.trainers.models import EncoderType +from mlagents.trainers.settings import NetworkSettings +from mlagents.trainers.brain import CameraResolution ActivationFunction = Callable[[torch.Tensor], torch.Tensor] EncoderFunction = Callable[ @@ -131,6 +133,65 @@ def forward(self, vec_inputs, vis_inputs, memories=None, sequence_length=1): return embedding, memories +class QNetwork(nn.Module): + def __init__( + self, + stream_names: List[str], + vector_sizes: List[int], + visual_sizes: List[CameraResolution], + network_settings: NetworkSettings, + act_type: ActionType, + act_size: List[int], + ): + super(QNetwork, self).__init__() + self.network_body = NetworkBody( + vector_sizes, + visual_sizes, + network_settings.hidden_units, + network_settings.normalize, + network_settings.num_layers, + network_settings.memory.memory_size + if network_settings.memory is not None + else 0, + network_settings.vis_encode_type, + network_settings.memory is not None, + ) + self.stream_names = stream_names + + +class ContinuousQNetwork(QNetwork): + def __init__( + self, + stream_names: List[str], + vector_sizes: List[int], + visual_sizes: List[CameraResolution], + network_settings: NetworkSettings, + act_type: ActionType, + act_size: List[int], + ): + super(ContinuousQNetwork, self).__init__( + stream_names, + vector_sizes, + visual_sizes, + network_settings, + act_type, + act_size, + ) + self.q_heads = ValueHeads( + self.stream_names, network_settings.hidden_units + sum(act_size) + ) + + def forward( + self, + vec_inputs: List[torch.Tensor], + vis_inputs: List[torch.Tensor], + actions: torch.Tensor, + ): + embedding, _ = self.network_body(vec_inputs, vis_inputs) + concat_embed = torch.cat([embedding, actions], axis=-1) + return self.q_heads(concat_embed) + + class ActorCritic(nn.Module): def __init__( self, @@ -146,6 +207,7 @@ def __init__( use_lstm, stream_names, separate_critic, + conditional_sigma=False, ): super(ActorCritic, self).__init__() self.act_type = ActionType.from_str(act_type) @@ -170,7 +232,9 @@ def __init__( else: embedding_size = h_size if self.act_type == ActionType.CONTINUOUS: - self.distribution = GaussianDistribution(embedding_size, act_size[0]) + self.distribution = GaussianDistribution( + embedding_size, act_size[0], conditional_sigma=conditional_sigma + ) else: self.distribution = MultiCategoricalDistribution(embedding_size, act_size) if separate_critic: diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index d3fb2db022..70c272758f 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -54,6 +54,7 @@ def __init__( self.global_step = 0 self.m_size = 0 self.model_path = model_path + self.network_settings = trainer_settings.network_settings self.act_size = brain.vector_action_space_size self.act_type = brain.vector_action_space_type @@ -115,6 +116,7 @@ def __init__( vis_encode_type=trainer_settings.network_settings.vis_encode_type, stream_names=reward_signal_names, separate_critic=self.use_continuous_act, + conditional_sigma=self.condition_sigma_on_obs, ) def split_decision_step(self, decision_requests): diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index 19519aaeb8..3e60bd9011 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -194,14 +194,6 @@ def _update_policy(self): self._clear_update_buffer() return True - def create_policy( - self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters - ) -> Policy: - if self.framework == "torch": - return self.create_torch_policy(parsed_behavior_id, brain_parameters) - else: - return self.create_tf_policy(parsed_behavior_id, brain_parameters) - def create_tf_policy( self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters ) -> NNPolicy: diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py new file mode 100644 index 0000000000..dfb3d6c508 --- /dev/null +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -0,0 +1,370 @@ +import numpy as np +from typing import Dict, List, Mapping, cast, Tuple +import torch +from torch import nn + +from mlagents_envs.logging_util import get_logger +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.settings import NetworkSettings +from mlagents.trainers.brain import CameraResolution +from mlagents.trainers.models_torch import Critic, ContinuousQNetwork, ActionType +from mlagents.trainers.buffer import AgentBuffer +from mlagents_envs.timers import timed +from mlagents.trainers.exception import UnityTrainerException +from mlagents.trainers.settings import TrainerSettings, SACSettings + +EPSILON = 1e-6 # Small value to avoid divide by zero + +logger = get_logger(__name__) + +POLICY_SCOPE = "" +TARGET_SCOPE = "target_network" + + +class TorchSACOptimizer(TorchOptimizer): + class PolicyValueNetwork(nn.Module): + def __init__( + self, + stream_names: List[str], + vector_sizes: List[int], + visual_sizes: List[CameraResolution], + network_settings: NetworkSettings, + act_type: ActionType, + act_size: List[int], + ): + super().__init__() + if act_type == ActionType.CONTINUOUS: + self.q1_network = ContinuousQNetwork( + stream_names, + vector_sizes, + visual_sizes, + network_settings, + act_type, + act_size, + ) + self.q2_network = ContinuousQNetwork( + stream_names, + vector_sizes, + visual_sizes, + network_settings, + act_type, + act_size, + ) + else: + raise UnityTrainerException("Not supported yet") + + def forward( + self, + vec_inputs: List[torch.Tensor], + vis_inputs: List[torch.Tensor], + actions: torch.Tensor = None, + ) -> Tuple[ + Dict[str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, torch.Tensor] + ]: + if actions is not None: + assert isinstance(self.q1_network, ContinuousQNetwork) + q1_out, _ = self.q1_network(vec_inputs, vis_inputs, actions) + q2_out, _ = self.q2_network(vec_inputs, vis_inputs, actions) + return q1_out, q2_out + + def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): + super().__init__(policy, trainer_params) + hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters) + lr = hyperparameters.learning_rate + # lr_schedule = hyperparameters.learning_rate_schedule + # max_step = trainer_params.max_steps + self.tau = hyperparameters.tau + self.init_entcoef = hyperparameters.init_entcoef + + self.policy = policy + self.act_size = policy.act_size + policy_network_settings = policy.network_settings + # h_size = policy_network_settings.hidden_units + # num_layers = policy_network_settings.num_layers + # vis_encode_type = policy_network_settings.vis_encode_type + + self.tau = hyperparameters.tau + self.burn_in_ratio = 0.0 + + # Non-exposed SAC parameters + self.discrete_target_entropy_scale = 0.2 # Roughly equal to e-greedy 0.05 + self.continuous_target_entropy_scale = 1.0 + + self.stream_names = list(self.reward_signals.keys()) + # Use to reduce "survivor bonus" when using Curiosity or GAIL. + self.gammas = [_val.gamma for _val in trainer_params.reward_signals.values()] + self.use_dones_in_backup = { + name: int(self.reward_signals[name].use_terminal_states) + for name in self.stream_names + } + # self.disable_use_dones = { + # name: self.use_dones_in_backup[name].assign(0.0) + # for name in stream_names + # } + + brain = policy.brain + self.value_network = TorchSACOptimizer.PolicyValueNetwork( + self.stream_names, + [brain.vector_observation_space_size], + brain.camera_resolutions, + policy_network_settings, + ActionType.from_str(policy.act_type), + self.act_size, + ) + self.target_network = Critic( + self.stream_names, + policy_network_settings.hidden_units, + [brain.vector_observation_space_size], + brain.camera_resolutions, + policy_network_settings.normalize, + policy_network_settings.num_layers, + policy_network_settings.memory.memory_sze + if policy_network_settings.memory is not None + else 0, + policy_network_settings.vis_encode_type, + ) + self.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0) + + self._log_ent_coef = torch.tensor( + np.log([self.init_entcoef] * len(self.act_size)).astype(np.float32), + requires_grad=True, + ) + self.target_entropy = torch.as_tensor( + -1 + * self.continuous_target_entropy_scale + * np.prod(self.act_size[0]).astype(np.float32) + ) + + policy_params = list(self.policy.actor_critic.network_body.parameters()) + list( + self.policy.actor_critic.distribution.parameters() + ) + print(self.policy.actor_critic.network_body.parameters()) + value_params = list(self.value_network.parameters()) + list( + self.policy.actor_critic.critic.parameters() + ) + + self.policy_optimizer = torch.optim.Adam(policy_params, lr=lr) + self.value_optimizer = torch.optim.Adam(value_params, lr=lr) + self.entropy_optimizer = torch.optim.Adam([self._log_ent_coef], lr=lr) + + def sac_q_loss( + self, + q1_out: Dict[str, torch.Tensor], + q2_out: Dict[str, torch.Tensor], + target_values: Dict[str, torch.Tensor], + dones: torch.Tensor, + rewards: Dict[str, torch.Tensor], + loss_masks: torch.Tensor, + discrete: bool = False, + ) -> None: + """ + Creates training-specific Tensorflow ops for SAC models. + :param q1_streams: Q1 streams from policy network + :param q1_streams: Q2 streams from policy network + :param lr: Learning rate + :param max_step: Total number of training steps. + :param stream_names: List of reward stream names. + :param discrete: Whether or not to use discrete action losses. + """ + q1_losses = [] + q2_losses = [] + # Multiple q losses per stream + for i, name in enumerate(q1_out.keys()): + q1_stream = q1_out[name] + q2_stream = q2_out[name] + with torch.no_grad(): + q_backup = rewards[name] + ( + 1.0 + - self.use_dones_in_backup[name] + * dones + * self.gammas[i] + * target_values[name] + ) + + _q1_loss = 0.5 * torch.mean( + loss_masks * torch.pow((q_backup - q1_stream), 2) + ) + _q2_loss = 0.5 * torch.mean( + loss_masks * torch.pow((q_backup - q2_stream), 2) + ) + + q1_losses.append(_q1_loss) + q2_losses.append(_q2_loss) + q1_loss = torch.mean(torch.stack(q1_losses)) + q2_loss = torch.mean(torch.stack(q2_losses)) + print(q1_loss) + + return q1_loss + q2_loss + + def soft_update(self, source: nn.Module, target: nn.Module, tau: float): + for source_param, target_param in zip(source.parameters(), target.parameters()): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + + def sac_value_loss( + self, + log_probs: torch.Tensor, + values: Dict[str, torch.Tensor], + q1p_out: Dict[str, torch.Tensor], + q2p_out: Dict[str, torch.Tensor], + loss_masks: torch.Tensor, + discrete: bool, + ): + min_policy_qs = {} + + for name in values.keys(): + if not discrete: + min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name]) + _ent_coef = torch.exp(self._log_ent_coef) + + if not discrete: + value_losses = [] + for name in values.keys(): + with torch.no_grad(): + v_backup = min_policy_qs[name] - torch.sum( + _ent_coef * log_probs, dim=1 + ) + value_loss = 0.5 * torch.mean( + loss_masks * torch.pow((values[name] - v_backup), 2) + ) + value_losses.append(value_loss) + value_loss = torch.mean(torch.stack(value_losses)) + return value_loss + + def sac_policy_loss( + self, + log_probs: torch.Tensor, + q1p_outs: Dict[str, torch.Tensor], + loss_masks: torch.Tensor, + discrete: bool, + ): + _ent_coef = torch.exp(self._log_ent_coef) + if not discrete: + mean_q1 = torch.mean(torch.stack(list(q1p_outs.values()))) + batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1) + policy_loss = torch.mean(loss_masks * batch_policy_loss) + else: + policy_loss = 0 + return policy_loss + + def sac_entropy_loss( + self, log_probs: torch.Tensor, loss_masks: torch.Tensor, discrete: bool + ): + if not discrete: + with torch.no_grad(): + inner_term = torch.sum(log_probs + self.target_entropy, dim=1) + entropy_loss = -torch.mean(self._log_ent_coef * loss_masks * inner_term) + else: + entropy_loss = 0 + return entropy_loss + + @timed + def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: + """ + Updates model using buffer. + :param num_sequences: Number of trajectories in batch. + :param batch: Experience mini-batch. + :param update_target: Whether or not to update target value network + :param reward_signal_batches: Minibatches to use for updating the reward signals, + indexed by name. If none, don't update the reward signals. + :return: Output from update process. + """ + rewards = {} + for name in self.reward_signals: + rewards[name] = torch.as_tensor(batch["{}_rewards".format(name)]) + + vec_obs = [torch.as_tensor(batch["vector_obs"])] + next_vec_obs = [torch.as_tensor(batch["next_vector_in"])] + act_masks = torch.as_tensor(batch["action_mask"]) + if self.policy.use_continuous_act: + actions = torch.as_tensor(batch["actions"]).unsqueeze(-1) + else: + actions = torch.as_tensor(batch["actions"], dtype=torch.long) + + memories = [ + torch.as_tensor(batch["memory"][i]) + for i in range(0, len(batch["memory"]), self.policy.sequence_length) + ] + if len(memories) > 0: + memories = torch.stack(memories).unsqueeze(0) + + next_vis_obs = [] + if self.policy.use_vis_obs: + vis_obs = [] + next_vis_obs = [] + for idx, _ in enumerate( + self.policy.actor_critic.network_body.visual_encoders + ): + vis_ob = torch.as_tensor(batch["visual_obs%d" % idx]) + vis_obs.append(vis_ob) + next_vis_ob = torch.as_tensor(batch["next_visual_obs%d" % idx]) + next_vis_obs.append(next_vis_ob) + else: + vis_obs = [] + sampled_actions, log_probs, entropies, sampled_values, _ = self.policy.sample_actions( + vec_obs, + vis_obs, + masks=act_masks, + memories=memories, + seq_len=self.policy.sequence_length, + ) + q1p_out, q2p_out = self.value_network(vec_obs, vis_obs, sampled_actions) + q1_out, q2_out = self.value_network(vec_obs, vis_obs, actions.squeeze(-1)) + + target_values, _ = self.target_network(next_vec_obs, next_vis_obs) + q_loss = self.sac_q_loss( + q1_out, + q2_out, + target_values, + torch.as_tensor(batch["done"]), + rewards, + torch.as_tensor(batch["masks"], dtype=torch.int32), + False, + ) + value_loss = self.sac_value_loss( + log_probs, + sampled_values, + q1p_out, + q2p_out, + torch.as_tensor(batch["masks"], dtype=torch.int32), + False, + ) + policy_loss = self.sac_policy_loss( + log_probs, + q1p_out, + torch.as_tensor(batch["masks"], dtype=torch.int32), + False, + ) + entropy_loss = self.sac_entropy_loss( + log_probs, torch.as_tensor(batch["masks"], dtype=torch.int32), False + ) + self.policy_optimizer.zero_grad() + policy_loss.backward() + self.policy_optimizer.step() + + total_value_loss = q_loss + value_loss + self.value_optimizer.zero_grad() + total_value_loss.backward() + self.value_optimizer.step() + + self.entropy_optimizer.zero_grad() + entropy_loss.backward + self.entropy_optimizer.step() + + # Update Q network + self.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau) + + update_stats = { + "Losses/Policy Loss": abs(policy_loss.detach().numpy()), + "Losses/Value Loss": value_loss.detach().numpy(), + "Losses/Q Loss": q_loss.detach().numpy(), + "Policy/Entropy Coeff": torch.exp(self._log_ent_coef).detach().numpy(), + } + return update_stats + + def update_reward_signals( + self, reward_signal_minibatches: Mapping[str, AgentBuffer], num_sequences: int + ) -> Dict[str, float]: + return {} diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index df6f97dcf6..9dab42fb10 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -18,6 +18,8 @@ from mlagents.trainers.trajectory import Trajectory, SplitObservations from mlagents.trainers.brain import BrainParameters from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer from mlagents.trainers.settings import TrainerSettings, SACSettings @@ -187,17 +189,7 @@ def _update_policy(self) -> bool: def create_policy( self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters ) -> Policy: - policy = NNPolicy( - self.seed, - brain_parameters, - self.trainer_settings, - self.is_training, - self.artifact_path, - self.load, - tanh_squash=True, - reparameterize=True, - create_tf_graph=False, - ) + policy = super().create_policy(parsed_behavior_id, brain_parameters) # Load the replay buffer if load if self.load and self.checkpoint_replay_buffer: try: @@ -214,6 +206,41 @@ def create_policy( return policy + def create_tf_policy( + self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters + ) -> NNPolicy: + policy = NNPolicy( + self.seed, + brain_parameters, + self.trainer_settings, + self.is_training, + self.artifact_path, + self.load, + tanh_squash=True, + reparameterize=True, + create_tf_graph=False, + ) + return policy + + def create_torch_policy( + self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters + ) -> TorchPolicy: + """ + Creates a PPO policy to trainers list of policies. + :param parsed_behavior_id: + :param brain_parameters: specifications for policy construction + :return policy + """ + policy = TorchPolicy( + self.seed, + brain_parameters, + self.trainer_settings, + self.artifact_path, + self.load, + condition_sigma_on_obs=True, # Faster training for PPO + ) + return policy + def _update_sac_policy(self) -> bool: """ Uses update_buffer to update the policy. We sample the update_buffer and update @@ -317,10 +344,17 @@ def add_policy( self.__class__.__name__ ) ) - if not isinstance(policy, NNPolicy): - raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy - self.optimizer = SACOptimizer(self.policy, self.trainer_settings) + if self.framework == "torch": + self.optimizer = TorchSACOptimizer( # type: ignore + self.policy, self.trainer_settings # type: ignore + ) # type: ignore + else: + if not isinstance(policy, NNPolicy): + raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()") + self.optimizer = SACOptimizer( # type: ignore + self.policy, self.trainer_settings # type: ignore + ) # type: ignore for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py index 8ad5b10be8..f02d799b93 100644 --- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py +++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py @@ -10,6 +10,9 @@ from mlagents.trainers.trainer import Trainer from mlagents.trainers.components.reward_signals import RewardSignalResult from mlagents_envs.timers import hierarchical_timer +from mlagents.trainers.brain import BrainParameters +from mlagents.trainers.policy.policy import Policy +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers from mlagents.trainers.agent_processor import AgentManagerQueue from mlagents.trainers.trajectory import Trajectory from mlagents.trainers.stats import StatsPropertyType @@ -37,6 +40,7 @@ def __init__(self, *args, **kwargs): self._stats_reporter.add_property( StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict() ) + self.framework = "torch" self._next_save_step = 0 self._next_summary_step = 0 @@ -80,6 +84,14 @@ def _is_ready_update(self): """ return False + def create_policy( + self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters + ) -> Policy: + if self.framework == "torch": + return self.create_torch_policy(parsed_behavior_id, brain_parameters) + else: + return self.create_tf_policy(parsed_behavior_id, brain_parameters) + @abc.abstractmethod def _update_policy(self) -> bool: """ From 61bd08c8983c6fc90946892731e120e5908b0a38 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 9 Jul 2020 17:06:34 -0700 Subject: [PATCH 02/16] working SAC --- .../mlagents/trainers/distributions_torch.py | 35 ++++- ml-agents/mlagents/trainers/models_torch.py | 139 +++++++++++++----- .../mlagents/trainers/policy/torch_policy.py | 1 + .../mlagents/trainers/sac/optimizer_torch.py | 45 ++++-- ml-agents/mlagents/trainers/sac/trainer.py | 3 +- 5 files changed, 164 insertions(+), 59 deletions(-) diff --git a/ml-agents/mlagents/trainers/distributions_torch.py b/ml-agents/mlagents/trainers/distributions_torch.py index 29c683df63..eb29123d69 100644 --- a/ml-agents/mlagents/trainers/distributions_torch.py +++ b/ml-agents/mlagents/trainers/distributions_torch.py @@ -13,7 +13,8 @@ def __init__(self, mean, std): self.std = std def sample(self): - return self.mean + torch.randn_like(self.mean) * self.std + sample = self.mean + torch.randn_like(self.mean) * self.std + return sample def log_prob(self, value): var = self.std ** 2 @@ -32,6 +33,21 @@ def entropy(self): return torch.log(2 * math.pi * math.e * self.std) +class TanhGaussianDistInstance(GaussianDistInstance): + def sample(self): + unsquashed_sample = super().sample() + squashed = torch.tanh(unsquashed_sample) + return squashed + + def _inverse_tanh(self, value): + return 0.5 * torch.log((1 + value) / (1 - value) + EPSILON) + + def log_prob(self, value): + return super().log_prob(self._inverse_tanh(value)) - torch.log( + 1 - value ** 2 + EPSILON + ) + + class CategoricalDistInstance(nn.Module): def __init__(self, logits): super(CategoricalDistInstance, self).__init__() @@ -52,10 +68,18 @@ def entropy(self): class GaussianDistribution(nn.Module): - def __init__(self, hidden_size, num_outputs, conditional_sigma=False, **kwargs): + def __init__( + self, + hidden_size, + num_outputs, + conditional_sigma=False, + tanh_squash=False, + **kwargs + ): super(GaussianDistribution, self).__init__(**kwargs) self.conditional_sigma = conditional_sigma self.mu = nn.Linear(hidden_size, num_outputs) + self.tanh_squash = tanh_squash nn.init.xavier_uniform_(self.mu.weight, gain=0.01) if conditional_sigma: self.log_sigma = nn.Linear(hidden_size, num_outputs) @@ -68,10 +92,13 @@ def __init__(self, hidden_size, num_outputs, conditional_sigma=False, **kwargs): def forward(self, inputs): mu = self.mu(inputs) if self.conditional_sigma: - log_sigma = self.log_sigma(inputs) + log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2) else: log_sigma = self.log_sigma - return [GaussianDistInstance(mu, torch.exp(log_sigma))] + if self.tanh_squash: + return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))] + else: + return [GaussianDistInstance(mu, torch.exp(log_sigma))] class MultiCategoricalDistribution(nn.Module): diff --git a/ml-agents/mlagents/trainers/models_torch.py b/ml-agents/mlagents/trainers/models_torch.py index 6375cbd626..9009114ef2 100644 --- a/ml-agents/mlagents/trainers/models_torch.py +++ b/ml-agents/mlagents/trainers/models_torch.py @@ -94,6 +94,13 @@ def update_normalization(self, vec_inputs): for idx, vec_input in enumerate(vec_inputs): self.vector_normalizers[idx].update(vec_input) + def copy_normalization(self, other_network: "NetworkBody"): + if self.normalize: + for n1, n2 in zip( + self.vector_normalizers, other_network.vector_normalizers + ): + n1.copy_from(n2) + def forward(self, vec_inputs, vis_inputs, memories=None, sequence_length=1): vec_embeds = [] for idx, encoder in enumerate(self.vector_encoders): @@ -133,7 +140,7 @@ def forward(self, vec_inputs, vis_inputs, memories=None, sequence_length=1): return embedding, memories -class QNetwork(nn.Module): +class ContinuousQNetwork(NetworkBody): def __init__( self, stream_names: List[str], @@ -143,53 +150,98 @@ def __init__( act_type: ActionType, act_size: List[int], ): - super(QNetwork, self).__init__() - self.network_body = NetworkBody( - vector_sizes, - visual_sizes, - network_settings.hidden_units, - network_settings.normalize, - network_settings.num_layers, + # This is not a typo, we want to call __init__ of nn.Module + nn.Module.__init__(self) + self.normalize = network_settings.normalize + self.visual_encoders = [] + self.vector_encoders = [] + self.vector_normalizers = [] + self.use_lstm = network_settings.memory is not None + self.h_size = network_settings.hidden_units + self.m_size = ( network_settings.memory.memory_size if network_settings.memory is not None else 0, - network_settings.vis_encode_type, - network_settings.memory is not None, ) - self.stream_names = stream_names - -class ContinuousQNetwork(QNetwork): - def __init__( - self, - stream_names: List[str], - vector_sizes: List[int], - visual_sizes: List[CameraResolution], - network_settings: NetworkSettings, - act_type: ActionType, - act_size: List[int], - ): - super(ContinuousQNetwork, self).__init__( - stream_names, - vector_sizes, - visual_sizes, - network_settings, - act_type, - act_size, - ) - self.q_heads = ValueHeads( - self.stream_names, network_settings.hidden_units + sum(act_size) + visual_encoder = ModelUtils.get_encoder_for_type( + network_settings.vis_encode_type ) + for vector_size in vector_sizes: + if vector_size != 0: + self.vector_normalizers.append(Normalizer(vector_size)) + self.vector_encoders.append( + VectorEncoder( + vector_size + sum(act_size), + self.h_size, + network_settings.num_layers, + ) + ) + for visual_size in visual_sizes: + self.visual_encoders.append( + visual_encoder( + visual_size.height, + visual_size.width, + visual_size.num_channels, + self.h_size, + ) + ) + + self.vector_encoders = nn.ModuleList(self.vector_encoders) + self.visual_encoders = nn.ModuleList(self.visual_encoders) + if self.use_lstm: + self.lstm = nn.LSTM(self.h_size, self.m_size // 2, 1) + self.q_heads = ValueHeads(stream_names, network_settings.hidden_units) def forward( self, - vec_inputs: List[torch.Tensor], - vis_inputs: List[torch.Tensor], - actions: torch.Tensor, + vec_inputs, + vis_inputs, + memories=None, + sequence_length=1, + actions: torch.Tensor = None, ): - embedding, _ = self.network_body(vec_inputs, vis_inputs) - concat_embed = torch.cat([embedding, actions], axis=-1) - return self.q_heads(concat_embed) + vec_embeds = [] + for idx, encoder in enumerate(self.vector_encoders): + vec_input = vec_inputs[idx] + if self.normalize: + vec_input = self.vector_normalizers[idx](vec_input) + if actions is not None: + hidden = encoder(torch.cat([vec_input, actions], axis=-1)) + else: + hidden = encoder(vec_input) + vec_embeds.append(hidden) + + vis_embeds = [] + for idx, encoder in enumerate(self.visual_encoders): + vis_input = vis_inputs[idx] + vis_input = vis_input.permute([0, 3, 1, 2]) + hidden = encoder(vis_input) + vis_embeds.append(hidden) + + # embedding = vec_embeds[0] + if len(vec_embeds) > 0: + vec_embeds = torch.stack(vec_embeds, dim=-1).sum(dim=-1) + if len(vis_embeds) > 0: + vis_embeds = torch.stack(vis_embeds, dim=-1).sum(dim=-1) + if len(vec_embeds) > 0 and len(vis_embeds) > 0: + embedding = torch.stack([vec_embeds, vis_embeds], dim=-1).sum(dim=-1) + elif len(vec_embeds) > 0: + embedding = vec_embeds + elif len(vis_embeds) > 0: + embedding = vis_embeds + else: + raise Exception("No valid inputs to network.") + + if self.use_lstm: + embedding = embedding.view([sequence_length, -1, self.h_size]) + memories = torch.split(memories, self.m_size // 2, dim=-1) + embedding, memories = self.lstm(embedding, memories) + embedding = embedding.view([-1, self.m_size // 2]) + memories = torch.cat(memories, dim=-1) + + output, _ = self.q_heads(embedding) + return output, memories class ActorCritic(nn.Module): @@ -208,6 +260,7 @@ def __init__( stream_names, separate_critic, conditional_sigma=False, + tanh_squash=False, ): super(ActorCritic, self).__init__() self.act_type = ActionType.from_str(act_type) @@ -233,7 +286,10 @@ def __init__( embedding_size = h_size if self.act_type == ActionType.CONTINUOUS: self.distribution = GaussianDistribution( - embedding_size, act_size[0], conditional_sigma=conditional_sigma + embedding_size, + act_size[0], + conditional_sigma=conditional_sigma, + tanh_squash=tanh_squash, ) else: self.distribution = MultiCategoricalDistribution(embedding_size, act_size) @@ -385,6 +441,11 @@ def update(self, vector_input): self.running_variance = new_variance self.normalization_steps = total_new_steps + def copy_from(self, other_normalizer: "Normalizer"): + self.normalization_steps.data.copy_(other_normalizer.normalization_steps.data) + self.running_mean.data.copy_(other_normalizer.running_mean.data) + self.running_variance.copy_(other_normalizer.running_variance.data) + class ValueHeads(nn.Module): def __init__(self, stream_names, input_size): diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 70c272758f..2c61e69df3 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -117,6 +117,7 @@ def __init__( stream_names=reward_signal_names, separate_critic=self.use_continuous_act, conditional_sigma=self.condition_sigma_on_obs, + tanh_squash=tanh_squash, ) def split_decision_step(self, decision_requests): diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index dfb3d6c508..20379e05ac 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -126,8 +126,8 @@ def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): ) self.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0) - self._log_ent_coef = torch.tensor( - np.log([self.init_entcoef] * len(self.act_size)).astype(np.float32), + self._log_ent_coef = torch.nn.Parameter( + torch.log(torch.as_tensor([self.init_entcoef] * len(self.act_size))), requires_grad=True, ) self.target_entropy = torch.as_tensor( @@ -139,11 +139,17 @@ def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): policy_params = list(self.policy.actor_critic.network_body.parameters()) + list( self.policy.actor_critic.distribution.parameters() ) - print(self.policy.actor_critic.network_body.parameters()) value_params = list(self.value_network.parameters()) + list( self.policy.actor_critic.critic.parameters() ) + logger.debug("value_vars") + for param in value_params: + logger.debug(param.shape) + logger.debug("policy_vars") + for param in policy_params: + logger.debug(param.shape) + self.policy_optimizer = torch.optim.Adam(policy_params, lr=lr) self.value_optimizer = torch.optim.Adam(value_params, lr=lr) self.entropy_optimizer = torch.optim.Adam([self._log_ent_coef], lr=lr) @@ -175,13 +181,10 @@ def sac_q_loss( q2_stream = q2_out[name] with torch.no_grad(): q_backup = rewards[name] + ( - 1.0 - - self.use_dones_in_backup[name] - * dones + (1.0 - self.use_dones_in_backup[name] * dones) * self.gammas[i] * target_values[name] ) - _q1_loss = 0.5 * torch.mean( loss_masks * torch.pow((q_backup - q1_stream), 2) ) @@ -193,9 +196,7 @@ def sac_q_loss( q2_losses.append(_q2_loss) q1_loss = torch.mean(torch.stack(q1_losses)) q2_loss = torch.mean(torch.stack(q2_losses)) - print(q1_loss) - - return q1_loss + q2_loss + return q1_loss, q2_loss def soft_update(self, source: nn.Module, target: nn.Module, tau: float): for source_param, target_param in zip(source.parameters(), target.parameters()): @@ -242,7 +243,8 @@ def sac_policy_loss( ): _ent_coef = torch.exp(self._log_ent_coef) if not discrete: - mean_q1 = torch.mean(torch.stack(list(q1p_outs.values()))) + mean_q1 = torch.mean(torch.stack(list(q1p_outs.values())), axis=0) + mean_q1.unsqueeze_(1) batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1) policy_loss = torch.mean(loss_masks * batch_policy_loss) else: @@ -303,6 +305,18 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: next_vis_obs.append(next_vis_ob) else: vis_obs = [] + + # Copy normalizers from policy + self.value_network.q1_network.copy_normalization( + self.policy.actor_critic.network_body + ) + self.value_network.q2_network.copy_normalization( + self.policy.actor_critic.network_body + ) + self.target_network.network_body.copy_normalization( + self.policy.actor_critic.network_body + ) + sampled_actions, log_probs, entropies, sampled_values, _ = self.policy.sample_actions( vec_obs, vis_obs, @@ -314,7 +328,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: q1_out, q2_out = self.value_network(vec_obs, vis_obs, actions.squeeze(-1)) target_values, _ = self.target_network(next_vec_obs, next_vis_obs) - q_loss = self.sac_q_loss( + q1_loss, q2_loss = self.sac_q_loss( q1_out, q2_out, target_values, @@ -344,13 +358,13 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: policy_loss.backward() self.policy_optimizer.step() - total_value_loss = q_loss + value_loss + total_value_loss = q1_loss + q2_loss + value_loss self.value_optimizer.zero_grad() total_value_loss.backward() self.value_optimizer.step() self.entropy_optimizer.zero_grad() - entropy_loss.backward + entropy_loss.backward() self.entropy_optimizer.step() # Update Q network @@ -359,7 +373,8 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: update_stats = { "Losses/Policy Loss": abs(policy_loss.detach().numpy()), "Losses/Value Loss": value_loss.detach().numpy(), - "Losses/Q Loss": q_loss.detach().numpy(), + "Losses/Q1 Loss": q1_loss.detach().numpy(), + "Losses/Q2 Loss": q2_loss.detach().numpy(), "Policy/Entropy Coeff": torch.exp(self._log_ent_coef).detach().numpy(), } return update_stats diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index 9dab42fb10..a09b9b0b7b 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -237,7 +237,8 @@ def create_torch_policy( self.trainer_settings, self.artifact_path, self.load, - condition_sigma_on_obs=True, # Faster training for PPO + condition_sigma_on_obs=True, + tanh_squash=True, ) return policy From c9c5ecf59694eb03e2f09368564bacd0f8db5587 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 9 Jul 2020 18:10:24 -0700 Subject: [PATCH 03/16] Fix bad merge --- .../mlagents/trainers/sac/optimizer_torch.py | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 20379e05ac..b3d154c195 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -59,13 +59,11 @@ def forward( vec_inputs: List[torch.Tensor], vis_inputs: List[torch.Tensor], actions: torch.Tensor = None, - ) -> Tuple[ - Dict[str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, torch.Tensor] - ]: + ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: if actions is not None: assert isinstance(self.q1_network, ContinuousQNetwork) - q1_out, _ = self.q1_network(vec_inputs, vis_inputs, actions) - q2_out, _ = self.q2_network(vec_inputs, vis_inputs, actions) + q1_out, _ = self.q1_network(vec_inputs, vis_inputs, actions=actions) + q2_out, _ = self.q2_network(vec_inputs, vis_inputs, actions=actions) return q1_out, q2_out def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): @@ -119,7 +117,7 @@ def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): brain.camera_resolutions, policy_network_settings.normalize, policy_network_settings.num_layers, - policy_network_settings.memory.memory_sze + policy_network_settings.memory.memory_size if policy_network_settings.memory is not None else 0, policy_network_settings.vis_encode_type, @@ -163,7 +161,7 @@ def sac_q_loss( rewards: Dict[str, torch.Tensor], loss_masks: torch.Tensor, discrete: bool = False, - ) -> None: + ) -> Tuple[torch.Tensor, torch.Tensor]: """ Creates training-specific Tensorflow ops for SAC models. :param q1_streams: Q1 streams from policy network @@ -198,7 +196,7 @@ def sac_q_loss( q2_loss = torch.mean(torch.stack(q2_losses)) return q1_loss, q2_loss - def soft_update(self, source: nn.Module, target: nn.Module, tau: float): + def soft_update(self, source: nn.Module, target: nn.Module, tau: float) -> None: for source_param, target_param in zip(source.parameters(), target.parameters()): target_param.data.copy_( target_param.data * (1.0 - tau) + source_param.data * tau @@ -212,7 +210,7 @@ def sac_value_loss( q2p_out: Dict[str, torch.Tensor], loss_masks: torch.Tensor, discrete: bool, - ): + ) -> torch.Tensor: min_policy_qs = {} for name in values.keys(): @@ -240,7 +238,7 @@ def sac_policy_loss( q1p_outs: Dict[str, torch.Tensor], loss_masks: torch.Tensor, discrete: bool, - ): + ) -> torch.Tensor: _ent_coef = torch.exp(self._log_ent_coef) if not discrete: mean_q1 = torch.mean(torch.stack(list(q1p_outs.values())), axis=0) @@ -253,7 +251,7 @@ def sac_policy_loss( def sac_entropy_loss( self, log_probs: torch.Tensor, loss_masks: torch.Tensor, discrete: bool - ): + ) -> torch.Tensor: if not discrete: with torch.no_grad(): inner_term = torch.sum(log_probs + self.target_entropy, dim=1) @@ -292,10 +290,10 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) - next_vis_obs = [] + vis_obs: List[torch.Tensor] = [] + next_vis_obs: List[torch.Tensor] = [] if self.policy.use_vis_obs: vis_obs = [] - next_vis_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_encoders ): @@ -303,8 +301,6 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: vis_obs.append(vis_ob) next_vis_ob = torch.as_tensor(batch["next_visual_obs%d" % idx]) next_vis_obs.append(next_vis_ob) - else: - vis_obs = [] # Copy normalizers from policy self.value_network.q1_network.copy_normalization( From 1ee90d5215fd7ba38d3b3959876f4ff0d9df73e9 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 9 Jul 2020 18:11:14 -0700 Subject: [PATCH 04/16] Convert to list_to_tensor --- .../mlagents/trainers/sac/optimizer_torch.py | 49 ++++++++----------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index b3d154c195..2c05667059 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -8,7 +8,12 @@ from mlagents.trainers.policy.torch_policy import TorchPolicy from mlagents.trainers.settings import NetworkSettings from mlagents.trainers.brain import CameraResolution -from mlagents.trainers.models_torch import Critic, ContinuousQNetwork, ActionType +from mlagents.trainers.models_torch import ( + Critic, + ContinuousQNetwork, + ActionType, + list_to_tensor, +) from mlagents.trainers.buffer import AgentBuffer from mlagents_envs.timers import timed from mlagents.trainers.exception import UnityTrainerException @@ -273,18 +278,18 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ rewards = {} for name in self.reward_signals: - rewards[name] = torch.as_tensor(batch["{}_rewards".format(name)]) + rewards[name] = list_to_tensor(batch["{}_rewards".format(name)]) - vec_obs = [torch.as_tensor(batch["vector_obs"])] - next_vec_obs = [torch.as_tensor(batch["next_vector_in"])] - act_masks = torch.as_tensor(batch["action_mask"]) + vec_obs = [list_to_tensor(batch["vector_obs"])] + next_vec_obs = [list_to_tensor(batch["next_vector_in"])] + act_masks = list_to_tensor(batch["action_mask"]) if self.policy.use_continuous_act: - actions = torch.as_tensor(batch["actions"]).unsqueeze(-1) + actions = list_to_tensor(batch["actions"]).unsqueeze(-1) else: - actions = torch.as_tensor(batch["actions"], dtype=torch.long) + actions = list_to_tensor(batch["actions"], dtype=torch.long) memories = [ - torch.as_tensor(batch["memory"][i]) + list_to_tensor(batch["memory"][i]) for i in range(0, len(batch["memory"]), self.policy.sequence_length) ] if len(memories) > 0: @@ -297,9 +302,9 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_encoders ): - vis_ob = torch.as_tensor(batch["visual_obs%d" % idx]) + vis_ob = list_to_tensor(batch["visual_obs%d" % idx]) vis_obs.append(vis_ob) - next_vis_ob = torch.as_tensor(batch["next_visual_obs%d" % idx]) + next_vis_ob = list_to_tensor(batch["next_visual_obs%d" % idx]) next_vis_obs.append(next_vis_ob) # Copy normalizers from policy @@ -324,29 +329,15 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: q1_out, q2_out = self.value_network(vec_obs, vis_obs, actions.squeeze(-1)) target_values, _ = self.target_network(next_vec_obs, next_vis_obs) + masks = list_to_tensor(batch["masks"], dtype=torch.int32) + dones = list_to_tensor(batch["done"]) q1_loss, q2_loss = self.sac_q_loss( - q1_out, - q2_out, - target_values, - torch.as_tensor(batch["done"]), - rewards, - torch.as_tensor(batch["masks"], dtype=torch.int32), - False, + q1_out, q2_out, target_values, dones, rewards, masks, False ) value_loss = self.sac_value_loss( - log_probs, - sampled_values, - q1p_out, - q2p_out, - torch.as_tensor(batch["masks"], dtype=torch.int32), - False, - ) - policy_loss = self.sac_policy_loss( - log_probs, - q1p_out, - torch.as_tensor(batch["masks"], dtype=torch.int32), - False, + log_probs, sampled_values, q1p_out, q2p_out, masks, False ) + policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, False) entropy_loss = self.sac_entropy_loss( log_probs, torch.as_tensor(batch["masks"], dtype=torch.int32), False ) From b196b095d275413fd5fcfa459cce5a8ee2984ed6 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Fri, 10 Jul 2020 11:02:01 -0700 Subject: [PATCH 05/16] Fix issue with inverse tanh causing NaNs --- ml-agents/mlagents/trainers/distributions_torch.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ml-agents/mlagents/trainers/distributions_torch.py b/ml-agents/mlagents/trainers/distributions_torch.py index eb29123d69..f01f78a0ba 100644 --- a/ml-agents/mlagents/trainers/distributions_torch.py +++ b/ml-agents/mlagents/trainers/distributions_torch.py @@ -18,9 +18,9 @@ def sample(self): def log_prob(self, value): var = self.std ** 2 - log_scale = self.std.log() + log_scale = torch.log(self.std + EPSILON) return ( - -((value - self.mean) ** 2) / (2 * var) + -((value - self.mean) ** 2) / (2 * var + EPSILON) - log_scale - math.log(math.sqrt(2 * math.pi)) ) @@ -30,7 +30,7 @@ def pdf(self, value): return torch.exp(log_prob) def entropy(self): - return torch.log(2 * math.pi * math.e * self.std) + return torch.log(2 * math.pi * math.e * self.std + EPSILON) class TanhGaussianDistInstance(GaussianDistInstance): @@ -40,7 +40,8 @@ def sample(self): return squashed def _inverse_tanh(self, value): - return 0.5 * torch.log((1 + value) / (1 - value) + EPSILON) + capped_value = torch.clamp(value, -1 + EPSILON, 1 - EPSILON) + return 0.5 * torch.log((1 + capped_value) / (1 - capped_value) + EPSILON) def log_prob(self, value): return super().log_prob(self._inverse_tanh(value)) - torch.log( From fda491dca1d8db40367a51a81e2ff4397d2407e5 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Fri, 10 Jul 2020 16:38:15 -0700 Subject: [PATCH 06/16] Allow caching to fix numerical instability --- ml-agents/mlagents/trainers/distributions_torch.py | 11 ++++++++--- ml-agents/mlagents/trainers/models_torch.py | 9 ++++----- ml-agents/mlagents/trainers/policy/torch_policy.py | 14 ++++++++++---- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/ml-agents/mlagents/trainers/distributions_torch.py b/ml-agents/mlagents/trainers/distributions_torch.py index f01f78a0ba..d87049d9eb 100644 --- a/ml-agents/mlagents/trainers/distributions_torch.py +++ b/ml-agents/mlagents/trainers/distributions_torch.py @@ -34,9 +34,13 @@ def entropy(self): class TanhGaussianDistInstance(GaussianDistInstance): + def __init__(self, mean, std): + super().__init__(mean, std) + self.transform = torch.distributions.transforms.TanhTransform(cache_size=1) + def sample(self): unsquashed_sample = super().sample() - squashed = torch.tanh(unsquashed_sample) + squashed = self.transform(unsquashed_sample) return squashed def _inverse_tanh(self, value): @@ -44,8 +48,9 @@ def _inverse_tanh(self, value): return 0.5 * torch.log((1 + capped_value) / (1 - capped_value) + EPSILON) def log_prob(self, value): - return super().log_prob(self._inverse_tanh(value)) - torch.log( - 1 - value ** 2 + EPSILON + unsquashed = self.transform.inv(value) + return super().log_prob(unsquashed) - self.transform.log_abs_det_jacobian( + unsquashed, value ) diff --git a/ml-agents/mlagents/trainers/models_torch.py b/ml-agents/mlagents/trainers/models_torch.py index 15b826691a..811a11abda 100644 --- a/ml-agents/mlagents/trainers/models_torch.py +++ b/ml-agents/mlagents/trainers/models_torch.py @@ -336,14 +336,12 @@ def sample_action(self, dists): for action_dist in dists: action = action_dist.sample() actions.append(action) - actions = torch.stack(actions, dim=-1) return actions - def get_probs_and_entropy(self, actions, dists): + def get_probs_and_entropy(self, action_list, dists): log_probs = [] entropies = [] - for idx, action_dist in enumerate(dists): - action = actions[..., idx] + for action, action_dist in zip(action_list, dists): log_prob = action_dist.log_prob(action) log_probs.append(log_prob) entropies.append(action_dist.entropy()) @@ -379,7 +377,8 @@ def forward( dists, value_outputs, memories = self.get_dist_and_value( vec_inputs, vis_inputs, masks, memories, sequence_length ) - sampled_actions = self.sample_action(dists) + action_list = self.sample_action(dists) + sampled_actions = torch.stack(action_list, dim=-1) return ( sampled_actions, dists[0].pdf(sampled_actions), diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 2c61e69df3..40621c5825 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -1,6 +1,7 @@ from typing import Any, Dict, List import numpy as np import torch + import os from torch import onnx from mlagents.trainers.action_info import ActionInfo @@ -151,8 +152,11 @@ def sample_actions(self, vec_obs, vis_obs, masks=None, memories=None, seq_len=1) vec_obs, vis_obs, masks, memories, seq_len ) - actions = self.actor_critic.sample_action(dists) - log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists) + action_list = self.actor_critic.sample_action(dists) + log_probs, entropies = self.actor_critic.get_probs_and_entropy( + action_list, dists + ) + actions = torch.stack(action_list, dim=-1) if self.use_continuous_act: actions = actions[:, :, 0] else: @@ -166,8 +170,10 @@ def evaluate_actions( dists, (value_heads, mean_value), _ = self.actor_critic.get_dist_and_value( vec_obs, vis_obs, masks, memories, seq_len ) - - log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists) + action_list = [actions[..., i] for i in range(actions.shape[2])] + log_probs, entropies = self.actor_critic.get_probs_and_entropy( + action_list, dists + ) return log_probs, entropies, value_heads From 657febf92b7103e160d323a2889180990f823616 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 13 Jul 2020 17:05:44 -0700 Subject: [PATCH 07/16] Working discrete --- .../mlagents/trainers/distributions_torch.py | 3 + ml-agents/mlagents/trainers/models_torch.py | 59 ++++- .../mlagents/trainers/policy/torch_policy.py | 32 ++- .../mlagents/trainers/sac/optimizer_torch.py | 212 +++++++++++++----- ml-agents/mlagents/trainers/sac/trainer.py | 2 +- 5 files changed, 233 insertions(+), 75 deletions(-) diff --git a/ml-agents/mlagents/trainers/distributions_torch.py b/ml-agents/mlagents/trainers/distributions_torch.py index d87049d9eb..f48914e3e5 100644 --- a/ml-agents/mlagents/trainers/distributions_torch.py +++ b/ml-agents/mlagents/trainers/distributions_torch.py @@ -69,6 +69,9 @@ def pdf(self, value): def log_prob(self, value): return torch.log(self.pdf(value)) + def all_log_prob(self): + return torch.log(self.probs) + def entropy(self): return torch.sum(self.probs * torch.log(self.probs), dim=-1) diff --git a/ml-agents/mlagents/trainers/models_torch.py b/ml-agents/mlagents/trainers/models_torch.py index 811a11abda..26d22591fe 100644 --- a/ml-agents/mlagents/trainers/models_torch.py +++ b/ml-agents/mlagents/trainers/models_torch.py @@ -57,6 +57,34 @@ class NormalizerTensors(NamedTuple): running_variance: torch.Tensor +def break_into_branches( + concatenated_logits: torch.Tensor, action_size: List[int] +) -> List[torch.Tensor]: + """ + Takes a concatenated set of logits that represent multiple discrete action branches + and breaks it up into one Tensor per branch. + :param concatenated_logits: Tensor that represents the concatenated action branches + :param action_size: List of ints containing the number of possible actions for each branch. + :return: A List of Tensors containing one tensor per branch. + """ + action_idx = [0] + list(np.cumsum(action_size)) + branched_logits = [ + concatenated_logits[:, action_idx[i] : action_idx[i + 1]] + for i in range(len(action_size)) + ] + return branched_logits + + +def actions_to_onehot( + discrete_actions: torch.Tensor, action_size: List[int] +) -> List[torch.Tensor]: + onehot_branches = [ + torch.nn.functional.one_hot(_act.T, action_size[i]) + for i, _act in enumerate(discrete_actions.T) + ] + return onehot_branches + + class NetworkBody(nn.Module): def __init__( self, @@ -151,7 +179,7 @@ def forward(self, vec_inputs, vis_inputs, memories=None, sequence_length=1): return embedding, memories -class ContinuousQNetwork(NetworkBody): +class QNetwork(NetworkBody): def __init__( # pylint: disable=W0231 self, stream_names: List[str], @@ -181,12 +209,13 @@ def __init__( # pylint: disable=W0231 for vector_size in vector_sizes: if vector_size != 0: self.vector_normalizers.append(Normalizer(vector_size)) + input_size = ( + vector_size + sum(act_size) + if not act_type == ActionType.DISCRETE + else vector_size + ) self.vector_encoders.append( - VectorEncoder( - vector_size + sum(act_size), - self.h_size, - network_settings.num_layers, - ) + VectorEncoder(input_size, self.h_size, network_settings.num_layers) ) for visual_size in visual_sizes: self.visual_encoders.append( @@ -202,7 +231,12 @@ def __init__( # pylint: disable=W0231 self.visual_encoders = nn.ModuleList(self.visual_encoders) if self.use_lstm: self.lstm = nn.LSTM(self.h_size, self.m_size // 2, 1) - self.q_heads = ValueHeads(stream_names, network_settings.hidden_units) + if act_type == ActionType.DISCRETE: + self.q_heads = ValueHeads( + stream_names, network_settings.hidden_units, sum(act_size) + ) + else: + self.q_heads = ValueHeads(stream_names, network_settings.hidden_units) def forward( # pylint: disable=W0221 self, @@ -340,17 +374,22 @@ def sample_action(self, dists): def get_probs_and_entropy(self, action_list, dists): log_probs = [] + all_probs = [] entropies = [] for action, action_dist in zip(action_list, dists): log_prob = action_dist.log_prob(action) log_probs.append(log_prob) entropies.append(action_dist.entropy()) + if self.act_type == ActionType.DISCRETE: + all_probs.append(action_dist.all_log_prob()) log_probs = torch.stack(log_probs, dim=-1) entropies = torch.stack(entropies, dim=-1) + all_probs = torch.cat(all_probs, dim=-1) if self.act_type == ActionType.CONTINUOUS: log_probs = log_probs.squeeze(-1) entropies = entropies.squeeze(-1) - return log_probs, entropies + all_probs = None + return log_probs, entropies, all_probs def get_dist_and_value( self, vec_inputs, vis_inputs, masks=None, memories=None, sequence_length=1 @@ -458,13 +497,13 @@ def copy_from(self, other_normalizer: "Normalizer") -> None: class ValueHeads(nn.Module): - def __init__(self, stream_names, input_size): + def __init__(self, stream_names, input_size, output_size=1): super(ValueHeads, self).__init__() self.stream_names = stream_names self.value_heads = {} for name in stream_names: - value = nn.Linear(input_size, 1) + value = nn.Linear(input_size, output_size) self.value_heads[name] = value self.value_heads = nn.ModuleDict(self.value_heads) diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 40621c5825..b2b19a19c5 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import numpy as np import torch @@ -32,6 +32,7 @@ def __init__( tanh_squash: bool = False, reparameterize: bool = False, condition_sigma_on_obs: bool = True, + separate_critic: Optional[bool] = None, ): """ Policy that uses a multilayer perceptron to map the observations to actions. Could @@ -116,7 +117,9 @@ def __init__( visual_sizes=brain.camera_resolutions, vis_encode_type=trainer_settings.network_settings.vis_encode_type, stream_names=reward_signal_names, - separate_critic=self.use_continuous_act, + separate_critic=separate_critic + if separate_critic is not None + else self.use_continuous_act, conditional_sigma=self.condition_sigma_on_obs, tanh_squash=tanh_squash, ) @@ -144,7 +147,18 @@ def update_normalization(self, vector_obs: np.ndarray) -> None: self.actor_critic.update_normalization(vector_obs) @timed - def sample_actions(self, vec_obs, vis_obs, masks=None, memories=None, seq_len=1): + def sample_actions( + self, + vec_obs, + vis_obs, + masks=None, + memories=None, + seq_len=1, + all_log_probs=False, + ): + """ + :param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action. + """ dists, ( value_heads, mean_value, @@ -153,7 +167,7 @@ def sample_actions(self, vec_obs, vis_obs, masks=None, memories=None, seq_len=1) ) action_list = self.actor_critic.sample_action(dists) - log_probs, entropies = self.actor_critic.get_probs_and_entropy( + log_probs, entropies, all_logs = self.actor_critic.get_probs_and_entropy( action_list, dists ) actions = torch.stack(action_list, dim=-1) @@ -162,7 +176,13 @@ def sample_actions(self, vec_obs, vis_obs, masks=None, memories=None, seq_len=1) else: actions = actions[:, 0, :] - return actions, log_probs, entropies, value_heads, memories + return ( + actions, + all_logs if all_log_probs else log_probs, + entropies, + value_heads, + memories, + ) def evaluate_actions( self, vec_obs, vis_obs, actions, masks=None, memories=None, seq_len=1 @@ -171,7 +191,7 @@ def evaluate_actions( vec_obs, vis_obs, masks, memories, seq_len ) action_list = [actions[..., i] for i in range(actions.shape[2])] - log_probs, entropies = self.actor_critic.get_probs_and_entropy( + log_probs, entropies, _ = self.actor_critic.get_probs_and_entropy( action_list, dists ) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 2c05667059..30c9ef911f 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -10,9 +10,11 @@ from mlagents.trainers.brain import CameraResolution from mlagents.trainers.models_torch import ( Critic, - ContinuousQNetwork, + QNetwork, ActionType, list_to_tensor, + break_into_branches, + actions_to_onehot, ) from mlagents.trainers.buffer import AgentBuffer from mlagents_envs.timers import timed @@ -23,9 +25,6 @@ logger = get_logger(__name__) -POLICY_SCOPE = "" -TARGET_SCOPE = "target_network" - class TorchSACOptimizer(TorchOptimizer): class PolicyValueNetwork(nn.Module): @@ -39,25 +38,22 @@ def __init__( act_size: List[int], ): super().__init__() - if act_type == ActionType.CONTINUOUS: - self.q1_network = ContinuousQNetwork( - stream_names, - vector_sizes, - visual_sizes, - network_settings, - act_type, - act_size, - ) - self.q2_network = ContinuousQNetwork( - stream_names, - vector_sizes, - visual_sizes, - network_settings, - act_type, - act_size, - ) - else: - raise UnityTrainerException("Not supported yet") + self.q1_network = QNetwork( + stream_names, + vector_sizes, + visual_sizes, + network_settings, + act_type, + act_size, + ) + self.q2_network = QNetwork( + stream_names, + vector_sizes, + visual_sizes, + network_settings, + act_type, + act_size, + ) def forward( self, @@ -65,10 +61,8 @@ def forward( vis_inputs: List[torch.Tensor], actions: torch.Tensor = None, ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: - if actions is not None: - assert isinstance(self.q1_network, ContinuousQNetwork) - q1_out, _ = self.q1_network(vec_inputs, vis_inputs, actions=actions) - q2_out, _ = self.q2_network(vec_inputs, vis_inputs, actions=actions) + q1_out, _ = self.q1_network(vec_inputs, vis_inputs, actions=actions) + q2_out, _ = self.q2_network(vec_inputs, vis_inputs, actions=actions) return q1_out, q2_out def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): @@ -133,11 +127,17 @@ def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): torch.log(torch.as_tensor([self.init_entcoef] * len(self.act_size))), requires_grad=True, ) - self.target_entropy = torch.as_tensor( - -1 - * self.continuous_target_entropy_scale - * np.prod(self.act_size[0]).astype(np.float32) - ) + if self.policy.use_continuous_act: + self.target_entropy = torch.as_tensor( + -1 + * self.continuous_target_entropy_scale + * np.prod(self.act_size[0]).astype(np.float32) + ) + else: + self.target_entropy = [ + self.discrete_target_entropy_scale * np.log(i).astype(np.float32) + for i in self.act_size + ] policy_params = list(self.policy.actor_critic.network_body.parameters()) + list( self.policy.actor_critic.distribution.parameters() @@ -165,23 +165,14 @@ def sac_q_loss( dones: torch.Tensor, rewards: Dict[str, torch.Tensor], loss_masks: torch.Tensor, - discrete: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Creates training-specific Tensorflow ops for SAC models. - :param q1_streams: Q1 streams from policy network - :param q1_streams: Q2 streams from policy network - :param lr: Learning rate - :param max_step: Total number of training steps. - :param stream_names: List of reward stream names. - :param discrete: Whether or not to use discrete action losses. - """ q1_losses = [] q2_losses = [] # Multiple q losses per stream for i, name in enumerate(q1_out.keys()): q1_stream = q1_out[name] q2_stream = q2_out[name] + with torch.no_grad(): q_backup = rewards[name] + ( (1.0 - self.use_dones_in_backup[name] * dones) @@ -217,24 +208,67 @@ def sac_value_loss( discrete: bool, ) -> torch.Tensor: min_policy_qs = {} - + _ent_coef = torch.exp(self._log_ent_coef) for name in values.keys(): if not discrete: min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name]) - _ent_coef = torch.exp(self._log_ent_coef) + else: + action_probs = log_probs.exp() + _branched_q1p = break_into_branches( + q1p_out[name] * action_probs, self.act_size + ) + _branched_q2p = break_into_branches( + q2p_out[name] * action_probs, self.act_size + ) + _q1p_mean = torch.mean( + torch.stack( + [torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q1p] + ) + ) + _q2p_mean = torch.mean( + torch.stack( + [torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q2p] + ) + ) + + min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean) + value_losses = [] if not discrete: - value_losses = [] for name in values.keys(): with torch.no_grad(): v_backup = min_policy_qs[name] - torch.sum( _ent_coef * log_probs, dim=1 ) + # print(log_probs, v_backup, _ent_coef, loss_masks) + value_loss = 0.5 * torch.mean( + loss_masks * torch.pow((values[name] - v_backup), 2) + ) + value_losses.append(value_loss) + else: + branched_per_action_ent = break_into_branches( + log_probs * log_probs.exp(), self.act_size + ) + # We have to do entropy bonus per action branch + branched_ent_bonus = torch.stack( + [ + torch.sum(_ent_coef[i] * _lp, dim=1, keepdim=True) + for i, _lp in enumerate(branched_per_action_ent) + ] + ) + for name in values.keys(): + with torch.no_grad(): + v_backup = min_policy_qs[name] - torch.mean( + branched_ent_bonus, axis=0 + ) value_loss = 0.5 * torch.mean( loss_masks * torch.pow((values[name] - v_backup), 2) ) value_losses.append(value_loss) + value_loss = torch.mean(torch.stack(value_losses)) + if torch.isinf(value_loss).any() or torch.isnan(value_loss).any(): + raise UnityTrainerException("Inf found") return value_loss def sac_policy_loss( @@ -245,13 +279,27 @@ def sac_policy_loss( discrete: bool, ) -> torch.Tensor: _ent_coef = torch.exp(self._log_ent_coef) + mean_q1 = torch.mean(torch.stack(list(q1p_outs.values())), axis=0) if not discrete: - mean_q1 = torch.mean(torch.stack(list(q1p_outs.values())), axis=0) mean_q1.unsqueeze_(1) batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1) policy_loss = torch.mean(loss_masks * batch_policy_loss) else: - policy_loss = 0 + action_probs = log_probs.exp() + branched_per_action_ent = break_into_branches( + log_probs * action_probs, self.act_size + ) + branched_q_term = break_into_branches(mean_q1 * action_probs, self.act_size) + branched_policy_loss = torch.stack( + [ + torch.sum(_ent_coef[i] * _lp - _qt, dim=1, keepdim=True) + for i, (_lp, _qt) in enumerate( + zip(branched_per_action_ent, branched_q_term) + ) + ] + ) + batch_policy_loss = torch.squeeze(branched_policy_loss) + policy_loss = torch.mean(loss_masks * batch_policy_loss) return policy_loss def sac_entropy_loss( @@ -259,12 +307,50 @@ def sac_entropy_loss( ) -> torch.Tensor: if not discrete: with torch.no_grad(): - inner_term = torch.sum(log_probs + self.target_entropy, dim=1) - entropy_loss = -torch.mean(self._log_ent_coef * loss_masks * inner_term) + target_current_diff = torch.sum(log_probs + self.target_entropy, dim=1) + entropy_loss = -torch.mean( + self._log_ent_coef * loss_masks * target_current_diff + ) else: - entropy_loss = 0 + with torch.no_grad(): + branched_per_action_ent = break_into_branches( + log_probs * log_probs.exp(), self.act_size + ) + target_current_diff_branched = torch.stack( + [ + torch.sum(_lp, axis=1, keepdim=True) + _te + for _lp, _te in zip( + branched_per_action_ent, self.target_entropy + ) + ], + axis=1, + ) + target_current_diff = torch.squeeze( + target_current_diff_branched, axis=2 + ) + entropy_loss = -torch.mean( + loss_masks + * torch.mean(self._log_ent_coef * target_current_diff, axis=1) + ) + return entropy_loss + def _condense_q_streams( + self, q_output: Dict[str, torch.Tensor], discrete_actions: torch.Tensor + ) -> Dict[str, torch.Tensor]: + condensed_q_output = {} + onehot_actions = actions_to_onehot(discrete_actions, self.act_size) + for key, item in q_output.items(): + branched_q = break_into_branches(item, self.act_size) + only_action_qs = torch.stack( + [ + torch.sum(_act * _q, axis=1, keepdim=True) + for _act, _q in zip(onehot_actions, branched_q) + ] + ) + condensed_q_output[key] = torch.mean(only_action_qs, axis=0) + return condensed_q_output + @timed def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ @@ -324,27 +410,38 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: masks=act_masks, memories=memories, seq_len=self.policy.sequence_length, + all_log_probs=True, ) - q1p_out, q2p_out = self.value_network(vec_obs, vis_obs, sampled_actions) - q1_out, q2_out = self.value_network(vec_obs, vis_obs, actions.squeeze(-1)) + squeezed_actions = actions.squeeze(-1) + + if self.policy.use_continuous_act: + q1p_out, q2p_out = self.value_network(vec_obs, vis_obs, sampled_actions) + q1_out, q2_out = self.value_network(vec_obs, vis_obs, squeezed_actions) + q1_stream, q2_stream = q1_out, q2_out + else: + q1p_out, q2p_out = self.value_network(vec_obs, vis_obs) + q1_out, q2_out = self.value_network(vec_obs, vis_obs) + q1_stream = self._condense_q_streams(q1_out, squeezed_actions) + q2_stream = self._condense_q_streams(q2_out, squeezed_actions) target_values, _ = self.target_network(next_vec_obs, next_vis_obs) masks = list_to_tensor(batch["masks"], dtype=torch.int32) + + use_discrete = not self.policy.use_continuous_act dones = list_to_tensor(batch["done"]) q1_loss, q2_loss = self.sac_q_loss( - q1_out, q2_out, target_values, dones, rewards, masks, False + q1_stream, q2_stream, target_values, dones, rewards, masks ) value_loss = self.sac_value_loss( - log_probs, sampled_values, q1p_out, q2p_out, masks, False + log_probs, sampled_values, q1p_out, q2p_out, masks, use_discrete ) - policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, False) + policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete) entropy_loss = self.sac_entropy_loss( - log_probs, torch.as_tensor(batch["masks"], dtype=torch.int32), False + log_probs, torch.as_tensor(batch["masks"], dtype=torch.int32), use_discrete ) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() - total_value_loss = q1_loss + q2_loss + value_loss self.value_optimizer.zero_grad() total_value_loss.backward() @@ -356,7 +453,6 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: # Update Q network self.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau) - update_stats = { "Losses/Policy Loss": abs(policy_loss.detach().numpy()), "Losses/Value Loss": value_loss.detach().numpy(), diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index a09b9b0b7b..bf93fc7931 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -213,7 +213,6 @@ def create_tf_policy( self.seed, brain_parameters, self.trainer_settings, - self.is_training, self.artifact_path, self.load, tanh_squash=True, @@ -239,6 +238,7 @@ def create_torch_policy( self.load, condition_sigma_on_obs=True, tanh_squash=True, + separate_critic=True, ) return policy From 8f3e78ccb3ca7cfe07b56c339cd2bca4f144af57 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 13 Jul 2020 17:19:10 -0700 Subject: [PATCH 08/16] Fix continuous SAC and PPO --- ml-agents/mlagents/trainers/models_torch.py | 3 ++- ml-agents/mlagents/trainers/policy/torch_policy.py | 2 ++ ml-agents/mlagents/trainers/sac/optimizer_torch.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/models_torch.py b/ml-agents/mlagents/trainers/models_torch.py index 26d22591fe..eea2f5c8b9 100644 --- a/ml-agents/mlagents/trainers/models_torch.py +++ b/ml-agents/mlagents/trainers/models_torch.py @@ -384,11 +384,12 @@ def get_probs_and_entropy(self, action_list, dists): all_probs.append(action_dist.all_log_prob()) log_probs = torch.stack(log_probs, dim=-1) entropies = torch.stack(entropies, dim=-1) - all_probs = torch.cat(all_probs, dim=-1) if self.act_type == ActionType.CONTINUOUS: log_probs = log_probs.squeeze(-1) entropies = entropies.squeeze(-1) all_probs = None + else: + all_probs = torch.cat(all_probs, dim=-1) return log_probs, entropies, all_probs def get_dist_and_value( diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index b2b19a19c5..711fff6bbd 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -190,6 +190,8 @@ def evaluate_actions( dists, (value_heads, mean_value), _ = self.actor_critic.get_dist_and_value( vec_obs, vis_obs, masks, memories, seq_len ) + if len(actions.shape) <= 2: + actions.unsqueeze_(-1) action_list = [actions[..., i] for i in range(actions.shape[2])] log_probs, entropies, _ = self.actor_critic.get_probs_and_entropy( action_list, dists diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 30c9ef911f..f0ed02cae4 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -410,7 +410,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: masks=act_masks, memories=memories, seq_len=self.policy.sequence_length, - all_log_probs=True, + all_log_probs=not self.policy.use_continuous_act, ) squeezed_actions = actions.squeeze(-1) From 1dcc1e9b78218175198eb7afea229712946af4e6 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 14 Jul 2020 17:27:53 -0700 Subject: [PATCH 09/16] Fix crash with single-branch --- ml-agents/mlagents/trainers/sac/optimizer_torch.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index f0ed02cae4..773d44ca9c 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -223,12 +223,14 @@ def sac_value_loss( _q1p_mean = torch.mean( torch.stack( [torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q1p] - ) + ), + dim=0, ) _q2p_mean = torch.mean( torch.stack( [torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q2p] - ) + ), + dim=0, ) min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean) @@ -412,17 +414,17 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: seq_len=self.policy.sequence_length, all_log_probs=not self.policy.use_continuous_act, ) - squeezed_actions = actions.squeeze(-1) if self.policy.use_continuous_act: + squeezed_actions = actions.squeeze(-1) q1p_out, q2p_out = self.value_network(vec_obs, vis_obs, sampled_actions) q1_out, q2_out = self.value_network(vec_obs, vis_obs, squeezed_actions) q1_stream, q2_stream = q1_out, q2_out else: q1p_out, q2p_out = self.value_network(vec_obs, vis_obs) q1_out, q2_out = self.value_network(vec_obs, vis_obs) - q1_stream = self._condense_q_streams(q1_out, squeezed_actions) - q2_stream = self._condense_q_streams(q2_out, squeezed_actions) + q1_stream = self._condense_q_streams(q1_out, actions) + q2_stream = self._condense_q_streams(q2_out, actions) target_values, _ = self.target_network(next_vec_obs, next_vis_obs) masks = list_to_tensor(batch["masks"], dtype=torch.int32) From c766e216ddc1193b66020ad452d46f43ae4c386c Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 14 Jul 2020 20:03:57 -0700 Subject: [PATCH 10/16] Tweaks that don't actually fix anything --- ml-agents/mlagents/trainers/sac/optimizer_torch.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 773d44ca9c..4fd13489f1 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -172,7 +172,6 @@ def sac_q_loss( for i, name in enumerate(q1_out.keys()): q1_stream = q1_out[name] q2_stream = q2_out[name] - with torch.no_grad(): q_backup = rewards[name] + ( (1.0 - self.use_dones_in_backup[name] * dones) @@ -267,7 +266,6 @@ def sac_value_loss( loss_masks * torch.pow((values[name] - v_backup), 2) ) value_losses.append(value_loss) - value_loss = torch.mean(torch.stack(value_losses)) if torch.isinf(value_loss).any() or torch.isnan(value_loss).any(): raise UnityTrainerException("Inf found") @@ -346,11 +344,11 @@ def _condense_q_streams( branched_q = break_into_branches(item, self.act_size) only_action_qs = torch.stack( [ - torch.sum(_act * _q, axis=1, keepdim=True) + torch.sum(_act * _q, dim=1, keepdim=True) for _act, _q in zip(onehot_actions, branched_q) ] ) - condensed_q_output[key] = torch.mean(only_action_qs, axis=0) + condensed_q_output[key] = torch.mean(only_action_qs, dim=0) return condensed_q_output @timed @@ -431,6 +429,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: use_discrete = not self.policy.use_continuous_act dones = list_to_tensor(batch["done"]) + q1_loss, q2_loss = self.sac_q_loss( q1_stream, q2_stream, target_values, dones, rewards, masks ) @@ -438,22 +437,21 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: log_probs, sampled_values, q1p_out, q2p_out, masks, use_discrete ) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete) - entropy_loss = self.sac_entropy_loss( - log_probs, torch.as_tensor(batch["masks"], dtype=torch.int32), use_discrete - ) + entropy_loss = self.sac_entropy_loss(log_probs, masks, use_discrete) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() total_value_loss = q1_loss + q2_loss + value_loss self.value_optimizer.zero_grad() total_value_loss.backward() + self.value_optimizer.step() self.entropy_optimizer.zero_grad() entropy_loss.backward() self.entropy_optimizer.step() - # Update Q network + # Update target network self.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau) update_stats = { "Losses/Policy Loss": abs(policy_loss.detach().numpy()), From 4f9660aab19b28e3114a2922900bb37a053488c7 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 15 Jul 2020 17:27:20 -0700 Subject: [PATCH 11/16] It trains??? --- ml-agents/mlagents/trainers/sac/optimizer_torch.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 4fd13489f1..81e4c3be4d 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -344,10 +344,11 @@ def _condense_q_streams( branched_q = break_into_branches(item, self.act_size) only_action_qs = torch.stack( [ - torch.sum(_act * _q, dim=1, keepdim=True) + torch.sum(_act * _q * 0.0, dim=1, keepdim=True) for _act, _q in zip(onehot_actions, branched_q) ] ) + condensed_q_output[key] = torch.mean(only_action_qs, dim=0) return condensed_q_output @@ -419,7 +420,8 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: q1_out, q2_out = self.value_network(vec_obs, vis_obs, squeezed_actions) q1_stream, q2_stream = q1_out, q2_out else: - q1p_out, q2p_out = self.value_network(vec_obs, vis_obs) + with torch.no_grad(): + q1p_out, q2p_out = self.value_network(vec_obs, vis_obs) q1_out, q2_out = self.value_network(vec_obs, vis_obs) q1_stream = self._condense_q_streams(q1_out, actions) q2_stream = self._condense_q_streams(q2_out, actions) From 3aad4dbded2dc6194d313405cb79653def42fa7c Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 15 Jul 2020 17:31:00 -0700 Subject: [PATCH 12/16] It does not --- ml-agents/mlagents/trainers/sac/optimizer_torch.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 81e4c3be4d..d90bdbe320 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -344,7 +344,7 @@ def _condense_q_streams( branched_q = break_into_branches(item, self.act_size) only_action_qs = torch.stack( [ - torch.sum(_act * _q * 0.0, dim=1, keepdim=True) + torch.sum(_act * _q, dim=1, keepdim=True) for _act, _q in zip(onehot_actions, branched_q) ] ) @@ -420,8 +420,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: q1_out, q2_out = self.value_network(vec_obs, vis_obs, squeezed_actions) q1_stream, q2_stream = q1_out, q2_out else: - with torch.no_grad(): - q1p_out, q2p_out = self.value_network(vec_obs, vis_obs) + q1p_out, q2p_out = self.value_network(vec_obs, vis_obs) q1_out, q2_out = self.value_network(vec_obs, vis_obs) q1_stream = self._condense_q_streams(q1_out, actions) q2_stream = self._condense_q_streams(q2_out, actions) From c65fe1ddd610a915b6ed3b53a0f3039951e9f259 Mon Sep 17 00:00:00 2001 From: Vincent-Pierre BERGES Date: Thu, 16 Jul 2020 09:25:26 -0700 Subject: [PATCH 13/16] Develop add fire sac exp (#4234) * Updating experiment_torch.py with SAC * _ * _ --- experiment_torch.py | 43 +++++++++++++++---- .../mlagents/trainers/sac/optimizer_torch.py | 10 ++--- .../mlagents/trainers/trainer/rl_trainer.py | 6 ++- 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/experiment_torch.py b/experiment_torch.py index 289ddc866d..9088ee91d9 100644 --- a/experiment_torch.py +++ b/experiment_torch.py @@ -14,6 +14,7 @@ def run_experiment( name: str, steps: int, use_torch: bool, + algo: str, num_torch_threads: int, use_gpu: bool, num_envs: int = 1, @@ -32,6 +33,7 @@ def run_experiment( name, str(steps), str(use_torch), + algo, str(num_torch_threads), str(num_envs), str(use_gpu), @@ -46,7 +48,7 @@ def run_experiment( if config_name is None: config_name = name run_options = parse_command_line( - [f"config/ppo/{config_name}.yaml", "--num-envs", f"{num_envs}"] + [f"config/{algo}/{config_name}.yaml", "--num-envs", f"{num_envs}"] ) run_options.checkpoint_settings.run_id = ( f"{name}_test_" + str(steps) + "_" + ("torch" if use_torch else "tf") @@ -87,20 +89,29 @@ def run_experiment( tc_advance_total = tc_advance["total"] tc_advance_count = tc_advance["count"] if use_torch: - update_total = update["TorchPPOOptimizer.update"]["total"] + if algo == "ppo": + update_total = update["TorchPPOOptimizer.update"]["total"] + update_count = update["TorchPPOOptimizer.update"]["count"] + else: + update_total = update["SACTrainer._update_policy"]["total"] + update_count = update["SACTrainer._update_policy"]["count"] evaluate_total = evaluate["TorchPolicy.evaluate"]["total"] - update_count = update["TorchPPOOptimizer.update"]["count"] evaluate_count = evaluate["TorchPolicy.evaluate"]["count"] else: - update_total = update["TFPPOOptimizer.update"]["total"] + if algo == "ppo": + update_total = update["TFPPOOptimizer.update"]["total"] + update_count = update["TFPPOOptimizer.update"]["count"] + else: + update_total = update["SACTrainer._update_policy"]["total"] + update_count = update["SACTrainer._update_policy"]["count"] evaluate_total = evaluate["NNPolicy.evaluate"]["total"] - update_count = update["TFPPOOptimizer.update"]["count"] evaluate_count = evaluate["NNPolicy.evaluate"]["count"] # todo: do total / count return ( name, str(steps), str(use_torch), + algo, str(num_torch_threads), str(num_envs), str(use_gpu), @@ -133,6 +144,12 @@ def main(): action="store_true", help="If true, will only do 3dball", ) + parser.add_argument( + "--sac", + default=False, + action="store_true", + help="If true, will run sac instead of ppo", + ) args = parser.parse_args() if args.gpu: @@ -140,21 +157,28 @@ def main(): else: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" + algo = "ppo" + if args.sac: + algo = "sac" + envs_config_tuples = [ ("3DBall", "3DBall"), ("GridWorld", "GridWorld"), ("PushBlock", "PushBlock"), - ("Hallway", "Hallway"), ("CrawlerStaticTarget", "CrawlerStatic"), - ("VisualHallway", "VisualHallway"), ] + if algo == "ppo": + envs_config_tuples += [("Hallway", "Hallway"), + ("VisualHallway", "VisualHallway")] if args.ball: envs_config_tuples = [("3DBall", "3DBall")] + labels = ( "name", "steps", "use_torch", + "algorithm", "num_torch_threads", "num_envs", "use_gpu", @@ -170,7 +194,7 @@ def main(): results = [] results.append(labels) f = open( - f"result_data_steps_{args.steps}_envs_{args.num_envs}_gpu_{args.gpu}_thread_{args.threads}.txt", + f"result_data_steps_{args.steps}_algo_{algo}_envs_{args.num_envs}_gpu_{args.gpu}_thread_{args.threads}.txt", "w", ) f.write(" ".join(labels) + "\n") @@ -180,6 +204,7 @@ def main(): name=env_config[0], steps=args.steps, use_torch=True, + algo=algo, num_torch_threads=1, use_gpu=args.gpu, num_envs=args.num_envs, @@ -193,6 +218,7 @@ def main(): name=env_config[0], steps=args.steps, use_torch=True, + algo=algo, num_torch_threads=8, use_gpu=args.gpu, num_envs=args.num_envs, @@ -205,6 +231,7 @@ def main(): name=env_config[0], steps=args.steps, use_torch=False, + algo=algo, num_torch_threads=1, use_gpu=args.gpu, num_envs=args.num_envs, diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index d90bdbe320..a0a66f0814 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -455,11 +455,11 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: # Update target network self.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau) update_stats = { - "Losses/Policy Loss": abs(policy_loss.detach().numpy()), - "Losses/Value Loss": value_loss.detach().numpy(), - "Losses/Q1 Loss": q1_loss.detach().numpy(), - "Losses/Q2 Loss": q2_loss.detach().numpy(), - "Policy/Entropy Coeff": torch.exp(self._log_ent_coef).detach().numpy(), + "Losses/Policy Loss": abs(policy_loss.detach().cpu().numpy()), + "Losses/Value Loss": value_loss.detach().cpu().numpy(), + "Losses/Q1 Loss": q1_loss.detach().cpu().numpy(), + "Losses/Q2 Loss": q2_loss.detach().cpu().numpy(), + "Policy/Entropy Coeff": torch.exp(self._log_ent_coef).detach().cpu().numpy(), } return update_stats diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py index f02d799b93..ae1af2e1a8 100644 --- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py +++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py @@ -17,6 +17,8 @@ from mlagents.trainers.trajectory import Trajectory from mlagents.trainers.stats import StatsPropertyType +from mlagents.trainers.ppo.trainer import TestingConfiguration + RewardSignalResults = Dict[str, RewardSignalResult] logger = get_logger(__name__) @@ -40,7 +42,9 @@ def __init__(self, *args, **kwargs): self._stats_reporter.add_property( StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict() ) - self.framework = "torch" + self.framework = "torch" if TestingConfiguration.use_torch else "tf" + if TestingConfiguration.max_steps > 0: + self.trainer_settings.max_steps = TestingConfiguration.max_steps self._next_save_step = 0 self._next_summary_step = 0 From 6930147e5f11ca318b59144c6ca05147b0e73412 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 16 Jul 2020 18:47:22 -0700 Subject: [PATCH 14/16] Minor perf improvements --- ml-agents/mlagents/trainers/models_torch.py | 80 ++++++++++--------- .../mlagents/trainers/policy/torch_policy.py | 12 ++- ml-agents/mlagents/trainers/ppo/trainer.py | 6 +- .../mlagents/trainers/sac/optimizer_torch.py | 15 +++- 4 files changed, 62 insertions(+), 51 deletions(-) diff --git a/ml-agents/mlagents/trainers/models_torch.py b/ml-agents/mlagents/trainers/models_torch.py index 55a200714e..7546f2686a 100644 --- a/ml-agents/mlagents/trainers/models_torch.py +++ b/ml-agents/mlagents/trainers/models_torch.py @@ -122,7 +122,7 @@ def __init__( h_size, ) ) - + self.vector_normalizers = nn.ModuleList(self.vector_normalizers) self.vector_encoders = nn.ModuleList(self.vector_encoders) self.visual_encoders = nn.ModuleList(self.visual_encoders) if use_lstm: @@ -157,23 +157,26 @@ def forward(self, vec_inputs, vis_inputs, memories=None, sequence_length=1): vis_embeds.append(hidden) # embedding = vec_embeds[0] - if len(vec_embeds) > 0: - vec_embeds = torch.stack(vec_embeds, dim=-1).sum(dim=-1) - if len(vis_embeds) > 0: - vis_embeds = torch.stack(vis_embeds, dim=-1).sum(dim=-1) if len(vec_embeds) > 0 and len(vis_embeds) > 0: - embedding = torch.stack([vec_embeds, vis_embeds], dim=-1).sum(dim=-1) + vec_embeds_tensor = torch.stack(vec_embeds, dim=-1).sum(dim=-1) + vis_embeds_tensor = torch.stack(vis_embeds, dim=-1).sum(dim=-1) + embedding = torch.stack([vec_embeds_tensor, vis_embeds_tensor], dim=-1).sum( + dim=-1 + ) elif len(vec_embeds) > 0: - embedding = vec_embeds + embedding = torch.stack(vec_embeds, dim=-1).sum(dim=-1) elif len(vis_embeds) > 0: - embedding = vis_embeds + embedding = torch.stack(vis_embeds, dim=-1).sum(dim=-1) else: raise Exception("No valid inputs to network.") if self.use_lstm: embedding = embedding.view([sequence_length, -1, self.h_size]) memories = torch.split(memories, self.m_size // 2, dim=-1) - embedding, memories = self.lstm(embedding.contiguous(), (memories[0].contiguous(), memories[1].contiguous())) + embedding, memories = self.lstm( + embedding.contiguous(), + (memories[0].contiguous(), memories[1].contiguous()), + ) embedding = embedding.view([-1, self.m_size // 2]) memories = torch.cat(memories, dim=-1) return embedding, memories @@ -226,11 +229,13 @@ def __init__( # pylint: disable=W0231 self.h_size, ) ) - + self.vector_normalizers = nn.ModuleList(self.vector_normalizers) self.vector_encoders = nn.ModuleList(self.vector_encoders) self.visual_encoders = nn.ModuleList(self.visual_encoders) if self.use_lstm: self.lstm = nn.LSTM(self.h_size, self.m_size // 2, 1) + else: + self.lstm = None if act_type == ActionType.DISCRETE: self.q_heads = ValueHeads( stream_names, network_settings.hidden_units, sum(act_size) @@ -247,14 +252,16 @@ def forward( # pylint: disable=W0221 actions: torch.Tensor = None, ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: vec_embeds = [] - for idx, encoder in enumerate(self.vector_encoders): - vec_input = vec_inputs[idx] + for i, (enc, norm) in enumerate( + zip(self.vector_encoders, self.vector_normalizers) + ): + vec_input = vec_inputs[i] if self.normalize: - vec_input = self.vector_normalizers[idx](vec_input) + vec_input = norm(vec_input) if actions is not None: - hidden = encoder(torch.cat([vec_input, actions], axis=-1)) + hidden = enc(torch.cat([vec_input, actions], dim=-1)) else: - hidden = encoder(vec_input) + hidden = enc(vec_input) vec_embeds.append(hidden) vis_embeds = [] @@ -265,25 +272,25 @@ def forward( # pylint: disable=W0221 vis_embeds.append(hidden) # embedding = vec_embeds[0] - if len(vec_embeds) > 0: - vec_embeds = torch.stack(vec_embeds, dim=-1).sum(dim=-1) - if len(vis_embeds) > 0: - vis_embeds = torch.stack(vis_embeds, dim=-1).sum(dim=-1) if len(vec_embeds) > 0 and len(vis_embeds) > 0: - embedding = torch.stack([vec_embeds, vis_embeds], dim=-1).sum(dim=-1) + vec_embeds_tensor = torch.stack(vec_embeds, dim=-1).sum(dim=-1) + vis_embeds_tensor = torch.stack(vis_embeds, dim=-1).sum(dim=-1) + embedding = torch.stack([vec_embeds_tensor, vis_embeds_tensor], dim=-1).sum( + dim=-1 + ) elif len(vec_embeds) > 0: - embedding = vec_embeds + embedding = torch.stack(vec_embeds, dim=-1).sum(dim=-1) elif len(vis_embeds) > 0: - embedding = vis_embeds + embedding = torch.stack(vis_embeds, dim=-1).sum(dim=-1) else: raise Exception("No valid inputs to network.") - if self.use_lstm: + if self.lstm is not None: embedding = embedding.view([sequence_length, -1, self.h_size]) - memories = torch.split(memories, self.m_size // 2, dim=-1) - embedding, memories = self.lstm(embedding, memories) + memories_tensor = torch.split(memories, self.m_size // 2, dim=-1) + embedding, memories = self.lstm(embedding, memories_tensor) embedding = embedding.view([-1, self.m_size // 2]) - memories = torch.cat(memories, dim=-1) + memories = torch.cat(memories_tensor, dim=-1) output, _ = self.q_heads(embedding) return output, memories @@ -501,19 +508,17 @@ class ValueHeads(nn.Module): def __init__(self, stream_names, input_size, output_size=1): super(ValueHeads, self).__init__() self.stream_names = stream_names - self.value_heads = {} + _value_heads = {} for name in stream_names: value = nn.Linear(input_size, output_size) - self.value_heads[name] = value - self.value_heads = nn.ModuleDict(self.value_heads) + _value_heads[name] = value + self.value_heads = nn.ModuleDict(_value_heads) def forward(self, hidden): value_outputs = {} - for stream_name, _ in self.value_heads.items(): - value_outputs[stream_name] = self.value_heads[stream_name](hidden).squeeze( - -1 - ) + for stream_name, head in self.value_heads.items(): + value_outputs[stream_name] = head(hidden).squeeze(-1) return ( value_outputs, torch.mean(torch.stack(list(value_outputs.values())), dim=0), @@ -527,13 +532,10 @@ def __init__(self, input_size, hidden_size, num_layers, **kwargs): for _ in range(num_layers - 1): self.layers.append(nn.Linear(hidden_size, hidden_size)) self.layers.append(nn.ReLU()) - self.layers = nn.ModuleList(self.layers) + self.seq_layers = nn.Sequential(*self.layers) def forward(self, inputs): - x = inputs - for layer in self.layers: - x = layer(x) - return x + return self.seq_layers(inputs) def conv_output_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1): @@ -572,7 +574,7 @@ def forward(self, visual_obs): conv_1 = torch.relu(self.conv1(visual_obs)) conv_2 = torch.relu(self.conv2(conv_1)) # hidden = torch.relu(self.dense(conv_2.view([-1, self.final_flat]))) - hidden = torch.relu(self.dense(torch.reshape(conv_2,(-1, self.final_flat)))) + hidden = torch.relu(self.dense(torch.reshape(conv_2, (-1, self.final_flat)))) return hidden diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 0e0d43dc92..6ccb897382 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -100,7 +100,6 @@ def __init__( torch.set_default_tensor_type(torch.cuda.FloatTensor) else: torch.set_default_tensor_type(torch.FloatTensor) - self.inference_dict: Dict[str, tf.Tensor] = {} self.update_dict: Dict[str, tf.Tensor] = {} @@ -243,7 +242,6 @@ def evaluate( run_out["learning_rate"] = 0.0 if self.use_recurrent: run_out["memories"] = memories.detach().cpu().numpy() - self.actor_critic.update_normalization(vec_obs) return run_out def get_action( @@ -291,14 +289,20 @@ def load_model(self, step=0): def export_model(self, step=0): try: - fake_vec_obs = [torch.zeros([1] + [self.brain.vector_observation_space_size])] + fake_vec_obs = [ + torch.zeros([1] + [self.brain.vector_observation_space_size]) + ] fake_vis_obs = [torch.zeros([1] + [84, 84, 3])] fake_masks = torch.ones([1] + self.actor_critic.act_size) # fake_memories = torch.zeros([1] + [self.m_size]) export_path = "./model-" + str(step) + ".onnx" output_names = ["action", "action_probs"] input_names = ["vector_observation", "action_mask"] - dynamic_axes = {"vector_observation": [0], "action": [0], "action_probs": [0]} + dynamic_axes = { + "vector_observation": [0], + "action": [0], + "action_probs": [0], + } onnx.export( self.actor_critic, (fake_vec_obs, fake_vis_obs, fake_masks), diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index 852f941d86..06beb0134d 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -2,14 +2,14 @@ # ## ML-Agent Learning (PPO) # Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347 + class TestingConfiguration: - use_torch = False + use_torch = True max_steps = 0 env_name = "" device = "cpu" - from collections import defaultdict from typing import cast @@ -30,8 +30,6 @@ class TestingConfiguration: logger = get_logger(__name__) - - class PPOTrainer(RLTrainer): """The PPOTrainer is an implementation of the PPO algorithm.""" diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index a0a66f0814..099fc1b724 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -207,7 +207,8 @@ def sac_value_loss( discrete: bool, ) -> torch.Tensor: min_policy_qs = {} - _ent_coef = torch.exp(self._log_ent_coef) + with torch.no_grad(): + _ent_coef = torch.exp(self._log_ent_coef) for name in values.keys(): if not discrete: min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name]) @@ -420,12 +421,14 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: q1_out, q2_out = self.value_network(vec_obs, vis_obs, squeezed_actions) q1_stream, q2_stream = q1_out, q2_out else: - q1p_out, q2p_out = self.value_network(vec_obs, vis_obs) + with torch.no_grad(): + q1p_out, q2p_out = self.value_network(vec_obs, vis_obs) q1_out, q2_out = self.value_network(vec_obs, vis_obs) q1_stream = self._condense_q_streams(q1_out, actions) q2_stream = self._condense_q_streams(q2_out, actions) - target_values, _ = self.target_network(next_vec_obs, next_vis_obs) + with torch.no_grad(): + target_values, _ = self.target_network(next_vec_obs, next_vis_obs) masks = list_to_tensor(batch["masks"], dtype=torch.int32) use_discrete = not self.policy.use_continuous_act @@ -439,6 +442,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: ) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete) entropy_loss = self.sac_entropy_loss(log_probs, masks, use_discrete) + self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() @@ -459,7 +463,10 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: "Losses/Value Loss": value_loss.detach().cpu().numpy(), "Losses/Q1 Loss": q1_loss.detach().cpu().numpy(), "Losses/Q2 Loss": q2_loss.detach().cpu().numpy(), - "Policy/Entropy Coeff": torch.exp(self._log_ent_coef).detach().cpu().numpy(), + "Policy/Entropy Coeff": torch.exp(self._log_ent_coef) + .detach() + .cpu() + .numpy(), } return update_stats From 4017e94408cc4c8f081b9c36808ad3d2f5a6d4c3 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Fri, 17 Jul 2020 16:05:19 -0700 Subject: [PATCH 15/16] Remove inplace squeeze --- ml-agents/mlagents/trainers/policy/torch_policy.py | 2 +- ml-agents/mlagents/trainers/sac/optimizer_torch.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 6ccb897382..626384a252 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -200,7 +200,7 @@ def evaluate_actions( vec_obs, vis_obs, masks, memories, seq_len ) if len(actions.shape) <= 2: - actions.unsqueeze_(-1) + actions = actions.unsqueeze(-1) action_list = [actions[..., i] for i in range(actions.shape[2])] log_probs, entropies, _ = self.actor_critic.get_probs_and_entropy( action_list, dists diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 099fc1b724..351fb837a1 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -282,7 +282,7 @@ def sac_policy_loss( _ent_coef = torch.exp(self._log_ent_coef) mean_q1 = torch.mean(torch.stack(list(q1p_outs.values())), axis=0) if not discrete: - mean_q1.unsqueeze_(1) + mean_q1 = mean_q1.unsqueeze(1) batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1) policy_loss = torch.mean(loss_masks * batch_policy_loss) else: @@ -382,7 +382,6 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) - vis_obs: List[torch.Tensor] = [] next_vis_obs: List[torch.Tensor] = [] if self.policy.use_vis_obs: @@ -405,7 +404,6 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: self.target_network.network_body.copy_normalization( self.policy.actor_critic.network_body ) - sampled_actions, log_probs, entropies, sampled_values, _ = self.policy.sample_actions( vec_obs, vis_obs, @@ -430,7 +428,6 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: with torch.no_grad(): target_values, _ = self.target_network(next_vec_obs, next_vis_obs) masks = list_to_tensor(batch["masks"], dtype=torch.int32) - use_discrete = not self.policy.use_continuous_act dones = list_to_tensor(batch["done"]) @@ -443,13 +440,14 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete) entropy_loss = self.sac_entropy_loss(log_probs, masks, use_discrete) + total_value_loss = q1_loss + q2_loss + value_loss + self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() - total_value_loss = q1_loss + q2_loss + value_loss + self.value_optimizer.zero_grad() total_value_loss.backward() - self.value_optimizer.step() self.entropy_optimizer.zero_grad() @@ -468,6 +466,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: .cpu() .numpy(), } + return update_stats def update_reward_signals( From 9f70f4838f174988c767fa17bad736f78149bde8 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 20 Jul 2020 23:58:37 -0700 Subject: [PATCH 16/16] Fix dimension issue (discrete trains) --- .../mlagents/trainers/sac/optimizer_torch.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 351fb837a1..84b9cfc46e 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -170,8 +170,8 @@ def sac_q_loss( q2_losses = [] # Multiple q losses per stream for i, name in enumerate(q1_out.keys()): - q1_stream = q1_out[name] - q2_stream = q2_out[name] + q1_stream = q1_out[name].squeeze() + q2_stream = q2_out[name].squeeze() with torch.no_grad(): q_backup = rewards[name] + ( (1.0 - self.use_dones_in_backup[name] * dones) @@ -179,10 +179,10 @@ def sac_q_loss( * target_values[name] ) _q1_loss = 0.5 * torch.mean( - loss_masks * torch.pow((q_backup - q1_stream), 2) + loss_masks * torch.nn.functional.mse_loss(q_backup, q1_stream) ) _q2_loss = 0.5 * torch.mean( - loss_masks * torch.pow((q_backup - q2_stream), 2) + loss_masks * torch.nn.functional.mse_loss(q_backup, q2_stream) ) q1_losses.append(_q1_loss) @@ -244,7 +244,7 @@ def sac_value_loss( ) # print(log_probs, v_backup, _ent_coef, loss_masks) value_loss = 0.5 * torch.mean( - loss_masks * torch.pow((values[name] - v_backup), 2) + loss_masks * torch.nn.functional.mse_loss(values[name], v_backup) ) value_losses.append(value_loss) else: @@ -264,7 +264,8 @@ def sac_value_loss( branched_ent_bonus, axis=0 ) value_loss = 0.5 * torch.mean( - loss_masks * torch.pow((values[name] - v_backup), 2) + loss_masks + * torch.nn.functional.mse_loss(values[name], v_backup.squeeze()) ) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) @@ -404,7 +405,13 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: self.target_network.network_body.copy_normalization( self.policy.actor_critic.network_body ) - sampled_actions, log_probs, entropies, sampled_values, _ = self.policy.sample_actions( + ( + sampled_actions, + log_probs, + entropies, + sampled_values, + _, + ) = self.policy.sample_actions( vec_obs, vis_obs, masks=act_masks, @@ -412,7 +419,6 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: seq_len=self.policy.sequence_length, all_log_probs=not self.policy.use_continuous_act, ) - if self.policy.use_continuous_act: squeezed_actions = actions.squeeze(-1) q1p_out, q2p_out = self.value_network(vec_obs, vis_obs, sampled_actions)