diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index 44f4b202ea..d55cefbac5 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -29,6 +29,10 @@ and this project adheres to - The interaction between EnvManager and TrainerController was changed; EnvManager.advance() was split into to stages, and TrainerController now uses the results from the first stage to handle new behavior names. This change speeds up Python training by approximately 5-10%. (#4259) +- Experimental PyTorch support has been added. Use `--torch` when running `mlagents-learn`, or add +`framework: pytorch` to your trainer configuration (under the behavior name) to enable it. +Note that PyTorch 1.6.0 or greater should be installed to use this feature; see +[the PyTorch website](https://pytorch.org/) for installation instructions. (#4335) ### Minor Changes #### com.unity.ml-agents (C#) diff --git a/docs/Learning-Environment-Examples.md b/docs/Learning-Environment-Examples.md index 7f020eb533..87ea8760e9 100644 --- a/docs/Learning-Environment-Examples.md +++ b/docs/Learning-Environment-Examples.md @@ -460,7 +460,7 @@ you would like to contribute environments, please see our head, thighs, shins, feet, arms, forearms and hands. - Goal: The agents must move its body toward the goal direction without falling. - `WalkerDynamic`- Goal direction is randomized. - - `WalkerDynamicVariableSpeed`- Goal direction and walking speed are randomized. + - `WalkerDynamicVariableSpeed`- Goal direction and walking speed are randomized. - `WalkerStatic` - Goal direction is always forward. - `WalkerStaticVariableSpeed` - Goal direction is always forward. Walking speed is randomized diff --git a/ml-agents/mlagents/trainers/buffer.py b/ml-agents/mlagents/trainers/buffer.py index 87fd160d8f..9b0cf48aaa 100644 --- a/ml-agents/mlagents/trainers/buffer.py +++ b/ml-agents/mlagents/trainers/buffer.py @@ -48,7 +48,7 @@ def extend(self, data: np.ndarray) -> None: Adds a list of np.arrays to the end of the list of np.arrays. :param data: The np.array list to append. """ - self += list(np.array(data)) + self += list(np.array(data, dtype=np.float32)) def set(self, data): """ diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py index 41f2c3d87a..8f82751d3f 100644 --- a/ml-agents/mlagents/trainers/cli_utils.py +++ b/ml-agents/mlagents/trainers/cli_utils.py @@ -168,6 +168,13 @@ def _create_parser() -> argparse.ArgumentParser: action=DetectDefaultStoreTrue, help="Forces training using CPU only", ) + argparser.add_argument( + "--torch", + default=False, + action=DetectDefaultStoreTrue, + help="(Experimental) Use the PyTorch framework instead of TensorFlow. Install PyTorch " + "before using this option", + ) eng_conf = argparser.add_argument_group(title="Engine Configuration") eng_conf.add_argument( diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py index 849deeae5d..50d9aad74c 100644 --- a/ml-agents/mlagents/trainers/ghost/trainer.py +++ b/ml-agents/mlagents/trainers/ghost/trainer.py @@ -304,7 +304,10 @@ def save_model(self) -> None: self.trainer.save_model() def create_policy( - self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, ) -> Policy: """ Creates policy with the wrapped trainer's create_policy function @@ -313,10 +316,10 @@ def create_policy( team are grouped. All policies associated with this team are added to the wrapped trainer to be trained. """ - policy = self.trainer.create_policy(parsed_behavior_id, behavior_spec) - policy.create_tf_graph() + policy = self.trainer.create_policy( + parsed_behavior_id, behavior_spec, create_graph=True + ) self.trainer.saver.initialize_or_load(policy) - policy.init_load_weights() team_id = parsed_behavior_id.team_id self.controller.subscribe_team_id(team_id, self) @@ -326,7 +329,6 @@ def create_policy( parsed_behavior_id, behavior_spec ) self.trainer.add_policy(parsed_behavior_id, internal_trainer_policy) - internal_trainer_policy.init_load_weights() self.current_policy_snapshot[ parsed_behavior_id.brain_name ] = internal_trainer_policy.get_weights() diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py new file mode 100644 index 0000000000..4cba4c9a2b --- /dev/null +++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py @@ -0,0 +1,94 @@ +from typing import Dict, Optional, Tuple, List +import torch +import numpy as np + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.trajectory import SplitObservations +from mlagents.trainers.torch.components.bc.module import BCModule +from mlagents.trainers.torch.components.reward_providers import create_reward_provider + +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.optimizer import Optimizer +from mlagents.trainers.settings import TrainerSettings +from mlagents.trainers.torch.utils import ModelUtils + + +class TorchOptimizer(Optimizer): # pylint: disable=W0223 + def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings): + super().__init__() + self.policy = policy + self.trainer_settings = trainer_settings + self.update_dict: Dict[str, torch.Tensor] = {} + self.value_heads: Dict[str, torch.Tensor] = {} + self.memory_in: torch.Tensor = None + self.memory_out: torch.Tensor = None + self.m_size: int = 0 + self.global_step = torch.tensor(0) + self.bc_module: Optional[BCModule] = None + self.create_reward_signals(trainer_settings.reward_signals) + if trainer_settings.behavioral_cloning is not None: + self.bc_module = BCModule( + self.policy, + trainer_settings.behavioral_cloning, + policy_learning_rate=trainer_settings.hyperparameters.learning_rate, + default_batch_size=trainer_settings.hyperparameters.batch_size, + default_num_epoch=3, + ) + + def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: + pass + + def create_reward_signals(self, reward_signal_configs): + """ + Create reward signals + :param reward_signal_configs: Reward signal config. + """ + for reward_signal, settings in reward_signal_configs.items(): + # Name reward signals by string in case we have duplicates later + self.reward_signals[reward_signal.value] = create_reward_provider( + reward_signal, self.policy.behavior_spec, settings + ) + + def get_trajectory_value_estimates( + self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool + ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: + vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])] + if self.policy.use_vis_obs: + visual_obs = [] + for idx, _ in enumerate( + self.policy.actor_critic.network_body.visual_encoders + ): + visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx]) + visual_obs.append(visual_ob) + else: + visual_obs = [] + + memory = torch.zeros([1, 1, self.policy.m_size]) + + vec_vis_obs = SplitObservations.from_observations(next_obs) + next_vec_obs = [ + ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0) + ] + next_vis_obs = [ + ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0) + for _vis_ob in vec_vis_obs.visual_observations + ] + + value_estimates, next_memory = self.policy.actor_critic.critic_pass( + vector_obs, visual_obs, memory, sequence_length=batch.num_experiences + ) + + next_value_estimate, _ = self.policy.actor_critic.critic_pass( + next_vec_obs, next_vis_obs, next_memory, sequence_length=1 + ) + + for name, estimate in value_estimates.items(): + value_estimates[name] = estimate.detach().cpu().numpy() + next_value_estimate[name] = next_value_estimate[name].detach().cpu().numpy() + + if done: + for k in next_value_estimate: + if not self.reward_signals[k].ignore_done: + next_value_estimate[k] = 0.0 + + return value_estimates, next_value_estimate diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py index 707023ab3b..47789d1e92 100644 --- a/ml-agents/mlagents/trainers/policy/tf_policy.py +++ b/ml-agents/mlagents/trainers/policy/tf_policy.py @@ -152,6 +152,8 @@ def create_tf_graph(self) -> None: # We do an initialize to make the Policy usable out of the box. If an optimizer is needed, # it will re-load the full graph self.initialize() + # Create assignment ops for Ghost Trainer + self.init_load_weights() def _create_encoder( self, diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py new file mode 100644 index 0000000000..8c61bc37bd --- /dev/null +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -0,0 +1,281 @@ +from typing import Any, Dict, List, Tuple, Optional +import numpy as np +import torch +import copy + +from mlagents.trainers.action_info import ActionInfo +from mlagents.trainers.behavior_id_utils import get_global_agent_id +from mlagents.trainers.policy import Policy +from mlagents_envs.base_env import DecisionSteps, BehaviorSpec +from mlagents_envs.timers import timed + +from mlagents.trainers.settings import TrainerSettings +from mlagents.trainers.trajectory import SplitObservations +from mlagents.trainers.torch.networks import ( + SharedActorCritic, + SeparateActorCritic, + GlobalSteps, +) +from mlagents.trainers.torch.utils import ModelUtils + +EPSILON = 1e-7 # Small value to avoid divide by zero + + +class TorchPolicy(Policy): + def __init__( + self, + seed: int, + behavior_spec: BehaviorSpec, + trainer_settings: TrainerSettings, + tanh_squash: bool = False, + reparameterize: bool = False, + separate_critic: bool = True, + condition_sigma_on_obs: bool = True, + ): + """ + Policy that uses a multilayer perceptron to map the observations to actions. Could + also use a CNN to encode visual input prior to the MLP. Supports discrete and + continuous action spaces, as well as recurrent networks. + :param seed: Random seed. + :param brain: Assigned BrainParameters object. + :param trainer_settings: Defined training parameters. + :param load: Whether a pre-trained model will be loaded or a new one created. + :param tanh_squash: Whether to use a tanh function on the continuous output, + or a clipped output. + :param reparameterize: Whether we are using the resampling trick to update the policy + in continuous output. + """ + super().__init__( + seed, + behavior_spec, + trainer_settings, + tanh_squash, + reparameterize, + condition_sigma_on_obs, + ) + self.global_step = ( + GlobalSteps() + ) # could be much simpler if TorchPolicy is nn.Module + self.grads = None + + torch.set_default_tensor_type(torch.FloatTensor) + + reward_signal_configs = trainer_settings.reward_signals + reward_signal_names = [key.value for key, _ in reward_signal_configs.items()] + + self.stats_name_to_update_name = { + "Losses/Value Loss": "value_loss", + "Losses/Policy Loss": "policy_loss", + } + if separate_critic: + ac_class = SeparateActorCritic + else: + ac_class = SharedActorCritic + self.actor_critic = ac_class( + observation_shapes=self.behavior_spec.observation_shapes, + network_settings=trainer_settings.network_settings, + act_type=behavior_spec.action_type, + act_size=self.act_size, + stream_names=reward_signal_names, + conditional_sigma=self.condition_sigma_on_obs, + tanh_squash=tanh_squash, + ) + # Save the m_size needed for export + self._export_m_size = self.m_size + # m_size needed for training is determined by network, not trainer settings + self.m_size = self.actor_critic.memory_size + + self.actor_critic.to("cpu") + + @property + def export_memory_size(self) -> int: + """ + Returns the memory size of the exported ONNX policy. This only includes the memory + of the Actor and not any auxillary networks. + """ + return self._export_m_size + + def _split_decision_step( + self, decision_requests: DecisionSteps + ) -> Tuple[SplitObservations, np.ndarray]: + vec_vis_obs = SplitObservations.from_observations(decision_requests.obs) + mask = None + if not self.use_continuous_act: + mask = torch.ones([len(decision_requests), np.sum(self.act_size)]) + if decision_requests.action_mask is not None: + mask = torch.as_tensor( + 1 - np.concatenate(decision_requests.action_mask, axis=1) + ) + return vec_vis_obs, mask + + def update_normalization(self, vector_obs: np.ndarray) -> None: + """ + If this policy normalizes vector observations, this will update the norm values in the graph. + :param vector_obs: The vector observations to add to the running estimate of the distribution. + """ + vector_obs = [torch.as_tensor(vector_obs)] + if self.use_vec_obs and self.normalize: + self.actor_critic.update_normalization(vector_obs) + + @timed + def sample_actions( + self, + vec_obs: List[torch.Tensor], + vis_obs: List[torch.Tensor], + masks: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + seq_len: int = 1, + all_log_probs: bool = False, + ) -> Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor + ]: + """ + :param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action. + """ + dists, value_heads, memories = self.actor_critic.get_dist_and_value( + vec_obs, vis_obs, masks, memories, seq_len + ) + action_list = self.actor_critic.sample_action(dists) + log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy( + action_list, dists + ) + actions = torch.stack(action_list, dim=-1) + if self.use_continuous_act: + actions = actions[:, :, 0] + else: + actions = actions[:, 0, :] + + return ( + actions, + all_logs if all_log_probs else log_probs, + entropies, + value_heads, + memories, + ) + + def evaluate_actions( + self, + vec_obs: torch.Tensor, + vis_obs: torch.Tensor, + actions: torch.Tensor, + masks: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + seq_len: int = 1, + ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]: + dists, value_heads, _ = self.actor_critic.get_dist_and_value( + vec_obs, vis_obs, masks, memories, seq_len + ) + action_list = [actions[..., i] for i in range(actions.shape[-1])] + log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists) + + return log_probs, entropies, value_heads + + @timed + def evaluate( + self, decision_requests: DecisionSteps, global_agent_ids: List[str] + ) -> Dict[str, Any]: + """ + Evaluates policy for the agent experiences provided. + :param global_agent_ids: + :param decision_requests: DecisionStep object containing inputs. + :return: Outputs from network as defined by self.inference_dict. + """ + vec_vis_obs, masks = self._split_decision_step(decision_requests) + vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)] + vis_obs = [ + torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations + ] + memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze( + 0 + ) + + run_out = {} + with torch.no_grad(): + action, log_probs, entropy, value_heads, memories = self.sample_actions( + vec_obs, vis_obs, masks=masks, memories=memories + ) + run_out["action"] = action.detach().cpu().numpy() + run_out["pre_action"] = action.detach().cpu().numpy() + # Todo - make pre_action difference + run_out["log_probs"] = log_probs.detach().cpu().numpy() + run_out["entropy"] = entropy.detach().cpu().numpy() + run_out["value_heads"] = { + name: t.detach().cpu().numpy() for name, t in value_heads.items() + } + run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0) + run_out["learning_rate"] = 0.0 + if self.use_recurrent: + run_out["memory_out"] = memories.detach().cpu().numpy().squeeze(0) + return run_out + + def get_action( + self, decision_requests: DecisionSteps, worker_id: int = 0 + ) -> ActionInfo: + """ + Decides actions given observations information, and takes them in environment. + :param worker_id: + :param decision_requests: A dictionary of brain names and BrainInfo from environment. + :return: an ActionInfo containing action, memories, values and an object + to be passed to add experiences + """ + if len(decision_requests) == 0: + return ActionInfo.empty() + + global_agent_ids = [ + get_global_agent_id(worker_id, int(agent_id)) + for agent_id in decision_requests.agent_id + ] # For 1-D array, the iterator order is correct. + + run_out = self.evaluate( + decision_requests, global_agent_ids + ) # pylint: disable=assignment-from-no-return + self.save_memories(global_agent_ids, run_out.get("memory_out")) + return ActionInfo( + action=run_out.get("action"), + value=run_out.get("value"), + outputs=run_out, + agent_ids=list(decision_requests.agent_id), + ) + + @property + def use_vis_obs(self): + return self.vis_obs_size > 0 + + @property + def use_vec_obs(self): + return self.vec_obs_size > 0 + + def get_current_step(self): + """ + Gets current model step. + :return: current model step. + """ + return self.global_step.current_step + + def set_step(self, step: int) -> int: + """ + Sets current model step to step without creating additional ops. + :param step: Step to set the current model step to. + :return: The step the model was set to. + """ + self.global_step.current_step = step + return step + + def increment_step(self, n_steps): + """ + Increments model step. + """ + self.global_step.increment(n_steps) + return self.get_current_step() + + def load_weights(self, values: List[np.ndarray]) -> None: + self.actor_critic.load_state_dict(values) + + def init_load_weights(self) -> None: + pass + + def get_weights(self) -> List[np.ndarray]: + return copy.deepcopy(self.actor_critic.state_dict()) + + def get_modules(self): + return {"Policy": self.actor_critic, "global_step": self.global_step} diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py new file mode 100644 index 0000000000..7fc27cb44f --- /dev/null +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -0,0 +1,203 @@ +from typing import Dict, cast +import torch + +from mlagents.trainers.buffer import AgentBuffer + +from mlagents_envs.timers import timed +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.settings import TrainerSettings, PPOSettings +from mlagents.trainers.torch.utils import ModelUtils + + +class TorchPPOOptimizer(TorchOptimizer): + def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings): + """ + Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. + The PPO optimizer has a value estimator and a loss function. + :param policy: A TFPolicy object that will be updated by this PPO Optimizer. + :param trainer_params: Trainer parameters dictionary that specifies the + properties of the trainer. + """ + # Create the graph here to give more granular control of the TF graph to the Optimizer. + + super().__init__(policy, trainer_settings) + params = list(self.policy.actor_critic.parameters()) + self.hyperparameters: PPOSettings = cast( + PPOSettings, trainer_settings.hyperparameters + ) + self.decay_learning_rate = ModelUtils.DecayedValue( + self.hyperparameters.learning_rate_schedule, + self.hyperparameters.learning_rate, + 1e-10, + self.trainer_settings.max_steps, + ) + self.decay_epsilon = ModelUtils.DecayedValue( + self.hyperparameters.learning_rate_schedule, + self.hyperparameters.epsilon, + 0.1, + self.trainer_settings.max_steps, + ) + self.decay_beta = ModelUtils.DecayedValue( + self.hyperparameters.learning_rate_schedule, + self.hyperparameters.beta, + 1e-5, + self.trainer_settings.max_steps, + ) + + self.optimizer = torch.optim.Adam( + params, lr=self.trainer_settings.hyperparameters.learning_rate + ) + self.stats_name_to_update_name = { + "Losses/Value Loss": "value_loss", + "Losses/Policy Loss": "policy_loss", + } + + self.stream_names = list(self.reward_signals.keys()) + + def ppo_value_loss( + self, + values: Dict[str, torch.Tensor], + old_values: Dict[str, torch.Tensor], + returns: Dict[str, torch.Tensor], + epsilon: float, + loss_masks: torch.Tensor, + ) -> torch.Tensor: + """ + Evaluates value loss for PPO. + :param values: Value output of the current network. + :param old_values: Value stored with experiences in buffer. + :param returns: Computed returns. + :param epsilon: Clipping value for value estimate. + :param loss_mask: Mask for losses. Used with LSTM to ignore 0'ed out experiences. + """ + value_losses = [] + for name, head in values.items(): + old_val_tensor = old_values[name] + returns_tensor = returns[name] + clipped_value_estimate = old_val_tensor + torch.clamp( + head - old_val_tensor, -1 * epsilon, epsilon + ) + v_opt_a = (returns_tensor - head) ** 2 + v_opt_b = (returns_tensor - clipped_value_estimate) ** 2 + value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks) + value_losses.append(value_loss) + value_loss = torch.mean(torch.stack(value_losses)) + return value_loss + + def ppo_policy_loss( + self, + advantages: torch.Tensor, + log_probs: torch.Tensor, + old_log_probs: torch.Tensor, + loss_masks: torch.Tensor, + ) -> torch.Tensor: + """ + Evaluate PPO policy loss. + :param advantages: Computed advantages. + :param log_probs: Current policy probabilities + :param old_log_probs: Past policy probabilities + :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences. + """ + advantage = advantages.unsqueeze(-1) + + decay_epsilon = self.hyperparameters.epsilon + + r_theta = torch.exp(log_probs - old_log_probs) + p_opt_a = r_theta * advantage + p_opt_b = ( + torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage + ) + policy_loss = -1 * ModelUtils.masked_mean( + torch.min(p_opt_a, p_opt_b), loss_masks + ) + return policy_loss + + @timed + def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: + """ + Performs update on model. + :param batch: Batch of experiences. + :param num_sequences: Number of sequences to process. + :return: Results of update. + """ + # Get decayed parameters + decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) + decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step()) + decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) + returns = {} + old_values = {} + for name in self.reward_signals: + old_values[name] = ModelUtils.list_to_tensor( + batch[f"{name}_value_estimates"] + ) + returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"]) + + vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])] + act_masks = ModelUtils.list_to_tensor(batch["action_mask"]) + if self.policy.use_continuous_act: + actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1) + else: + actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) + + memories = [ + ModelUtils.list_to_tensor(batch["memory"][i]) + for i in range(0, len(batch["memory"]), self.policy.sequence_length) + ] + if len(memories) > 0: + memories = torch.stack(memories).unsqueeze(0) + + if self.policy.use_vis_obs: + vis_obs = [] + for idx, _ in enumerate( + self.policy.actor_critic.network_body.visual_encoders + ): + vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx]) + vis_obs.append(vis_ob) + else: + vis_obs = [] + log_probs, entropy, values = self.policy.evaluate_actions( + vec_obs, + vis_obs, + masks=act_masks, + actions=actions, + memories=memories, + seq_len=self.policy.sequence_length, + ) + loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) + value_loss = self.ppo_value_loss( + values, old_values, returns, decay_eps, loss_masks + ) + policy_loss = self.ppo_policy_loss( + ModelUtils.list_to_tensor(batch["advantages"]), + log_probs, + ModelUtils.list_to_tensor(batch["action_probs"]), + loss_masks, + ) + loss = ( + policy_loss + + 0.5 * value_loss + - decay_bet * ModelUtils.masked_mean(entropy, loss_masks) + ) + + # Set optimizer learning rate + ModelUtils.update_learning_rate(self.optimizer, decay_lr) + self.optimizer.zero_grad() + loss.backward() + + self.optimizer.step() + update_stats = { + "Losses/Policy Loss": abs(policy_loss.detach().cpu().numpy()), + "Losses/Value Loss": value_loss.detach().cpu().numpy(), + "Policy/Learning Rate": decay_lr, + "Policy/Epsilon": decay_eps, + "Policy/Beta": decay_bet, + } + + for reward_provider in self.reward_signals.values(): + update_stats.update(reward_provider.update(batch)) + + return update_stats + + def get_modules(self): + return {"Optimizer": self.optimizer} diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index 0a46f45b8e..c2af0d2058 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -15,7 +15,15 @@ from mlagents.trainers.ppo.optimizer import PPOOptimizer from mlagents.trainers.trajectory import Trajectory from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers -from mlagents.trainers.settings import TrainerSettings, PPOSettings +from mlagents.trainers.settings import TrainerSettings, PPOSettings, FrameworkType +from mlagents.trainers.components.reward_signals import RewardSignal + +try: + from mlagents.trainers.policy.torch_policy import TorchPolicy + from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer +except ModuleNotFoundError: + TorchPolicy = None # type: ignore + TorchPPOOptimizer = None # type: ignore logger = get_logger(__name__) @@ -73,20 +81,33 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: trajectory.next_obs, trajectory.done_reached and not trajectory.interrupted, ) + for name, v in value_estimates.items(): agent_buffer_trajectory[f"{name}_value_estimates"].extend(v) - self._stats_reporter.add_stat( - self.optimizer.reward_signals[name].value_name, np.mean(v) - ) + if isinstance(self.optimizer.reward_signals[name], RewardSignal): + self._stats_reporter.add_stat( + self.optimizer.reward_signals[name].value_name, np.mean(v) + ) + else: + self._stats_reporter.add_stat( + f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", + np.mean(v), + ) # Evaluate all reward functions self.collected_rewards["environment"][agent_id] += np.sum( agent_buffer_trajectory["environment_rewards"] ) for name, reward_signal in self.optimizer.reward_signals.items(): - evaluate_result = reward_signal.evaluate_batch( - agent_buffer_trajectory - ).scaled_reward + if isinstance(reward_signal, RewardSignal): + evaluate_result = reward_signal.evaluate_batch( + agent_buffer_trajectory + ).scaled_reward + else: + evaluate_result = ( + reward_signal.evaluate(agent_buffer_trajectory) + * reward_signal.strength + ) agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) @@ -101,6 +122,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: local_value_estimates = agent_buffer_trajectory[ f"{name}_value_estimates" ].get_batch() + local_advantage = get_gae( rewards=local_rewards, value_estimates=local_value_estimates, @@ -187,12 +209,17 @@ def _update_policy(self): self._clear_update_buffer() return True - def create_policy( - self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + def create_tf_policy( + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, ) -> TFPolicy: """ - Creates a PPO policy to trainers list of policies. + Creates a policy with a Tensorflow backend and PPO hyperparameters + :param parsed_behavior_id: :param behavior_spec: specifications for policy construction + :param create_graph: whether to create the Tensorflow graph on construction :return policy """ policy = TFPolicy( @@ -200,13 +227,37 @@ def create_policy( behavior_spec, self.trainer_settings, condition_sigma_on_obs=False, # Faster training for PPO - create_tf_graph=False, # We will create the TF graph in the Optimizer + create_tf_graph=create_graph, ) + return policy + def create_torch_policy( + self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + ) -> TorchPolicy: + """ + Creates a policy with a PyTorch backend and PPO hyperparameters + :param parsed_behavior_id: + :param behavior_spec: specifications for policy construction + :return policy + """ + policy = TorchPolicy( + self.seed, + behavior_spec, + self.trainer_settings, + condition_sigma_on_obs=False, # Faster training for PPO + separate_critic=behavior_spec.is_action_continuous(), + ) return policy def create_ppo_optimizer(self) -> PPOOptimizer: - return PPOOptimizer(cast(TFPolicy, self.policy), self.trainer_settings) + if self.framework == FrameworkType.PYTORCH: + return TorchPPOOptimizer( # type: ignore + cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore + ) # type: ignore + else: + return PPOOptimizer( # type: ignore + cast(TFPolicy, self.policy), self.trainer_settings # type: ignore + ) # type: ignore def add_policy( self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy @@ -225,6 +276,7 @@ def add_policy( ) self.policy = policy self.policies[parsed_behavior_id.behavior_id] = policy + self.optimizer = self.create_ppo_optimizer() for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py new file mode 100644 index 0000000000..9ca71be3bf --- /dev/null +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -0,0 +1,561 @@ +import numpy as np +from typing import Dict, List, Mapping, cast, Tuple, Optional +import torch +from torch import nn +import attr + +from mlagents_envs.logging_util import get_logger +from mlagents_envs.base_env import ActionType +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.settings import NetworkSettings +from mlagents.trainers.torch.networks import ValueNetwork +from mlagents.trainers.torch.utils import ModelUtils +from mlagents.trainers.buffer import AgentBuffer +from mlagents_envs.timers import timed +from mlagents.trainers.exception import UnityTrainerException +from mlagents.trainers.settings import TrainerSettings, SACSettings + +EPSILON = 1e-6 # Small value to avoid divide by zero + +logger = get_logger(__name__) + + +class TorchSACOptimizer(TorchOptimizer): + class PolicyValueNetwork(nn.Module): + def __init__( + self, + stream_names: List[str], + observation_shapes: List[Tuple[int, ...]], + network_settings: NetworkSettings, + act_type: ActionType, + act_size: List[int], + ): + super().__init__() + if act_type == ActionType.CONTINUOUS: + num_value_outs = 1 + num_action_ins = sum(act_size) + else: + num_value_outs = sum(act_size) + num_action_ins = 0 + self.q1_network = ValueNetwork( + stream_names, + observation_shapes, + network_settings, + num_action_ins, + num_value_outs, + ) + self.q2_network = ValueNetwork( + stream_names, + observation_shapes, + network_settings, + num_action_ins, + num_value_outs, + ) + + def forward( + self, + vec_inputs: List[torch.Tensor], + vis_inputs: List[torch.Tensor], + actions: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: + q1_out, _ = self.q1_network( + vec_inputs, + vis_inputs, + actions=actions, + memories=memories, + sequence_length=sequence_length, + ) + q2_out, _ = self.q2_network( + vec_inputs, + vis_inputs, + actions=actions, + memories=memories, + sequence_length=sequence_length, + ) + return q1_out, q2_out + + def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): + super().__init__(policy, trainer_params) + hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters) + self.tau = hyperparameters.tau + self.init_entcoef = hyperparameters.init_entcoef + + self.policy = policy + self.act_size = policy.act_size + policy_network_settings = policy.network_settings + + self.tau = hyperparameters.tau + self.burn_in_ratio = 0.0 + + # Non-exposed SAC parameters + self.discrete_target_entropy_scale = 0.2 # Roughly equal to e-greedy 0.05 + self.continuous_target_entropy_scale = 1.0 + + self.stream_names = list(self.reward_signals.keys()) + # Use to reduce "survivor bonus" when using Curiosity or GAIL. + self.gammas = [_val.gamma for _val in trainer_params.reward_signals.values()] + self.use_dones_in_backup = { + name: int(not self.reward_signals[name].ignore_done) + for name in self.stream_names + } + + # Critics should have 1/2 of the memory of the policy + critic_memory = policy_network_settings.memory + if critic_memory is not None: + critic_memory = attr.evolve( + critic_memory, memory_size=critic_memory.memory_size // 2 + ) + value_network_settings = attr.evolve( + policy_network_settings, memory=critic_memory + ) + + self.value_network = TorchSACOptimizer.PolicyValueNetwork( + self.stream_names, + self.policy.behavior_spec.observation_shapes, + value_network_settings, + self.policy.behavior_spec.action_type, + self.act_size, + ) + + self.target_network = ValueNetwork( + self.stream_names, + self.policy.behavior_spec.observation_shapes, + value_network_settings, + ) + self.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0) + + self._log_ent_coef = torch.nn.Parameter( + torch.log(torch.as_tensor([self.init_entcoef] * len(self.act_size))), + requires_grad=True, + ) + if self.policy.use_continuous_act: + self.target_entropy = torch.as_tensor( + -1 + * self.continuous_target_entropy_scale + * np.prod(self.act_size[0]).astype(np.float32) + ) + else: + self.target_entropy = [ + self.discrete_target_entropy_scale * np.log(i).astype(np.float32) + for i in self.act_size + ] + + policy_params = list(self.policy.actor_critic.network_body.parameters()) + list( + self.policy.actor_critic.distribution.parameters() + ) + value_params = list(self.value_network.parameters()) + list( + self.policy.actor_critic.critic.parameters() + ) + + logger.debug("value_vars") + for param in value_params: + logger.debug(param.shape) + logger.debug("policy_vars") + for param in policy_params: + logger.debug(param.shape) + + self.decay_learning_rate = ModelUtils.DecayedValue( + hyperparameters.learning_rate_schedule, + hyperparameters.learning_rate, + 1e-10, + self.trainer_settings.max_steps, + ) + self.policy_optimizer = torch.optim.Adam( + policy_params, lr=hyperparameters.learning_rate + ) + self.value_optimizer = torch.optim.Adam( + value_params, lr=hyperparameters.learning_rate + ) + self.entropy_optimizer = torch.optim.Adam( + [self._log_ent_coef], lr=hyperparameters.learning_rate + ) + + def sac_q_loss( + self, + q1_out: Dict[str, torch.Tensor], + q2_out: Dict[str, torch.Tensor], + target_values: Dict[str, torch.Tensor], + dones: torch.Tensor, + rewards: Dict[str, torch.Tensor], + loss_masks: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + q1_losses = [] + q2_losses = [] + # Multiple q losses per stream + for i, name in enumerate(q1_out.keys()): + q1_stream = q1_out[name].squeeze() + q2_stream = q2_out[name].squeeze() + with torch.no_grad(): + q_backup = rewards[name] + ( + (1.0 - self.use_dones_in_backup[name] * dones) + * self.gammas[i] + * target_values[name] + ) + _q1_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(q_backup, q1_stream), loss_masks + ) + _q2_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(q_backup, q2_stream), loss_masks + ) + + q1_losses.append(_q1_loss) + q2_losses.append(_q2_loss) + q1_loss = torch.mean(torch.stack(q1_losses)) + q2_loss = torch.mean(torch.stack(q2_losses)) + return q1_loss, q2_loss + + def soft_update(self, source: nn.Module, target: nn.Module, tau: float) -> None: + for source_param, target_param in zip(source.parameters(), target.parameters()): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + + def sac_value_loss( + self, + log_probs: torch.Tensor, + values: Dict[str, torch.Tensor], + q1p_out: Dict[str, torch.Tensor], + q2p_out: Dict[str, torch.Tensor], + loss_masks: torch.Tensor, + discrete: bool, + ) -> torch.Tensor: + min_policy_qs = {} + with torch.no_grad(): + _ent_coef = torch.exp(self._log_ent_coef) + for name in values.keys(): + if not discrete: + min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name]) + else: + action_probs = log_probs.exp() + _branched_q1p = ModelUtils.break_into_branches( + q1p_out[name] * action_probs, self.act_size + ) + _branched_q2p = ModelUtils.break_into_branches( + q2p_out[name] * action_probs, self.act_size + ) + _q1p_mean = torch.mean( + torch.stack( + [torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q1p] + ), + dim=0, + ) + _q2p_mean = torch.mean( + torch.stack( + [torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q2p] + ), + dim=0, + ) + + min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean) + + value_losses = [] + if not discrete: + for name in values.keys(): + with torch.no_grad(): + v_backup = min_policy_qs[name] - torch.sum( + _ent_coef * log_probs, dim=1 + ) + value_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(values[name], v_backup), loss_masks + ) + value_losses.append(value_loss) + else: + branched_per_action_ent = ModelUtils.break_into_branches( + log_probs * log_probs.exp(), self.act_size + ) + # We have to do entropy bonus per action branch + branched_ent_bonus = torch.stack( + [ + torch.sum(_ent_coef[i] * _lp, dim=1, keepdim=True) + for i, _lp in enumerate(branched_per_action_ent) + ] + ) + for name in values.keys(): + with torch.no_grad(): + v_backup = min_policy_qs[name] - torch.mean( + branched_ent_bonus, axis=0 + ) + value_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(values[name], v_backup.squeeze()), + loss_masks, + ) + value_losses.append(value_loss) + value_loss = torch.mean(torch.stack(value_losses)) + if torch.isinf(value_loss).any() or torch.isnan(value_loss).any(): + raise UnityTrainerException("Inf found") + return value_loss + + def sac_policy_loss( + self, + log_probs: torch.Tensor, + q1p_outs: Dict[str, torch.Tensor], + loss_masks: torch.Tensor, + discrete: bool, + ) -> torch.Tensor: + _ent_coef = torch.exp(self._log_ent_coef) + mean_q1 = torch.mean(torch.stack(list(q1p_outs.values())), axis=0) + if not discrete: + mean_q1 = mean_q1.unsqueeze(1) + batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1) + policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks) + else: + action_probs = log_probs.exp() + branched_per_action_ent = ModelUtils.break_into_branches( + log_probs * action_probs, self.act_size + ) + branched_q_term = ModelUtils.break_into_branches( + mean_q1 * action_probs, self.act_size + ) + branched_policy_loss = torch.stack( + [ + torch.sum(_ent_coef[i] * _lp - _qt, dim=1, keepdim=True) + for i, (_lp, _qt) in enumerate( + zip(branched_per_action_ent, branched_q_term) + ) + ] + ) + batch_policy_loss = torch.squeeze(branched_policy_loss) + policy_loss = torch.mean(loss_masks * batch_policy_loss) + return policy_loss + + def sac_entropy_loss( + self, log_probs: torch.Tensor, loss_masks: torch.Tensor, discrete: bool + ) -> torch.Tensor: + if not discrete: + with torch.no_grad(): + target_current_diff = torch.sum(log_probs + self.target_entropy, dim=1) + entropy_loss = -torch.mean( + self._log_ent_coef * loss_masks * target_current_diff + ) + else: + with torch.no_grad(): + branched_per_action_ent = ModelUtils.break_into_branches( + log_probs * log_probs.exp(), self.act_size + ) + target_current_diff_branched = torch.stack( + [ + torch.sum(_lp, axis=1, keepdim=True) + _te + for _lp, _te in zip( + branched_per_action_ent, self.target_entropy + ) + ], + axis=1, + ) + target_current_diff = torch.squeeze( + target_current_diff_branched, axis=2 + ) + entropy_loss = -1 * ModelUtils.masked_mean( + torch.mean(self._log_ent_coef * target_current_diff, axis=1), loss_masks + ) + + return entropy_loss + + def _condense_q_streams( + self, q_output: Dict[str, torch.Tensor], discrete_actions: torch.Tensor + ) -> Dict[str, torch.Tensor]: + condensed_q_output = {} + onehot_actions = ModelUtils.actions_to_onehot(discrete_actions, self.act_size) + for key, item in q_output.items(): + branched_q = ModelUtils.break_into_branches(item, self.act_size) + only_action_qs = torch.stack( + [ + torch.sum(_act * _q, dim=1, keepdim=True) + for _act, _q in zip(onehot_actions, branched_q) + ] + ) + + condensed_q_output[key] = torch.mean(only_action_qs, dim=0) + return condensed_q_output + + @timed + def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: + """ + Updates model using buffer. + :param num_sequences: Number of trajectories in batch. + :param batch: Experience mini-batch. + :param update_target: Whether or not to update target value network + :param reward_signal_batches: Minibatches to use for updating the reward signals, + indexed by name. If none, don't update the reward signals. + :return: Output from update process. + """ + rewards = {} + for name in self.reward_signals: + rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"]) + + vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])] + next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])] + act_masks = ModelUtils.list_to_tensor(batch["action_mask"]) + if self.policy.use_continuous_act: + actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1) + else: + actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) + + memories_list = [ + ModelUtils.list_to_tensor(batch["memory"][i]) + for i in range(0, len(batch["memory"]), self.policy.sequence_length) + ] + # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. + offset = 1 if self.policy.sequence_length > 1 else 0 + next_memories_list = [ + ModelUtils.list_to_tensor( + batch["memory"][i][self.policy.m_size // 2 :] + ) # only pass value part of memory to target network + for i in range(offset, len(batch["memory"]), self.policy.sequence_length) + ] + + if len(memories_list) > 0: + memories = torch.stack(memories_list).unsqueeze(0) + next_memories = torch.stack(next_memories_list).unsqueeze(0) + else: + memories = None + next_memories = None + # Q network memories are 0'ed out, since we don't have them during inference. + q_memories = ( + torch.zeros_like(next_memories) if next_memories is not None else None + ) + + vis_obs: List[torch.Tensor] = [] + next_vis_obs: List[torch.Tensor] = [] + if self.policy.use_vis_obs: + vis_obs = [] + for idx, _ in enumerate( + self.policy.actor_critic.network_body.visual_encoders + ): + vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx]) + vis_obs.append(vis_ob) + next_vis_ob = ModelUtils.list_to_tensor( + batch["next_visual_obs%d" % idx] + ) + next_vis_obs.append(next_vis_ob) + + # Copy normalizers from policy + self.value_network.q1_network.network_body.copy_normalization( + self.policy.actor_critic.network_body + ) + self.value_network.q2_network.network_body.copy_normalization( + self.policy.actor_critic.network_body + ) + self.target_network.network_body.copy_normalization( + self.policy.actor_critic.network_body + ) + ( + sampled_actions, + log_probs, + entropies, + sampled_values, + _, + ) = self.policy.sample_actions( + vec_obs, + vis_obs, + masks=act_masks, + memories=memories, + seq_len=self.policy.sequence_length, + all_log_probs=not self.policy.use_continuous_act, + ) + if self.policy.use_continuous_act: + squeezed_actions = actions.squeeze(-1) + q1p_out, q2p_out = self.value_network( + vec_obs, + vis_obs, + sampled_actions, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) + q1_out, q2_out = self.value_network( + vec_obs, + vis_obs, + squeezed_actions, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) + q1_stream, q2_stream = q1_out, q2_out + else: + with torch.no_grad(): + q1p_out, q2p_out = self.value_network( + vec_obs, + vis_obs, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) + q1_out, q2_out = self.value_network( + vec_obs, + vis_obs, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) + q1_stream = self._condense_q_streams(q1_out, actions) + q2_stream = self._condense_q_streams(q2_out, actions) + + with torch.no_grad(): + target_values, _ = self.target_network( + next_vec_obs, + next_vis_obs, + memories=next_memories, + sequence_length=self.policy.sequence_length, + ) + masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) + use_discrete = not self.policy.use_continuous_act + dones = ModelUtils.list_to_tensor(batch["done"]) + + q1_loss, q2_loss = self.sac_q_loss( + q1_stream, q2_stream, target_values, dones, rewards, masks + ) + value_loss = self.sac_value_loss( + log_probs, sampled_values, q1p_out, q2p_out, masks, use_discrete + ) + policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete) + entropy_loss = self.sac_entropy_loss(log_probs, masks, use_discrete) + + total_value_loss = q1_loss + q2_loss + value_loss + + decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) + ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) + self.policy_optimizer.zero_grad() + policy_loss.backward() + self.policy_optimizer.step() + + ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) + self.value_optimizer.zero_grad() + total_value_loss.backward() + self.value_optimizer.step() + + ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) + self.entropy_optimizer.zero_grad() + entropy_loss.backward() + self.entropy_optimizer.step() + + # Update target network + self.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau) + update_stats = { + "Losses/Policy Loss": abs(policy_loss.detach().cpu().numpy()), + "Losses/Value Loss": value_loss.detach().cpu().numpy(), + "Losses/Q1 Loss": q1_loss.detach().cpu().numpy(), + "Losses/Q2 Loss": q2_loss.detach().cpu().numpy(), + "Policy/Entropy Coeff": torch.exp(self._log_ent_coef) + .detach() + .cpu() + .numpy(), + "Policy/Learning Rate": decay_lr, + } + + for signal in self.reward_signals.values(): + signal.update(batch) + + return update_stats + + def update_reward_signals( + self, reward_signal_minibatches: Mapping[str, AgentBuffer], num_sequences: int + ) -> Dict[str, float]: + return {} + + def get_modules(self): + return { + "Optimizer:value_network": self.value_network, + "Optimizer:target_network": self.target_network, + "Optimizer:policy_optimizer": self.policy_optimizer, + "Optimizer:value_optimizer": self.value_optimizer, + "Optimizer:entropy_optimizer": self.entropy_optimizer, + } diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index 6cf0ffbd7f..6be7eb9524 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -18,8 +18,15 @@ from mlagents.trainers.trainer.rl_trainer import RLTrainer from mlagents.trainers.trajectory import Trajectory, SplitObservations from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers -from mlagents.trainers.settings import TrainerSettings, SACSettings +from mlagents.trainers.settings import TrainerSettings, SACSettings, FrameworkType +from mlagents.trainers.components.reward_signals import RewardSignal +try: + from mlagents.trainers.policy.torch_policy import TorchPolicy + from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer +except ModuleNotFoundError: + TorchPolicy = None # type: ignore + TorchSACOptimizer = None # type: ignore logger = get_logger(__name__) @@ -136,9 +143,15 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: agent_buffer_trajectory["environment_rewards"] ) for name, reward_signal in self.optimizer.reward_signals.items(): - evaluate_result = reward_signal.evaluate_batch( - agent_buffer_trajectory - ).scaled_reward + if isinstance(reward_signal, RewardSignal): + evaluate_result = reward_signal.evaluate_batch( + agent_buffer_trajectory + ).scaled_reward + else: + evaluate_result = ( + reward_signal.evaluate(agent_buffer_trajectory) + * reward_signal.strength + ) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) @@ -147,9 +160,15 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached ) for name, v in value_estimates.items(): - self._stats_reporter.add_stat( - self.optimizer.reward_signals[name].value_name, np.mean(v) - ) + if isinstance(self.optimizer.reward_signals[name], RewardSignal): + self._stats_reporter.add_stat( + self.optimizer.reward_signals[name].value_name, np.mean(v) + ) + else: + self._stats_reporter.add_stat( + f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value", + np.mean(v), + ) # Bootstrap using the last step rather than the bootstrap step if max step is reached. # Set last element to duplicate obs and remove dones. @@ -193,17 +212,7 @@ def _update_policy(self) -> bool: self._update_reward_signals() return policy_was_updated - def create_policy( - self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec - ) -> TFPolicy: - policy = TFPolicy( - self.seed, - behavior_spec, - self.trainer_settings, - tanh_squash=True, - reparameterize=True, - create_tf_graph=False, - ) + def maybe_load_replay_buffer(self): # Load the replay buffer if load if self.load and self.checkpoint_replay_buffer: try: @@ -218,6 +227,48 @@ def create_policy( ) ) + def create_tf_policy( + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, + ) -> TFPolicy: + """ + Creates a policy with a Tensorflow backend and SAC hyperparameters + :param parsed_behavior_id: + :param behavior_spec: specifications for policy construction + :param create_graph: whether to create the Tensorflow graph on construction + :return policy + """ + policy = TFPolicy( + self.seed, + behavior_spec, + self.trainer_settings, + tanh_squash=True, + reparameterize=True, + create_tf_graph=create_graph, + ) + self.maybe_load_replay_buffer() + return policy + + def create_torch_policy( + self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + ) -> TorchPolicy: + """ + Creates a policy with a PyTorch backend and SAC hyperparameters + :param parsed_behavior_id: + :param behavior_spec: specifications for policy construction + :return policy + """ + policy = TorchPolicy( + self.seed, + behavior_spec, + self.trainer_settings, + condition_sigma_on_obs=True, + tanh_squash=True, + separate_critic=True, + ) + self.maybe_load_replay_buffer() return policy def _update_sac_policy(self) -> bool: @@ -244,9 +295,14 @@ def _update_sac_policy(self) -> bool: ) # Get rewards for each reward for name, signal in self.optimizer.reward_signals.items(): - sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch( - sampled_minibatch - ).scaled_reward + if isinstance(signal, RewardSignal): + sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch( + sampled_minibatch + ).scaled_reward + else: + sampled_minibatch[f"{name}_rewards"] = ( + signal.evaluate(sampled_minibatch) * signal.strength + ) update_stats = self.optimizer.update(sampled_minibatch, n_sequences) for stat_name, value in update_stats.items(): @@ -293,12 +349,13 @@ def _update_reward_signals(self) -> None: reward_signal_minibatches = {} for name, signal in self.optimizer.reward_signals.items(): logger.debug(f"Updating {name} at step {self.step}") - # Some signals don't need a minibatch to be sampled - so we don't! - if signal.update_dict: - reward_signal_minibatches[name] = buffer.sample_mini_batch( - self.hyperparameters.batch_size, - sequence_length=self.policy.sequence_length, - ) + if isinstance(signal, RewardSignal): + # Some signals don't need a minibatch to be sampled - so we don't! + if signal.update_dict: + reward_signal_minibatches[name] = buffer.sample_mini_batch( + self.hyperparameters.batch_size, + sequence_length=self.policy.sequence_length, + ) update_stats = self.optimizer.update_reward_signals( reward_signal_minibatches, n_sequences ) @@ -310,7 +367,14 @@ def _update_reward_signals(self) -> None: self._stats_reporter.add_stat(stat, np.mean(stat_list)) def create_sac_optimizer(self) -> SACOptimizer: - return SACOptimizer(cast(TFPolicy, self.policy), self.trainer_settings) + if self.framework == FrameworkType.PYTORCH: + return TorchSACOptimizer( # type: ignore + cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore + ) # type: ignore + else: + return SACOptimizer( # type: ignore + cast(TFPolicy, self.policy), self.trainer_settings # type: ignore + ) # type: ignore def add_policy( self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy diff --git a/ml-agents/mlagents/trainers/saver/torch_saver.py b/ml-agents/mlagents/trainers/saver/torch_saver.py new file mode 100644 index 0000000000..ce54cdc136 --- /dev/null +++ b/ml-agents/mlagents/trainers/saver/torch_saver.py @@ -0,0 +1,118 @@ +import os +import shutil +import torch +from typing import Dict, Union, Optional, cast +from mlagents_envs.exception import UnityPolicyException +from mlagents_envs.logging_util import get_logger +from mlagents.trainers.saver.saver import BaseSaver +from mlagents.trainers.settings import TrainerSettings, SerializationSettings +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.torch.model_serialization import ModelSerializer + + +logger = get_logger(__name__) + + +class TorchSaver(BaseSaver): + """ + Saver class for PyTorch + """ + + def __init__( + self, trainer_settings: TrainerSettings, model_path: str, load: bool = False + ): + super().__init__() + self.model_path = model_path + self.initialize_path = trainer_settings.init_path + self._keep_checkpoints = trainer_settings.keep_checkpoints + self.load = load + + self.policy: Optional[TorchPolicy] = None + self.exporter: Optional[ModelSerializer] = None + self.modules: Dict[str, torch.nn.Modules] = {} + + def register(self, module: Union[TorchPolicy, TorchOptimizer]) -> None: + if isinstance(module, TorchPolicy) or isinstance(module, TorchOptimizer): + self.modules.update(module.get_modules()) # type: ignore + else: + raise UnityPolicyException( + "Registering Object of unsupported type {} to Saver ".format( + type(module) + ) + ) + if self.policy is None and isinstance(module, TorchPolicy): + self.policy = module + self.exporter = ModelSerializer(self.policy) + + def save_checkpoint(self, brain_name: str, step: int) -> str: + if not os.path.exists(self.model_path): + os.makedirs(self.model_path) + checkpoint_path = os.path.join(self.model_path, f"{brain_name}-{step}") + state_dict = { + name: module.state_dict() for name, module in self.modules.items() + } + torch.save(state_dict, f"{checkpoint_path}.pt") + torch.save(state_dict, os.path.join(self.model_path, "checkpoint.pt")) + self.export(checkpoint_path, brain_name) + return checkpoint_path + + def export(self, output_filepath: str, brain_name: str) -> None: + if self.exporter is not None: + self.exporter.export_policy_model(output_filepath) + + def initialize_or_load(self, policy: Optional[TorchPolicy] = None) -> None: + # Initialize/Load registered self.policy by default. + # If given input argument policy, use the input policy instead. + # This argument is mainly for initialization of the ghost trainer's fixed policy. + reset_steps = not self.load + if self.initialize_path is not None: + self._load_model( + self.initialize_path, policy, reset_global_steps=reset_steps + ) + elif self.load: + self._load_model(self.model_path, policy, reset_global_steps=reset_steps) + + def _load_model( + self, + load_path: str, + policy: Optional[TorchPolicy] = None, + reset_global_steps: bool = False, + ) -> None: + model_path = os.path.join(load_path, "checkpoint.pt") + saved_state_dict = torch.load(model_path) + if policy is None: + modules = self.modules + policy = self.policy + else: + modules = policy.get_modules() + policy = cast(TorchPolicy, policy) + + for name, mod in modules.items(): + mod.load_state_dict(saved_state_dict[name]) + + if reset_global_steps: + policy.set_step(0) + logger.info( + "Starting training from step 0 and saving to {}.".format( + self.model_path + ) + ) + else: + logger.info(f"Resuming training from step {policy.get_current_step()}.") + + def copy_final_model(self, source_nn_path: str) -> None: + """ + Copy the .nn file at the given source to the destination. + Also copies the corresponding .onnx file if it exists. + """ + final_model_name = os.path.splitext(source_nn_path)[0] + + if SerializationSettings.convert_to_onnx: + try: + source_path = f"{final_model_name}.onnx" + destination_path = f"{self.model_path}.onnx" + shutil.copyfile(source_path, destination_path) + logger.info(f"Copied {source_path} to {destination_path}.") + except OSError: + pass diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py index 62ded20ed1..7054ce9be4 100644 --- a/ml-agents/mlagents/trainers/settings.py +++ b/ml-agents/mlagents/trainers/settings.py @@ -531,6 +531,11 @@ def to_settings(self) -> type: return _mapping[self] +class FrameworkType(Enum): + TENSORFLOW: str = "tensorflow" + PYTORCH: str = "pytorch" + + @attr.s(auto_attribs=True) class TrainerSettings(ExportableSettings): trainer_type: TrainerType = TrainerType.PPO @@ -553,6 +558,7 @@ def _set_default_hyperparameters(self): threaded: bool = True self_play: Optional[SelfPlaySettings] = None behavioral_cloning: Optional[BehavioralCloningSettings] = None + framework: FrameworkType = FrameworkType.TENSORFLOW cattr.register_structure_hook( Dict[RewardSignalType, RewardSignalSettings], RewardSignalSettings.structure @@ -720,7 +726,13 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions": configured_dict["engine_settings"][key] = val else: # Base options configured_dict[key] = val - return RunOptions.from_dict(configured_dict) + + # Apply --torch retroactively + final_runoptions = RunOptions.from_dict(configured_dict) + if "torch" in DetectDefault.non_default_args: + for trainer_set in final_runoptions.behaviors.values(): + trainer_set.framework = FrameworkType.PYTORCH + return final_runoptions @staticmethod def from_dict(options_dict: Dict[str, Any]) -> "RunOptions": diff --git a/ml-agents/mlagents/trainers/tests/test_ghost.py b/ml-agents/mlagents/trainers/tests/test_ghost.py index e72f573f36..acc9711830 100644 --- a/ml-agents/mlagents/trainers/tests/test_ghost.py +++ b/ml-agents/mlagents/trainers/tests/test_ghost.py @@ -38,12 +38,9 @@ def test_load_and_set(dummy_config, use_discrete): trainer_params = dummy_config trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") trainer.seed = 1 - policy = trainer.create_policy("test", mock_specs) - policy.create_tf_graph() + policy = trainer.create_policy("test", mock_specs, create_graph=True) trainer.seed = 20 # otherwise graphs are the same - to_load_policy = trainer.create_policy("test", mock_specs) - to_load_policy.create_tf_graph() - to_load_policy.init_load_weights() + to_load_policy = trainer.create_policy("test", mock_specs, create_graph=True) weights = policy.get_weights() load_weights = to_load_policy.get_weights() diff --git a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py index f22e37e8af..e406e33611 100644 --- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py +++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py @@ -33,7 +33,10 @@ def checkpoint_path(brain_name, step): mock_saver.save_checkpoint.side_effect = checkpoint_path self.saver = mock_saver - def create_policy(self): + def create_tf_policy(self, parsed_behavior_id, behavior_spec): + return mock.Mock() + + def create_torch_policy(self, parsed_behavior_id, behavior_spec): return mock.Mock() def _process_trajectory(self, trajectory): diff --git a/ml-agents/mlagents/trainers/tests/test_sac.py b/ml-agents/mlagents/trainers/tests/test_sac.py index 67d9f13407..b22d3bf484 100644 --- a/ml-agents/mlagents/trainers/tests/test_sac.py +++ b/ml-agents/mlagents/trainers/tests/test_sac.py @@ -51,7 +51,7 @@ def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual): 0, mock_brain, trainer_settings, "test", False, create_tf_graph=False ) optimizer = SACOptimizer(policy, trainer_settings) - policy.initialize() + optimizer.policy.initialize() return optimizer @@ -228,6 +228,7 @@ def test_advance(dummy_config): trainer.add_policy(behavior_id, policy) trainer.saver.initialize_or_load(policy) trainer.optimizer.update = mock.Mock() + trainer.saver.initialize_or_load(policy) trainer.optimizer.update_reward_signals = mock.Mock() trainer.optimizer.update_reward_signals.return_value = {} trainer.optimizer.update.return_value = {} diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py index 380309577e..b62a1b3175 100644 --- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py +++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py @@ -27,6 +27,7 @@ RewardSignalType, EncoderType, ScheduleType, + FrameworkType, ) from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager from mlagents_envs.side_channel.environment_parameters_channel import ( @@ -53,6 +54,7 @@ summary_freq=500, max_steps=3000, threaded=False, + framework=FrameworkType.TENSORFLOW, ) SAC_CONFIG = TrainerSettings( diff --git a/ml-agents/mlagents/trainers/tests/torch/test.demo b/ml-agents/mlagents/trainers/tests/torch/test.demo new file mode 100644 index 0000000000..8f188ddd58 Binary files /dev/null and b/ml-agents/mlagents/trainers/tests/torch/test.demo differ diff --git a/ml-agents/mlagents/trainers/tests/torch/test_bcmodule.py b/ml-agents/mlagents/trainers/tests/torch/test_bcmodule.py new file mode 100644 index 0000000000..201e2d2295 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/torch/test_bcmodule.py @@ -0,0 +1,144 @@ +from unittest.mock import MagicMock +import pytest +import mlagents.trainers.tests.mock_brain as mb + +import numpy as np +import os + +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.torch.components.bc.module import BCModule +from mlagents.trainers.settings import ( + TrainerSettings, + BehavioralCloningSettings, + NetworkSettings, +) + + +def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): + # model_path = env.external_brain_names[0] + trainer_config = TrainerSettings() + trainer_config.network_settings.memory = ( + NetworkSettings.MemorySettings() if use_rnn else None + ) + policy = TorchPolicy( + 0, mock_behavior_specs, trainer_config, tanhresample, tanhresample + ) + bc_module = BCModule( + policy, + settings=bc_settings, + policy_learning_rate=trainer_config.hyperparameters.learning_rate, + default_batch_size=trainer_config.hyperparameters.batch_size, + default_num_epoch=3, + ) + return bc_module + + +# Test default values +def test_bcmodule_defaults(): + # See if default values match + mock_specs = mb.create_mock_3dball_behavior_specs() + bc_settings = BehavioralCloningSettings( + demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo" + ) + bc_module = create_bc_module(mock_specs, bc_settings, False, False) + assert bc_module.num_epoch == 3 + assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size + # Assign strange values and see if it overrides properly + bc_settings = BehavioralCloningSettings( + demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo", + num_epoch=100, + batch_size=10000, + ) + bc_module = create_bc_module(mock_specs, bc_settings, False, False) + assert bc_module.num_epoch == 100 + assert bc_module.batch_size == 10000 + + +# Test with continuous control env and vector actions +@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) +def test_bcmodule_update(is_sac): + mock_specs = mb.create_mock_3dball_behavior_specs() + bc_settings = BehavioralCloningSettings( + demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo" + ) + bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac) + stats = bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + + +# Test with constant pretraining learning rate +@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) +def test_bcmodule_constant_lr_update(is_sac): + mock_specs = mb.create_mock_3dball_behavior_specs() + bc_settings = BehavioralCloningSettings( + demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo", + steps=0, + ) + bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac) + stats = bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + old_learning_rate = bc_module.current_lr + + _ = bc_module.update() + assert old_learning_rate == bc_module.current_lr + + +# Test with constant pretraining learning rate +@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) +def test_bcmodule_linear_lr_update(is_sac): + mock_specs = mb.create_mock_3dball_behavior_specs() + bc_settings = BehavioralCloningSettings( + demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo", + steps=100, + ) + bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac) + # Should decay by 10/100 * 0.0003 = 0.00003 + bc_module.policy.get_current_step = MagicMock(return_value=10) + old_learning_rate = bc_module.current_lr + _ = bc_module.update() + assert old_learning_rate - 0.00003 == pytest.approx(bc_module.current_lr, abs=0.01) + + +# Test with RNN +@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) +def test_bcmodule_rnn_update(is_sac): + mock_specs = mb.create_mock_3dball_behavior_specs() + bc_settings = BehavioralCloningSettings( + demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo" + ) + bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac) + stats = bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + + +# Test with discrete control and visual observations +@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) +def test_bcmodule_dc_visual_update(is_sac): + mock_specs = mb.create_mock_banana_behavior_specs() + bc_settings = BehavioralCloningSettings( + demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo" + ) + bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac) + stats = bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + + +# Test with discrete control, visual observations and RNN +@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) +def test_bcmodule_rnn_dc_update(is_sac): + mock_specs = mb.create_mock_banana_behavior_specs() + bc_settings = BehavioralCloningSettings( + demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo" + ) + bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac) + stats = bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + + +if __name__ == "__main__": + pytest.main() diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py new file mode 100644 index 0000000000..06f0666cc8 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py @@ -0,0 +1,177 @@ +import pytest + +import numpy as np + +from mlagents.trainers.ghost.trainer import GhostTrainer +from mlagents.trainers.ghost.controller import GhostController +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers +from mlagents.trainers.ppo.trainer import PPOTrainer +from mlagents.trainers.agent_processor import AgentManagerQueue +from mlagents.trainers.tests import mock_brain as mb +from mlagents.trainers.tests.test_trajectory import make_fake_trajectory +from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings, FrameworkType + + +@pytest.fixture +def dummy_config(): + return TrainerSettings( + self_play=SelfPlaySettings(), framework=FrameworkType.PYTORCH + ) + + +VECTOR_ACTION_SPACE = 1 +VECTOR_OBS_SPACE = 8 +DISCRETE_ACTION_SPACE = [3, 3, 3, 2] +BUFFER_INIT_SAMPLES = 513 +NUM_AGENTS = 12 + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_load_and_set(dummy_config, use_discrete): + mock_specs = mb.setup_test_behavior_specs( + use_discrete, + False, + vector_action_space=DISCRETE_ACTION_SPACE + if use_discrete + else VECTOR_ACTION_SPACE, + vector_obs_space=VECTOR_OBS_SPACE, + ) + + trainer_params = dummy_config + trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") + trainer.seed = 1 + policy = trainer.create_policy("test", mock_specs) + trainer.seed = 20 # otherwise graphs are the same + to_load_policy = trainer.create_policy("test", mock_specs) + + weights = policy.get_weights() + load_weights = to_load_policy.get_weights() + try: + for w, lw in zip(weights, load_weights): + np.testing.assert_array_equal(w, lw) + except AssertionError: + pass + + to_load_policy.load_weights(weights) + load_weights = to_load_policy.get_weights() + + for w, lw in zip(weights, load_weights): + np.testing.assert_array_equal(w, lw) + + +def test_process_trajectory(dummy_config): + mock_specs = mb.setup_test_behavior_specs( + True, False, vector_action_space=[2], vector_obs_space=1 + ) + behavior_id_team0 = "test_brain?team=0" + behavior_id_team1 = "test_brain?team=1" + brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name + + ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") + controller = GhostController(100) + trainer = GhostTrainer( + ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" + ) + + # first policy encountered becomes policy trained by wrapped PPO + parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) + policy = trainer.create_policy(parsed_behavior_id0, mock_specs) + trainer.add_policy(parsed_behavior_id0, policy) + trajectory_queue0 = AgentManagerQueue(behavior_id_team0) + trainer.subscribe_trajectory_queue(trajectory_queue0) + + # Ghost trainer should ignore this queue because off policy + parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) + policy = trainer.create_policy(parsed_behavior_id1, mock_specs) + trainer.add_policy(parsed_behavior_id1, policy) + trajectory_queue1 = AgentManagerQueue(behavior_id_team1) + trainer.subscribe_trajectory_queue(trajectory_queue1) + + time_horizon = 15 + trajectory = make_fake_trajectory( + length=time_horizon, + max_step_complete=True, + observation_shapes=[(1,)], + action_space=[2], + ) + trajectory_queue0.put(trajectory) + trainer.advance() + + # Check that trainer put trajectory in update buffer + assert trainer.trainer.update_buffer.num_experiences == 15 + + trajectory_queue1.put(trajectory) + trainer.advance() + + # Check that ghost trainer ignored off policy queue + assert trainer.trainer.update_buffer.num_experiences == 15 + # Check that it emptied the queue + assert trajectory_queue1.empty() + + +def test_publish_queue(dummy_config): + mock_specs = mb.setup_test_behavior_specs( + True, False, vector_action_space=[1], vector_obs_space=8 + ) + + behavior_id_team0 = "test_brain?team=0" + behavior_id_team1 = "test_brain?team=1" + + parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) + + brain_name = parsed_behavior_id0.brain_name + + ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") + controller = GhostController(100) + trainer = GhostTrainer( + ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" + ) + + # First policy encountered becomes policy trained by wrapped PPO + # This queue should remain empty after swap snapshot + policy = trainer.create_policy(parsed_behavior_id0, mock_specs) + trainer.add_policy(parsed_behavior_id0, policy) + policy_queue0 = AgentManagerQueue(behavior_id_team0) + trainer.publish_policy_queue(policy_queue0) + + # Ghost trainer should use this queue for ghost policy swap + parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) + policy = trainer.create_policy(parsed_behavior_id1, mock_specs) + trainer.add_policy(parsed_behavior_id1, policy) + policy_queue1 = AgentManagerQueue(behavior_id_team1) + trainer.publish_policy_queue(policy_queue1) + + # check ghost trainer swap pushes to ghost queue and not trainer + assert policy_queue0.empty() and policy_queue1.empty() + trainer._swap_snapshots() + assert policy_queue0.empty() and not policy_queue1.empty() + # clear + policy_queue1.get_nowait() + + mock_specs = mb.setup_test_behavior_specs( + False, + False, + vector_action_space=VECTOR_ACTION_SPACE, + vector_obs_space=VECTOR_OBS_SPACE, + ) + + buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs) + # Mock out reward signal eval + buffer["extrinsic_rewards"] = buffer["environment_rewards"] + buffer["extrinsic_returns"] = buffer["environment_rewards"] + buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] + buffer["curiosity_rewards"] = buffer["environment_rewards"] + buffer["curiosity_returns"] = buffer["environment_rewards"] + buffer["curiosity_value_estimates"] = buffer["environment_rewards"] + buffer["advantages"] = buffer["environment_rewards"] + trainer.trainer.update_buffer = buffer + + # when ghost trainer advance and wrapped trainer buffers full + # the wrapped trainer pushes updated policy to correct queue + assert policy_queue0.empty() and policy_queue1.empty() + trainer.advance() + assert not policy_queue0.empty() and policy_queue1.empty() + + +if __name__ == "__main__": + pytest.main() diff --git a/ml-agents/mlagents/trainers/tests/torch/test_layers.py b/ml-agents/mlagents/trainers/tests/torch/test_layers.py index 6d1132aa2e..2086d6dd13 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_layers.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_layers.py @@ -5,6 +5,7 @@ linear_layer, lstm_layer, Initialization, + LSTM, ) @@ -38,3 +39,21 @@ def test_lstm_layer(): assert torch.all( torch.eq(param.data[4:8], torch.ones_like(param.data[4:8])) ) + + +def test_lstm_class(): + torch.manual_seed(0) + input_size = 12 + memory_size = 64 + batch_size = 8 + seq_len = 16 + lstm = LSTM(input_size, memory_size) + + assert lstm.memory_size == memory_size + + sample_input = torch.ones((batch_size, seq_len, input_size)) + sample_memories = torch.ones((1, batch_size, memory_size)) + out, mem = lstm(sample_input, sample_memories) + # Hidden size should be half of memory_size + assert out.shape == (batch_size, seq_len, memory_size // 2) + assert mem.shape == (1, batch_size, memory_size) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_networks.py b/ml-agents/mlagents/trainers/tests/torch/test_networks.py index 06f8b1ab25..343b007904 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_networks.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_networks.py @@ -150,19 +150,16 @@ def test_simple_actor(action_type): assert act.shape == (1, 1) # Test forward - actions, probs, ver_num, mem_size, is_cont, act_size_vec = actor.forward( + actions, ver_num, mem_size, is_cont, act_size_vec = actor.forward( [sample_obs], [], masks=masks ) for act in actions: + # This is different from above for ONNX export if action_type == ActionType.CONTINUOUS: - assert act.shape == ( - act_size[0], - 1, - ) # This is different from above for ONNX export + assert act.shape == (act_size[0], 1) else: - assert act.shape == (1, 1) + assert act.shape == tuple(act_size) - # TODO: Once export works properly. fix the shapes here. assert mem_size == 0 assert is_cont == int(action_type == ActionType.CONTINUOUS) assert act_size_vec == torch.tensor(act_size) @@ -184,11 +181,7 @@ def test_actor_critic(ac_type, lstm): if lstm: sample_obs = torch.ones((1, network_settings.memory.sequence_length, obs_size)) memories = torch.ones( - ( - 1, - network_settings.memory.sequence_length, - network_settings.memory.memory_size, - ) + (1, network_settings.memory.sequence_length, actor.memory_size) ) else: sample_obs = torch.ones((1, obs_size)) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_policy.py b/ml-agents/mlagents/trainers/tests/torch/test_policy.py new file mode 100644 index 0000000000..208a208b77 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/torch/test_policy.py @@ -0,0 +1,150 @@ +import pytest + +import torch +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.tests import mock_brain as mb +from mlagents.trainers.settings import TrainerSettings, NetworkSettings +from mlagents.trainers.torch.utils import ModelUtils + +VECTOR_ACTION_SPACE = 2 +VECTOR_OBS_SPACE = 8 +DISCRETE_ACTION_SPACE = [3, 3, 3, 2] +BUFFER_INIT_SAMPLES = 32 +NUM_AGENTS = 12 +EPSILON = 1e-7 + + +def create_policy_mock( + dummy_config: TrainerSettings, + use_rnn: bool = False, + use_discrete: bool = True, + use_visual: bool = False, + seed: int = 0, +) -> TorchPolicy: + mock_spec = mb.setup_test_behavior_specs( + use_discrete, + use_visual, + vector_action_space=DISCRETE_ACTION_SPACE + if use_discrete + else VECTOR_ACTION_SPACE, + vector_obs_space=VECTOR_OBS_SPACE, + ) + + trainer_settings = dummy_config + trainer_settings.keep_checkpoints = 3 + trainer_settings.network_settings.memory = ( + NetworkSettings.MemorySettings() if use_rnn else None + ) + policy = TorchPolicy(seed, mock_spec, trainer_settings) + return policy + + +@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"]) +@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) +@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) +def test_policy_evaluate(rnn, visual, discrete): + # Test evaluate + policy = create_policy_mock( + TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual + ) + decision_step, terminal_step = mb.create_steps_from_behavior_spec( + policy.behavior_spec, num_agents=NUM_AGENTS + ) + + run_out = policy.evaluate(decision_step, list(decision_step.agent_id)) + if discrete: + run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) + else: + assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE) + + +@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"]) +@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) +@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) +def test_evaluate_actions(rnn, visual, discrete): + policy = create_policy_mock( + TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual + ) + buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) + vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])] + act_masks = ModelUtils.list_to_tensor(buffer["action_mask"]) + if policy.use_continuous_act: + actions = ModelUtils.list_to_tensor(buffer["actions"]).unsqueeze(-1) + else: + actions = ModelUtils.list_to_tensor(buffer["actions"], dtype=torch.long) + vis_obs = [] + for idx, _ in enumerate(policy.actor_critic.network_body.visual_encoders): + vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx]) + vis_obs.append(vis_ob) + + memories = [ + ModelUtils.list_to_tensor(buffer["memory"][i]) + for i in range(0, len(buffer["memory"]), policy.sequence_length) + ] + if len(memories) > 0: + memories = torch.stack(memories).unsqueeze(0) + + log_probs, entropy, values = policy.evaluate_actions( + vec_obs, + vis_obs, + masks=act_masks, + actions=actions, + memories=memories, + seq_len=policy.sequence_length, + ) + assert log_probs.shape == (64, policy.behavior_spec.action_size) + assert entropy.shape == (64, policy.behavior_spec.action_size) + for val in values.values(): + assert val.shape == (64,) + + +@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"]) +@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) +@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) +def test_sample_actions(rnn, visual, discrete): + policy = create_policy_mock( + TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual + ) + buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) + vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])] + act_masks = ModelUtils.list_to_tensor(buffer["action_mask"]) + + vis_obs = [] + for idx, _ in enumerate(policy.actor_critic.network_body.visual_encoders): + vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx]) + vis_obs.append(vis_ob) + + memories = [ + ModelUtils.list_to_tensor(buffer["memory"][i]) + for i in range(0, len(buffer["memory"]), policy.sequence_length) + ] + if len(memories) > 0: + memories = torch.stack(memories).unsqueeze(0) + + ( + sampled_actions, + log_probs, + entropies, + sampled_values, + memories, + ) = policy.sample_actions( + vec_obs, + vis_obs, + masks=act_masks, + memories=memories, + seq_len=policy.sequence_length, + all_log_probs=not policy.use_continuous_act, + ) + if discrete: + assert log_probs.shape == ( + 64, + sum(policy.behavior_spec.discrete_action_branches), + ) + else: + assert log_probs.shape == (64, policy.behavior_spec.action_shape) + assert entropies.shape == (64, policy.behavior_spec.action_size) + for val in sampled_values.values(): + assert val.shape == (64,) + + if rnn: + assert memories.shape == (1, 1, policy.m_size) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py new file mode 100644 index 0000000000..f22728fbf2 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py @@ -0,0 +1,111 @@ +import numpy as np +import pytest +import torch +from mlagents.trainers.torch.components.reward_providers import ( + CuriosityRewardProvider, + create_reward_provider, +) +from mlagents_envs.base_env import BehaviorSpec, ActionType +from mlagents.trainers.settings import CuriositySettings, RewardSignalType +from mlagents.trainers.tests.torch.test_reward_providers.utils import ( + create_agent_buffer, +) + +SEED = [42] + + +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + ], +) +def test_construction(behavior_spec: BehaviorSpec) -> None: + curiosity_settings = CuriositySettings(32, 0.01) + curiosity_settings.strength = 0.1 + curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) + assert curiosity_rp.strength == 0.1 + assert curiosity_rp.name == "Curiosity" + + +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,), (64, 66, 3), (84, 86, 1)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,), (64, 66, 1)], ActionType.DISCRETE, (2, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)), + ], +) +def test_factory(behavior_spec: BehaviorSpec) -> None: + curiosity_settings = CuriositySettings(32, 0.01) + curiosity_rp = create_reward_provider( + RewardSignalType.CURIOSITY, behavior_spec, curiosity_settings + ) + assert curiosity_rp.name == "Curiosity" + + +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,), (64, 66, 3), (24, 26, 1)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)), + ], +) +def test_reward_decreases(behavior_spec: BehaviorSpec, seed: int) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + curiosity_settings = CuriositySettings(32, 0.01) + curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) + buffer = create_agent_buffer(behavior_spec, 5) + curiosity_rp.update(buffer) + reward_old = curiosity_rp.evaluate(buffer)[0] + for _ in range(10): + curiosity_rp.update(buffer) + reward_new = curiosity_rp.evaluate(buffer)[0] + assert reward_new < reward_old + reward_old = reward_new + + +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize( + "behavior_spec", [BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5)] +) +def test_continuous_action_prediction(behavior_spec: BehaviorSpec, seed: int) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + curiosity_settings = CuriositySettings(32, 0.1) + curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) + buffer = create_agent_buffer(behavior_spec, 5) + for _ in range(200): + curiosity_rp.update(buffer) + prediction = curiosity_rp._network.predict_action(buffer)[0].detach() + target = buffer["actions"][0] + error = float(torch.mean((prediction - target) ** 2)) + assert error < 0.001 + + +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,), (64, 66, 3)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)), + ], +) +def test_next_state_prediction(behavior_spec: BehaviorSpec, seed: int) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + curiosity_settings = CuriositySettings(32, 0.1) + curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) + buffer = create_agent_buffer(behavior_spec, 5) + for _ in range(100): + curiosity_rp.update(buffer) + prediction = curiosity_rp._network.predict_next_state(buffer)[0] + target = curiosity_rp._network.get_next_state(buffer)[0] + error = float(torch.mean((prediction - target) ** 2).detach()) + assert error < 0.001 diff --git a/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py new file mode 100644 index 0000000000..5d8548dd74 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py @@ -0,0 +1,56 @@ +import pytest +from mlagents.trainers.torch.components.reward_providers import ( + ExtrinsicRewardProvider, + create_reward_provider, +) +from mlagents_envs.base_env import BehaviorSpec, ActionType +from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType +from mlagents.trainers.tests.torch.test_reward_providers.utils import ( + create_agent_buffer, +) + + +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + ], +) +def test_construction(behavior_spec: BehaviorSpec) -> None: + settings = RewardSignalSettings() + settings.gamma = 0.2 + extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings) + assert extrinsic_rp.gamma == 0.2 + assert extrinsic_rp.name == "Extrinsic" + + +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + ], +) +def test_factory(behavior_spec: BehaviorSpec) -> None: + settings = RewardSignalSettings() + extrinsic_rp = create_reward_provider( + RewardSignalType.EXTRINSIC, behavior_spec, settings + ) + assert extrinsic_rp.name == "Extrinsic" + + +@pytest.mark.parametrize("reward", [2.0, 3.0, 4.0]) +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + ], +) +def test_reward(behavior_spec: BehaviorSpec, reward: float) -> None: + buffer = create_agent_buffer(behavior_spec, 1000, reward) + settings = RewardSignalSettings() + extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings) + generated_rewards = extrinsic_rp.evaluate(buffer) + assert (generated_rewards == reward).all() diff --git a/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_gail.py b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_gail.py new file mode 100644 index 0000000000..91b5c4bc4d --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_gail.py @@ -0,0 +1,138 @@ +from typing import Any +import numpy as np +import pytest +from unittest.mock import patch +import torch +import os +from mlagents.trainers.torch.components.reward_providers import ( + GAILRewardProvider, + create_reward_provider, +) +from mlagents_envs.base_env import BehaviorSpec, ActionType +from mlagents.trainers.settings import GAILSettings, RewardSignalType +from mlagents.trainers.tests.torch.test_reward_providers.utils import ( + create_agent_buffer, +) +from mlagents.trainers.torch.components.reward_providers.gail_reward_provider import ( + DiscriminatorNetwork, +) + +CONTINUOUS_PATH = ( + os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir) + + "/test.demo" +) +DISCRETE_PATH = ( + os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir) + + "/testdcvis.demo" +) +SEED = [42] + + +@pytest.mark.parametrize( + "behavior_spec", [BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2)] +) +def test_construction(behavior_spec: BehaviorSpec) -> None: + gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH) + gail_rp = GAILRewardProvider(behavior_spec, gail_settings) + assert gail_rp.name == "GAIL" + + +@pytest.mark.parametrize( + "behavior_spec", [BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2)] +) +def test_factory(behavior_spec: BehaviorSpec) -> None: + gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH) + gail_rp = create_reward_provider( + RewardSignalType.GAIL, behavior_spec, gail_settings + ) + assert gail_rp.name == "GAIL" + + +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(8,), (24, 26, 1)], ActionType.CONTINUOUS, 2), + BehaviorSpec([(50,)], ActionType.DISCRETE, (2, 3, 3, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (20,)), + ], +) +@pytest.mark.parametrize("use_actions", [False, True]) +@patch( + "mlagents.trainers.torch.components.reward_providers.gail_reward_provider.demo_to_buffer" +) +def test_reward_decreases( + demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int +) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + buffer_expert = create_agent_buffer(behavior_spec, 1000) + buffer_policy = create_agent_buffer(behavior_spec, 1000) + demo_to_buffer.return_value = None, buffer_expert + gail_settings = GAILSettings( + demo_path="", learning_rate=0.05, use_vail=False, use_actions=use_actions + ) + gail_rp = create_reward_provider( + RewardSignalType.GAIL, behavior_spec, gail_settings + ) + + init_reward_expert = gail_rp.evaluate(buffer_expert)[0] + init_reward_policy = gail_rp.evaluate(buffer_policy)[0] + + for _ in range(10): + gail_rp.update(buffer_policy) + reward_expert = gail_rp.evaluate(buffer_expert)[0] + reward_policy = gail_rp.evaluate(buffer_policy)[0] + assert reward_expert >= 0 # GAIL / VAIL reward always positive + assert reward_policy >= 0 + reward_expert = gail_rp.evaluate(buffer_expert)[0] + reward_policy = gail_rp.evaluate(buffer_policy)[0] + assert reward_expert > reward_policy # Expert reward greater than non-expert reward + assert ( + reward_expert > init_reward_expert + ) # Expert reward getting better as network trains + assert ( + reward_policy < init_reward_policy + ) # Non-expert reward getting worse as network trains + + +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3, 3, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (20,)), + ], +) +@pytest.mark.parametrize("use_actions", [False, True]) +@patch( + "mlagents.trainers.torch.components.reward_providers.gail_reward_provider.demo_to_buffer" +) +def test_reward_decreases_vail( + demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int +) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + buffer_expert = create_agent_buffer(behavior_spec, 1000) + buffer_policy = create_agent_buffer(behavior_spec, 1000) + demo_to_buffer.return_value = None, buffer_expert + gail_settings = GAILSettings( + demo_path="", learning_rate=0.005, use_vail=True, use_actions=use_actions + ) + DiscriminatorNetwork.initial_beta = 0.0 + # we must set the initial value of beta to 0 for testing + # If we do not, the kl-loss will dominate early and will block the estimator + gail_rp = create_reward_provider( + RewardSignalType.GAIL, behavior_spec, gail_settings + ) + + for _ in range(100): + gail_rp.update(buffer_policy) + reward_expert = gail_rp.evaluate(buffer_expert)[0] + reward_policy = gail_rp.evaluate(buffer_policy)[0] + assert reward_expert >= 0 # GAIL / VAIL reward always positive + assert reward_policy >= 0 + reward_expert = gail_rp.evaluate(buffer_expert)[0] + reward_policy = gail_rp.evaluate(buffer_policy)[0] + assert reward_expert > reward_policy # Expert reward greater than non-expert reward diff --git a/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py new file mode 100644 index 0000000000..1f50f06a11 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py @@ -0,0 +1,32 @@ +import numpy as np +from mlagents.trainers.buffer import AgentBuffer +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.trajectory import SplitObservations + + +def create_agent_buffer( + behavior_spec: BehaviorSpec, number: int, reward: float = 0.0 +) -> AgentBuffer: + buffer = AgentBuffer() + curr_observations = [ + np.random.normal(size=shape) for shape in behavior_spec.observation_shapes + ] + next_observations = [ + np.random.normal(size=shape) for shape in behavior_spec.observation_shapes + ] + action = behavior_spec.create_random_action(1)[0, :] + for _ in range(number): + curr_split_obs = SplitObservations.from_observations(curr_observations) + next_split_obs = SplitObservations.from_observations(next_observations) + for i, _ in enumerate(curr_split_obs.visual_observations): + buffer["visual_obs%d" % i].append(curr_split_obs.visual_observations[i]) + buffer["next_visual_obs%d" % i].append( + next_split_obs.visual_observations[i] + ) + buffer["vector_obs"].append(curr_split_obs.vector_observations) + buffer["next_vector_in"].append(next_split_obs.vector_observations) + buffer["actions"].append(action) + buffer["done"].append(np.zeros(1, dtype=np.float32)) + buffer["reward"].append(np.ones(1, dtype=np.float32) * reward) + buffer["masks"].append(np.ones(1, dtype=np.float32)) + return buffer diff --git a/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py new file mode 100644 index 0000000000..e7b9e8c939 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py @@ -0,0 +1,505 @@ +import math +import tempfile +import pytest +import numpy as np +import attr +from typing import Dict + +from mlagents.trainers.tests.simple_test_envs import ( + SimpleEnvironment, + MemoryEnvironment, + RecordEnvironment, +) +from mlagents.trainers.trainer_controller import TrainerController +from mlagents.trainers.trainer_util import TrainerFactory +from mlagents.trainers.simple_env_manager import SimpleEnvManager +from mlagents.trainers.demo_loader import write_demo +from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary +from mlagents.trainers.settings import ( + TrainerSettings, + PPOSettings, + SACSettings, + NetworkSettings, + SelfPlaySettings, + BehavioralCloningSettings, + GAILSettings, + TrainerType, + RewardSignalType, + EncoderType, + ScheduleType, + FrameworkType, +) +from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager +from mlagents_envs.side_channel.environment_parameters_channel import ( + EnvironmentParametersChannel, +) +from mlagents_envs.communicator_objects.demonstration_meta_pb2 import ( + DemonstrationMetaProto, +) +from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto +from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous + +BRAIN_NAME = "1D" + + +PPO_CONFIG = TrainerSettings( + trainer_type=TrainerType.PPO, + hyperparameters=PPOSettings( + learning_rate=5.0e-3, + learning_rate_schedule=ScheduleType.CONSTANT, + batch_size=16, + buffer_size=64, + ), + network_settings=NetworkSettings(num_layers=1, hidden_units=32), + summary_freq=500, + max_steps=3000, + threaded=False, + framework=FrameworkType.PYTORCH, +) + +SAC_CONFIG = TrainerSettings( + trainer_type=TrainerType.SAC, + hyperparameters=SACSettings( + learning_rate=5.0e-3, + learning_rate_schedule=ScheduleType.CONSTANT, + batch_size=8, + buffer_init_steps=100, + buffer_size=5000, + tau=0.01, + init_entcoef=0.01, + ), + network_settings=NetworkSettings(num_layers=1, hidden_units=16), + summary_freq=100, + max_steps=1000, + threaded=False, +) + + +# The reward processor is passed as an argument to _check_environment_trains. +# It is applied to the list of all final rewards for each brain individually. +# This is so that we can process all final rewards in different ways for different algorithms. +# Custom reward processors should be built within the test function and passed to _check_environment_trains +# Default is average over the last 5 final rewards +def default_reward_processor(rewards, last_n_rewards=5): + rewards_to_use = rewards[-last_n_rewards:] + # For debugging tests + print(f"Last {last_n_rewards} rewards:", rewards_to_use) + return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean() + + +class DebugWriter(StatsWriter): + """ + Print to stdout so stats can be viewed in pytest + """ + + def __init__(self): + self._last_reward_summary: Dict[str, float] = {} + + def get_last_rewards(self): + return self._last_reward_summary + + def write_stats( + self, category: str, values: Dict[str, StatsSummary], step: int + ) -> None: + for val, stats_summary in values.items(): + if val == "Environment/Cumulative Reward": + print(step, val, stats_summary.mean) + self._last_reward_summary[category] = stats_summary.mean + + +def _check_environment_trains( + env, + trainer_config, + reward_processor=default_reward_processor, + env_parameter_manager=None, + success_threshold=0.9, + env_manager=None, +): + if env_parameter_manager is None: + env_parameter_manager = EnvironmentParameterManager() + # Create controller and begin training. + with tempfile.TemporaryDirectory() as dir: + run_id = "id" + seed = 1337 + StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file + debug_writer = DebugWriter() + StatsReporter.add_writer(debug_writer) + if env_manager is None: + env_manager = SimpleEnvManager(env, EnvironmentParametersChannel()) + trainer_factory = TrainerFactory( + trainer_config=trainer_config, + output_path=dir, + train_model=True, + load_model=False, + seed=seed, + param_manager=env_parameter_manager, + multi_gpu=False, + ) + + tc = TrainerController( + trainer_factory=trainer_factory, + output_path=dir, + run_id=run_id, + param_manager=env_parameter_manager, + train=True, + training_seed=seed, + ) + + # Begin training + tc.start_learning(env_manager) + if ( + success_threshold is not None + ): # For tests where we are just checking setup and not reward + processed_rewards = [ + reward_processor(rewards) for rewards in env.final_rewards.values() + ] + assert all(not math.isnan(reward) for reward in processed_rewards) + assert all(reward > success_threshold for reward in processed_rewards) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_simple_ppo(use_discrete): + env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete) + config = attr.evolve(PPO_CONFIG) + _check_environment_trains(env, {BRAIN_NAME: config}) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_2d_ppo(use_discrete): + env = SimpleEnvironment( + [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8 + ) + new_hyperparams = attr.evolve( + PPO_CONFIG.hyperparameters, batch_size=64, buffer_size=640 + ) + config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams, max_steps=10000) + _check_environment_trains(env, {BRAIN_NAME: config}) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +@pytest.mark.parametrize("num_visual", [1, 2]) +def test_visual_ppo(num_visual, use_discrete): + env = SimpleEnvironment( + [BRAIN_NAME], + use_discrete=use_discrete, + num_visual=num_visual, + num_vector=0, + step_size=0.2, + ) + new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4) + config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams) + _check_environment_trains(env, {BRAIN_NAME: config}) + + +@pytest.mark.parametrize("num_visual", [1, 2]) +@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn"]) +def test_visual_advanced_ppo(vis_encode_type, num_visual): + env = SimpleEnvironment( + [BRAIN_NAME], + use_discrete=True, + num_visual=num_visual, + num_vector=0, + step_size=0.5, + vis_obs_size=(36, 36, 3), + ) + new_networksettings = attr.evolve( + SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type) + ) + new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4) + config = attr.evolve( + PPO_CONFIG, + hyperparameters=new_hyperparams, + network_settings=new_networksettings, + max_steps=500, + summary_freq=100, + ) + # The number of steps is pretty small for these encoders + _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_recurrent_ppo(use_discrete): + env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete) + new_network_settings = attr.evolve( + PPO_CONFIG.network_settings, + memory=NetworkSettings.MemorySettings(memory_size=16), + ) + new_hyperparams = attr.evolve( + PPO_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128 + ) + config = attr.evolve( + PPO_CONFIG, + hyperparameters=new_hyperparams, + network_settings=new_network_settings, + max_steps=5000, + ) + _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_simple_sac(use_discrete): + env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete) + config = attr.evolve(SAC_CONFIG) + _check_environment_trains(env, {BRAIN_NAME: config}) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_2d_sac(use_discrete): + env = SimpleEnvironment( + [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8 + ) + new_hyperparams = attr.evolve(SAC_CONFIG.hyperparameters, buffer_init_steps=2000) + config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams, max_steps=10000) + _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +@pytest.mark.parametrize("num_visual", [1, 2]) +def test_visual_sac(num_visual, use_discrete): + env = SimpleEnvironment( + [BRAIN_NAME], + use_discrete=use_discrete, + num_visual=num_visual, + num_vector=0, + step_size=0.2, + ) + new_hyperparams = attr.evolve( + SAC_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4 + ) + config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams) + _check_environment_trains(env, {BRAIN_NAME: config}) + + +@pytest.mark.parametrize("num_visual", [1, 2]) +@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn"]) +def test_visual_advanced_sac(vis_encode_type, num_visual): + env = SimpleEnvironment( + [BRAIN_NAME], + use_discrete=True, + num_visual=num_visual, + num_vector=0, + step_size=0.5, + vis_obs_size=(36, 36, 3), + ) + new_networksettings = attr.evolve( + SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type) + ) + new_hyperparams = attr.evolve( + SAC_CONFIG.hyperparameters, + batch_size=16, + learning_rate=3e-4, + buffer_init_steps=0, + ) + config = attr.evolve( + SAC_CONFIG, + hyperparameters=new_hyperparams, + network_settings=new_networksettings, + max_steps=100, + ) + # The number of steps is pretty small for these encoders + _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_recurrent_sac(use_discrete): + step_size = 0.5 if use_discrete else 0.2 + env = MemoryEnvironment( + [BRAIN_NAME], use_discrete=use_discrete, step_size=step_size + ) + new_networksettings = attr.evolve( + SAC_CONFIG.network_settings, + memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16), + ) + new_hyperparams = attr.evolve( + SAC_CONFIG.hyperparameters, + batch_size=128, + learning_rate=1e-3, + buffer_init_steps=1000, + steps_per_update=2, + ) + config = attr.evolve( + SAC_CONFIG, + hyperparameters=new_hyperparams, + network_settings=new_networksettings, + max_steps=5000, + ) + _check_environment_trains(env, {BRAIN_NAME: config}) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_simple_ghost(use_discrete): + env = SimpleEnvironment( + [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete + ) + self_play_settings = SelfPlaySettings( + play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000 + ) + config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500) + _check_environment_trains(env, {BRAIN_NAME: config}) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_simple_ghost_fails(use_discrete): + env = SimpleEnvironment( + [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete + ) + # This config should fail because the ghosted policy is never swapped with a competent policy. + # Swap occurs after max step is reached. + self_play_settings = SelfPlaySettings( + play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000 + ) + config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500) + _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None) + processed_rewards = [ + default_reward_processor(rewards) for rewards in env.final_rewards.values() + ] + success_threshold = 0.9 + assert any(reward > success_threshold for reward in processed_rewards) and any( + reward < success_threshold for reward in processed_rewards + ) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_simple_asymm_ghost(use_discrete): + # Make opponent for asymmetric case + brain_name_opp = BRAIN_NAME + "Opp" + env = SimpleEnvironment( + [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete + ) + self_play_settings = SelfPlaySettings( + play_against_latest_model_ratio=1.0, + save_steps=10000, + swap_steps=10000, + team_change=400, + ) + config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=4000) + _check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config}) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_simple_asymm_ghost_fails(use_discrete): + # Make opponent for asymmetric case + brain_name_opp = BRAIN_NAME + "Opp" + env = SimpleEnvironment( + [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete + ) + # This config should fail because the team that us not learning when both have reached + # max step should be executing the initial, untrained poliy. + self_play_settings = SelfPlaySettings( + play_against_latest_model_ratio=0.0, + save_steps=5000, + swap_steps=5000, + team_change=2000, + ) + config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=3000) + _check_environment_trains( + env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None + ) + processed_rewards = [ + default_reward_processor(rewards) for rewards in env.final_rewards.values() + ] + success_threshold = 0.9 + assert any(reward > success_threshold for reward in processed_rewards) and any( + reward < success_threshold for reward in processed_rewards + ) + + +@pytest.fixture(scope="session") +def simple_record(tmpdir_factory): + def record_demo(use_discrete, num_visual=0, num_vector=1): + env = RecordEnvironment( + [BRAIN_NAME], + use_discrete=use_discrete, + num_visual=num_visual, + num_vector=num_vector, + n_demos=100, + ) + # If we want to use true demos, we can solve the env in the usual way + # Otherwise, we can just call solve to execute the optimal policy + env.solve() + agent_info_protos = env.demonstration_protos[BRAIN_NAME] + meta_data_proto = DemonstrationMetaProto() + brain_param_proto = BrainParametersProto( + vector_action_size=[2] if use_discrete else [1], + vector_action_descriptions=[""], + vector_action_space_type=discrete if use_discrete else continuous, + brain_name=BRAIN_NAME, + is_training=True, + ) + action_type = "Discrete" if use_discrete else "Continuous" + demo_path_name = "1DTest" + action_type + ".demo" + demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name)) + write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos) + return demo_path + + return record_demo + + +@pytest.mark.parametrize("use_discrete", [True, False]) +@pytest.mark.parametrize("trainer_config", [PPO_CONFIG, SAC_CONFIG]) +def test_gail(simple_record, use_discrete, trainer_config): + demo_path = simple_record(use_discrete) + env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2) + bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) + reward_signals = { + RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) + } + config = attr.evolve( + trainer_config, + reward_signals=reward_signals, + behavioral_cloning=bc_settings, + max_steps=500, + ) + _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_gail_visual_ppo(simple_record, use_discrete): + demo_path = simple_record(use_discrete, num_visual=1, num_vector=0) + env = SimpleEnvironment( + [BRAIN_NAME], + num_visual=1, + num_vector=0, + use_discrete=use_discrete, + step_size=0.2, + ) + bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500) + reward_signals = { + RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) + } + hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3e-4) + config = attr.evolve( + PPO_CONFIG, + reward_signals=reward_signals, + hyperparameters=hyperparams, + behavioral_cloning=bc_settings, + max_steps=1000, + ) + _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_gail_visual_sac(simple_record, use_discrete): + demo_path = simple_record(use_discrete, num_visual=1, num_vector=0) + env = SimpleEnvironment( + [BRAIN_NAME], + num_visual=1, + num_vector=0, + use_discrete=use_discrete, + step_size=0.2, + ) + bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) + reward_signals = { + RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) + } + hyperparams = attr.evolve( + SAC_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16 + ) + config = attr.evolve( + SAC_CONFIG, + reward_signals=reward_signals, + hyperparameters=hyperparams, + behavioral_cloning=bc_settings, + max_steps=500, + ) + _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_utils.py b/ml-agents/mlagents/trainers/tests/torch/test_utils.py index 0275581d08..ae52456f3e 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_utils.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_utils.py @@ -214,3 +214,9 @@ def test_masked_mean(): masks = torch.tensor([False, False, False, False, False]) mean = ModelUtils.masked_mean(test_input, masks=masks) assert mean == 0.0 + + # Make sure it works with 2d arrays of shape (mask_length, N) + test_input = torch.tensor([1, 2, 3, 4, 5]).repeat(2, 1).T + masks = torch.tensor([False, False, True, True, True]) + mean = ModelUtils.masked_mean(test_input, masks=masks) + assert mean == 4.0 diff --git a/ml-agents/mlagents/trainers/tests/torch/testdcvis.demo b/ml-agents/mlagents/trainers/tests/torch/testdcvis.demo new file mode 100644 index 0000000000..bb9c48dfca Binary files /dev/null and b/ml-agents/mlagents/trainers/tests/torch/testdcvis.demo differ diff --git a/ml-agents/mlagents/trainers/tf/model_serialization.py b/ml-agents/mlagents/trainers/tf/model_serialization.py index c51a5ad3e3..c11f579215 100644 --- a/ml-agents/mlagents/trainers/tf/model_serialization.py +++ b/ml-agents/mlagents/trainers/tf/model_serialization.py @@ -61,7 +61,7 @@ def export_policy_model( model_path: str, output_filepath: str, - brain_name: str, + behavior_name: str, graph: tf.Graph, sess: tf.Session, ) -> None: @@ -69,11 +69,11 @@ def export_policy_model( Exports a TF graph for a Policy to .nn and/or .onnx format for Unity embedding. :param output_filepath: file path to output the model (without file suffix) - :param brain_name: brain name of the trained model + :param behavior_name: behavior name of the trained model :param graph: Tensorflow Graph for the policy :param sess: Tensorflow session for the policy """ - frozen_graph_def = _make_frozen_graph(brain_name, graph, sess) + frozen_graph_def = _make_frozen_graph(behavior_name, graph, sess) if not os.path.exists(output_filepath): os.makedirs(output_filepath) # Save frozen graph @@ -90,7 +90,7 @@ def export_policy_model( if ONNX_EXPORT_ENABLED: if SerializationSettings.convert_to_onnx: try: - onnx_graph = convert_frozen_to_onnx(brain_name, frozen_graph_def) + onnx_graph = convert_frozen_to_onnx(behavior_name, frozen_graph_def) onnx_output_path = f"{output_filepath}.onnx" with open(onnx_output_path, "wb") as f: f.write(onnx_graph.SerializeToString()) @@ -113,10 +113,10 @@ def export_policy_model( def _make_frozen_graph( - brain_name: str, graph: tf.Graph, sess: tf.Session + behavior_name: str, graph: tf.Graph, sess: tf.Session ) -> tf.GraphDef: with graph.as_default(): - target_nodes = ",".join(_process_graph(brain_name, graph)) + target_nodes = ",".join(_process_graph(behavior_name, graph)) graph_def = graph.as_graph_def() output_graph_def = graph_util.convert_variables_to_constants( sess, graph_def, target_nodes.replace(" ", "").split(",") @@ -124,7 +124,7 @@ def _make_frozen_graph( return output_graph_def -def convert_frozen_to_onnx(brain_name: str, frozen_graph_def: tf.GraphDef) -> Any: +def convert_frozen_to_onnx(behavior_name: str, frozen_graph_def: tf.GraphDef) -> Any: # This is basically https://github.com/onnx/tensorflow-onnx/blob/master/tf2onnx/convert.py inputs = _get_input_node_names(frozen_graph_def) @@ -146,7 +146,7 @@ def convert_frozen_to_onnx(brain_name: str, frozen_graph_def: tf.GraphDef) -> An ) onnx_graph = optimizer.optimize_graph(g) - model_proto = onnx_graph.make_model(brain_name) + model_proto = onnx_graph.make_model(behavior_name) return model_proto @@ -195,14 +195,14 @@ def _get_frozen_graph_node_names(frozen_graph_def: Any) -> Set[str]: return names -def _process_graph(brain_name: str, graph: tf.Graph) -> List[str]: +def _process_graph(behavior_name: str, graph: tf.Graph) -> List[str]: """ Gets the list of the output nodes present in the graph for inference :return: list of node names """ all_nodes = [x.name for x in graph.as_graph_def().node] nodes = [x for x in all_nodes if x in POSSIBLE_OUTPUT_NODES | MODEL_CONSTANTS] - logger.info("List of nodes to export for brain :" + brain_name) + logger.info("List of nodes to export for behavior :" + behavior_name) for n in nodes: logger.info("\t" + n) return nodes diff --git a/ml-agents/mlagents/trainers/torch/components/__init__.py b/ml-agents/mlagents/trainers/torch/components/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ml-agents/mlagents/trainers/torch/components/bc/__init__.py b/ml-agents/mlagents/trainers/torch/components/bc/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ml-agents/mlagents/trainers/torch/components/bc/module.py b/ml-agents/mlagents/trainers/torch/components/bc/module.py new file mode 100644 index 0000000000..61f7f03758 --- /dev/null +++ b/ml-agents/mlagents/trainers/torch/components/bc/module.py @@ -0,0 +1,183 @@ +from typing import Dict +import numpy as np +import torch + +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.demo_loader import demo_to_buffer +from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType +from mlagents.trainers.torch.utils import ModelUtils + + +class BCModule: + def __init__( + self, + policy: TorchPolicy, + settings: BehavioralCloningSettings, + policy_learning_rate: float, + default_batch_size: int, + default_num_epoch: int, + ): + """ + A BC trainer that can be used inline with RL. + :param policy: The policy of the learning model + :param settings: The settings for BehavioralCloning including LR strength, batch_size, + num_epochs, samples_per_update and LR annealing steps. + :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate + for the pretrainer. + """ + self.policy = policy + self._anneal_steps = settings.steps + self.current_lr = policy_learning_rate * settings.strength + + learning_rate_schedule: ScheduleType = ScheduleType.LINEAR if self._anneal_steps > 0 else ScheduleType.CONSTANT + self.decay_learning_rate = ModelUtils.DecayedValue( + learning_rate_schedule, self.current_lr, 1e-10, self._anneal_steps + ) + params = self.policy.actor_critic.parameters() + self.optimizer = torch.optim.Adam(params, lr=self.current_lr) + _, self.demonstration_buffer = demo_to_buffer( + settings.demo_path, policy.sequence_length, policy.behavior_spec + ) + + self.batch_size = ( + settings.batch_size if settings.batch_size else default_batch_size + ) + self.num_epoch = settings.num_epoch if settings.num_epoch else default_num_epoch + self.n_sequences = max( + min(self.batch_size, self.demonstration_buffer.num_experiences) + // policy.sequence_length, + 1, + ) + + self.has_updated = False + self.use_recurrent = self.policy.use_recurrent + self.samples_per_update = settings.samples_per_update + + def update(self) -> Dict[str, np.ndarray]: + """ + Updates model using buffer. + :param max_batches: The maximum number of batches to use per update. + :return: The loss of the update. + """ + # Don't continue training if the learning rate has reached 0, to reduce training time. + + decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) + if self.current_lr <= 0: + return {"Losses/Pretraining Loss": 0} + + batch_losses = [] + possible_demo_batches = ( + self.demonstration_buffer.num_experiences // self.n_sequences + ) + possible_batches = possible_demo_batches + + max_batches = self.samples_per_update // self.n_sequences + + n_epoch = self.num_epoch + for _ in range(n_epoch): + self.demonstration_buffer.shuffle( + sequence_length=self.policy.sequence_length + ) + if max_batches == 0: + num_batches = possible_batches + else: + num_batches = min(possible_batches, max_batches) + for i in range(num_batches // self.policy.sequence_length): + demo_update_buffer = self.demonstration_buffer + start = i * self.n_sequences * self.policy.sequence_length + end = (i + 1) * self.n_sequences * self.policy.sequence_length + mini_batch_demo = demo_update_buffer.make_mini_batch(start, end) + run_out = self._update_batch(mini_batch_demo, self.n_sequences) + loss = run_out["loss"] + batch_losses.append(loss) + + ModelUtils.update_learning_rate(self.optimizer, decay_lr) + self.current_lr = decay_lr + + self.has_updated = True + update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)} + return update_stats + + def _behavioral_cloning_loss(self, selected_actions, log_probs, expert_actions): + if self.policy.use_continuous_act: + bc_loss = torch.nn.functional.mse_loss(selected_actions, expert_actions) + else: + log_prob_branches = ModelUtils.break_into_branches( + log_probs, self.policy.act_size + ) + bc_loss = torch.mean( + torch.stack( + [ + torch.sum( + -torch.nn.functional.log_softmax(log_prob_branch, dim=1) + * expert_actions_branch, + dim=1, + ) + for log_prob_branch, expert_actions_branch in zip( + log_prob_branches, expert_actions + ) + ] + ) + ) + return bc_loss + + def _update_batch( + self, mini_batch_demo: Dict[str, np.ndarray], n_sequences: int + ) -> Dict[str, float]: + """ + Helper function for update_batch. + """ + vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])] + act_masks = None + if self.policy.use_continuous_act: + expert_actions = ModelUtils.list_to_tensor(mini_batch_demo["actions"]) + else: + raw_expert_actions = ModelUtils.list_to_tensor( + mini_batch_demo["actions"], dtype=torch.long + ) + expert_actions = ModelUtils.actions_to_onehot( + raw_expert_actions, self.policy.act_size + ) + act_masks = ModelUtils.list_to_tensor( + np.ones( + ( + self.n_sequences * self.policy.sequence_length, + sum(self.policy.behavior_spec.discrete_action_branches), + ), + dtype=np.float32, + ) + ) + + memories = [] + if self.policy.use_recurrent: + memories = torch.zeros(1, self.n_sequences, self.policy.m_size) + + if self.policy.use_vis_obs: + vis_obs = [] + for idx, _ in enumerate( + self.policy.actor_critic.network_body.visual_encoders + ): + vis_ob = ModelUtils.list_to_tensor( + mini_batch_demo["visual_obs%d" % idx] + ) + vis_obs.append(vis_ob) + else: + vis_obs = [] + + selected_actions, all_log_probs, _, _, _ = self.policy.sample_actions( + vec_obs, + vis_obs, + masks=act_masks, + memories=memories, + seq_len=self.policy.sequence_length, + all_log_probs=True, + ) + bc_loss = self._behavioral_cloning_loss( + selected_actions, all_log_probs, expert_actions + ) + self.optimizer.zero_grad() + bc_loss.backward() + + self.optimizer.step() + run_out = {"loss": bc_loss.detach().cpu().numpy()} + return run_out diff --git a/ml-agents/mlagents/trainers/torch/components/reward_providers/__init__.py b/ml-agents/mlagents/trainers/torch/components/reward_providers/__init__.py new file mode 100644 index 0000000000..d6097a4dab --- /dev/null +++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/__init__.py @@ -0,0 +1,15 @@ +from mlagents.trainers.torch.components.reward_providers.base_reward_provider import ( # noqa F401 + BaseRewardProvider, +) +from mlagents.trainers.torch.components.reward_providers.extrinsic_reward_provider import ( # noqa F401 + ExtrinsicRewardProvider, +) +from mlagents.trainers.torch.components.reward_providers.curiosity_reward_provider import ( # noqa F401 + CuriosityRewardProvider, +) +from mlagents.trainers.torch.components.reward_providers.gail_reward_provider import ( # noqa F401 + GAILRewardProvider, +) +from mlagents.trainers.torch.components.reward_providers.reward_provider_factory import ( # noqa F401 + create_reward_provider, +) diff --git a/ml-agents/mlagents/trainers/torch/components/reward_providers/base_reward_provider.py b/ml-agents/mlagents/trainers/torch/components/reward_providers/base_reward_provider.py new file mode 100644 index 0000000000..77b7fa7a30 --- /dev/null +++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/base_reward_provider.py @@ -0,0 +1,72 @@ +import numpy as np +from abc import ABC, abstractmethod +from typing import Dict + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.settings import RewardSignalSettings +from mlagents_envs.base_env import BehaviorSpec + + +class BaseRewardProvider(ABC): + def __init__(self, specs: BehaviorSpec, settings: RewardSignalSettings) -> None: + self._policy_specs = specs + self._gamma = settings.gamma + self._strength = settings.strength + self._ignore_done = False + + @property + def gamma(self) -> float: + """ + The discount factor for the reward signal + """ + return self._gamma + + @property + def strength(self) -> float: + """ + The strength multiplier of the reward provider + """ + return self._strength + + @property + def name(self) -> str: + """ + The name of the reward provider. Is used for reporting and identification + """ + class_name = self.__class__.__name__ + return class_name.replace("RewardProvider", "") + + @property + def ignore_done(self) -> bool: + """ + If true, when the agent is done, the rewards of the next episode must be + used to calculate the return of the current episode. + Is used to mitigate the positive bias in rewards with no natural end. + """ + return self._ignore_done + + @abstractmethod + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + """ + Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward + function drawn straight from a Buffer. + :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) + when drawing from the update buffer. + :return: a np.ndarray of rewards generated by the reward provider + """ + raise NotImplementedError( + "The reward provider's evaluate method has not been implemented " + ) + + @abstractmethod + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + """ + Update the reward for the data present in the Dict mini_batch. Use this when updating a reward + function drawn straight from a Buffer. + :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) + when drawing from the update buffer. + :return: A dictionary from string to stats values + """ + raise NotImplementedError( + "The reward provider's update method has not been implemented " + ) diff --git a/ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py b/ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py new file mode 100644 index 0000000000..842b039510 --- /dev/null +++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py @@ -0,0 +1,225 @@ +import numpy as np +from typing import Dict +import torch + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.torch.components.reward_providers.base_reward_provider import ( + BaseRewardProvider, +) +from mlagents.trainers.settings import CuriositySettings + +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.torch.utils import ModelUtils +from mlagents.trainers.torch.networks import NetworkBody +from mlagents.trainers.torch.layers import linear_layer, Swish +from mlagents.trainers.settings import NetworkSettings, EncoderType + + +class CuriosityRewardProvider(BaseRewardProvider): + beta = 0.2 # Forward vs Inverse loss weight + loss_multiplier = 10.0 # Loss multiplier + + def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: + super().__init__(specs, settings) + self._ignore_done = True + self._network = CuriosityNetwork(specs, settings) + self.optimizer = torch.optim.Adam( + self._network.parameters(), lr=settings.learning_rate + ) + self._has_updated_once = False + + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + with torch.no_grad(): + rewards = self._network.compute_reward(mini_batch).detach().cpu().numpy() + rewards = np.minimum(rewards, 1.0 / self.strength) + return rewards * self._has_updated_once + + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + self._has_updated_once = True + forward_loss = self._network.compute_forward_loss(mini_batch) + inverse_loss = self._network.compute_inverse_loss(mini_batch) + + loss = self.loss_multiplier * ( + self.beta * forward_loss + (1.0 - self.beta) * inverse_loss + ) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + return { + "Losses/Curiosity Forward Loss": forward_loss.detach().cpu().numpy(), + "Losses/Curiosity Inverse Loss": inverse_loss.detach().cpu().numpy(), + } + + +class CuriosityNetwork(torch.nn.Module): + EPSILON = 1e-10 + + def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: + super().__init__() + self._policy_specs = specs + state_encoder_settings = NetworkSettings( + normalize=False, + hidden_units=settings.encoding_size, + num_layers=2, + vis_encode_type=EncoderType.SIMPLE, + memory=None, + ) + self._state_encoder = NetworkBody( + specs.observation_shapes, state_encoder_settings + ) + + self._action_flattener = ModelUtils.ActionFlattener(specs) + + self.inverse_model_action_predition = torch.nn.Sequential( + linear_layer(2 * settings.encoding_size, 256), + Swish(), + linear_layer(256, self._action_flattener.flattened_size), + ) + + self.forward_model_next_state_prediction = torch.nn.Sequential( + linear_layer( + settings.encoding_size + self._action_flattener.flattened_size, 256 + ), + Swish(), + linear_layer(256, settings.encoding_size), + ) + + def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Extracts the current state embedding from a mini_batch. + """ + n_vis = len(self._state_encoder.visual_encoders) + hidden, _ = self._state_encoder.forward( + vec_inputs=[ + ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float) + ], + vis_inputs=[ + ModelUtils.list_to_tensor( + mini_batch["visual_obs%d" % i], dtype=torch.float + ) + for i in range(n_vis) + ], + ) + return hidden + + def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Extracts the next state embedding from a mini_batch. + """ + n_vis = len(self._state_encoder.visual_encoders) + hidden, _ = self._state_encoder.forward( + vec_inputs=[ + ModelUtils.list_to_tensor( + mini_batch["next_vector_in"], dtype=torch.float + ) + ], + vis_inputs=[ + ModelUtils.list_to_tensor( + mini_batch["next_visual_obs%d" % i], dtype=torch.float + ) + for i in range(n_vis) + ], + ) + return hidden + + def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + In the continuous case, returns the predicted action. + In the discrete case, returns the logits. + """ + inverse_model_input = torch.cat( + (self.get_current_state(mini_batch), self.get_next_state(mini_batch)), dim=1 + ) + hidden = self.inverse_model_action_predition(inverse_model_input) + if self._policy_specs.is_action_continuous(): + return hidden + else: + branches = ModelUtils.break_into_branches( + hidden, self._policy_specs.discrete_action_branches + ) + branches = [torch.softmax(b, dim=1) for b in branches] + return torch.cat(branches, dim=1) + + def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Uses the current state embedding and the action of the mini_batch to predict + the next state embedding. + """ + if self._policy_specs.is_action_continuous(): + action = ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float) + else: + action = torch.cat( + ModelUtils.actions_to_onehot( + ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long), + self._policy_specs.discrete_action_branches, + ), + dim=1, + ) + forward_model_input = torch.cat( + (self.get_current_state(mini_batch), action), dim=1 + ) + + return self.forward_model_next_state_prediction(forward_model_input) + + def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Computes the inverse loss for a mini_batch. Corresponds to the error on the + action prediction (given the current and next state). + """ + predicted_action = self.predict_action(mini_batch) + if self._policy_specs.is_action_continuous(): + sq_difference = ( + ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float) + - predicted_action + ) ** 2 + sq_difference = torch.sum(sq_difference, dim=1) + return torch.mean( + ModelUtils.dynamic_partition( + sq_difference, + ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), + 2, + )[1] + ) + else: + true_action = torch.cat( + ModelUtils.actions_to_onehot( + ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long), + self._policy_specs.discrete_action_branches, + ), + dim=1, + ) + cross_entropy = torch.sum( + -torch.log(predicted_action + self.EPSILON) * true_action, dim=1 + ) + return torch.mean( + ModelUtils.dynamic_partition( + cross_entropy, + ModelUtils.list_to_tensor( + mini_batch["masks"], dtype=torch.float + ), # use masks not action_masks + 2, + )[1] + ) + + def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Calculates the curiosity reward for the mini_batch. Corresponds to the error + between the predicted and actual next state. + """ + predicted_next_state = self.predict_next_state(mini_batch) + target = self.get_next_state(mini_batch) + sq_difference = 0.5 * (target - predicted_next_state) ** 2 + sq_difference = torch.sum(sq_difference, dim=1) + return sq_difference + + def compute_forward_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Computes the loss for the next state prediction + """ + return torch.mean( + ModelUtils.dynamic_partition( + self.compute_reward(mini_batch), + ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), + 2, + )[1] + ) diff --git a/ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py b/ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py new file mode 100644 index 0000000000..33a083f6d4 --- /dev/null +++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py @@ -0,0 +1,15 @@ +import numpy as np +from typing import Dict + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.torch.components.reward_providers.base_reward_provider import ( + BaseRewardProvider, +) + + +class ExtrinsicRewardProvider(BaseRewardProvider): + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + return np.array(mini_batch["environment_rewards"], dtype=np.float32) + + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + return {} diff --git a/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py b/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py new file mode 100644 index 0000000000..b59ef0c494 --- /dev/null +++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py @@ -0,0 +1,256 @@ +from typing import Optional, Dict +import numpy as np +import torch + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.torch.components.reward_providers.base_reward_provider import ( + BaseRewardProvider, +) +from mlagents.trainers.settings import GAILSettings +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.torch.utils import ModelUtils +from mlagents.trainers.torch.networks import NetworkBody +from mlagents.trainers.torch.layers import linear_layer, Swish, Initialization +from mlagents.trainers.settings import NetworkSettings, EncoderType +from mlagents.trainers.demo_loader import demo_to_buffer + + +class GAILRewardProvider(BaseRewardProvider): + def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: + super().__init__(specs, settings) + self._ignore_done = True + self._discriminator_network = DiscriminatorNetwork(specs, settings) + _, self._demo_buffer = demo_to_buffer( + settings.demo_path, 1, specs + ) # This is supposed to be the sequence length but we do not have access here + params = list(self._discriminator_network.parameters()) + self.optimizer = torch.optim.Adam(params, lr=settings.learning_rate) + + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + with torch.no_grad(): + estimates, _ = self._discriminator_network.compute_estimate( + mini_batch, use_vail_noise=False + ) + return ( + -torch.log( + 1.0 + - estimates.squeeze(dim=1) + * (1.0 - self._discriminator_network.EPSILON) + ) + .detach() + .cpu() + .numpy() + ) + + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + expert_batch = self._demo_buffer.sample_mini_batch( + mini_batch.num_experiences, 1 + ) + loss, stats_dict = self._discriminator_network.compute_loss( + mini_batch, expert_batch + ) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + return stats_dict + + +class DiscriminatorNetwork(torch.nn.Module): + gradient_penalty_weight = 10.0 + z_size = 128 + alpha = 0.0005 + mutual_information = 0.5 + EPSILON = 1e-7 + initial_beta = 0.0 + + def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: + super().__init__() + self._policy_specs = specs + self._use_vail = settings.use_vail + self._settings = settings + + state_encoder_settings = NetworkSettings( + normalize=False, + hidden_units=settings.encoding_size, + num_layers=2, + vis_encode_type=EncoderType.SIMPLE, + memory=None, + ) + self._state_encoder = NetworkBody( + specs.observation_shapes, state_encoder_settings + ) + + self._action_flattener = ModelUtils.ActionFlattener(specs) + + encoder_input_size = settings.encoding_size + if settings.use_actions: + encoder_input_size += ( + self._action_flattener.flattened_size + 1 + ) # + 1 is for done + + self.encoder = torch.nn.Sequential( + linear_layer(encoder_input_size, settings.encoding_size), + Swish(), + linear_layer(settings.encoding_size, settings.encoding_size), + Swish(), + ) + + estimator_input_size = settings.encoding_size + if settings.use_vail: + estimator_input_size = self.z_size + self._z_sigma = torch.nn.Parameter( + torch.ones((self.z_size), dtype=torch.float), requires_grad=True + ) + self._z_mu_layer = linear_layer( + settings.encoding_size, + self.z_size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=0.1, + ) + self._beta = torch.nn.Parameter( + torch.tensor(self.initial_beta, dtype=torch.float), requires_grad=False + ) + + self._estimator = torch.nn.Sequential( + linear_layer(estimator_input_size, 1), torch.nn.Sigmoid() + ) + + def get_action_input(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Creates the action Tensor. In continuous case, corresponds to the action. In + the discrete case, corresponds to the concatenation of one hot action Tensors. + """ + return self._action_flattener.forward( + torch.as_tensor(mini_batch["actions"], dtype=torch.float) + ) + + def get_state_encoding(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Creates the observation input. + """ + n_vis = len(self._state_encoder.visual_encoders) + hidden, _ = self._state_encoder.forward( + vec_inputs=[torch.as_tensor(mini_batch["vector_obs"], dtype=torch.float)], + vis_inputs=[ + torch.as_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float) + for i in range(n_vis) + ], + ) + return hidden + + def compute_estimate( + self, mini_batch: AgentBuffer, use_vail_noise: bool = False + ) -> torch.Tensor: + """ + Given a mini_batch, computes the estimate (How much the discriminator believes + the data was sampled from the demonstration data). + :param mini_batch: The AgentBuffer of data + :param use_vail_noise: Only when using VAIL : If true, will sample the code, if + false, will return the mean of the code. + """ + encoder_input = self.get_state_encoding(mini_batch) + if self._settings.use_actions: + actions = self.get_action_input(mini_batch) + dones = torch.as_tensor(mini_batch["done"], dtype=torch.float) + encoder_input = torch.cat([encoder_input, actions, dones], dim=1) + hidden = self.encoder(encoder_input) + z_mu: Optional[torch.Tensor] = None + if self._settings.use_vail: + z_mu = self._z_mu_layer(hidden) + hidden = torch.normal(z_mu, self._z_sigma * use_vail_noise) + estimate = self._estimator(hidden) + return estimate, z_mu + + def compute_loss( + self, policy_batch: AgentBuffer, expert_batch: AgentBuffer + ) -> torch.Tensor: + """ + Given a policy mini_batch and an expert mini_batch, computes the loss of the discriminator. + """ + total_loss = torch.zeros(1) + stats_dict: Dict[str, np.ndarray] = {} + policy_estimate, policy_mu = self.compute_estimate( + policy_batch, use_vail_noise=True + ) + expert_estimate, expert_mu = self.compute_estimate( + expert_batch, use_vail_noise=True + ) + stats_dict["Policy/GAIL Policy Estimate"] = ( + policy_estimate.mean().detach().cpu().numpy() + ) + stats_dict["Policy/GAIL Expert Estimate"] = ( + expert_estimate.mean().detach().cpu().numpy() + ) + discriminator_loss = -( + torch.log(expert_estimate + self.EPSILON) + + torch.log(1.0 - policy_estimate + self.EPSILON) + ).mean() + stats_dict["Losses/GAIL Loss"] = discriminator_loss.detach().cpu().numpy() + total_loss += discriminator_loss + if self._settings.use_vail: + # KL divergence loss (encourage latent representation to be normal) + kl_loss = torch.mean( + -torch.sum( + 1 + + (self._z_sigma ** 2).log() + - 0.5 * expert_mu ** 2 + - 0.5 * policy_mu ** 2 + - (self._z_sigma ** 2), + dim=1, + ) + ) + vail_loss = self._beta * (kl_loss - self.mutual_information) + with torch.no_grad(): + self._beta.data = torch.max( + self._beta + self.alpha * (kl_loss - self.mutual_information), + torch.tensor(0.0), + ) + total_loss += vail_loss + stats_dict["Policy/GAIL Beta"] = self._beta.detach().cpu().numpy() + stats_dict["Losses/GAIL KL Loss"] = kl_loss.detach().cpu().numpy() + if self.gradient_penalty_weight > 0.0: + total_loss += ( + self.gradient_penalty_weight + * self.compute_gradient_magnitude(policy_batch, expert_batch) + ) + return total_loss, stats_dict + + def compute_gradient_magnitude( + self, policy_batch: AgentBuffer, expert_batch: AgentBuffer + ) -> torch.Tensor: + """ + Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp. + for off-policy. Compute gradients w.r.t randomly interpolated input. + """ + policy_obs = self.get_state_encoding(policy_batch) + expert_obs = self.get_state_encoding(expert_batch) + obs_epsilon = torch.rand(policy_obs.shape) + encoder_input = obs_epsilon * policy_obs + (1 - obs_epsilon) * expert_obs + if self._settings.use_actions: + policy_action = self.get_action_input(policy_batch) + expert_action = self.get_action_input(policy_batch) + action_epsilon = torch.rand(policy_action.shape) + policy_dones = torch.as_tensor(policy_batch["done"], dtype=torch.float) + expert_dones = torch.as_tensor(expert_batch["done"], dtype=torch.float) + dones_epsilon = torch.rand(policy_dones.shape) + encoder_input = torch.cat( + [ + encoder_input, + action_epsilon * policy_action + + (1 - action_epsilon) * expert_action, + dones_epsilon * policy_dones + (1 - dones_epsilon) * expert_dones, + ], + dim=1, + ) + hidden = self.encoder(encoder_input) + if self._settings.use_vail: + use_vail_noise = True + z_mu = self._z_mu_layer(hidden) + hidden = torch.normal(z_mu, self._z_sigma * use_vail_noise) + hidden = self._estimator(hidden) + estimate = torch.mean(torch.sum(hidden, dim=1)) + gradient = torch.autograd.grad(estimate, encoder_input)[0] + # Norm's gradient could be NaN at 0. Use our own safe_norm + safe_norm = (torch.sum(gradient ** 2, dim=1) + self.EPSILON).sqrt() + gradient_mag = torch.mean((safe_norm - 1) ** 2) + return gradient_mag diff --git a/ml-agents/mlagents/trainers/torch/components/reward_providers/reward_provider_factory.py b/ml-agents/mlagents/trainers/torch/components/reward_providers/reward_provider_factory.py new file mode 100644 index 0000000000..2501868bc3 --- /dev/null +++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/reward_provider_factory.py @@ -0,0 +1,43 @@ +from typing import Dict, Type +from mlagents.trainers.exception import UnityTrainerException + +from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType + +from mlagents.trainers.torch.components.reward_providers.base_reward_provider import ( + BaseRewardProvider, +) +from mlagents.trainers.torch.components.reward_providers.extrinsic_reward_provider import ( + ExtrinsicRewardProvider, +) +from mlagents.trainers.torch.components.reward_providers.curiosity_reward_provider import ( + CuriosityRewardProvider, +) +from mlagents.trainers.torch.components.reward_providers.gail_reward_provider import ( + GAILRewardProvider, +) + +from mlagents_envs.base_env import BehaviorSpec + +NAME_TO_CLASS: Dict[RewardSignalType, Type[BaseRewardProvider]] = { + RewardSignalType.EXTRINSIC: ExtrinsicRewardProvider, + RewardSignalType.CURIOSITY: CuriosityRewardProvider, + RewardSignalType.GAIL: GAILRewardProvider, +} + + +def create_reward_provider( + name: RewardSignalType, specs: BehaviorSpec, settings: RewardSignalSettings +) -> BaseRewardProvider: + """ + Creates a reward provider class based on the name and config entry provided as a dict. + :param name: The name of the reward signal + :param specs: The BehaviorSpecs of the policy + :param settings: The RewardSignalSettings for that reward signal + :return: The reward signal class instantiated + """ + rcls = NAME_TO_CLASS.get(name) + if not rcls: + raise UnityTrainerException(f"Unknown reward signal type {name}") + + class_inst = rcls(specs, settings) + return class_inst diff --git a/ml-agents/mlagents/trainers/torch/encoders.py b/ml-agents/mlagents/trainers/torch/encoders.py index b598498e1c..f5c0f93953 100644 --- a/ml-agents/mlagents/trainers/torch/encoders.py +++ b/ml-agents/mlagents/trainers/torch/encoders.py @@ -274,29 +274,26 @@ def __init__(self, height, width, initial_channels, final_hidden): super().__init__() n_channels = [16, 32, 32] # channel for each stack n_blocks = 2 # number of residual blocks - self.layers = [] + layers = [] last_channel = initial_channels for _, channel in enumerate(n_channels): - self.layers.append( - nn.Conv2d(last_channel, channel, [3, 3], [1, 1], padding=1) - ) - self.layers.append(nn.MaxPool2d([3, 3], [2, 2])) + layers.append(nn.Conv2d(last_channel, channel, [3, 3], [1, 1], padding=1)) + layers.append(nn.MaxPool2d([3, 3], [2, 2])) height, width = pool_out_shape((height, width), 3) for _ in range(n_blocks): - self.layers.append(ResNetBlock(channel)) + layers.append(ResNetBlock(channel)) last_channel = channel - self.layers.append(Swish()) + layers.append(Swish()) self.dense = linear_layer( n_channels[-1] * height * width, final_hidden, kernel_init=Initialization.KaimingHeNormal, kernel_gain=1.0, ) + self.sequential = nn.Sequential(*layers) def forward(self, visual_obs): batch_size = visual_obs.shape[0] - hidden = visual_obs - for layer in self.layers: - hidden = layer(hidden) + hidden = self.sequential(visual_obs) before_out = hidden.view(batch_size, -1) return torch.relu(self.dense(before_out)) diff --git a/ml-agents/mlagents/trainers/torch/layers.py b/ml-agents/mlagents/trainers/torch/layers.py index 707d4748a5..0177d52535 100644 --- a/ml-agents/mlagents/trainers/torch/layers.py +++ b/ml-agents/mlagents/trainers/torch/layers.py @@ -1,4 +1,6 @@ import torch +import abc +from typing import Tuple from enum import Enum @@ -82,3 +84,68 @@ def lstm_layer( forget_bias ) return lstm + + +class MemoryModule(torch.nn.Module): + @abc.abstractproperty + def memory_size(self) -> int: + """ + Size of memory that is required at the start of a sequence. + """ + pass + + @abc.abstractmethod + def forward( + self, input_tensor: torch.Tensor, memories: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Pass a sequence to the memory module. + :input_tensor: Tensor of shape (batch_size, seq_length, size) that represents the input. + :memories: Tensor of initial memories. + :return: Tuple of output, final memories. + """ + pass + + +class LSTM(MemoryModule): + """ + Memory module that implements LSTM. + """ + + def __init__( + self, + input_size: int, + memory_size: int, + num_layers: int = 1, + forget_bias: float = 1.0, + kernel_init: Initialization = Initialization.XavierGlorotUniform, + bias_init: Initialization = Initialization.Zero, + ): + super().__init__() + # We set hidden size to half of memory_size since the initial memory + # will be divided between the hidden state and initial cell state. + self.hidden_size = memory_size // 2 + self.lstm = lstm_layer( + input_size, + self.hidden_size, + num_layers, + True, + forget_bias, + kernel_init, + bias_init, + ) + + @property + def memory_size(self) -> int: + return 2 * self.hidden_size + + def forward( + self, input_tensor: torch.Tensor, memories: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + # We don't use torch.split here since it is not supported by Barracuda + h0 = memories[:, :, : self.hidden_size] + c0 = memories[:, :, self.hidden_size :] + hidden = (h0, c0) + lstm_out, hidden_out = self.lstm(input_tensor, hidden) + output_mem = torch.cat(hidden_out, dim=-1) + return lstm_out, output_mem diff --git a/ml-agents/mlagents/trainers/torch/model_serialization.py b/ml-agents/mlagents/trainers/torch/model_serialization.py new file mode 100644 index 0000000000..311af8d7fc --- /dev/null +++ b/ml-agents/mlagents/trainers/torch/model_serialization.py @@ -0,0 +1,74 @@ +import os +import torch + +from mlagents_envs.logging_util import get_logger +from mlagents.trainers.settings import SerializationSettings + + +logger = get_logger(__name__) + + +class ModelSerializer: + def __init__(self, policy): + # ONNX only support input in NCHW (channel first) format. + # Barracuda also expect to get data in NCHW. + # Any multi-dimentional input should follow that otherwise will + # cause problem to barracuda import. + self.policy = policy + batch_dim = [1] + seq_len_dim = [1] + dummy_vec_obs = [torch.zeros(batch_dim + [self.policy.vec_obs_size])] + # create input shape of NCHW + # (It's NHWC in self.policy.behavior_spec.observation_shapes) + dummy_vis_obs = [ + torch.zeros(batch_dim + [shape[2], shape[0], shape[1]]) + for shape in self.policy.behavior_spec.observation_shapes + if len(shape) == 3 + ] + dummy_masks = torch.ones(batch_dim + [sum(self.policy.actor_critic.act_size)]) + dummy_memories = torch.zeros( + batch_dim + seq_len_dim + [self.policy.export_memory_size] + ) + + self.dummy_input = (dummy_vec_obs, dummy_vis_obs, dummy_masks, dummy_memories) + + self.input_names = ( + ["vector_observation"] + + [f"visual_observation_{i}" for i in range(self.policy.vis_obs_size)] + + ["action_masks", "memories"] + ) + + self.output_names = [ + "action", + "version_number", + "memory_size", + "is_continuous_control", + "action_output_shape", + ] + + self.dynamic_axes = {name: {0: "batch"} for name in self.input_names} + self.dynamic_axes.update({"action": {0: "batch"}}) + + def export_policy_model(self, output_filepath: str) -> None: + """ + Exports a Torch model for a Policy to .onnx format for Unity embedding. + + :param output_filepath: file path to output the model (without file suffix) + :param brain_name: Brain name of brain to be trained + """ + if not os.path.exists(output_filepath): + os.makedirs(output_filepath) + + onnx_output_path = f"{output_filepath}.onnx" + logger.info(f"Converting to {onnx_output_path}") + + torch.onnx.export( + self.policy.actor_critic, + self.dummy_input, + onnx_output_path, + opset_version=SerializationSettings.onnx_opset, + input_names=self.input_names, + output_names=self.output_names, + dynamic_axes=self.dynamic_axes, + ) + logger.info(f"Exported {onnx_output_path}") diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index bfe9c0ade3..526407b2e2 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -1,5 +1,4 @@ from typing import Callable, List, Dict, Tuple, Optional -import attr import abc import torch @@ -14,7 +13,7 @@ from mlagents.trainers.settings import NetworkSettings from mlagents.trainers.torch.utils import ModelUtils from mlagents.trainers.torch.decoders import ValueHeads -from mlagents.trainers.torch.layers import lstm_layer +from mlagents.trainers.torch.layers import LSTM ActivationFunction = Callable[[torch.Tensor], torch.Tensor] EncoderFunction = Callable[ @@ -51,9 +50,9 @@ def __init__( ) if self.use_lstm: - self.lstm = lstm_layer(self.h_size, self.m_size // 2, batch_first=True) + self.lstm = LSTM(self.h_size, self.m_size) else: - self.lstm = None + self.lstm = None # type: ignore def update_normalization(self, vec_inputs: List[torch.Tensor]) -> None: for vec_input, vec_enc in zip(vec_inputs, self.vector_encoders): @@ -64,6 +63,10 @@ def copy_normalization(self, other_network: "NetworkBody") -> None: for n1, n2 in zip(self.vector_encoders, other_network.vector_encoders): n1.copy_normalization(n2) + @property + def memory_size(self) -> int: + return self.lstm.memory_size if self.use_lstm else 0 + def forward( self, vec_inputs: List[torch.Tensor], @@ -72,42 +75,36 @@ def forward( memories: Optional[torch.Tensor] = None, sequence_length: int = 1, ) -> Tuple[torch.Tensor, torch.Tensor]: - vec_encodes = [] + encodes = [] for idx, encoder in enumerate(self.vector_encoders): vec_input = vec_inputs[idx] if actions is not None: hidden = encoder(vec_input, actions) else: hidden = encoder(vec_input) - vec_encodes.append(hidden) + encodes.append(hidden) - vis_encodes = [] for idx, encoder in enumerate(self.visual_encoders): vis_input = vis_inputs[idx] - vis_input = vis_input.permute([0, 3, 1, 2]) + if not torch.onnx.is_in_onnx_export(): + vis_input = vis_input.permute([0, 3, 1, 2]) hidden = encoder(vis_input) - vis_encodes.append(hidden) - - if len(vec_encodes) > 0 and len(vis_encodes) > 0: - vec_encodes_tensor = torch.stack(vec_encodes, dim=-1).sum(dim=-1) - vis_encodes_tensor = torch.stack(vis_encodes, dim=-1).sum(dim=-1) - encoding = torch.stack( - [vec_encodes_tensor, vis_encodes_tensor], dim=-1 - ).sum(dim=-1) - elif len(vec_encodes) > 0: - encoding = torch.stack(vec_encodes, dim=-1).sum(dim=-1) - elif len(vis_encodes) > 0: - encoding = torch.stack(vis_encodes, dim=-1).sum(dim=-1) - else: + encodes.append(hidden) + + if len(encodes) == 0: raise Exception("No valid inputs to network.") + # Constants don't work in Barracuda + encoding = encodes[0] + if len(encodes) > 1: + for _enc in encodes[1:]: + encoding += _enc + if self.use_lstm: # Resize to (batch, sequence length, encoding size) encoding = encoding.reshape([-1, sequence_length, self.h_size]) - memories = torch.split(memories, self.m_size // 2, dim=-1) encoding, memories = self.lstm(encoding, memories) encoding = encoding.reshape([-1, self.m_size // 2]) - memories = torch.cat(memories, dim=-1) return encoding, memories @@ -132,6 +129,10 @@ def __init__( encoding_size = network_settings.hidden_units self.value_heads = ValueHeads(stream_names, encoding_size, outputs_per_stream) + @property + def memory_size(self) -> int: + return self.network_body.memory_size + def forward( self, vec_inputs: List[torch.Tensor], @@ -192,8 +193,7 @@ def forward( vis_inputs: List[torch.Tensor], masks: Optional[torch.Tensor] = None, memories: Optional[torch.Tensor] = None, - sequence_length: int = 1, - ) -> Tuple[torch.Tensor, torch.Tensor, int, int, int, int]: + ) -> Tuple[torch.Tensor, int, int, int, int]: """ Forward pass of the Actor for inference. This is required for export to ONNX, and the inputs and outputs of this method should not be changed without a respective change @@ -242,6 +242,14 @@ def get_dist_and_value( """ pass + @abc.abstractproperty + def memory_size(self): + """ + Returns the size of the memory (same size used as input and output in the other + methods) used by this Actor. + """ + pass + class SimpleActor(nn.Module, Actor): def __init__( @@ -257,7 +265,6 @@ def __init__( self.act_type = act_type self.act_size = act_size self.version_number = torch.nn.Parameter(torch.Tensor([2.0])) - self.memory_size = torch.nn.Parameter(torch.Tensor([0])) self.is_continuous_int = torch.nn.Parameter( torch.Tensor([int(act_type == ActionType.CONTINUOUS)]) ) @@ -267,6 +274,7 @@ def __init__( self.encoding_size = network_settings.memory.memory_size // 2 else: self.encoding_size = network_settings.hidden_units + if self.act_type == ActionType.CONTINUOUS: self.distribution = GaussianDistribution( self.encoding_size, @@ -279,6 +287,10 @@ def __init__( self.encoding_size, act_size ) + @property + def memory_size(self) -> int: + return self.network_body.memory_size + def update_normalization(self, vector_obs: List[torch.Tensor]) -> None: self.network_body.update_normalization(vector_obs) @@ -313,21 +325,21 @@ def forward( vis_inputs: List[torch.Tensor], masks: Optional[torch.Tensor] = None, memories: Optional[torch.Tensor] = None, - sequence_length: int = 1, - ) -> Tuple[torch.Tensor, torch.Tensor, int, int, int, int]: + ) -> Tuple[torch.Tensor, int, int, int, int]: """ Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs. """ - dists, _ = self.get_dists( - vec_inputs, vis_inputs, masks, memories, sequence_length - ) + dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1) action_list = self.sample_action(dists) sampled_actions = torch.stack(action_list, dim=-1) + if self.act_type == ActionType.CONTINUOUS: + action_out = sampled_actions + else: + action_out = dists[0].all_log_prob() return ( - sampled_actions, - dists[0].pdf(sampled_actions), + action_out, self.version_number, - self.memory_size, + torch.Tensor([self.network_body.memory_size]), self.is_continuous_int, self.act_size_vector, ) @@ -401,29 +413,20 @@ def __init__( # Give the Actor only half the memories. Note we previously validate # that memory_size must be a multiple of 4. self.use_lstm = network_settings.memory is not None - if network_settings.memory is not None: - self.half_mem_size = network_settings.memory.memory_size // 2 - new_memory_settings = attr.evolve( - network_settings.memory, memory_size=self.half_mem_size - ) - use_network_settings = attr.evolve( - network_settings, memory=new_memory_settings - ) - else: - use_network_settings = network_settings - self.half_mem_size = 0 super().__init__( observation_shapes, - use_network_settings, + network_settings, act_type, act_size, conditional_sigma, tanh_squash, ) self.stream_names = stream_names - self.critic = ValueNetwork( - stream_names, observation_shapes, use_network_settings - ) + self.critic = ValueNetwork(stream_names, observation_shapes, network_settings) + + @property + def memory_size(self) -> int: + return self.network_body.memory_size + self.critic.memory_size def critic_pass( self, @@ -435,7 +438,7 @@ def critic_pass( actor_mem, critic_mem = None, None if self.use_lstm: # Use only the back half of memories for critic - actor_mem, critic_mem = torch.split(memories, self.half_mem_size, -1) + actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, -1) value_outputs, critic_mem_out = self.critic( vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length ) @@ -456,7 +459,7 @@ def get_dist_and_value( ) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]: if self.use_lstm: # Use only the back half of memories for critic and actor - actor_mem, critic_mem = torch.split(memories, self.half_mem_size, dim=-1) + actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1) else: critic_mem = None actor_mem = None @@ -480,10 +483,18 @@ def get_dist_and_value( class GlobalSteps(nn.Module): def __init__(self): super().__init__() - self.global_step = torch.Tensor([0]) + self.__global_step = nn.Parameter(torch.Tensor([0]), requires_grad=False) + + @property + def current_step(self): + return int(self.__global_step.item()) + + @current_step.setter + def current_step(self, value): + self.__global_step[:] = value def increment(self, value): - self.global_step += value + self.__global_step += value class LearningRate(nn.Module): diff --git a/ml-agents/mlagents/trainers/torch/utils.py b/ml-agents/mlagents/trainers/torch/utils.py index 0e855ea79b..570fa7b7bf 100644 --- a/ml-agents/mlagents/trainers/torch/utils.py +++ b/ml-agents/mlagents/trainers/torch/utils.py @@ -293,4 +293,6 @@ def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor: :param tensor: Tensor which needs mean computation. :param masks: Boolean tensor of masks with same dimension as tensor. """ - return (tensor * masks).sum() / torch.clamp(masks.float().sum(), min=1.0) + return (tensor.T * masks).sum() / torch.clamp( + (torch.ones_like(tensor.T) * masks).float().sum(), min=1.0 + ) diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py index 26af50f391..f40fe32959 100644 --- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py +++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py @@ -13,14 +13,25 @@ from mlagents.trainers.optimizer import Optimizer from mlagents.trainers.buffer import AgentBuffer from mlagents.trainers.trainer import Trainer -from mlagents.trainers.components.reward_signals import RewardSignalResult +from mlagents.trainers.components.reward_signals import RewardSignalResult, RewardSignal from mlagents_envs.timers import hierarchical_timer +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.policy.policy import Policy +from mlagents.trainers.policy.tf_policy import TFPolicy +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers from mlagents.trainers.agent_processor import AgentManagerQueue from mlagents.trainers.trajectory import Trajectory -from mlagents.trainers.settings import TrainerSettings +from mlagents.trainers.settings import TrainerSettings, FrameworkType from mlagents.trainers.stats import StatsPropertyType from mlagents.trainers.saver.saver import BaseSaver from mlagents.trainers.saver.tf_saver import TFSaver +from mlagents.trainers.exception import UnityTrainerException + +try: + from mlagents.trainers.policy.torch_policy import TorchPolicy + from mlagents.trainers.saver.torch_saver import TorchSaver +except ModuleNotFoundError: + TorchPolicy = None # type: ignore RewardSignalResults = Dict[str, RewardSignalResult] @@ -45,10 +56,13 @@ def __init__(self, *args, **kwargs): self._stats_reporter.add_property( StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict() ) + self.framework = self.trainer_settings.framework + logger.debug(f"Using framework {self.framework.value}") + self._next_save_step = 0 self._next_summary_step = 0 self.saver = self.create_saver( - self.trainer_settings, self.artifact_path, self.load + self.framework, self.trainer_settings, self.artifact_path, self.load ) def end_episode(self) -> None: @@ -60,13 +74,6 @@ def end_episode(self) -> None: for agent_id in rewards: rewards[agent_id] = 0 - @staticmethod - def create_saver( - trainer_settings: TrainerSettings, model_path: str, load: bool - ) -> BaseSaver: - saver = TFSaver(trainer_settings, model_path, load) - return saver - def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None: for name, rewards in self.collected_rewards.items(): if name == "environment": @@ -79,9 +86,16 @@ def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None self.reward_buffer.appendleft(rewards.get(agent_id, 0)) rewards[agent_id] = 0 else: - self.stats_reporter.add_stat( - optimizer.reward_signals[name].stat_name, rewards.get(agent_id, 0) - ) + if isinstance(optimizer.reward_signals[name], RewardSignal): + self.stats_reporter.add_stat( + optimizer.reward_signals[name].stat_name, + rewards.get(agent_id, 0), + ) + else: + self.stats_reporter.add_stat( + f"Policy/{optimizer.reward_signals[name].name.capitalize()} Reward", + rewards.get(agent_id, 0), + ) rewards[agent_id] = 0 def _clear_update_buffer(self) -> None: @@ -98,6 +112,58 @@ def _is_ready_update(self): """ return False + def create_policy( + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, + ) -> Policy: + if self.framework == FrameworkType.PYTORCH and TorchPolicy is None: + raise UnityTrainerException( + "To use the experimental PyTorch backend, install the PyTorch Python package first." + ) + elif self.framework == FrameworkType.PYTORCH: + return self.create_torch_policy(parsed_behavior_id, behavior_spec) + else: + return self.create_tf_policy( + parsed_behavior_id, behavior_spec, create_graph=create_graph + ) + + @abc.abstractmethod + def create_torch_policy( + self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + ) -> TorchPolicy: + """ + Create a Policy object that uses the PyTorch backend. + """ + pass + + @abc.abstractmethod + def create_tf_policy( + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, + ) -> TFPolicy: + """ + Create a Policy object that uses the TensorFlow backend. + """ + pass + + @staticmethod + def create_saver( + framework: str, trainer_settings: TrainerSettings, model_path: str, load: bool + ) -> BaseSaver: + if framework == FrameworkType.PYTORCH: + saver = TorchSaver( # type: ignore + trainer_settings, model_path, load + ) + else: + saver = TFSaver( # type: ignore + trainer_settings, model_path, load + ) + return saver + def _policy_mean_reward(self) -> Optional[float]: """ Returns the mean episode reward for the current policy. """ rewards = self.cumulative_returns_since_policy_update @@ -137,11 +203,12 @@ def save_model(self) -> None: logger.warning( "Trainer has multiple policies, but default behavior only saves the first." ) - model_checkpoint = self._checkpoint() + elif n_policies == 0: + logger.warning("Trainer has no policies, not saving anything.") + return - # Copy the checkpointed model files to the final output location + model_checkpoint = self._checkpoint() self.saver.copy_final_model(model_checkpoint.file_path) - final_checkpoint = attr.evolve( model_checkpoint, file_path=f"{self.saver.model_path}.nn" ) diff --git a/ml-agents/mlagents/trainers/trainer/trainer.py b/ml-agents/mlagents/trainers/trainer/trainer.py index a08b2dd6ad..55ac5a9ef1 100644 --- a/ml-agents/mlagents/trainers/trainer/trainer.py +++ b/ml-agents/mlagents/trainers/trainer/trainer.py @@ -125,7 +125,10 @@ def end_episode(self): @abc.abstractmethod def create_policy( - self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, ) -> Policy: """ Creates policy diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py index 870ce7a813..550a514a1d 100644 --- a/ml-agents/mlagents/trainers/trainer_controller.py +++ b/ml-agents/mlagents/trainers/trainer_controller.py @@ -30,6 +30,11 @@ from mlagents.trainers.agent_processor import AgentManager from mlagents.tf_utils.globals import get_rank +try: + import torch +except ModuleNotFoundError: + torch = None # type: ignore + class TrainerController: def __init__( @@ -66,6 +71,8 @@ def __init__( self.kill_trainers = False np.random.seed(training_seed) tf.set_random_seed(training_seed) + if torch is not None: + torch.manual_seed(training_seed) self.rank = get_rank() @timed