From 4cde65dc98ad362e85b26cda63d9fd05bc1e7071 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 1 Jul 2020 16:25:29 -0700 Subject: [PATCH] Fix for discrete actions Change to FloatTensor vs. DoubleTensor --- ml-agents/mlagents/trainers/distributions_torch.py | 2 +- ml-agents/mlagents/trainers/policy/policy.py | 2 +- ml-agents/mlagents/trainers/policy/torch_policy.py | 7 +++---- ml-agents/mlagents/trainers/ppo/optimizer_torch.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/ml-agents/mlagents/trainers/distributions_torch.py b/ml-agents/mlagents/trainers/distributions_torch.py index a8616f65d6..29c683df63 100644 --- a/ml-agents/mlagents/trainers/distributions_torch.py +++ b/ml-agents/mlagents/trainers/distributions_torch.py @@ -42,7 +42,7 @@ def sample(self): return torch.multinomial(self.probs, 1) def pdf(self, value): - return torch.diag(self.probs.T[value.flatten()]) + return torch.diag(self.probs.T[value.flatten().long()]) def log_prob(self, value): return torch.log(self.pdf(value)) diff --git a/ml-agents/mlagents/trainers/policy/policy.py b/ml-agents/mlagents/trainers/policy/policy.py index b1a9d460b2..e3830d5472 100644 --- a/ml-agents/mlagents/trainers/policy/policy.py +++ b/ml-agents/mlagents/trainers/policy/policy.py @@ -32,7 +32,7 @@ def __init__( self.num_branches = len(self.brain.vector_action_space_size) self.previous_action_dict: Dict[str, np.array] = {} self.memory_dict: Dict[str, np.ndarray] = {} - self.normalize = trainer_settings + self.normalize = trainer_settings.network_settings.normalize self.use_recurrent = trainer_settings.network_settings.memory is not None self.model_path = trainer_settings.init_path diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index d140a6e1d5..99c7949362 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -94,7 +94,7 @@ def __init__( self.inference_dict: Dict[str, tf.Tensor] = {} self.update_dict: Dict[str, tf.Tensor] = {} # TF defaults to 32-bit, so we use the same here. - torch.set_default_tensor_type(torch.DoubleTensor) + torch.set_default_tensor_type(torch.FloatTensor) reward_signal_configs = trainer_settings.reward_signals reward_signal_names = [key.value for key, _ in reward_signal_configs.items()] @@ -152,8 +152,7 @@ def sample_actions(self, vec_obs, vis_obs, masks=None, memories=None, seq_len=1) actions = self.actor_critic.sample_action(dists) log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists) - if self.act_type == "continuous": - actions.squeeze_(-1) + actions = torch.squeeze(actions) return actions, log_probs, entropies, value_heads, memories @@ -252,7 +251,7 @@ def export_model(self, step=0): fake_vec_obs = [torch.zeros([1] + [self.brain.vector_observation_space_size])] fake_vis_obs = [torch.zeros([1] + [84, 84, 3])] fake_masks = torch.ones([1] + self.actor_critic.act_size) - fake_memories = torch.zeros([1] + [self.m_size]) + # fake_memories = torch.zeros([1] + [self.m_size]) export_path = "./model-" + str(step) + ".onnx" output_names = ["action", "action_probs"] input_names = ["vector_observation", "action_mask"] diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 171facc52a..28fc2e8598 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -99,7 +99,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: if self.policy.use_continuous_act: actions = torch.as_tensor(batch["actions"]).unsqueeze(-1) else: - actions = torch.as_tensor(batch["actions"]) + actions = torch.as_tensor(batch["actions"], dtype=torch.long) memories = [ torch.as_tensor(batch["memory"][i])