From 4cde65dc98ad362e85b26cda63d9fd05bc1e7071 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 1 Jul 2020 16:25:29 -0700
Subject: [PATCH] Fix for discrete actions Change to FloatTensor vs.
 DoubleTensor

---
 ml-agents/mlagents/trainers/distributions_torch.py | 2 +-
 ml-agents/mlagents/trainers/policy/policy.py       | 2 +-
 ml-agents/mlagents/trainers/policy/torch_policy.py | 7 +++----
 ml-agents/mlagents/trainers/ppo/optimizer_torch.py | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/ml-agents/mlagents/trainers/distributions_torch.py b/ml-agents/mlagents/trainers/distributions_torch.py
index a8616f65d6..29c683df63 100644
--- a/ml-agents/mlagents/trainers/distributions_torch.py
+++ b/ml-agents/mlagents/trainers/distributions_torch.py
@@ -42,7 +42,7 @@ def sample(self):
         return torch.multinomial(self.probs, 1)
 
     def pdf(self, value):
-        return torch.diag(self.probs.T[value.flatten()])
+        return torch.diag(self.probs.T[value.flatten().long()])
 
     def log_prob(self, value):
         return torch.log(self.pdf(value))
diff --git a/ml-agents/mlagents/trainers/policy/policy.py b/ml-agents/mlagents/trainers/policy/policy.py
index b1a9d460b2..e3830d5472 100644
--- a/ml-agents/mlagents/trainers/policy/policy.py
+++ b/ml-agents/mlagents/trainers/policy/policy.py
@@ -32,7 +32,7 @@ def __init__(
             self.num_branches = len(self.brain.vector_action_space_size)
         self.previous_action_dict: Dict[str, np.array] = {}
         self.memory_dict: Dict[str, np.ndarray] = {}
-        self.normalize = trainer_settings
+        self.normalize = trainer_settings.network_settings.normalize
         self.use_recurrent = trainer_settings.network_settings.memory is not None
         self.model_path = trainer_settings.init_path
 
diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py
index d140a6e1d5..99c7949362 100644
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
@@ -94,7 +94,7 @@ def __init__(
         self.inference_dict: Dict[str, tf.Tensor] = {}
         self.update_dict: Dict[str, tf.Tensor] = {}
         # TF defaults to 32-bit, so we use the same here.
-        torch.set_default_tensor_type(torch.DoubleTensor)
+        torch.set_default_tensor_type(torch.FloatTensor)
 
         reward_signal_configs = trainer_settings.reward_signals
         reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
@@ -152,8 +152,7 @@ def sample_actions(self, vec_obs, vis_obs, masks=None, memories=None, seq_len=1)
 
         actions = self.actor_critic.sample_action(dists)
         log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists)
-        if self.act_type == "continuous":
-            actions.squeeze_(-1)
+        actions = torch.squeeze(actions)
 
         return actions, log_probs, entropies, value_heads, memories
 
@@ -252,7 +251,7 @@ def export_model(self, step=0):
         fake_vec_obs = [torch.zeros([1] + [self.brain.vector_observation_space_size])]
         fake_vis_obs = [torch.zeros([1] + [84, 84, 3])]
         fake_masks = torch.ones([1] + self.actor_critic.act_size)
-        fake_memories = torch.zeros([1] + [self.m_size])
+        # fake_memories = torch.zeros([1] + [self.m_size])
         export_path = "./model-" + str(step) + ".onnx"
         output_names = ["action", "action_probs"]
         input_names = ["vector_observation", "action_mask"]
diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
index 171facc52a..28fc2e8598 100644
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
@@ -99,7 +99,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
         if self.policy.use_continuous_act:
             actions = torch.as_tensor(batch["actions"]).unsqueeze(-1)
         else:
-            actions = torch.as_tensor(batch["actions"])
+            actions = torch.as_tensor(batch["actions"], dtype=torch.long)
 
         memories = [
             torch.as_tensor(batch["memory"][i])