From 88b9e40343f85d8687ef10f04b5ff3507ca587e1 Mon Sep 17 00:00:00 2001
From: Andrew Cohen <andrew.cohen@unity3d.com>
Date: Mon, 17 Aug 2020 10:00:51 -0700
Subject: [PATCH 1/9] merge add fire

---
 config/ppo/Tennis.yaml                             | 1 +
 ml-agents/mlagents/trainers/policy/torch_policy.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/config/ppo/Tennis.yaml b/config/ppo/Tennis.yaml
index ed73a6d83a..e6b1cafe84 100644
--- a/config/ppo/Tennis.yaml
+++ b/config/ppo/Tennis.yaml
@@ -1,5 +1,6 @@
 behaviors:
   Tennis:
+    framework: pytorch
     trainer_type: ppo
     hyperparameters:
       batch_size: 2048
diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py
index 5fa135f6a2..49f057d0ff 100644
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
@@ -249,7 +249,7 @@ def increment_step(self, n_steps):
         return self.get_current_step()
 
     def load_weights(self, values: List[np.ndarray]) -> None:
-        pass
+        self.actor_critic.load_state_dict(values)
 
     def init_load_weights(self) -> None:
         pass

From c180bed0f10995f0ce7a8aec99f17e9caed990f8 Mon Sep 17 00:00:00 2001
From: Andrew Cohen <andrew.cohen@unity3d.com>
Date: Mon, 17 Aug 2020 14:08:57 -0700
Subject: [PATCH 2/9] ghost trainer tests

---
 ml-agents/mlagents/trainers/ghost/trainer.py  |  12 +-
 .../mlagents/trainers/policy/tf_policy.py     |   2 +
 .../mlagents/trainers/policy/torch_policy.py  |   2 +-
 ml-agents/mlagents/trainers/ppo/trainer.py    |   6 +-
 .../trainers/tests/torch/test_ghost.py        | 177 ++++++++++++++++++
 .../mlagents/trainers/trainer/rl_trainer.py   |  14 +-
 .../mlagents/trainers/trainer/trainer.py      |   5 +-
 7 files changed, 207 insertions(+), 11 deletions(-)
 create mode 100644 ml-agents/mlagents/trainers/tests/torch/test_ghost.py

diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py
index 849deeae5d..50d9aad74c 100644
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
@@ -304,7 +304,10 @@ def save_model(self) -> None:
         self.trainer.save_model()
 
     def create_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
+        self,
+        parsed_behavior_id: BehaviorIdentifiers,
+        behavior_spec: BehaviorSpec,
+        create_graph: bool = False,
     ) -> Policy:
         """
         Creates policy with the wrapped trainer's create_policy function
@@ -313,10 +316,10 @@ def create_policy(
         team are grouped. All policies associated with this team are added to the
         wrapped trainer to be trained.
         """
-        policy = self.trainer.create_policy(parsed_behavior_id, behavior_spec)
-        policy.create_tf_graph()
+        policy = self.trainer.create_policy(
+            parsed_behavior_id, behavior_spec, create_graph=True
+        )
         self.trainer.saver.initialize_or_load(policy)
-        policy.init_load_weights()
         team_id = parsed_behavior_id.team_id
         self.controller.subscribe_team_id(team_id, self)
 
@@ -326,7 +329,6 @@ def create_policy(
                 parsed_behavior_id, behavior_spec
             )
             self.trainer.add_policy(parsed_behavior_id, internal_trainer_policy)
-            internal_trainer_policy.init_load_weights()
             self.current_policy_snapshot[
                 parsed_behavior_id.brain_name
             ] = internal_trainer_policy.get_weights()
diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py
index 707023ab3b..ec8e448ac0 100644
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
@@ -138,6 +138,8 @@ def create_tf_graph(self) -> None:
             self.trainable_variables += tf.get_collection(
                 tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
             )  # LSTMs need to be root scope for Barracuda export
+            # Create assignment ops for Ghost Trainer
+            self.init_load_weights()
 
         self.inference_dict = {
             "action": self.output,
diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py
index 49f057d0ff..582d154725 100644
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
@@ -255,7 +255,7 @@ def init_load_weights(self) -> None:
         pass
 
     def get_weights(self) -> List[np.ndarray]:
-        return []
+        return self.actor_critic.state_dict()
 
     def get_modules(self):
         return {"Policy": self.actor_critic, "global_step": self.global_step}
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
index c16bc3439d..e49a83ccd3 100644
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -217,7 +217,10 @@ def _update_policy(self):
         return True
 
     def create_tf_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
+        self,
+        parsed_behavior_id: BehaviorIdentifiers,
+        behavior_spec: BehaviorSpec,
+        create_graph: bool = False,
     ) -> TFPolicy:
         """
         Creates a PPO policy to trainers list of policies.
@@ -229,6 +232,7 @@ def create_tf_policy(
             behavior_spec,
             self.trainer_settings,
             condition_sigma_on_obs=False,  # Faster training for PPO
+            create_tf_graph=create_graph,
         )
         return policy
 
diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
new file mode 100644
index 0000000000..de720a1e1e
--- /dev/null
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
@@ -0,0 +1,177 @@
+import pytest
+
+import numpy as np
+
+from mlagents.trainers.ghost.trainer import GhostTrainer
+from mlagents.trainers.ghost.controller import GhostController
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.ppo.trainer import PPOTrainer
+from mlagents.trainers.agent_processor import AgentManagerQueue
+from mlagents.trainers.tests import mock_brain as mb
+from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
+from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings, FrameworkType
+
+
+@pytest.fixture
+def dummy_config():
+    return TrainerSettings(
+        self_play=SelfPlaySettings(), framework=FrameworkType.PYTORCH
+    )
+
+
+VECTOR_ACTION_SPACE = 1
+VECTOR_OBS_SPACE = 8
+DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
+BUFFER_INIT_SAMPLES = 513
+NUM_AGENTS = 12
+
+
+@pytest.mark.parametrize("use_discrete", [True, False])
+def test_load_and_set(dummy_config, use_discrete):
+    mock_specs = mb.setup_test_behavior_specs(
+        use_discrete,
+        False,
+        vector_action_space=DISCRETE_ACTION_SPACE
+        if use_discrete
+        else VECTOR_ACTION_SPACE,
+        vector_obs_space=VECTOR_OBS_SPACE,
+    )
+
+    trainer_params = dummy_config
+    trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
+    trainer.seed = 1
+    policy = trainer.create_policy("test", mock_specs, create_graph=True)
+    trainer.seed = 20  # otherwise graphs are the same
+    to_load_policy = trainer.create_policy("test", mock_specs, create_graph=True)
+
+    weights = policy.get_weights()
+    load_weights = to_load_policy.get_weights()
+    try:
+        for w, lw in zip(weights, load_weights):
+            np.testing.assert_array_equal(w, lw)
+    except AssertionError:
+        pass
+
+    to_load_policy.load_weights(weights)
+    load_weights = to_load_policy.get_weights()
+
+    for w, lw in zip(weights, load_weights):
+        np.testing.assert_array_equal(w, lw)
+
+
+def test_process_trajectory(dummy_config):
+    mock_specs = mb.setup_test_behavior_specs(
+        True, False, vector_action_space=[2], vector_obs_space=1
+    )
+    behavior_id_team0 = "test_brain?team=0"
+    behavior_id_team1 = "test_brain?team=1"
+    brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
+
+    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
+    controller = GhostController(100)
+    trainer = GhostTrainer(
+        ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
+    )
+
+    # first policy encountered becomes policy trained by wrapped PPO
+    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
+    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
+    trainer.add_policy(parsed_behavior_id0, policy)
+    trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
+    trainer.subscribe_trajectory_queue(trajectory_queue0)
+
+    # Ghost trainer should ignore this queue because off policy
+    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
+    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
+    trainer.add_policy(parsed_behavior_id1, policy)
+    trajectory_queue1 = AgentManagerQueue(behavior_id_team1)
+    trainer.subscribe_trajectory_queue(trajectory_queue1)
+
+    time_horizon = 15
+    trajectory = make_fake_trajectory(
+        length=time_horizon,
+        max_step_complete=True,
+        observation_shapes=[(1,)],
+        action_space=[2],
+    )
+    trajectory_queue0.put(trajectory)
+    trainer.advance()
+
+    # Check that trainer put trajectory in update buffer
+    assert trainer.trainer.update_buffer.num_experiences == 15
+
+    trajectory_queue1.put(trajectory)
+    trainer.advance()
+
+    # Check that ghost trainer ignored off policy queue
+    assert trainer.trainer.update_buffer.num_experiences == 15
+    # Check that it emptied the queue
+    assert trajectory_queue1.empty()
+
+
+def test_publish_queue(dummy_config):
+    mock_specs = mb.setup_test_behavior_specs(
+        True, False, vector_action_space=[1], vector_obs_space=8
+    )
+
+    behavior_id_team0 = "test_brain?team=0"
+    behavior_id_team1 = "test_brain?team=1"
+
+    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
+
+    brain_name = parsed_behavior_id0.brain_name
+
+    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
+    controller = GhostController(100)
+    trainer = GhostTrainer(
+        ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
+    )
+
+    # First policy encountered becomes policy trained by wrapped PPO
+    # This queue should remain empty after swap snapshot
+    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
+    trainer.add_policy(parsed_behavior_id0, policy)
+    policy_queue0 = AgentManagerQueue(behavior_id_team0)
+    trainer.publish_policy_queue(policy_queue0)
+
+    # Ghost trainer should use this queue for ghost policy swap
+    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
+    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
+    trainer.add_policy(parsed_behavior_id1, policy)
+    policy_queue1 = AgentManagerQueue(behavior_id_team1)
+    trainer.publish_policy_queue(policy_queue1)
+
+    # check ghost trainer swap pushes to ghost queue and not trainer
+    assert policy_queue0.empty() and policy_queue1.empty()
+    trainer._swap_snapshots()
+    assert policy_queue0.empty() and not policy_queue1.empty()
+    # clear
+    policy_queue1.get_nowait()
+
+    mock_specs = mb.setup_test_behavior_specs(
+        False,
+        False,
+        vector_action_space=VECTOR_ACTION_SPACE,
+        vector_obs_space=VECTOR_OBS_SPACE,
+    )
+
+    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs)
+    # Mock out reward signal eval
+    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
+    buffer["extrinsic_returns"] = buffer["environment_rewards"]
+    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
+    buffer["curiosity_rewards"] = buffer["environment_rewards"]
+    buffer["curiosity_returns"] = buffer["environment_rewards"]
+    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
+    buffer["advantages"] = buffer["environment_rewards"]
+    trainer.trainer.update_buffer = buffer
+
+    # when ghost trainer advance and wrapped trainer buffers full
+    # the wrapped trainer pushes updated policy to correct queue
+    assert policy_queue0.empty() and policy_queue1.empty()
+    trainer.advance()
+    assert not policy_queue0.empty() and policy_queue1.empty()
+
+
+if __name__ == "__main__":
+    pytest.main()
diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
index 7ae4f08c21..d920a43279 100644
--- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
@@ -119,7 +119,10 @@ def _is_ready_update(self):
         return False
 
     def create_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
+        self,
+        parsed_behavior_id: BehaviorIdentifiers,
+        behavior_spec: BehaviorSpec,
+        create_graph: bool = False,
     ) -> Policy:
         if self.framework == FrameworkType.PYTORCH and TorchPolicy is None:
             raise UnityTrainerException(
@@ -128,7 +131,9 @@ def create_policy(
         elif self.framework == FrameworkType.PYTORCH:
             return self.create_torch_policy(parsed_behavior_id, behavior_spec)
         else:
-            return self.create_tf_policy(parsed_behavior_id, behavior_spec)
+            return self.create_tf_policy(
+                parsed_behavior_id, behavior_spec, create_graph=create_graph
+            )
 
     @abc.abstractmethod
     def create_torch_policy(
@@ -141,7 +146,10 @@ def create_torch_policy(
 
     @abc.abstractmethod
     def create_tf_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
+        self,
+        parsed_behavior_id: BehaviorIdentifiers,
+        behavior_spec: BehaviorSpec,
+        create_graph: bool = False,
     ) -> TFPolicy:
         """
         Create a Policy object that uses the TensorFlow backend.
diff --git a/ml-agents/mlagents/trainers/trainer/trainer.py b/ml-agents/mlagents/trainers/trainer/trainer.py
index a08b2dd6ad..55ac5a9ef1 100644
--- a/ml-agents/mlagents/trainers/trainer/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer.py
@@ -125,7 +125,10 @@ def end_episode(self):
 
     @abc.abstractmethod
     def create_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
+        self,
+        parsed_behavior_id: BehaviorIdentifiers,
+        behavior_spec: BehaviorSpec,
+        create_graph: bool = False,
     ) -> Policy:
         """
         Creates policy

From 16690d90ea0286d5cc0b8878ddc57cd401cdb5c2 Mon Sep 17 00:00:00 2001
From: Andrew Cohen <andrew.cohen@unity3d.com>
Date: Mon, 17 Aug 2020 14:18:10 -0700
Subject: [PATCH 3/9] fix tf policy for ghosts

---
 ml-agents/mlagents/trainers/policy/tf_policy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py
index ec8e448ac0..91a4ab61f2 100644
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
@@ -139,7 +139,6 @@ def create_tf_graph(self) -> None:
                 tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
             )  # LSTMs need to be root scope for Barracuda export
             # Create assignment ops for Ghost Trainer
-            self.init_load_weights()
 
         self.inference_dict = {
             "action": self.output,
@@ -154,6 +153,7 @@ def create_tf_graph(self) -> None:
         # We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
         # it will re-load the full graph
         self.initialize()
+        self.init_load_weights()
 
     def _create_encoder(
         self,

From d04fe05988ea1103e82740c5512dbd95bde3590b Mon Sep 17 00:00:00 2001
From: Andrew Cohen <andrew.cohen@unity3d.com>
Date: Mon, 17 Aug 2020 14:20:54 -0700
Subject: [PATCH 4/9] fix tf ghost tests

---
 ml-agents/mlagents/trainers/tests/test_ghost.py       | 7 ++-----
 ml-agents/mlagents/trainers/tests/torch/test_ghost.py | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_ghost.py b/ml-agents/mlagents/trainers/tests/test_ghost.py
index e72f573f36..acc9711830 100644
--- a/ml-agents/mlagents/trainers/tests/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/test_ghost.py
@@ -38,12 +38,9 @@ def test_load_and_set(dummy_config, use_discrete):
     trainer_params = dummy_config
     trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
     trainer.seed = 1
-    policy = trainer.create_policy("test", mock_specs)
-    policy.create_tf_graph()
+    policy = trainer.create_policy("test", mock_specs, create_graph=True)
     trainer.seed = 20  # otherwise graphs are the same
-    to_load_policy = trainer.create_policy("test", mock_specs)
-    to_load_policy.create_tf_graph()
-    to_load_policy.init_load_weights()
+    to_load_policy = trainer.create_policy("test", mock_specs, create_graph=True)
 
     weights = policy.get_weights()
     load_weights = to_load_policy.get_weights()
diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
index de720a1e1e..06f0666cc8 100644
--- a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
@@ -40,9 +40,9 @@ def test_load_and_set(dummy_config, use_discrete):
     trainer_params = dummy_config
     trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
     trainer.seed = 1
-    policy = trainer.create_policy("test", mock_specs, create_graph=True)
+    policy = trainer.create_policy("test", mock_specs)
     trainer.seed = 20  # otherwise graphs are the same
-    to_load_policy = trainer.create_policy("test", mock_specs, create_graph=True)
+    to_load_policy = trainer.create_policy("test", mock_specs)
 
     weights = policy.get_weights()
     load_weights = to_load_policy.get_weights()

From d9640857483092230e722295ac134a24f939aa3f Mon Sep 17 00:00:00 2001
From: Andrew Cohen <andrew.cohen@unity3d.com>
Date: Mon, 17 Aug 2020 14:22:51 -0700
Subject: [PATCH 5/9] revert tennis config

---
 config/ppo/Tennis.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/config/ppo/Tennis.yaml b/config/ppo/Tennis.yaml
index e6b1cafe84..ed73a6d83a 100644
--- a/config/ppo/Tennis.yaml
+++ b/config/ppo/Tennis.yaml
@@ -1,6 +1,5 @@
 behaviors:
   Tennis:
-    framework: pytorch
     trainer_type: ppo
     hyperparameters:
       batch_size: 2048

From 958836207d40de3aed5d0148aecbaf35c260738c Mon Sep 17 00:00:00 2001
From: Andrew Cohen <andrew.cohen@unity3d.com>
Date: Mon, 17 Aug 2020 14:24:01 -0700
Subject: [PATCH 6/9] move tf policy comment

---
 ml-agents/mlagents/trainers/policy/tf_policy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py
index 91a4ab61f2..47789d1e92 100644
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
@@ -138,7 +138,6 @@ def create_tf_graph(self) -> None:
             self.trainable_variables += tf.get_collection(
                 tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
             )  # LSTMs need to be root scope for Barracuda export
-            # Create assignment ops for Ghost Trainer
 
         self.inference_dict = {
             "action": self.output,
@@ -153,6 +152,7 @@ def create_tf_graph(self) -> None:
         # We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
         # it will re-load the full graph
         self.initialize()
+        # Create assignment ops for Ghost Trainer
         self.init_load_weights()
 
     def _create_encoder(

From b2da1090ebca8510017fb3a7c76eb1a92b31757d Mon Sep 17 00:00:00 2001
From: Andrew Cohen <andrew.cohen@unity3d.com>
Date: Mon, 17 Aug 2020 14:33:28 -0700
Subject: [PATCH 7/9] fix sac precommit

---
 ml-agents/mlagents/trainers/sac/trainer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
index e342ad4b03..11cc6762c5 100644
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -228,7 +228,10 @@ def maybe_load_replay_buffer(self):
             )
 
     def create_tf_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
+        self,
+        parsed_behavior_id: BehaviorIdentifiers,
+        behavior_spec: BehaviorSpec,
+        create_graph: bool = False,
     ) -> TFPolicy:
         policy = TFPolicy(
             self.seed,
@@ -236,7 +239,7 @@ def create_tf_policy(
             self.trainer_settings,
             tanh_squash=True,
             reparameterize=True,
-            create_tf_graph=False,
+            create_tf_graph=create_graph,
         )
         self.maybe_load_replay_buffer()
         return policy

From ef4a5a021c1f22ed610b29c8dcad1e85f0c59161 Mon Sep 17 00:00:00 2001
From: Andrew Cohen <andrew.cohen@unity3d.com>
Date: Mon, 17 Aug 2020 14:50:06 -0700
Subject: [PATCH 8/9] added comments

---
 ml-agents/mlagents/trainers/ppo/trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
index e49a83ccd3..a9ca897fbe 100644
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -225,6 +225,7 @@ def create_tf_policy(
         """
         Creates a PPO policy to trainers list of policies.
         :param behavior_spec: specifications for policy construction
+        :param create_graph: whether to create the graph when policy is constructed
         :return policy
         """
         policy = TFPolicy(

From 453e1aa317ea2bf4324c67fef9d835b9646b43b7 Mon Sep 17 00:00:00 2001
From: Andrew Cohen <andrew.cohen@unity3d.com>
Date: Tue, 18 Aug 2020 10:33:40 -0700
Subject: [PATCH 9/9] return copy of state_dict

---
 ml-agents/mlagents/trainers/policy/torch_policy.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py
index 582d154725..e2ed73b25f 100644
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict, List
 import numpy as np
 import torch
+import copy
 
 from mlagents.trainers.action_info import ActionInfo
 from mlagents.trainers.behavior_id_utils import get_global_agent_id
@@ -255,7 +256,7 @@ def init_load_weights(self) -> None:
         pass
 
     def get_weights(self) -> List[np.ndarray]:
-        return self.actor_critic.state_dict()
+        return copy.deepcopy(self.actor_critic.state_dict())
 
     def get_modules(self):
         return {"Policy": self.actor_critic, "global_step": self.global_step}