From 88b9e40343f85d8687ef10f04b5ff3507ca587e1 Mon Sep 17 00:00:00 2001 From: Andrew Cohen Date: Mon, 17 Aug 2020 10:00:51 -0700 Subject: [PATCH 1/9] merge add fire --- config/ppo/Tennis.yaml | 1 + ml-agents/mlagents/trainers/policy/torch_policy.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/config/ppo/Tennis.yaml b/config/ppo/Tennis.yaml index ed73a6d83a..e6b1cafe84 100644 --- a/config/ppo/Tennis.yaml +++ b/config/ppo/Tennis.yaml @@ -1,5 +1,6 @@ behaviors: Tennis: + framework: pytorch trainer_type: ppo hyperparameters: batch_size: 2048 diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 5fa135f6a2..49f057d0ff 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -249,7 +249,7 @@ def increment_step(self, n_steps): return self.get_current_step() def load_weights(self, values: List[np.ndarray]) -> None: - pass + self.actor_critic.load_state_dict(values) def init_load_weights(self) -> None: pass From c180bed0f10995f0ce7a8aec99f17e9caed990f8 Mon Sep 17 00:00:00 2001 From: Andrew Cohen Date: Mon, 17 Aug 2020 14:08:57 -0700 Subject: [PATCH 2/9] ghost trainer tests --- ml-agents/mlagents/trainers/ghost/trainer.py | 12 +- .../mlagents/trainers/policy/tf_policy.py | 2 + .../mlagents/trainers/policy/torch_policy.py | 2 +- ml-agents/mlagents/trainers/ppo/trainer.py | 6 +- .../trainers/tests/torch/test_ghost.py | 177 ++++++++++++++++++ .../mlagents/trainers/trainer/rl_trainer.py | 14 +- .../mlagents/trainers/trainer/trainer.py | 5 +- 7 files changed, 207 insertions(+), 11 deletions(-) create mode 100644 ml-agents/mlagents/trainers/tests/torch/test_ghost.py diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py index 849deeae5d..50d9aad74c 100644 --- a/ml-agents/mlagents/trainers/ghost/trainer.py +++ b/ml-agents/mlagents/trainers/ghost/trainer.py @@ -304,7 +304,10 @@ def save_model(self) -> None: self.trainer.save_model() def create_policy( - self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, ) -> Policy: """ Creates policy with the wrapped trainer's create_policy function @@ -313,10 +316,10 @@ def create_policy( team are grouped. All policies associated with this team are added to the wrapped trainer to be trained. """ - policy = self.trainer.create_policy(parsed_behavior_id, behavior_spec) - policy.create_tf_graph() + policy = self.trainer.create_policy( + parsed_behavior_id, behavior_spec, create_graph=True + ) self.trainer.saver.initialize_or_load(policy) - policy.init_load_weights() team_id = parsed_behavior_id.team_id self.controller.subscribe_team_id(team_id, self) @@ -326,7 +329,6 @@ def create_policy( parsed_behavior_id, behavior_spec ) self.trainer.add_policy(parsed_behavior_id, internal_trainer_policy) - internal_trainer_policy.init_load_weights() self.current_policy_snapshot[ parsed_behavior_id.brain_name ] = internal_trainer_policy.get_weights() diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py index 707023ab3b..ec8e448ac0 100644 --- a/ml-agents/mlagents/trainers/policy/tf_policy.py +++ b/ml-agents/mlagents/trainers/policy/tf_policy.py @@ -138,6 +138,8 @@ def create_tf_graph(self) -> None: self.trainable_variables += tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm" ) # LSTMs need to be root scope for Barracuda export + # Create assignment ops for Ghost Trainer + self.init_load_weights() self.inference_dict = { "action": self.output, diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 49f057d0ff..582d154725 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -255,7 +255,7 @@ def init_load_weights(self) -> None: pass def get_weights(self) -> List[np.ndarray]: - return [] + return self.actor_critic.state_dict() def get_modules(self): return {"Policy": self.actor_critic, "global_step": self.global_step} diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index c16bc3439d..e49a83ccd3 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -217,7 +217,10 @@ def _update_policy(self): return True def create_tf_policy( - self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, ) -> TFPolicy: """ Creates a PPO policy to trainers list of policies. @@ -229,6 +232,7 @@ def create_tf_policy( behavior_spec, self.trainer_settings, condition_sigma_on_obs=False, # Faster training for PPO + create_tf_graph=create_graph, ) return policy diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py new file mode 100644 index 0000000000..de720a1e1e --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py @@ -0,0 +1,177 @@ +import pytest + +import numpy as np + +from mlagents.trainers.ghost.trainer import GhostTrainer +from mlagents.trainers.ghost.controller import GhostController +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers +from mlagents.trainers.ppo.trainer import PPOTrainer +from mlagents.trainers.agent_processor import AgentManagerQueue +from mlagents.trainers.tests import mock_brain as mb +from mlagents.trainers.tests.test_trajectory import make_fake_trajectory +from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings, FrameworkType + + +@pytest.fixture +def dummy_config(): + return TrainerSettings( + self_play=SelfPlaySettings(), framework=FrameworkType.PYTORCH + ) + + +VECTOR_ACTION_SPACE = 1 +VECTOR_OBS_SPACE = 8 +DISCRETE_ACTION_SPACE = [3, 3, 3, 2] +BUFFER_INIT_SAMPLES = 513 +NUM_AGENTS = 12 + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_load_and_set(dummy_config, use_discrete): + mock_specs = mb.setup_test_behavior_specs( + use_discrete, + False, + vector_action_space=DISCRETE_ACTION_SPACE + if use_discrete + else VECTOR_ACTION_SPACE, + vector_obs_space=VECTOR_OBS_SPACE, + ) + + trainer_params = dummy_config + trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") + trainer.seed = 1 + policy = trainer.create_policy("test", mock_specs, create_graph=True) + trainer.seed = 20 # otherwise graphs are the same + to_load_policy = trainer.create_policy("test", mock_specs, create_graph=True) + + weights = policy.get_weights() + load_weights = to_load_policy.get_weights() + try: + for w, lw in zip(weights, load_weights): + np.testing.assert_array_equal(w, lw) + except AssertionError: + pass + + to_load_policy.load_weights(weights) + load_weights = to_load_policy.get_weights() + + for w, lw in zip(weights, load_weights): + np.testing.assert_array_equal(w, lw) + + +def test_process_trajectory(dummy_config): + mock_specs = mb.setup_test_behavior_specs( + True, False, vector_action_space=[2], vector_obs_space=1 + ) + behavior_id_team0 = "test_brain?team=0" + behavior_id_team1 = "test_brain?team=1" + brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name + + ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") + controller = GhostController(100) + trainer = GhostTrainer( + ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" + ) + + # first policy encountered becomes policy trained by wrapped PPO + parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) + policy = trainer.create_policy(parsed_behavior_id0, mock_specs) + trainer.add_policy(parsed_behavior_id0, policy) + trajectory_queue0 = AgentManagerQueue(behavior_id_team0) + trainer.subscribe_trajectory_queue(trajectory_queue0) + + # Ghost trainer should ignore this queue because off policy + parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) + policy = trainer.create_policy(parsed_behavior_id1, mock_specs) + trainer.add_policy(parsed_behavior_id1, policy) + trajectory_queue1 = AgentManagerQueue(behavior_id_team1) + trainer.subscribe_trajectory_queue(trajectory_queue1) + + time_horizon = 15 + trajectory = make_fake_trajectory( + length=time_horizon, + max_step_complete=True, + observation_shapes=[(1,)], + action_space=[2], + ) + trajectory_queue0.put(trajectory) + trainer.advance() + + # Check that trainer put trajectory in update buffer + assert trainer.trainer.update_buffer.num_experiences == 15 + + trajectory_queue1.put(trajectory) + trainer.advance() + + # Check that ghost trainer ignored off policy queue + assert trainer.trainer.update_buffer.num_experiences == 15 + # Check that it emptied the queue + assert trajectory_queue1.empty() + + +def test_publish_queue(dummy_config): + mock_specs = mb.setup_test_behavior_specs( + True, False, vector_action_space=[1], vector_obs_space=8 + ) + + behavior_id_team0 = "test_brain?team=0" + behavior_id_team1 = "test_brain?team=1" + + parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) + + brain_name = parsed_behavior_id0.brain_name + + ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") + controller = GhostController(100) + trainer = GhostTrainer( + ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" + ) + + # First policy encountered becomes policy trained by wrapped PPO + # This queue should remain empty after swap snapshot + policy = trainer.create_policy(parsed_behavior_id0, mock_specs) + trainer.add_policy(parsed_behavior_id0, policy) + policy_queue0 = AgentManagerQueue(behavior_id_team0) + trainer.publish_policy_queue(policy_queue0) + + # Ghost trainer should use this queue for ghost policy swap + parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) + policy = trainer.create_policy(parsed_behavior_id1, mock_specs) + trainer.add_policy(parsed_behavior_id1, policy) + policy_queue1 = AgentManagerQueue(behavior_id_team1) + trainer.publish_policy_queue(policy_queue1) + + # check ghost trainer swap pushes to ghost queue and not trainer + assert policy_queue0.empty() and policy_queue1.empty() + trainer._swap_snapshots() + assert policy_queue0.empty() and not policy_queue1.empty() + # clear + policy_queue1.get_nowait() + + mock_specs = mb.setup_test_behavior_specs( + False, + False, + vector_action_space=VECTOR_ACTION_SPACE, + vector_obs_space=VECTOR_OBS_SPACE, + ) + + buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs) + # Mock out reward signal eval + buffer["extrinsic_rewards"] = buffer["environment_rewards"] + buffer["extrinsic_returns"] = buffer["environment_rewards"] + buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] + buffer["curiosity_rewards"] = buffer["environment_rewards"] + buffer["curiosity_returns"] = buffer["environment_rewards"] + buffer["curiosity_value_estimates"] = buffer["environment_rewards"] + buffer["advantages"] = buffer["environment_rewards"] + trainer.trainer.update_buffer = buffer + + # when ghost trainer advance and wrapped trainer buffers full + # the wrapped trainer pushes updated policy to correct queue + assert policy_queue0.empty() and policy_queue1.empty() + trainer.advance() + assert not policy_queue0.empty() and policy_queue1.empty() + + +if __name__ == "__main__": + pytest.main() diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py index 7ae4f08c21..d920a43279 100644 --- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py +++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py @@ -119,7 +119,10 @@ def _is_ready_update(self): return False def create_policy( - self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, ) -> Policy: if self.framework == FrameworkType.PYTORCH and TorchPolicy is None: raise UnityTrainerException( @@ -128,7 +131,9 @@ def create_policy( elif self.framework == FrameworkType.PYTORCH: return self.create_torch_policy(parsed_behavior_id, behavior_spec) else: - return self.create_tf_policy(parsed_behavior_id, behavior_spec) + return self.create_tf_policy( + parsed_behavior_id, behavior_spec, create_graph=create_graph + ) @abc.abstractmethod def create_torch_policy( @@ -141,7 +146,10 @@ def create_torch_policy( @abc.abstractmethod def create_tf_policy( - self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, ) -> TFPolicy: """ Create a Policy object that uses the TensorFlow backend. diff --git a/ml-agents/mlagents/trainers/trainer/trainer.py b/ml-agents/mlagents/trainers/trainer/trainer.py index a08b2dd6ad..55ac5a9ef1 100644 --- a/ml-agents/mlagents/trainers/trainer/trainer.py +++ b/ml-agents/mlagents/trainers/trainer/trainer.py @@ -125,7 +125,10 @@ def end_episode(self): @abc.abstractmethod def create_policy( - self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, ) -> Policy: """ Creates policy From 16690d90ea0286d5cc0b8878ddc57cd401cdb5c2 Mon Sep 17 00:00:00 2001 From: Andrew Cohen Date: Mon, 17 Aug 2020 14:18:10 -0700 Subject: [PATCH 3/9] fix tf policy for ghosts --- ml-agents/mlagents/trainers/policy/tf_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py index ec8e448ac0..91a4ab61f2 100644 --- a/ml-agents/mlagents/trainers/policy/tf_policy.py +++ b/ml-agents/mlagents/trainers/policy/tf_policy.py @@ -139,7 +139,6 @@ def create_tf_graph(self) -> None: tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm" ) # LSTMs need to be root scope for Barracuda export # Create assignment ops for Ghost Trainer - self.init_load_weights() self.inference_dict = { "action": self.output, @@ -154,6 +153,7 @@ def create_tf_graph(self) -> None: # We do an initialize to make the Policy usable out of the box. If an optimizer is needed, # it will re-load the full graph self.initialize() + self.init_load_weights() def _create_encoder( self, From d04fe05988ea1103e82740c5512dbd95bde3590b Mon Sep 17 00:00:00 2001 From: Andrew Cohen Date: Mon, 17 Aug 2020 14:20:54 -0700 Subject: [PATCH 4/9] fix tf ghost tests --- ml-agents/mlagents/trainers/tests/test_ghost.py | 7 ++----- ml-agents/mlagents/trainers/tests/torch/test_ghost.py | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/test_ghost.py b/ml-agents/mlagents/trainers/tests/test_ghost.py index e72f573f36..acc9711830 100644 --- a/ml-agents/mlagents/trainers/tests/test_ghost.py +++ b/ml-agents/mlagents/trainers/tests/test_ghost.py @@ -38,12 +38,9 @@ def test_load_and_set(dummy_config, use_discrete): trainer_params = dummy_config trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") trainer.seed = 1 - policy = trainer.create_policy("test", mock_specs) - policy.create_tf_graph() + policy = trainer.create_policy("test", mock_specs, create_graph=True) trainer.seed = 20 # otherwise graphs are the same - to_load_policy = trainer.create_policy("test", mock_specs) - to_load_policy.create_tf_graph() - to_load_policy.init_load_weights() + to_load_policy = trainer.create_policy("test", mock_specs, create_graph=True) weights = policy.get_weights() load_weights = to_load_policy.get_weights() diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py index de720a1e1e..06f0666cc8 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py @@ -40,9 +40,9 @@ def test_load_and_set(dummy_config, use_discrete): trainer_params = dummy_config trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") trainer.seed = 1 - policy = trainer.create_policy("test", mock_specs, create_graph=True) + policy = trainer.create_policy("test", mock_specs) trainer.seed = 20 # otherwise graphs are the same - to_load_policy = trainer.create_policy("test", mock_specs, create_graph=True) + to_load_policy = trainer.create_policy("test", mock_specs) weights = policy.get_weights() load_weights = to_load_policy.get_weights() From d9640857483092230e722295ac134a24f939aa3f Mon Sep 17 00:00:00 2001 From: Andrew Cohen Date: Mon, 17 Aug 2020 14:22:51 -0700 Subject: [PATCH 5/9] revert tennis config --- config/ppo/Tennis.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/config/ppo/Tennis.yaml b/config/ppo/Tennis.yaml index e6b1cafe84..ed73a6d83a 100644 --- a/config/ppo/Tennis.yaml +++ b/config/ppo/Tennis.yaml @@ -1,6 +1,5 @@ behaviors: Tennis: - framework: pytorch trainer_type: ppo hyperparameters: batch_size: 2048 From 958836207d40de3aed5d0148aecbaf35c260738c Mon Sep 17 00:00:00 2001 From: Andrew Cohen Date: Mon, 17 Aug 2020 14:24:01 -0700 Subject: [PATCH 6/9] move tf policy comment --- ml-agents/mlagents/trainers/policy/tf_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py index 91a4ab61f2..47789d1e92 100644 --- a/ml-agents/mlagents/trainers/policy/tf_policy.py +++ b/ml-agents/mlagents/trainers/policy/tf_policy.py @@ -138,7 +138,6 @@ def create_tf_graph(self) -> None: self.trainable_variables += tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm" ) # LSTMs need to be root scope for Barracuda export - # Create assignment ops for Ghost Trainer self.inference_dict = { "action": self.output, @@ -153,6 +152,7 @@ def create_tf_graph(self) -> None: # We do an initialize to make the Policy usable out of the box. If an optimizer is needed, # it will re-load the full graph self.initialize() + # Create assignment ops for Ghost Trainer self.init_load_weights() def _create_encoder( From b2da1090ebca8510017fb3a7c76eb1a92b31757d Mon Sep 17 00:00:00 2001 From: Andrew Cohen Date: Mon, 17 Aug 2020 14:33:28 -0700 Subject: [PATCH 7/9] fix sac precommit --- ml-agents/mlagents/trainers/sac/trainer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index e342ad4b03..11cc6762c5 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -228,7 +228,10 @@ def maybe_load_replay_buffer(self): ) def create_tf_policy( - self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + create_graph: bool = False, ) -> TFPolicy: policy = TFPolicy( self.seed, @@ -236,7 +239,7 @@ def create_tf_policy( self.trainer_settings, tanh_squash=True, reparameterize=True, - create_tf_graph=False, + create_tf_graph=create_graph, ) self.maybe_load_replay_buffer() return policy From ef4a5a021c1f22ed610b29c8dcad1e85f0c59161 Mon Sep 17 00:00:00 2001 From: Andrew Cohen Date: Mon, 17 Aug 2020 14:50:06 -0700 Subject: [PATCH 8/9] added comments --- ml-agents/mlagents/trainers/ppo/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index e49a83ccd3..a9ca897fbe 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -225,6 +225,7 @@ def create_tf_policy( """ Creates a PPO policy to trainers list of policies. :param behavior_spec: specifications for policy construction + :param create_graph: whether to create the graph when policy is constructed :return policy """ policy = TFPolicy( From 453e1aa317ea2bf4324c67fef9d835b9646b43b7 Mon Sep 17 00:00:00 2001 From: Andrew Cohen Date: Tue, 18 Aug 2020 10:33:40 -0700 Subject: [PATCH 9/9] return copy of state_dict --- ml-agents/mlagents/trainers/policy/torch_policy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 582d154725..e2ed73b25f 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -1,6 +1,7 @@ from typing import Any, Dict, List import numpy as np import torch +import copy from mlagents.trainers.action_info import ActionInfo from mlagents.trainers.behavior_id_utils import get_global_agent_id @@ -255,7 +256,7 @@ def init_load_weights(self) -> None: pass def get_weights(self) -> List[np.ndarray]: - return self.actor_critic.state_dict() + return copy.deepcopy(self.actor_critic.state_dict()) def get_modules(self): return {"Policy": self.actor_critic, "global_step": self.global_step}