From 818d37c52d0633bb06131ec65785e890ea24badc Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 21 Oct 2020 15:26:43 -0700 Subject: [PATCH 1/6] Always initialize non-wrapped policy --- ml-agents/mlagents/trainers/ghost/trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py index a93b616df5..7d16551e8c 100644 --- a/ml-agents/mlagents/trainers/ghost/trainer.py +++ b/ml-agents/mlagents/trainers/ghost/trainer.py @@ -319,7 +319,9 @@ def create_policy( policy = self.trainer.create_policy( parsed_behavior_id, behavior_spec, create_graph=True ) - self.trainer.model_saver.initialize_or_load(policy) + # Initialize policy. You never want to load it, since that should only + # be done to the wrapped policy + policy.initialize() team_id = parsed_behavior_id.team_id self.controller.subscribe_team_id(team_id, self) From 4f4520be208654a14985a08578573ee2d62dcd18 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 21 Oct 2020 15:33:48 -0700 Subject: [PATCH 2/6] Load ghosted policy --- ml-agents/mlagents/trainers/ghost/trainer.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py index 7d16551e8c..fccc9366eb 100644 --- a/ml-agents/mlagents/trainers/ghost/trainer.py +++ b/ml-agents/mlagents/trainers/ghost/trainer.py @@ -146,11 +146,11 @@ def get_step(self) -> int: @property def reward_buffer(self) -> Deque[float]: """ - Returns the reward buffer. The reward buffer contains the cumulative - rewards of the most recent episodes completed by agents using this - trainer. - :return: the reward buffer. - """ + Returns the reward buffer. The reward buffer contains the cumulative + rewards of the most recent episodes completed by agents using this + trainer. + :return: the reward buffer. + """ return self.trainer.reward_buffer @property @@ -339,6 +339,11 @@ def create_policy( self._save_snapshot() # Need to save after trainer initializes policy self._learning_team = self.controller.get_learning_team self.wrapped_trainer_team = team_id + else: + # Load the weights of the ghost policy from the wrapped one + policy.load_weights( + self.trainer.get_policy(parsed_behavior_id).get_weights() + ) return policy def add_policy( From de996869690757c209b07e20292fb2d995bad68e Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 21 Oct 2020 15:36:04 -0700 Subject: [PATCH 3/6] Update changelog --- com.unity.ml-agents/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index c2482f3ed7..abb2880b53 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -24,6 +24,7 @@ and this project adheres to if they are called recursively (for example, if they call `Agent.EndEpisode()`). Previously, this would result in an infinite loop and cause the editor to hang. (#4573) #### ml-agents / ml-agents-envs / gym-unity (Python) +- Fixed an issue where runs could not be resumed when using TensorFlow and Ghost Training. (#4593) ## [1.5.0-preview] - 2020-10-14 From 2cb25e26c2e8fe748924198a1ee857963daf01c5 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 21 Oct 2020 16:09:20 -0700 Subject: [PATCH 4/6] Resume test --- .../trainers/tests/tensorflow/test_ghost.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py index acc9711830..11e64474b5 100644 --- a/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py +++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py @@ -57,6 +57,51 @@ def test_load_and_set(dummy_config, use_discrete): np.testing.assert_array_equal(w, lw) +def test_resume(dummy_config, tmp_path): + mock_specs = mb.setup_test_behavior_specs( + True, False, vector_action_space=[2], vector_obs_space=1 + ) + behavior_id_team0 = "test_brain?team=0" + behavior_id_team1 = "test_brain?team=1" + brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name + + ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path) + controller = GhostController(100) + trainer = GhostTrainer( + ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path + ) + + parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) + policy = trainer.create_policy(parsed_behavior_id0, mock_specs) + trainer.add_policy(parsed_behavior_id0, policy) + + parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) + policy = trainer.create_policy(parsed_behavior_id1, mock_specs) + trainer.add_policy(parsed_behavior_id1, policy) + + trainer.save_model() + + # Make a new trainer, check that the policies are the same + trainer2 = GhostTrainer( + ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path + ) + policy = trainer2.create_policy(parsed_behavior_id0, mock_specs) + trainer2.add_policy(parsed_behavior_id0, policy) + + policy = trainer2.create_policy(parsed_behavior_id1, mock_specs) + trainer2.add_policy(parsed_behavior_id1, policy) + + trainer1_policy = trainer.get_policy(parsed_behavior_id1) + trainer2_policy = trainer2.get_policy(parsed_behavior_id1) + weights = trainer1_policy.get_weights() + weights2 = trainer2_policy.get_weights() + try: + for w, lw in zip(weights, weights2): + np.testing.assert_array_equal(w, lw) + except AssertionError: + pass + + def test_process_trajectory(dummy_config): mock_specs = mb.setup_test_behavior_specs( True, False, vector_action_space=[2], vector_obs_space=1 From b6f90857d371d91aae49971860cd88a928d6d4e4 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 21 Oct 2020 16:32:40 -0700 Subject: [PATCH 5/6] Add test --- .../trainers/tests/tensorflow/test_ghost.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py index 11e64474b5..2e72303b56 100644 --- a/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py +++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py @@ -64,7 +64,7 @@ def test_resume(dummy_config, tmp_path): behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name - + tmp_path = tmp_path.as_posix() ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path) controller = GhostController(100) trainer = GhostTrainer( @@ -82,8 +82,9 @@ def test_resume(dummy_config, tmp_path): trainer.save_model() # Make a new trainer, check that the policies are the same + ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path) trainer2 = GhostTrainer( - ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path + ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path ) policy = trainer2.create_policy(parsed_behavior_id0, mock_specs) trainer2.add_policy(parsed_behavior_id0, policy) @@ -91,15 +92,13 @@ def test_resume(dummy_config, tmp_path): policy = trainer2.create_policy(parsed_behavior_id1, mock_specs) trainer2.add_policy(parsed_behavior_id1, policy) - trainer1_policy = trainer.get_policy(parsed_behavior_id1) - trainer2_policy = trainer2.get_policy(parsed_behavior_id1) + trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id) + trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id) weights = trainer1_policy.get_weights() weights2 = trainer2_policy.get_weights() - try: - for w, lw in zip(weights, weights2): - np.testing.assert_array_equal(w, lw) - except AssertionError: - pass + + for w, lw in zip(weights, weights2): + np.testing.assert_array_equal(w, lw) def test_process_trajectory(dummy_config): From 58960627b881a205317fc5e416a801cc6c25865a Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 21 Oct 2020 16:34:47 -0700 Subject: [PATCH 6/6] Add torch test and fix torch. --- ml-agents/mlagents/trainers/ghost/trainer.py | 3 -- .../trainers/tests/torch/test_ghost.py | 44 +++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py index fccc9366eb..b2db465c20 100644 --- a/ml-agents/mlagents/trainers/ghost/trainer.py +++ b/ml-agents/mlagents/trainers/ghost/trainer.py @@ -319,9 +319,6 @@ def create_policy( policy = self.trainer.create_policy( parsed_behavior_id, behavior_spec, create_graph=True ) - # Initialize policy. You never want to load it, since that should only - # be done to the wrapped policy - policy.initialize() team_id = parsed_behavior_id.team_id self.controller.subscribe_team_id(team_id, self) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py index 06f0666cc8..0d96084d47 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py @@ -59,6 +59,50 @@ def test_load_and_set(dummy_config, use_discrete): np.testing.assert_array_equal(w, lw) +def test_resume(dummy_config, tmp_path): + mock_specs = mb.setup_test_behavior_specs( + True, False, vector_action_space=[2], vector_obs_space=1 + ) + behavior_id_team0 = "test_brain?team=0" + behavior_id_team1 = "test_brain?team=1" + brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name + tmp_path = tmp_path.as_posix() + ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path) + controller = GhostController(100) + trainer = GhostTrainer( + ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path + ) + + parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) + policy = trainer.create_policy(parsed_behavior_id0, mock_specs) + trainer.add_policy(parsed_behavior_id0, policy) + + parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) + policy = trainer.create_policy(parsed_behavior_id1, mock_specs) + trainer.add_policy(parsed_behavior_id1, policy) + + trainer.save_model() + + # Make a new trainer, check that the policies are the same + ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path) + trainer2 = GhostTrainer( + ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path + ) + policy = trainer2.create_policy(parsed_behavior_id0, mock_specs) + trainer2.add_policy(parsed_behavior_id0, policy) + + policy = trainer2.create_policy(parsed_behavior_id1, mock_specs) + trainer2.add_policy(parsed_behavior_id1, policy) + + trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id) + trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id) + weights = trainer1_policy.get_weights() + weights2 = trainer2_policy.get_weights() + + for w, lw in zip(weights, weights2): + np.testing.assert_array_equal(w, lw) + + def test_process_trajectory(dummy_config): mock_specs = mb.setup_test_behavior_specs( True, False, vector_action_space=[2], vector_obs_space=1