diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index c2482f3ed7..abb2880b53 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -24,6 +24,7 @@ and this project adheres to if they are called recursively (for example, if they call `Agent.EndEpisode()`). Previously, this would result in an infinite loop and cause the editor to hang. (#4573) #### ml-agents / ml-agents-envs / gym-unity (Python) +- Fixed an issue where runs could not be resumed when using TensorFlow and Ghost Training. (#4593) ## [1.5.0-preview] - 2020-10-14 diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py index a93b616df5..b2db465c20 100644 --- a/ml-agents/mlagents/trainers/ghost/trainer.py +++ b/ml-agents/mlagents/trainers/ghost/trainer.py @@ -146,11 +146,11 @@ def get_step(self) -> int: @property def reward_buffer(self) -> Deque[float]: """ - Returns the reward buffer. The reward buffer contains the cumulative - rewards of the most recent episodes completed by agents using this - trainer. - :return: the reward buffer. - """ + Returns the reward buffer. The reward buffer contains the cumulative + rewards of the most recent episodes completed by agents using this + trainer. + :return: the reward buffer. + """ return self.trainer.reward_buffer @property @@ -319,7 +319,6 @@ def create_policy( policy = self.trainer.create_policy( parsed_behavior_id, behavior_spec, create_graph=True ) - self.trainer.model_saver.initialize_or_load(policy) team_id = parsed_behavior_id.team_id self.controller.subscribe_team_id(team_id, self) @@ -337,6 +336,11 @@ def create_policy( self._save_snapshot() # Need to save after trainer initializes policy self._learning_team = self.controller.get_learning_team self.wrapped_trainer_team = team_id + else: + # Load the weights of the ghost policy from the wrapped one + policy.load_weights( + self.trainer.get_policy(parsed_behavior_id).get_weights() + ) return policy def add_policy( diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py index acc9711830..2e72303b56 100644 --- a/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py +++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py @@ -57,6 +57,50 @@ def test_load_and_set(dummy_config, use_discrete): np.testing.assert_array_equal(w, lw) +def test_resume(dummy_config, tmp_path): + mock_specs = mb.setup_test_behavior_specs( + True, False, vector_action_space=[2], vector_obs_space=1 + ) + behavior_id_team0 = "test_brain?team=0" + behavior_id_team1 = "test_brain?team=1" + brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name + tmp_path = tmp_path.as_posix() + ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path) + controller = GhostController(100) + trainer = GhostTrainer( + ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path + ) + + parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) + policy = trainer.create_policy(parsed_behavior_id0, mock_specs) + trainer.add_policy(parsed_behavior_id0, policy) + + parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) + policy = trainer.create_policy(parsed_behavior_id1, mock_specs) + trainer.add_policy(parsed_behavior_id1, policy) + + trainer.save_model() + + # Make a new trainer, check that the policies are the same + ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path) + trainer2 = GhostTrainer( + ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path + ) + policy = trainer2.create_policy(parsed_behavior_id0, mock_specs) + trainer2.add_policy(parsed_behavior_id0, policy) + + policy = trainer2.create_policy(parsed_behavior_id1, mock_specs) + trainer2.add_policy(parsed_behavior_id1, policy) + + trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id) + trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id) + weights = trainer1_policy.get_weights() + weights2 = trainer2_policy.get_weights() + + for w, lw in zip(weights, weights2): + np.testing.assert_array_equal(w, lw) + + def test_process_trajectory(dummy_config): mock_specs = mb.setup_test_behavior_specs( True, False, vector_action_space=[2], vector_obs_space=1 diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py index 06f0666cc8..0d96084d47 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py @@ -59,6 +59,50 @@ def test_load_and_set(dummy_config, use_discrete): np.testing.assert_array_equal(w, lw) +def test_resume(dummy_config, tmp_path): + mock_specs = mb.setup_test_behavior_specs( + True, False, vector_action_space=[2], vector_obs_space=1 + ) + behavior_id_team0 = "test_brain?team=0" + behavior_id_team1 = "test_brain?team=1" + brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name + tmp_path = tmp_path.as_posix() + ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path) + controller = GhostController(100) + trainer = GhostTrainer( + ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path + ) + + parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) + policy = trainer.create_policy(parsed_behavior_id0, mock_specs) + trainer.add_policy(parsed_behavior_id0, policy) + + parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) + policy = trainer.create_policy(parsed_behavior_id1, mock_specs) + trainer.add_policy(parsed_behavior_id1, policy) + + trainer.save_model() + + # Make a new trainer, check that the policies are the same + ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path) + trainer2 = GhostTrainer( + ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path + ) + policy = trainer2.create_policy(parsed_behavior_id0, mock_specs) + trainer2.add_policy(parsed_behavior_id0, policy) + + policy = trainer2.create_policy(parsed_behavior_id1, mock_specs) + trainer2.add_policy(parsed_behavior_id1, policy) + + trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id) + trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id) + weights = trainer1_policy.get_weights() + weights2 = trainer2_policy.get_weights() + + for w, lw in zip(weights, weights2): + np.testing.assert_array_equal(w, lw) + + def test_process_trajectory(dummy_config): mock_specs = mb.setup_test_behavior_specs( True, False, vector_action_space=[2], vector_obs_space=1