diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index 5fd7fb0f4b..02942bcd29 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to ### Major Changes #### com.unity.ml-agents (C#) #### ml-agents / ml-agents-envs / gym-unity (Python) +- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`. ### Minor Changes #### com.unity.ml-agents (C#) #### ml-agents / ml-agents-envs / gym-unity (Python) diff --git a/docs/Migrating.md b/docs/Migrating.md index db0f6c83e5..2e4ceb2184 100644 --- a/docs/Migrating.md +++ b/docs/Migrating.md @@ -21,6 +21,7 @@ double-check that the versions are in the same. The versions can be found in instead of `summaries/` and `models/`. - Trainer configuration, curriculum configuration, and parameter randomization configuration have all been moved to a single YAML file. (#3791) +- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`. ### Steps to Migrate - Before upgrading, copy your `Behavior Name` sections from `trainer_config.yaml` into @@ -31,6 +32,8 @@ double-check that the versions are in the same. The versions can be found in the `Behavior Name` section. - If your training uses [parameter randomization](Training-ML-Agents.md#environment-parameter-randomization), move the contents of the sampler config to `parameter_randomization` in the main trainer configuration. +- If you are using `UnityEnvironment` directly, replace `max_step` with `interrupted` +in the `TerminalStep` and `TerminalSteps` objects. ## Migrating from 0.15 to Release 1 diff --git a/docs/Python-API.md b/docs/Python-API.md index 03a97b8670..2026f8158f 100644 --- a/docs/Python-API.md +++ b/docs/Python-API.md @@ -200,9 +200,9 @@ A `TerminalSteps` has the following fields : - `agent_id` is an int vector of length batch size containing unique identifier for the corresponding Agent. This is used to track Agents across simulation steps. -- `max_step` is an array of booleans of length batch size. Is true if the - associated Agent reached its maximum number of steps during the last - simulation step. + - `interrupted` is an array of booleans of length batch size. Is true if the + associated Agent was interrupted since the last decision step. For example, + if the Agent reached the maximum number of steps for the episode. It also has the two following methods: @@ -218,8 +218,9 @@ A `TerminalStep` has the following fields: - `reward` is a float. Corresponds to the rewards collected by the agent since the last simulation step. - `agent_id` is an int and an unique identifier for the corresponding Agent. -- `max_step` is a bool. Is true if the Agent reached its maximum number of steps - during the last simulation step. + - `interrupted` is a bool. Is true if the Agent was interrupted since the last + decision step. For example, if the Agent reached the maximum number of steps for + the episode. #### BehaviorSpec diff --git a/ml-agents-envs/mlagents_envs/base_env.py b/ml-agents-envs/mlagents_envs/base_env.py index d62675c7af..713e51083b 100644 --- a/ml-agents-envs/mlagents_envs/base_env.py +++ b/ml-agents-envs/mlagents_envs/base_env.py @@ -144,14 +144,15 @@ class TerminalStep(NamedTuple): - obs is a list of numpy arrays observations collected by the agent. - reward is a float. Corresponds to the rewards collected by the agent since the last simulation step. - - max_step is a bool. Is true if the Agent reached its maximum number of - steps during the last simulation step. + - interrupted is a bool. Is true if the Agent was interrupted since the last + decision step. For example, if the Agent reached the maximum number of steps for + the episode. - agent_id is an int and an unique identifier for the corresponding Agent. """ obs: List[np.ndarray] reward: float - max_step: bool + interrupted: bool agent_id: AgentId @@ -165,18 +166,18 @@ class TerminalSteps(Mapping): first dimension of the array corresponds to the batch size of the batch. - reward is a float vector of length batch size. Corresponds to the rewards collected by each agent since the last simulation step. - - max_step is an array of booleans of length batch size. Is true if the - associated Agent reached its maximum number of steps during the last - simulation step. + - interrupted is an array of booleans of length batch size. Is true if the + associated Agent was interrupted since the last decision step. For example, if the + Agent reached the maximum number of steps for the episode. - agent_id is an int vector of length batch size containing unique identifier for the corresponding Agent. This is used to track Agents across simulation steps. """ - def __init__(self, obs, reward, max_step, agent_id): + def __init__(self, obs, reward, interrupted, agent_id): self.obs: List[np.ndarray] = obs self.reward: np.ndarray = reward - self.max_step: np.ndarray = max_step + self.interrupted: np.ndarray = interrupted self.agent_id: np.ndarray = agent_id self._agent_id_to_index: Optional[Dict[AgentId, int]] = None @@ -213,7 +214,7 @@ def __getitem__(self, agent_id: AgentId) -> TerminalStep: return TerminalStep( obs=agent_obs, reward=self.reward[agent_index], - max_step=self.max_step[agent_index], + interrupted=self.interrupted[agent_index], agent_id=agent_id, ) @@ -232,7 +233,7 @@ def empty(spec: "BehaviorSpec") -> "TerminalSteps": return TerminalSteps( obs=obs, reward=np.zeros(0, dtype=np.float32), - max_step=np.zeros(0, dtype=np.bool), + interrupted=np.zeros(0, dtype=np.bool), agent_id=np.zeros(0, dtype=np.int32), ) @@ -381,7 +382,7 @@ def get_steps( the rewards, the agent ids and the action masks for the Agents of the specified behavior. These Agents need an action this step. - A TerminalSteps NamedTuple containing the observations, - rewards, agent ids and max_step flags of the agents that had their + rewards, agent ids and interrupted flags of the agents that had their episode terminated last step. """ pass diff --git a/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py b/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py index b8f7e41a5c..8216e4f7b1 100644 --- a/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py +++ b/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py @@ -128,7 +128,7 @@ def proto_from_steps( agent_id_index = terminal_steps.agent_id_to_index[agent_id] reward = terminal_steps.reward[agent_id_index] done = True - max_step_reached = terminal_steps.max_step[agent_id_index] + max_step_reached = terminal_steps.interrupted[agent_id_index] final_observations: List[ObservationProto] = [] for all_observations_of_type in terminal_steps.obs: @@ -248,7 +248,7 @@ def test_batched_step_result_from_proto(): for agent_id in range(n_agents): assert (agent_id in terminal_steps) == (agent_id % 2 == 0) if agent_id in terminal_steps: - assert terminal_steps[agent_id].max_step == (agent_id % 4 == 0) + assert terminal_steps[agent_id].interrupted == (agent_id % 4 == 0) assert decision_steps.obs[0].shape[1] == shapes[0][0] assert decision_steps.obs[1].shape[1] == shapes[1][0] assert terminal_steps.obs[0].shape[1] == shapes[0][0] diff --git a/ml-agents/mlagents/trainers/agent_processor.py b/ml-agents/mlagents/trainers/agent_processor.py index 3e3cce42f1..cab911d4b8 100644 --- a/ml-agents/mlagents/trainers/agent_processor.py +++ b/ml-agents/mlagents/trainers/agent_processor.py @@ -125,7 +125,7 @@ def _process_step( else: memory = None done = terminated # Since this is an ongoing step - max_step = step.max_step if terminated else False + interrupted = step.interrupted if terminated else False # Add the outputs of the last eval action = stored_take_action_outputs["action"][idx] if self.policy.use_continuous_act: @@ -144,7 +144,7 @@ def _process_step( action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, - max_step=max_step, + interrupted=interrupted, memory=memory, ) # Add the value outputs if needed diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index 7b6e6f12d1..6b2f81b9f8 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -102,7 +102,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: value_estimates, value_next = self.optimizer.get_trajectory_value_estimates( agent_buffer_trajectory, trajectory.next_obs, - trajectory.done_reached and not trajectory.max_step_reached, + trajectory.done_reached and not trajectory.interrupted, ) for name, v in value_estimates.items(): agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v) diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index ea2aeca2d4..9b7603da31 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -193,7 +193,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: # Bootstrap using the last step rather than the bootstrap step if max step is reached. # Set last element to duplicate obs and remove dones. - if last_step.max_step: + if last_step.interrupted: vec_vis_obs = SplitObservations.from_observations(last_step.obs) for i, obs in enumerate(vec_vis_obs.visual_observations): agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py index 849b2b0dde..fe0e12693c 100644 --- a/ml-agents/mlagents/trainers/tests/mock_brain.py +++ b/ml-agents/mlagents/trainers/tests/mock_brain.py @@ -75,7 +75,7 @@ def create_mock_steps( ] reward = np.array(num_agents * [1.0], dtype=np.float32) - max_step = np.array(num_agents * [False], dtype=np.bool) + interrupted = np.array(num_agents * [False], dtype=np.bool) agent_id = np.arange(num_agents, dtype=np.int32) behavior_spec = BehaviorSpec( [(84, 84, 3)] * num_vis_observations + [(num_vector_observations, 0, 0)], @@ -85,7 +85,7 @@ def create_mock_steps( if done: return ( DecisionSteps.empty(behavior_spec), - TerminalSteps(obs_list, reward, max_step, agent_id), + TerminalSteps(obs_list, reward, interrupted, agent_id), ) else: return ( @@ -156,7 +156,7 @@ def make_fake_trajectory( action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, - max_step=max_step, + interrupted=max_step, memory=memory, ) steps_list.append(experience) @@ -169,7 +169,7 @@ def make_fake_trajectory( action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, - max_step=max_step_complete, + interrupted=max_step_complete, memory=memory, ) steps_list.append(last_experience) diff --git a/ml-agents/mlagents/trainers/trajectory.py b/ml-agents/mlagents/trainers/trajectory.py index c14563ec2f..1eb1e55993 100644 --- a/ml-agents/mlagents/trainers/trajectory.py +++ b/ml-agents/mlagents/trainers/trajectory.py @@ -13,7 +13,7 @@ class AgentExperience(NamedTuple): action_pre: np.ndarray # TODO: Remove this action_mask: np.ndarray prev_action: np.ndarray - max_step: bool + interrupted: bool memory: np.ndarray @@ -141,8 +141,8 @@ def done_reached(self) -> bool: return self.steps[-1].done @property - def max_step_reached(self) -> bool: + def interrupted(self) -> bool: """ Returns true if trajectory was terminated because max steps was reached. """ - return self.steps[-1].max_step + return self.steps[-1].interrupted