diff --git a/experiment_torch.py b/experiment_torch.py new file mode 100644 index 0000000000..8ccb56be2b --- /dev/null +++ b/experiment_torch.py @@ -0,0 +1,111 @@ + +import json +import os +import torch +import tensorflow as tf +import argparse +from mlagents.trainers.learn import run_cli, parse_command_line +from mlagents.trainers.settings import RunOptions +from mlagents.trainers.stats import StatsReporter +from mlagents.trainers.ppo.trainer import TestingConfiguration +from mlagents_envs.timers import _thread_timer_stacks + + + + +def run_experiment(name:str, steps:int, use_torch:bool, num_torch_threads:int, use_gpu:bool, num_envs :int= 1, config_name=None): + TestingConfiguration.env_name = name + TestingConfiguration.max_steps = steps + TestingConfiguration.use_torch = use_torch + TestingConfiguration.device = "cuda:0" if use_gpu else "cpu" + if use_gpu: + tf.device("/GPU:0") + else: + tf.device("/device:CPU:0") + if (not torch.cuda.is_available() and use_gpu): + return name, str(steps), str(use_torch), str(num_torch_threads), str(num_envs), str(use_gpu), "na","na","na","na","na","na","na" + if config_name is None: + config_name = name + run_options = parse_command_line([f"config/ppo/{config_name}.yaml", "--num-envs", f"{num_envs}"]) + run_options.checkpoint_settings.run_id = f"{name}_test_" +str(steps) +"_"+("torch" if use_torch else "tf") + run_options.checkpoint_settings.force = True + # run_options.env_settings.num_envs = num_envs + for trainer_settings in run_options.behaviors.values(): + trainer_settings.threaded = False + timers_path = os.path.join("results", run_options.checkpoint_settings.run_id, "run_logs", "timers.json") + if use_torch: + torch.set_num_threads(num_torch_threads) + run_cli(run_options) + StatsReporter.writers.clear() + StatsReporter.stats_dict.clear() + _thread_timer_stacks.clear() + with open(timers_path) as timers_json_file: + timers_json = json.load(timers_json_file) + total = timers_json["total"] + tc_advance = timers_json["children"]["TrainerController.start_learning"]["children"]["TrainerController.advance"] + evaluate = timers_json["children"]["TrainerController.start_learning"]["children"]["TrainerController.advance"]["children"]["env_step"]["children"]["SubprocessEnvManager._take_step"]["children"] + update = timers_json["children"]["TrainerController.start_learning"]["children"]["TrainerController.advance"]["children"]["trainer_advance"]["children"]["_update_policy"]["children"] + tc_advance_total = tc_advance["total"] + tc_advance_count = tc_advance["count"] + if use_torch: + update_total = update["TorchPPOOptimizer.update"]["total"] + evaluate_total = evaluate["TorchPolicy.evaluate"]["total"] + update_count = update["TorchPPOOptimizer.update"]["count"] + evaluate_count = evaluate["TorchPolicy.evaluate"]["count"] + else: + update_total = update["TFPPOOptimizer.update"]["total"] + evaluate_total = evaluate["NNPolicy.evaluate"]["total"] + update_count = update["TFPPOOptimizer.update"]["count"] + evaluate_count= evaluate["NNPolicy.evaluate"]["count"] + # todo: do total / count + return name, str(steps), str(use_torch), str(num_torch_threads), str(num_envs), str(use_gpu), str(total), str(tc_advance_total), str(tc_advance_count), str(update_total), str(update_count), str(evaluate_total), str(evaluate_count) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--steps", default=25000, type=int, help="The number of steps") + parser.add_argument("--num-envs", default=1, type=int, help="The number of envs") + parser.add_argument("--gpu", default = False, action="store_true", help="If true, will use the GPU") + parser.add_argument("--threads", default=False, action="store_true", help="If true, will try both 1 and 8 threads for torch") + parser.add_argument("--ball", default=False, action="store_true", help="If true, will only do 3dball") + args = parser.parse_args() + + if args.gpu: + os.environ["CUDA_VISIBLE_DEVICES"] = "0" + else: + os.environ["CUDA_VISIBLE_DEVICES"] = "-1" + + envs_config_tuples = [("3DBall", "3DBall"), ("GridWorld", "GridWorld"), ("PushBlock", "PushBlock"), ("Hallway", "Hallway"), ("CrawlerStaticTarget", "CrawlerStatic"), ("VisualHallway", "VisualHallway")] + if args.ball: + envs_config_tuples=[("3DBall", "3DBall")] + + + labels = ("name", "steps", "use_torch", "num_torch_threads", "num_envs", "use_gpu" , "total", "tc_advance_total", "tc_advance_count", "update_total", "update_count", "evaluate_total", "evaluate_count") + + results = [] + results.append(labels) + f = open(f"result_data_steps_{args.steps}_envs_{args.num_envs}_gpu_{args.gpu}_thread_{args.threads}.txt", "w") + f.write(" ".join(labels)+ "\n") + + for env_config in envs_config_tuples: + data = run_experiment(name = env_config[0], steps=args.steps, use_torch=True, num_torch_threads=1, use_gpu=args.gpu, num_envs = args.num_envs, config_name=env_config[1]) + results.append(data) + f.write(" ".join(data) + "\n") + + if args.threads: + data = run_experiment(name = env_config[0], steps=args.steps, use_torch=True, num_torch_threads=8, use_gpu=args.gpu, num_envs = args.num_envs, config_name=env_config[1]) + results.append(data) + f.write(" ".join(data)+ "\n") + + + data = run_experiment(name = env_config[0], steps=args.steps, use_torch=False, num_torch_threads=1, use_gpu=args.gpu, num_envs = args.num_envs, config_name=env_config[1]) + results.append(data) + f.write(" ".join(data)+ "\n") + for r in results: + print(*r) + f.close() + + +if __name__ == "__main__": + main() + diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py index ead4b92cd8..0e9188e66d 100644 --- a/ml-agents/mlagents/trainers/learn.py +++ b/ml-agents/mlagents/trainers/learn.py @@ -35,6 +35,9 @@ ) from mlagents_envs import logging_util +from mlagents.trainers.ppo.trainer import TestingConfiguration +from mlagents_envs.registry import default_registry + logger = logging_util.get_logger(__name__) TRAINING_STATUS_FILE_NAME = "training_status.json" @@ -233,16 +236,27 @@ def create_unity_environment( ) -> UnityEnvironment: # Make sure that each environment gets a different seed env_seed = seed + worker_id - return UnityEnvironment( - file_name=env_path, - worker_id=worker_id, - seed=env_seed, - no_graphics=no_graphics, - base_port=start_port, - additional_args=env_args, - side_channels=side_channels, - log_folder=log_folder, - ) + if TestingConfiguration.env_name == "": + return UnityEnvironment( + file_name=env_path, + worker_id=worker_id, + seed=env_seed, + no_graphics=no_graphics, + base_port=start_port, + additional_args=env_args, + side_channels=side_channels, + log_folder=log_folder, + ) + else: + return default_registry[TestingConfiguration.env_name].make( + seed=env_seed, + no_graphics=no_graphics, + base_port=start_port, + worker_id=worker_id, + additional_args=env_args, + side_channels=side_channels, + log_folder=log_folder, + ) return create_unity_environment diff --git a/ml-agents/mlagents/trainers/models_torch.py b/ml-agents/mlagents/trainers/models_torch.py index 434634fbce..c2e0fc27de 100644 --- a/ml-agents/mlagents/trainers/models_torch.py +++ b/ml-agents/mlagents/trainers/models_torch.py @@ -136,7 +136,7 @@ def forward(self, vec_inputs, vis_inputs, memories=None, sequence_length=1): if self.use_lstm: embedding = embedding.view([sequence_length, -1, self.h_size]) memories = torch.split(memories, self.m_size // 2, dim=-1) - embedding, memories = self.lstm(embedding, memories) + embedding, memories = self.lstm(embedding.contiguous(), (memories[0].contiguous(), memories[1].contiguous())) embedding = embedding.view([-1, self.m_size // 2]) memories = torch.cat(memories, dim=-1) return embedding, memories @@ -407,7 +407,8 @@ def __init__(self, height, width, initial_channels, output_size): def forward(self, visual_obs): conv_1 = torch.relu(self.conv1(visual_obs)) conv_2 = torch.relu(self.conv2(conv_1)) - hidden = torch.relu(self.dense(conv_2.view([-1, self.final_flat]))) + # hidden = torch.relu(self.dense(conv_2.view([-1, self.final_flat]))) + hidden = torch.relu(self.dense(torch.reshape(conv_2,(-1, self.final_flat)))) return hidden diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py index 762ad00357..80cefaee39 100644 --- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py +++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py @@ -106,8 +106,8 @@ def get_trajectory_value_estimates( ) for name, estimate in value_estimates.items(): - value_estimates[name] = estimate.detach().numpy() - next_value_estimate[name] = next_value_estimate[name].detach().numpy() + value_estimates[name] = estimate.detach().cpu().numpy() + next_value_estimate[name] = next_value_estimate[name].detach().cpu().numpy() if done: for k in next_value_estimate: diff --git a/ml-agents/mlagents/trainers/policy/nn_policy.py b/ml-agents/mlagents/trainers/policy/nn_policy.py index ea550cb652..879065d3bc 100644 --- a/ml-agents/mlagents/trainers/policy/nn_policy.py +++ b/ml-agents/mlagents/trainers/policy/nn_policy.py @@ -12,6 +12,8 @@ MultiCategoricalDistribution, ) +from mlagents.trainers.ppo.trainer import TestingConfiguration + EPSILON = 1e-6 # Small value to avoid divide by zero diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index d3fb2db022..5ea2f729ee 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -17,6 +17,8 @@ from mlagents.trainers.brain import BrainParameters from mlagents.trainers.models_torch import ActorCritic +from mlagents.trainers.ppo.trainer import TestingConfiguration + EPSILON = 1e-7 # Small value to avoid divide by zero @@ -91,6 +93,12 @@ def __init__( self.log_std_min = -20 self.log_std_max = 2 + if TestingConfiguration.device != "cpu": + torch.set_default_tensor_type(torch.cuda.FloatTensor) + else: + torch.set_default_tensor_type(torch.FloatTensor) + + self.inference_dict: Dict[str, tf.Tensor] = {} self.update_dict: Dict[str, tf.Tensor] = {} @@ -117,6 +125,8 @@ def __init__( separate_critic=self.use_continuous_act, ) + self.actor_critic.to(TestingConfiguration.device) + def split_decision_step(self, decision_requests): vec_vis_obs = SplitObservations.from_observations(decision_requests.obs) mask = None @@ -190,18 +200,18 @@ def evaluate( action, log_probs, entropy, value_heads, memories = self.sample_actions( vec_obs, vis_obs, masks=masks, memories=memories ) - run_out["action"] = action.detach().numpy() - run_out["pre_action"] = action.detach().numpy() + run_out["action"] = action.detach().cpu().numpy() + run_out["pre_action"] = action.detach().cpu().numpy() # Todo - make pre_action difference - run_out["log_probs"] = log_probs.detach().numpy() - run_out["entropy"] = entropy.detach().numpy() + run_out["log_probs"] = log_probs.detach().cpu().numpy() + run_out["entropy"] = entropy.detach().cpu().numpy() run_out["value_heads"] = { - name: t.detach().numpy() for name, t in value_heads.items() + name: t.detach().cpu().numpy() for name, t in value_heads.items() } run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0) run_out["learning_rate"] = 0.0 if self.use_recurrent: - run_out["memories"] = memories.detach().numpy() + run_out["memories"] = memories.detach().cpu().numpy() self.actor_critic.update_normalization(vec_obs) return run_out @@ -249,24 +259,28 @@ def load_model(self, step=0): self.actor_critic.load_state_dict(torch.load(load_path)) def export_model(self, step=0): - fake_vec_obs = [torch.zeros([1] + [self.brain.vector_observation_space_size])] - fake_vis_obs = [torch.zeros([1] + [84, 84, 3])] - fake_masks = torch.ones([1] + self.actor_critic.act_size) - # fake_memories = torch.zeros([1] + [self.m_size]) - export_path = "./model-" + str(step) + ".onnx" - output_names = ["action", "action_probs"] - input_names = ["vector_observation", "action_mask"] - dynamic_axes = {"vector_observation": [0], "action": [0], "action_probs": [0]} - onnx.export( - self.actor_critic, - (fake_vec_obs, fake_vis_obs, fake_masks), - export_path, - verbose=True, - opset_version=12, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - ) + try: + fake_vec_obs = [torch.zeros([1] + [self.brain.vector_observation_space_size])] + fake_vis_obs = [torch.zeros([1] + [84, 84, 3])] + fake_masks = torch.ones([1] + self.actor_critic.act_size) + # fake_memories = torch.zeros([1] + [self.m_size]) + export_path = "./model-" + str(step) + ".onnx" + output_names = ["action", "action_probs"] + input_names = ["vector_observation", "action_mask"] + dynamic_axes = {"vector_observation": [0], "action": [0], "action_probs": [0]} + onnx.export( + self.actor_critic, + (fake_vec_obs, fake_vis_obs, fake_masks), + export_path, + verbose=True, + opset_version=12, + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + ) + except: + print("Could not export torch model") + return @property def vis_obs_size(self): diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 2e8ece92dc..afe3f91712 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -143,8 +143,8 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: self.optimizer.step() update_stats = { - "Losses/Policy Loss": abs(policy_loss.detach().numpy()), - "Losses/Value Loss": value_loss.detach().numpy(), + "Losses/Policy Loss": abs(policy_loss.detach().cpu().numpy()), + "Losses/Value Loss": value_loss.detach().cpu().numpy(), } return update_stats diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index 19519aaeb8..365a7634de 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -2,6 +2,14 @@ # ## ML-Agent Learning (PPO) # Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347 +class TestingConfiguration: + use_torch = False + max_steps = 0 + env_name = "" + device = "cpu" + + + from collections import defaultdict from typing import cast @@ -22,6 +30,8 @@ logger = get_logger(__name__) + + class PPOTrainer(RLTrainer): """The PPOTrainer is an implementation of the PPO algorithm.""" @@ -53,7 +63,9 @@ def __init__( ) self.load = load self.seed = seed - self.framework = "torch" + self.framework = "torch" if TestingConfiguration.use_torch else "tf" + if TestingConfiguration.max_steps > 0: + self.trainer_settings.max_steps = TestingConfiguration.max_steps self.policy: Policy = None # type: ignore def _process_trajectory(self, trajectory: Trajectory) -> None: