From f2ce4c7a1b10cf0ca3d6e61e95c2c2acea64520a Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 29 Apr 2020 21:12:06 -0700
Subject: [PATCH 01/54] Use attrs for RunOptions and CLI

---
 ml-agents/mlagents/trainers/cli_utils.py | 208 +++++++++++++
 ml-agents/mlagents/trainers/learn.py     | 371 +++--------------------
 ml-agents/mlagents/trainers/settings.py  | 106 +++++++
 ml-agents/setup.py                       |   2 +
 4 files changed, 365 insertions(+), 322 deletions(-)
 create mode 100644 ml-agents/mlagents/trainers/settings.py

diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py
index b84969b70a..df1dd7d9af 100644
--- a/ml-agents/mlagents/trainers/cli_utils.py
+++ b/ml-agents/mlagents/trainers/cli_utils.py
@@ -1,4 +1,5 @@
 from typing import Set
+from mlagents_envs.environment import UnityEnvironment
 import argparse
 
 
@@ -39,3 +40,210 @@ class StoreConfigFile(argparse.Action):
     def __call__(self, arg_parser, namespace, values, option_string=None):
         delattr(namespace, self.dest)
         StoreConfigFile.trainer_config_path = values
+
+
+def _create_parser() -> argparse.ArgumentParser:
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    argparser.add_argument("trainer_config_path", action=StoreConfigFile)
+    argparser.add_argument(
+        "--env",
+        default=None,
+        dest="env_path",
+        help="Path to the Unity executable to train",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--lesson",
+        default=0,
+        type=int,
+        help="The lesson to start with when performing curriculum training",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--keep-checkpoints",
+        default=5,
+        type=int,
+        help="The maximum number of model checkpoints to keep. Checkpoints are saved after the"
+        "number of steps specified by the save-freq option. Once the maximum number of checkpoints"
+        "has been reached, the oldest checkpoint is deleted when saving a new checkpoint.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--load",
+        default=False,
+        dest="load_model",
+        action=DetectDefaultStoreTrue,
+        help=argparse.SUPPRESS,  # Deprecated but still usable for now.
+    )
+    argparser.add_argument(
+        "--resume",
+        default=False,
+        dest="resume",
+        action=DetectDefaultStoreTrue,
+        help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
+        "If set, the training code loads an already trained model to initialize the neural network "
+        "before resuming training. This option is only valid when the models exist, and have the same "
+        "behavior names as the current agents in your scene.",
+    )
+    argparser.add_argument(
+        "--force",
+        default=False,
+        dest="force",
+        action=DetectDefaultStoreTrue,
+        help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
+        "this flag, attempting to train a model with a run-id that has been used before will throw "
+        "an error.",
+    )
+    argparser.add_argument(
+        "--run-id",
+        default="ppo",
+        help="The identifier for the training run. This identifier is used to name the "
+        "subdirectories in which the trained model and summary statistics are saved as well "
+        "as the saved model itself. If you use TensorBoard to view the training statistics, "
+        "always set a unique run-id for each training run. (The statistics for all runs with the "
+        "same id are combined as if they were produced by a the same session.)",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--initialize-from",
+        metavar="RUN_ID",
+        default=None,
+        help="Specify a previously saved run ID from which to initialize the model from. "
+        "This can be used, for instance, to fine-tune an existing model on a new environment. "
+        "Note that the previously saved models must have the same behavior parameters as your "
+        "current environment.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--save-freq",
+        default=50000,
+        type=int,
+        help="How often (in steps) to save the model during training",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--seed",
+        default=-1,
+        type=int,
+        help="A number to use as a seed for the random number generator used by the training code",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--train",
+        default=False,
+        dest="train_model",
+        action=DetectDefaultStoreTrue,
+        help=argparse.SUPPRESS,
+    )
+    argparser.add_argument(
+        "--inference",
+        default=False,
+        dest="inference",
+        action=DetectDefaultStoreTrue,
+        help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
+        "a model trained with an existing run ID.",
+    )
+    argparser.add_argument(
+        "--base-port",
+        default=UnityEnvironment.BASE_ENVIRONMENT_PORT,
+        type=int,
+        help="The starting port for environment communication. Each concurrent Unity environment "
+        "instance will get assigned a port sequentially, starting from the base-port. Each instance "
+        "will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
+        "each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
+        "than an executable, the base port will be ignored.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--num-envs",
+        default=1,
+        type=int,
+        help="The number of concurrent Unity environment instances to collect experiences "
+        "from when training",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--debug",
+        default=False,
+        action=DetectDefaultStoreTrue,
+        help="Whether to enable debug-level logging for some parts of the code",
+    )
+    argparser.add_argument(
+        "--env-args",
+        default=None,
+        nargs=argparse.REMAINDER,
+        help="Arguments passed to the Unity executable. Be aware that the standalone build will also "
+        "process these as Unity Command Line Arguments. You should choose different argument names if "
+        "you want to create environment-specific arguments. All arguments after this flag will be "
+        "passed to the executable.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--cpu",
+        default=False,
+        action=DetectDefaultStoreTrue,
+        help="Forces training using CPU only",
+    )
+
+    eng_conf = argparser.add_argument_group(title="Engine Configuration")
+    eng_conf.add_argument(
+        "--width",
+        default=84,
+        type=int,
+        help="The width of the executable window of the environment(s) in pixels "
+        "(ignored for editor training).",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--height",
+        default=84,
+        type=int,
+        help="The height of the executable window of the environment(s) in pixels "
+        "(ignored for editor training)",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--quality-level",
+        default=5,
+        type=int,
+        help="The quality level of the environment(s). Equivalent to calling "
+        "QualitySettings.SetQualityLevel in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--time-scale",
+        default=20,
+        type=float,
+        help="The time scale of the Unity environment(s). Equivalent to setting "
+        "Time.timeScale in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--target-frame-rate",
+        default=-1,
+        type=int,
+        help="The target frame rate of the Unity environment(s). Equivalent to setting "
+        "Application.targetFrameRate in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--capture-frame-rate",
+        default=60,
+        type=int,
+        help="The capture frame rate of the Unity environment(s). Equivalent to setting "
+        "Time.captureFramerate in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--no-graphics",
+        default=False,
+        action=DetectDefaultStoreTrue,
+        help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
+        "the graphics driver. Use this only if your agents don't use visual observations.",
+    )
+    return argparser
+
+
+parser = _create_parser()
diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index f465d32b83..b0febc1c66 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -1,12 +1,13 @@
 # # Unity ML-Agents Toolkit
-import argparse
 import yaml
 
 import os
 import numpy as np
 import json
 
-from typing import Callable, Optional, List, NamedTuple, Dict
+from typing import Callable, Optional, List, Dict
+import attr
+import cattr
 
 import mlagents.trainers
 import mlagents_envs
@@ -14,7 +15,6 @@
 from mlagents.trainers.trainer_controller import TrainerController
 from mlagents.trainers.meta_curriculum import MetaCurriculum
 from mlagents.trainers.trainer_util import (
-    load_config,
     TrainerFactory,
     handle_existing_directories,
     assemble_curriculum_config,
@@ -26,14 +26,11 @@
     GaugeWriter,
     ConsoleWriter,
 )
-from mlagents.trainers.cli_utils import (
-    StoreConfigFile,
-    DetectDefault,
-    DetectDefaultStoreTrue,
-)
+from mlagents.trainers.cli_utils import parser
 from mlagents_envs.environment import UnityEnvironment
 from mlagents.trainers.sampler_class import SamplerManager
-from mlagents.trainers.exception import SamplerException, TrainerConfigError
+from mlagents.trainers.exception import SamplerException
+from mlagents.trainers.settings import RunOptions
 from mlagents_envs.base_env import BaseEnv
 from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
 from mlagents_envs.side_channel.side_channel import SideChannel
@@ -49,286 +46,6 @@
 logger = logging_util.get_logger(__name__)
 
 
-def _create_parser():
-    argparser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    argparser.add_argument("trainer_config_path", action=StoreConfigFile)
-    argparser.add_argument(
-        "--env",
-        default=None,
-        dest="env_path",
-        help="Path to the Unity executable to train",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--lesson",
-        default=0,
-        type=int,
-        help="The lesson to start with when performing curriculum training",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--keep-checkpoints",
-        default=5,
-        type=int,
-        help="The maximum number of model checkpoints to keep. Checkpoints are saved after the"
-        "number of steps specified by the save-freq option. Once the maximum number of checkpoints"
-        "has been reached, the oldest checkpoint is deleted when saving a new checkpoint.",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--load",
-        default=False,
-        dest="load_model",
-        action=DetectDefaultStoreTrue,
-        help=argparse.SUPPRESS,  # Deprecated but still usable for now.
-    )
-    argparser.add_argument(
-        "--resume",
-        default=False,
-        dest="resume",
-        action=DetectDefaultStoreTrue,
-        help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
-        "If set, the training code loads an already trained model to initialize the neural network "
-        "before resuming training. This option is only valid when the models exist, and have the same "
-        "behavior names as the current agents in your scene.",
-    )
-    argparser.add_argument(
-        "--force",
-        default=False,
-        dest="force",
-        action=DetectDefaultStoreTrue,
-        help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
-        "this flag, attempting to train a model with a run-id that has been used before will throw "
-        "an error.",
-    )
-    argparser.add_argument(
-        "--run-id",
-        default="ppo",
-        help="The identifier for the training run. This identifier is used to name the "
-        "subdirectories in which the trained model and summary statistics are saved as well "
-        "as the saved model itself. If you use TensorBoard to view the training statistics, "
-        "always set a unique run-id for each training run. (The statistics for all runs with the "
-        "same id are combined as if they were produced by a the same session.)",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--initialize-from",
-        metavar="RUN_ID",
-        default=None,
-        help="Specify a previously saved run ID from which to initialize the model from. "
-        "This can be used, for instance, to fine-tune an existing model on a new environment. "
-        "Note that the previously saved models must have the same behavior parameters as your "
-        "current environment.",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--save-freq",
-        default=50000,
-        type=int,
-        help="How often (in steps) to save the model during training",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--seed",
-        default=-1,
-        type=int,
-        help="A number to use as a seed for the random number generator used by the training code",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--train",
-        default=False,
-        dest="train_model",
-        action=DetectDefaultStoreTrue,
-        help=argparse.SUPPRESS,
-    )
-    argparser.add_argument(
-        "--inference",
-        default=False,
-        dest="inference",
-        action=DetectDefaultStoreTrue,
-        help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
-        "a model trained with an existing run ID.",
-    )
-    argparser.add_argument(
-        "--base-port",
-        default=UnityEnvironment.BASE_ENVIRONMENT_PORT,
-        type=int,
-        help="The starting port for environment communication. Each concurrent Unity environment "
-        "instance will get assigned a port sequentially, starting from the base-port. Each instance "
-        "will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
-        "each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
-        "than an executable, the base port will be ignored.",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--num-envs",
-        default=1,
-        type=int,
-        help="The number of concurrent Unity environment instances to collect experiences "
-        "from when training",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--no-graphics",
-        default=False,
-        action=DetectDefaultStoreTrue,
-        help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
-        "the graphics driver. Use this only if your agents don't use visual observations.",
-    )
-    argparser.add_argument(
-        "--debug",
-        default=False,
-        action=DetectDefaultStoreTrue,
-        help="Whether to enable debug-level logging for some parts of the code",
-    )
-    argparser.add_argument(
-        "--env-args",
-        default=None,
-        nargs=argparse.REMAINDER,
-        help="Arguments passed to the Unity executable. Be aware that the standalone build will also "
-        "process these as Unity Command Line Arguments. You should choose different argument names if "
-        "you want to create environment-specific arguments. All arguments after this flag will be "
-        "passed to the executable.",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--cpu",
-        default=False,
-        action=DetectDefaultStoreTrue,
-        help="Forces training using CPU only",
-    )
-
-    argparser.add_argument("--version", action="version", version="")
-
-    eng_conf = argparser.add_argument_group(title="Engine Configuration")
-    eng_conf.add_argument(
-        "--width",
-        default=84,
-        type=int,
-        help="The width of the executable window of the environment(s) in pixels "
-        "(ignored for editor training).",
-        action=DetectDefault,
-    )
-    eng_conf.add_argument(
-        "--height",
-        default=84,
-        type=int,
-        help="The height of the executable window of the environment(s) in pixels "
-        "(ignored for editor training)",
-        action=DetectDefault,
-    )
-    eng_conf.add_argument(
-        "--quality-level",
-        default=5,
-        type=int,
-        help="The quality level of the environment(s). Equivalent to calling "
-        "QualitySettings.SetQualityLevel in Unity.",
-        action=DetectDefault,
-    )
-    eng_conf.add_argument(
-        "--time-scale",
-        default=20,
-        type=float,
-        help="The time scale of the Unity environment(s). Equivalent to setting "
-        "Time.timeScale in Unity.",
-        action=DetectDefault,
-    )
-    eng_conf.add_argument(
-        "--target-frame-rate",
-        default=-1,
-        type=int,
-        help="The target frame rate of the Unity environment(s). Equivalent to setting "
-        "Application.targetFrameRate in Unity.",
-        action=DetectDefault,
-    )
-    eng_conf.add_argument(
-        "--capture-frame-rate",
-        default=60,
-        type=int,
-        help="The capture frame rate of the Unity environment(s). Equivalent to setting "
-        "Time.captureFramerate in Unity.",
-        action=DetectDefault,
-    )
-    return argparser
-
-
-parser = _create_parser()
-
-
-class RunOptions(NamedTuple):
-    behaviors: Dict
-    debug: bool = parser.get_default("debug")
-    seed: int = parser.get_default("seed")
-    env_path: Optional[str] = parser.get_default("env_path")
-    run_id: str = parser.get_default("run_id")
-    initialize_from: str = parser.get_default("initialize_from")
-    load_model: bool = parser.get_default("load_model")
-    resume: bool = parser.get_default("resume")
-    force: bool = parser.get_default("force")
-    train_model: bool = parser.get_default("train_model")
-    inference: bool = parser.get_default("inference")
-    save_freq: int = parser.get_default("save_freq")
-    keep_checkpoints: int = parser.get_default("keep_checkpoints")
-    base_port: int = parser.get_default("base_port")
-    num_envs: int = parser.get_default("num_envs")
-    curriculum_config: Optional[Dict] = None
-    lesson: int = parser.get_default("lesson")
-    no_graphics: bool = parser.get_default("no_graphics")
-    multi_gpu: bool = parser.get_default("multi_gpu")
-    parameter_randomization: Optional[Dict] = None
-    env_args: Optional[List[str]] = parser.get_default("env_args")
-    cpu: bool = parser.get_default("cpu")
-    width: int = parser.get_default("width")
-    height: int = parser.get_default("height")
-    quality_level: int = parser.get_default("quality_level")
-    time_scale: float = parser.get_default("time_scale")
-    target_frame_rate: int = parser.get_default("target_frame_rate")
-    capture_frame_rate: int = parser.get_default("capture_frame_rate")
-
-    @staticmethod
-    def from_argparse(args: argparse.Namespace) -> "RunOptions":
-        """
-        Takes an argparse.Namespace as specified in `parse_command_line`, loads input configuration files
-        from file paths, and converts to a CommandLineOptions instance.
-        :param args: collection of command-line parameters passed to mlagents-learn
-        :return: CommandLineOptions representing the passed in arguments, with trainer config, curriculum and sampler
-          configs loaded from files.
-        """
-        argparse_args = vars(args)
-        run_options_dict = {}
-        run_options_dict.update(argparse_args)
-        config_path = StoreConfigFile.trainer_config_path
-
-        # Load YAML
-        yaml_config = load_config(config_path)
-        # This is the only option that is not optional and has no defaults.
-        if "behaviors" not in yaml_config:
-            raise TrainerConfigError(
-                "Trainer configurations not found. Make sure your YAML file has a section for behaviors."
-            )
-        # Use the YAML file values for all values not specified in the CLI.
-        for key, val in yaml_config.items():
-            # Detect bad config options
-            if not hasattr(RunOptions, key):
-                raise TrainerConfigError(
-                    "The option {} was specified in your YAML file, but is invalid.".format(
-                        key
-                    )
-                )
-            if key not in DetectDefault.non_default_args:
-                run_options_dict[key] = val
-
-        # Keep deprecated --load working, TODO: remove
-        run_options_dict["resume"] = (
-            run_options_dict["resume"] or run_options_dict["load_model"]
-        )
-
-        return RunOptions(**run_options_dict)
-
-
 def get_version_string() -> str:
     # pylint: disable=no-member
     return f""" Version information:
@@ -351,16 +68,24 @@ def run_training(run_seed: int, options: RunOptions) -> None:
     :param run_options: Command line arguments for training.
     """
     with hierarchical_timer("run_training.setup"):
+        checkpoint_settings = options.checkpoint_settings
+        env_settings = options.env_settings
+        engine_settings = options.engine_settings
         base_path = "results"
-        write_path = os.path.join(base_path, options.run_id)
+        write_path = os.path.join(base_path, checkpoint_settings.run_id)
         maybe_init_path = (
-            os.path.join(base_path, options.run_id) if options.initialize_from else None
+            os.path.join(base_path, checkpoint_settings.run_id)
+            if checkpoint_settings.initialize_from
+            else None
         )
         run_logs_dir = os.path.join(write_path, "run_logs")
-        port = options.base_port
+        port = env_settings.base_port
         # Check if directory exists
         handle_existing_directories(
-            write_path, options.resume, options.force, maybe_init_path
+            write_path,
+            checkpoint_settings.resume,
+            checkpoint_settings.force,
+            maybe_init_path,
         )
         # Make run logs directory
         os.makedirs(run_logs_dir, exist_ok=True)
@@ -373,7 +98,9 @@ def run_training(run_seed: int, options: RunOptions) -> None:
                 "Environment/Episode Length",
             ],
         )
-        tb_writer = TensorboardWriter(write_path, clear_past_data=not options.resume)
+        tb_writer = TensorboardWriter(
+            write_path, clear_past_data=not checkpoint_settings.resume
+        )
         gauge_write = GaugeWriter()
         console_writer = ConsoleWriter()
         StatsReporter.add_writer(tb_writer)
@@ -381,39 +108,41 @@ def run_training(run_seed: int, options: RunOptions) -> None:
         StatsReporter.add_writer(gauge_write)
         StatsReporter.add_writer(console_writer)
 
-        if options.env_path is None:
+        if env_settings.env_path is None:
             port = UnityEnvironment.DEFAULT_EDITOR_PORT
         env_factory = create_environment_factory(
-            options.env_path,
-            options.no_graphics,
+            env_settings.env_path,
+            engine_settings.no_graphics,
             run_seed,
             port,
-            options.env_args,
+            env_settings.env_args,
             os.path.abspath(run_logs_dir),  # Unity environment requires absolute path
         )
         engine_config = EngineConfig(
-            width=options.width,
-            height=options.height,
-            quality_level=options.quality_level,
-            time_scale=options.time_scale,
-            target_frame_rate=options.target_frame_rate,
-            capture_frame_rate=options.capture_frame_rate,
+            width=engine_settings.width,
+            height=engine_settings.height,
+            quality_level=engine_settings.quality_level,
+            time_scale=engine_settings.time_scale,
+            target_frame_rate=engine_settings.target_frame_rate,
+            capture_frame_rate=engine_settings.capture_frame_rate,
+        )
+        env_manager = SubprocessEnvManager(
+            env_factory, engine_config, env_settings.num_envs
         )
-        env_manager = SubprocessEnvManager(env_factory, engine_config, options.num_envs)
         curriculum_config = assemble_curriculum_config(options.behaviors)
         maybe_meta_curriculum = try_create_meta_curriculum(
-            curriculum_config, env_manager, options.lesson
+            curriculum_config, env_manager, checkpoint_settings.lesson
         )
         sampler_manager, resampling_interval = create_sampler_manager(
             options.parameter_randomization, run_seed
         )
         trainer_factory = TrainerFactory(
             options.behaviors,
-            options.run_id,
+            checkpoint_settings.run_id,
             write_path,
-            options.keep_checkpoints,
-            not options.inference,
-            options.resume,
+            checkpoint_settings.keep_checkpoints,
+            not checkpoint_settings.inference,
+            checkpoint_settings.resume,
             run_seed,
             maybe_init_path,
             maybe_meta_curriculum,
@@ -423,10 +152,10 @@ def run_training(run_seed: int, options: RunOptions) -> None:
         tc = TrainerController(
             trainer_factory,
             write_path,
-            options.run_id,
-            options.save_freq,
+            checkpoint_settings.run_id,
+            checkpoint_settings.save_freq,
             maybe_meta_curriculum,
-            not options.inference,
+            not checkpoint_settings.inference,
             run_seed,
             sampler_manager,
             resampling_interval,
@@ -446,9 +175,9 @@ def write_run_options(output_dir: str, run_options: RunOptions) -> None:
     try:
         with open(run_options_path, "w") as f:
             try:
-                yaml.dump(dict(run_options._asdict()), f, sort_keys=False)
+                yaml.dump(cattr.unstructure(run_options), f, sort_keys=False)
             except TypeError:  # Older versions of pyyaml don't support sort_keys
-                yaml.dump(dict(run_options._asdict()), f)
+                yaml.dump(cattr.unstructure(run_options), f)
     except FileNotFoundError:
         logger.warning(
             f"Unable to save configuration to {run_options_path}. Make sure the directory exists"
@@ -569,22 +298,20 @@ def run_cli(options: RunOptions) -> None:
     logging_util.set_log_level(log_level)
 
     logger.debug("Configuration for this run:")
-    logger.debug(json.dumps(options._asdict(), indent=4))
+    logger.debug(json.dumps(attr.asdict(options), indent=4))
 
     # Options deprecation warnings
-    if options.load_model:
+    if options.checkpoint_settings.load_model:
         logger.warning(
             "The --load option has been deprecated. Please use the --resume option instead."
         )
-    if options.train_model:
+    if options.checkpoint_settings.train_model:
         logger.warning(
             "The --train option has been deprecated. Train mode is now the default. Use "
             "--inference to run in inference mode."
         )
 
-    run_seed = options.seed
-    if options.cpu:
-        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+    run_seed = options.env_settings.seed
 
     # Add some timer metadata
     add_timer_metadata("mlagents_version", mlagents.trainers.__version__)
@@ -592,7 +319,7 @@ def run_cli(options: RunOptions) -> None:
     add_timer_metadata("communication_protocol_version", UnityEnvironment.API_VERSION)
     add_timer_metadata("tensorflow_version", tf_utils.tf.__version__)
 
-    if options.seed == -1:
+    if options.env_settings.seed == -1:
         run_seed = np.random.randint(0, 10000)
     run_training(run_seed, options)
 
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
new file mode 100644
index 0000000000..9ff09c871f
--- /dev/null
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -0,0 +1,106 @@
+import attr
+import cattr
+from typing import Dict, Optional, List, Any
+import argparse
+
+from mlagents.trainers.cli_utils import StoreConfigFile, DetectDefault, parser
+from mlagents.trainers.trainer_util import load_config
+from mlagents.trainers.exception import TrainerConfigError
+
+
+@attr.s(auto_attribs=True)
+class CheckpointSettings:
+    save_freq: int = parser.get_default("save_freq")
+    keep_checkpoints: int = parser.get_default("keep_checkpoints")
+    run_id: str = parser.get_default("run_id")
+    initialize_from: str = parser.get_default("initialize_from")
+    load_model: bool = parser.get_default("load_model")
+    resume: bool = parser.get_default("resume")
+    force: bool = parser.get_default("force")
+    train_model: bool = parser.get_default("train_model")
+    inference: bool = parser.get_default("inference")
+    lesson: int = parser.get_default("lesson")
+
+
+@attr.s(auto_attribs=True)
+class EnvironmentSettings:
+    env_path: Optional[str] = parser.get_default("env_path")
+    env_args: Optional[List[str]] = parser.get_default("env_args")
+    base_port: int = parser.get_default("base_port")
+    num_envs: int = parser.get_default("num_envs")
+    seed: int = parser.get_default("seed")
+
+
+@attr.s(auto_attribs=True)
+class EngineSettings:
+    width: int = parser.get_default("width")
+    height: int = parser.get_default("height")
+    quality_level: int = parser.get_default("quality_level")
+    time_scale: float = parser.get_default("time_scale")
+    target_frame_rate: int = parser.get_default("target_frame_rate")
+    capture_frame_rate: int = parser.get_default("capture_frame_rate")
+    no_graphics: bool = parser.get_default("no_graphics")
+
+
+@attr.s(auto_attribs=True)
+class RunOptions:
+    behaviors: Dict[str, Dict]
+    env_settings: EnvironmentSettings = EnvironmentSettings()
+    engine_settings: EngineSettings = EngineSettings()
+    environment_settings: EnvironmentSettings = EnvironmentSettings()
+    parameter_randomization: Optional[Dict] = None
+    curriculum_config: Optional[Dict] = None
+    checkpoint_settings: CheckpointSettings = CheckpointSettings()
+
+    # These are options that are relevant to the run itself, and not the engine or environment.
+    # They will be left here.
+    debug: bool = parser.get_default("debug")
+    multi_gpu: bool = False
+
+    @staticmethod
+    def from_argparse(args: argparse.Namespace) -> "RunOptions":
+        """
+        Takes an argparse.Namespace as specified in `parse_command_line`, loads input configuration files
+        from file paths, and converts to a RunOptions instance.
+        :param args: collection of command-line parameters passed to mlagents-learn
+        :return: RunOptions representing the passed in arguments, with trainer config, curriculum and sampler
+          configs loaded from files.
+        """
+        argparse_args = vars(args)
+        config_path = StoreConfigFile.trainer_config_path
+
+        # Load YAML
+        configured_dict: Dict[str, Any] = {
+            "checkpoint_settings": {},
+            "env_settings": {},
+            "engine_settings": {},
+        }
+        configured_dict.update(load_config(config_path))
+        # This is the only option that is not optional and has no defaults.
+        if "behaviors" not in configured_dict:
+            raise TrainerConfigError(
+                "Trainer configurations not found. Make sure your YAML file has a section for behaviors."
+            )
+        # Use the YAML file values for all values not specified in the CLI.
+        for key in configured_dict.keys():
+            # Detect bad config options
+            if key not in attr.fields_dict(RunOptions):
+                raise TrainerConfigError(
+                    "The option {} was specified in your YAML file, but is invalid.".format(
+                        key
+                    )
+                )
+        # Override with CLI args
+        # Keep deprecated --load working, TODO: remove
+        argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"]
+        for key, val in argparse_args.items():
+            if key in DetectDefault.non_default_args:
+                if key in attr.fields_dict(CheckpointSettings):
+                    configured_dict["checkpoint_settings"][key] = val
+                elif key in attr.fields_dict(EnvironmentSettings):
+                    configured_dict["env_settings"][key] = val
+                elif key in attr.fields_dict(EngineSettings):
+                    configured_dict["engine_settings"][key] = val
+                else:  # Base options
+                    configured_dict[key] = val
+        return cattr.structure(configured_dict, RunOptions)
diff --git a/ml-agents/setup.py b/ml-agents/setup.py
index 08e03dc73c..22143f4e03 100644
--- a/ml-agents/setup.py
+++ b/ml-agents/setup.py
@@ -63,6 +63,8 @@ def run(self):
         "protobuf>=3.6",
         "pyyaml",
         "tensorflow>=1.7,<3.0",
+        "cattr>=1.0.0",
+        "attr>=19.3.0",
         'pypiwin32==223;platform_system=="Windows"',
         # We don't actually need six, but tensorflow does, and pip seems
         # to get confused and install the wrong version.

From 124d777780114b3607f295a5fb4af41e9254b70f Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 30 Apr 2020 11:34:33 -0700
Subject: [PATCH 02/54] Add example of strict type conversion

---
 ml-agents/mlagents/trainers/settings.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 9ff09c871f..dcf86a2087 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -8,6 +8,13 @@
 from mlagents.trainers.exception import TrainerConfigError
 
 
+def strict_to_cls(d, t):
+    if d is None:
+        return None
+
+    return t(**d)
+
+
 @attr.s(auto_attribs=True)
 class CheckpointSettings:
     save_freq: int = parser.get_default("save_freq")
@@ -47,7 +54,6 @@ class RunOptions:
     behaviors: Dict[str, Dict]
     env_settings: EnvironmentSettings = EnvironmentSettings()
     engine_settings: EngineSettings = EngineSettings()
-    environment_settings: EnvironmentSettings = EnvironmentSettings()
     parameter_randomization: Optional[Dict] = None
     curriculum_config: Optional[Dict] = None
     checkpoint_settings: CheckpointSettings = CheckpointSettings()

From 7b39baabbe493d859a165606195a85396541c123 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 30 Apr 2020 11:47:04 -0700
Subject: [PATCH 03/54] Recursively apply cattr with being strict

---
 ml-agents/mlagents/trainers/settings.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index dcf86a2087..3f89aed04a 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -11,8 +11,17 @@
 def strict_to_cls(d, t):
     if d is None:
         return None
-
-    return t(**d)
+    d_copy = {}
+    d_copy.update(d)
+    for key, val in d_copy.items():
+        _fd = attr.fields_dict(t)
+        if key not in attr.fields_dict(t):
+            raise TrainerConfigError(
+                f"The option {key} was specified in your YAML file for {t.__name__}, but is invalid."
+            )
+        # Apply cattr structure to the values
+        d_copy[key] = cattr.structure(val, _fd[key].type)
+    return t(**d_copy)
 
 
 @attr.s(auto_attribs=True)
@@ -62,6 +71,10 @@ class RunOptions:
     # They will be left here.
     debug: bool = parser.get_default("debug")
     multi_gpu: bool = False
+    # Strict conversion
+    cattr.register_structure_hook(EnvironmentSettings, strict_to_cls)
+    cattr.register_structure_hook(EngineSettings, strict_to_cls)
+    cattr.register_structure_hook(CheckpointSettings, strict_to_cls)
 
     @staticmethod
     def from_argparse(args: argparse.Namespace) -> "RunOptions":

From b5121affb5d700d4d40d3c9c0b92e7794acdd00b Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 1 May 2020 11:35:50 -0700
Subject: [PATCH 04/54] PPO trains

---
 config/ppo/3DBall.yaml                        |  52 ++++---
 ml-agents/mlagents/trainers/cli_utils.py      |  32 +++-
 ml-agents/mlagents/trainers/learn.py          |   4 +-
 .../trainers/optimizer/tf_optimizer.py        |   4 +-
 .../mlagents/trainers/policy/nn_policy.py     |  11 +-
 .../mlagents/trainers/policy/tf_policy.py     |  39 +++--
 ml-agents/mlagents/trainers/ppo/optimizer.py  |  31 ++--
 ml-agents/mlagents/trainers/ppo/trainer.py    |  17 +-
 ml-agents/mlagents/trainers/settings.py       | 147 ++++++++++++++++--
 .../mlagents/trainers/trainer/rl_trainer.py   |  12 +-
 .../mlagents/trainers/trainer/trainer.py      |  13 +-
 .../mlagents/trainers/trainer_controller.py   |   3 +-
 ml-agents/mlagents/trainers/trainer_util.py   |  83 ++--------
 13 files changed, 277 insertions(+), 171 deletions(-)

diff --git a/config/ppo/3DBall.yaml b/config/ppo/3DBall.yaml
index 9f2767f73d..fffdd4efeb 100644
--- a/config/ppo/3DBall.yaml
+++ b/config/ppo/3DBall.yaml
@@ -1,25 +1,31 @@
 behaviors:
   3DBall:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.001
-    buffer_size: 12000
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.99
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 12000
-    use_recurrent: false
-    vis_encode_type: simple
-    reward_signals:
-      extrinsic:
-        strength: 1.0
-        gamma: 0.99
+      trainer_type: ppo
+
+      # hyperparameters:
+      #   batch_size: 64
+      #   beta: 0.001
+      #   buffer_size: 12000
+      #   epsilon: 0.2
+      #   lambd: 0.99
+      #   learning_rate: 0.0003
+      #   learning_rate_schedule: linear
+      #   max_steps: 5.0e5
+      #   num_epoch: 3
+
+      #   time_horizon: 1000
+      #   sequence_length: 64
+      #   summary_freq: 12000
+      network_settings:
+        num_layers: 2
+        normalize: true
+        hidden_units: 128
+      reward_signals:
+          extrinsic:
+            strength: 1.0
+            gamma: 0.99
+      time_horizon: 1000
+      sequence_length: 64
+      summary_freq: 12000
+
+
diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py
index df1dd7d9af..c2112ab454 100644
--- a/ml-agents/mlagents/trainers/cli_utils.py
+++ b/ml-agents/mlagents/trainers/cli_utils.py
@@ -1,4 +1,7 @@
-from typing import Set
+from typing import Set, Dict, Any, TextIO
+import os
+import yaml
+from mlagents.trainers.exception import TrainerConfigError
 from mlagents_envs.environment import UnityEnvironment
 import argparse
 
@@ -246,4 +249,31 @@ def _create_parser() -> argparse.ArgumentParser:
     return argparser
 
 
+def load_config(config_path: str) -> Dict[str, Any]:
+    try:
+        with open(config_path) as data_file:
+            return _load_config(data_file)
+    except IOError:
+        abs_path = os.path.abspath(config_path)
+        raise TrainerConfigError(f"Config file could not be found at {abs_path}.")
+    except UnicodeDecodeError:
+        raise TrainerConfigError(
+            f"There was an error decoding Config file from {config_path}. "
+            f"Make sure your file is save using UTF-8"
+        )
+
+
+def _load_config(fp: TextIO) -> Dict[str, Any]:
+    """
+    Load the yaml config from the file-like object.
+    """
+    try:
+        return yaml.safe_load(fp)
+    except yaml.parser.ParserError as e:
+        raise TrainerConfigError(
+            "Error parsing yaml file. Please check for formatting errors. "
+            "A tool such as http://www.yamllint.com/ can be helpful with this."
+        ) from e
+
+
 parser = _create_parser()
diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index b0febc1c66..cdb9e87503 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -6,7 +6,6 @@
 import json
 
 from typing import Callable, Optional, List, Dict
-import attr
 import cattr
 
 import mlagents.trainers
@@ -298,7 +297,8 @@ def run_cli(options: RunOptions) -> None:
     logging_util.set_log_level(log_level)
 
     logger.debug("Configuration for this run:")
-    logger.debug(json.dumps(attr.asdict(options), indent=4))
+    print(options)
+    logger.debug(json.dumps(cattr.unstructure(options), indent=4))
 
     # Options deprecation warnings
     if options.checkpoint_settings.load_model:
diff --git a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
index ba2f2001ab..392eb78724 100644
--- a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
@@ -18,13 +18,13 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
         self.policy = policy
         self.update_dict: Dict[str, tf.Tensor] = {}
         self.value_heads: Dict[str, tf.Tensor] = {}
-        self.create_reward_signals(trainer_params["reward_signals"])
+        self.create_reward_signals(trainer_params.reward_signals)
         self.memory_in: tf.Tensor = None
         self.memory_out: tf.Tensor = None
         self.m_size: int = 0
         self.bc_module: Optional[BCModule] = None
         # Create pretrainer if needed
-        if "behavioral_cloning" in trainer_params:
+        if trainer_params.behavioral_cloning is not None:
             BCModule.check_config(trainer_params["behavioral_cloning"])
             self.bc_module = BCModule(
                 self.policy,
diff --git a/ml-agents/mlagents/trainers/policy/nn_policy.py b/ml-agents/mlagents/trainers/policy/nn_policy.py
index 07573a2e08..3b098352ee 100644
--- a/ml-agents/mlagents/trainers/policy/nn_policy.py
+++ b/ml-agents/mlagents/trainers/policy/nn_policy.py
@@ -6,6 +6,7 @@
 from mlagents.trainers.models import EncoderType
 from mlagents.trainers.models import ModelUtils
 from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.settings import TrainerSettings
 from mlagents.trainers.distributions import (
     GaussianDistribution,
     MultiCategoricalDistribution,
@@ -19,7 +20,7 @@ def __init__(
         self,
         seed: int,
         brain: BrainParameters,
-        trainer_params: Dict[str, Any],
+        trainer_params: TrainerSettings,
         is_training: bool,
         load: bool,
         tanh_squash: bool = False,
@@ -42,14 +43,12 @@ def __init__(
         super().__init__(seed, brain, trainer_params, load)
         self.grads = None
         self.update_batch: Optional[tf.Operation] = None
-        num_layers = trainer_params["num_layers"]
-        self.h_size = trainer_params["hidden_units"]
+        num_layers = self.network_settings.num_layers
+        self.h_size = self.network_settings.hidden_units
         if num_layers < 1:
             num_layers = 1
         self.num_layers = num_layers
-        self.vis_encode_type = EncoderType(
-            trainer_params.get("vis_encode_type", "simple")
-        )
+        self.vis_encode_type = self.network_settings.vis_encode_type
         self.tanh_squash = tanh_squash
         self.reparameterize = reparameterize
         self.condition_sigma_on_obs = condition_sigma_on_obs
diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py
index f24f7acb25..26c7864c2a 100644
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
@@ -12,6 +12,8 @@
 from mlagents.trainers.brain_conversion_utils import get_global_agent_id
 from mlagents_envs.base_env import DecisionSteps
 from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.settings import TrainerSettings, NetworkSettings
+from mlagents.trainers.brain import BrainParameters
 
 
 logger = get_logger(__name__)
@@ -31,7 +33,13 @@ class TFPolicy(Policy):
     functions to save/load models and create the input placeholders.
     """
 
-    def __init__(self, seed, brain, trainer_parameters, load=False):
+    def __init__(
+        self,
+        seed: int,
+        brain: BrainParameters,
+        trainer_parameters: TrainerSettings,
+        load: bool = False,
+    ):
         """
         Initialized the policy.
         :param seed: Random seed to use for TensorFlow.
@@ -40,13 +48,14 @@ def __init__(self, seed, brain, trainer_parameters, load=False):
         """
         self._version_number_ = 2
         self.m_size = 0
-
+        self.trainer_parameters = trainer_parameters
+        self.network_settings: NetworkSettings = trainer_parameters.network_settings
         # for ghost trainer save/load snapshots
-        self.assign_phs = []
-        self.assign_ops = []
+        self.assign_phs: List[tf.Tensor] = []
+        self.assign_ops: List[tf.Operation] = []
 
-        self.inference_dict = {}
-        self.update_dict = {}
+        self.inference_dict: Dict[str, tf.Tensor] = {}
+        self.update_dict: Dict[str, tf.Tensor] = {}
         self.sequence_length = 1
         self.seed = seed
         self.brain = brain
@@ -55,26 +64,26 @@ def __init__(self, seed, brain, trainer_parameters, load=False):
         self.vec_obs_size = brain.vector_observation_space_size
         self.vis_obs_size = brain.number_visual_observations
 
-        self.use_recurrent = trainer_parameters["use_recurrent"]
+        self.use_recurrent = self.network_settings.memory is not None
         self.memory_dict: Dict[str, np.ndarray] = {}
         self.num_branches = len(self.brain.vector_action_space_size)
         self.previous_action_dict: Dict[str, np.array] = {}
-        self.normalize = trainer_parameters.get("normalize", False)
+        self.normalize = self.network_settings.normalize
         self.use_continuous_act = brain.vector_action_space_type == "continuous"
         if self.use_continuous_act:
             self.num_branches = self.brain.vector_action_space_size[0]
-        self.model_path = trainer_parameters["output_path"]
-        self.initialize_path = trainer_parameters.get("init_path", None)
-        self.keep_checkpoints = trainer_parameters.get("keep_checkpoints", 5)
+        self.model_path = self.trainer_parameters.output_path
+        self.initialize_path = self.trainer_parameters.init_path
+        self.keep_checkpoints = self.trainer_parameters.keep_checkpoints
         self.graph = tf.Graph()
         self.sess = tf.Session(
             config=tf_utils.generate_session_config(), graph=self.graph
         )
-        self.saver = None
+        self.saver: Optional[tf.Operation] = None
         self.seed = seed
-        if self.use_recurrent:
-            self.m_size = trainer_parameters["memory_size"]
-            self.sequence_length = trainer_parameters["sequence_length"]
+        if self.network_settings.memory is not None:
+            self.m_size = self.network_settings.memory.memory_size
+            self.sequence_length = self.network_settings.memory.sequence_length
             if self.m_size == 0:
                 raise UnityPolicyException(
                     "The memory size for brain {0} is 0 even "
diff --git a/ml-agents/mlagents/trainers/ppo/optimizer.py b/ml-agents/mlagents/trainers/ppo/optimizer.py
index 2151cf5707..42751c3c0d 100644
--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
@@ -2,14 +2,15 @@
 import numpy as np
 from mlagents.tf_utils import tf
 from mlagents_envs.timers import timed
-from mlagents.trainers.models import ModelUtils, EncoderType, LearningRateSchedule
+from mlagents.trainers.models import ModelUtils, EncoderType
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
 from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.settings import TrainerSettings, PPOSettings
 
 
 class PPOOptimizer(TFOptimizer):
-    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
         """
         Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
         The PPO optimizer has a value estimator and a loss function.
@@ -22,20 +23,18 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
         with policy.graph.as_default():
             with tf.variable_scope("optimizer/"):
                 super().__init__(policy, trainer_params)
-
-                lr = float(trainer_params["learning_rate"])
-                lr_schedule = LearningRateSchedule(
-                    trainer_params.get("learning_rate_schedule", "linear")
-                )
-                h_size = int(trainer_params["hidden_units"])
-                epsilon = float(trainer_params["epsilon"])
-                beta = float(trainer_params["beta"])
-                max_step = float(trainer_params["max_steps"])
-                num_layers = int(trainer_params["num_layers"])
-                vis_encode_type = EncoderType(
-                    trainer_params.get("vis_encode_type", "simple")
-                )
-                self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
+                hyperparameters: PPOSettings = trainer_params.hyperparameters
+                lr = float(hyperparameters.learning_rate)
+                lr_schedule = hyperparameters.learning_rate_schedule
+                epsilon = float(hyperparameters.epsilon)
+                beta = float(hyperparameters.beta)
+                max_step = float(trainer_params.max_steps)
+
+                policy_network_settings = policy.network_settings
+                h_size = int(policy_network_settings.hidden_units)
+                num_layers = policy_network_settings.num_layers
+                vis_encode_type = policy_network_settings.vis_encode_type
+                self.burn_in_ratio = 0.0
 
                 self.stream_names = list(self.reward_signals.keys())
 
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
index 7b6e6f12d1..085f05381a 100644
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -15,6 +15,7 @@
 from mlagents.trainers.trajectory import Trajectory
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.settings import TrainerSettings, PPOSettings
 
 
 logger = get_logger(__name__)
@@ -27,7 +28,7 @@ def __init__(
         self,
         brain_name: str,
         reward_buff_cap: int,
-        trainer_parameters: dict,
+        trainer_parameters: TrainerSettings,
         training: bool,
         load: bool,
         seed: int,
@@ -66,7 +67,7 @@ def __init__(
             "output_path",
             "reward_signals",
         ]
-        self._check_param_keys()
+        self.hyperparameters: PPOSettings = self.trainer_parameters.hyperparameters
         self.load = load
         self.seed = seed
         self.policy: NNPolicy = None  # type: ignore
@@ -139,7 +140,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
                 value_estimates=local_value_estimates,
                 value_next=bootstrap_value,
                 gamma=self.optimizer.reward_signals[name].gamma,
-                lambd=self.trainer_parameters["lambd"],
+                lambd=self.hyperparameters.lambd,
             )
             local_return = local_advantage + local_value_estimates
             # This is later use as target for the different value estimates
@@ -170,7 +171,7 @@ def _is_ready_update(self):
         :return: A boolean corresponding to whether or not update_model() can be run
         """
         size_of_buffer = self.update_buffer.num_experiences
-        return size_of_buffer > self.trainer_parameters["buffer_size"]
+        return size_of_buffer > self.hyperparameters.buffer_size
 
     def _update_policy(self):
         """
@@ -183,21 +184,21 @@ def _update_policy(self):
         # Make sure batch_size is a multiple of sequence length. During training, we
         # will need to reshape the data into a batch_size x sequence_length tensor.
         batch_size = (
-            self.trainer_parameters["batch_size"]
-            - self.trainer_parameters["batch_size"] % self.policy.sequence_length
+            self.hyperparameters.batch_size
+            - self.hyperparameters.batch_size % self.policy.sequence_length
         )
         # Make sure there is at least one sequence
         batch_size = max(batch_size, self.policy.sequence_length)
 
         n_sequences = max(
-            int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
         )
 
         advantages = self.update_buffer["advantages"].get_batch()
         self.update_buffer["advantages"].set(
             (advantages - advantages.mean()) / (advantages.std() + 1e-10)
         )
-        num_epoch = self.trainer_parameters["num_epoch"]
+        num_epoch = self.hyperparameters.num_epoch
         batch_update_stats = defaultdict(list)
         for _ in range(num_epoch):
             self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 3f89aed04a..d7f5d5a298 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -1,29 +1,150 @@
 import attr
 import cattr
-from typing import Dict, Optional, List, Any
+from typing import Dict, Optional, List, Any, DefaultDict, Mapping
+from enum import Enum
+import collections
 import argparse
 
 from mlagents.trainers.cli_utils import StoreConfigFile, DetectDefault, parser
-from mlagents.trainers.trainer_util import load_config
+from mlagents.trainers.cli_utils import load_config
 from mlagents.trainers.exception import TrainerConfigError
+from mlagents.trainers.models import LearningRateSchedule
 
 
-def strict_to_cls(d, t):
+def check_and_structure(key: str, value: Any, class_type: type) -> Any:
+    attr_fields_dict = attr.fields_dict(class_type)
+    if key not in attr_fields_dict:
+        raise TrainerConfigError(
+            f"The option {key} was specified in your YAML file for {class_type.__name__}, but is invalid."
+        )
+    # Apply cattr structure to the values
+    return cattr.structure(value, attr_fields_dict[key].type)
+
+
+def strict_to_cls(d: Mapping, t: type) -> Any:
     if d is None:
         return None
-    d_copy = {}
+    d_copy: Dict[str, Any] = {}
     d_copy.update(d)
     for key, val in d_copy.items():
-        _fd = attr.fields_dict(t)
-        if key not in attr.fields_dict(t):
-            raise TrainerConfigError(
-                f"The option {key} was specified in your YAML file for {t.__name__}, but is invalid."
-            )
-        # Apply cattr structure to the values
-        d_copy[key] = cattr.structure(val, _fd[key].type)
+        d_copy[key] = check_and_structure(key, val, t)
     return t(**d_copy)
 
 
+def trainer_settings_to_cls(d: Mapping, t: type) -> Any:
+    if d is None:
+        return None
+    d_copy: Dict[str, Any] = {}
+    d_copy.update(d)
+
+    for key, val in d_copy.items():
+        if key == "hyperparameters":
+            if "trainer_type" not in d_copy:
+                raise TrainerConfigError(
+                    "Hyperparameters were specified but no trainer_type was given."
+                )
+            else:
+                d_copy[key] = strict_to_cls(
+                    d_copy[key], TrainerSettings.to_settings(d_copy["trainer_type"])
+                )
+        else:
+            d_copy[key] = check_and_structure(key, val, t)
+    return t(**d_copy)
+
+
+@attr.s(auto_attribs=True)
+class NetworkSettings:
+    @attr.s(auto_attribs=True)
+    class MemorySettings:
+        sequence_length: int = 64
+        memory_size: int = 128
+
+    normalize: bool = False
+    hidden_units: int = 3
+    num_layers: int = 2
+    vis_encode_type: str = "simple"
+    memory: Optional[MemorySettings] = None
+
+
+@attr.s(auto_attribs=True)
+class HyperparamSettings:
+    pass
+
+
+@attr.s(auto_attribs=True)
+class PPOSettings(HyperparamSettings):
+    batch_size: int = 1024
+    beta: float = 5.0e-3
+    buffer_size: int = 10240
+    epsilon: float = 0.2
+    lambd: float = 0.95
+    learning_rate: float = 3.0e-4
+    num_epoch: int = 3
+    learning_rate_schedule: LearningRateSchedule = LearningRateSchedule.CONSTANT
+
+
+@attr.s(auto_attribs=True)
+class SACSettings(HyperparamSettings):
+    batch_size: int = 1024
+    beta: float = 5.0e-3
+    buffer_size: int = 10240
+    epsilon: float = 0.2
+    lambd: float = 0.95
+    learning_rate: float = 3.0e-4
+    learning_rate_schedule: LearningRateSchedule = LearningRateSchedule.CONSTANT
+
+
+@attr.s(auto_attribs=True)
+class RewardSignalSettings:
+    gamma: float = 0.99
+    strength: float = 1.0
+
+
+@attr.s(auto_attribs=True)
+class SelfPlaySettings:
+    hi: int = 0
+
+
+@attr.s(auto_attribs=True)
+class TrainerSettings:
+    # Edit these two fields to add new trainers #
+    class TrainerType(Enum):
+        PPO: str = "ppo"
+        SAC: str = "sac"
+
+    @staticmethod
+    def to_settings(ttype: TrainerType) -> type:
+        _mapping = {
+            TrainerSettings.TrainerType.PPO: PPOSettings,
+            TrainerSettings.TrainerType.SAC: SACSettings,
+        }
+        return _mapping[ttype]
+
+    ###############################################
+
+    trainer_type: TrainerType = TrainerType.PPO
+    hyperparameters: HyperparamSettings = attr.ib()
+
+    @hyperparameters.default
+    def _set_default_hyperparameters(self):
+        return TrainerSettings.to_settings(self.trainer_type)()
+
+    network_settings: NetworkSettings = NetworkSettings()
+    reward_signals: Dict[str, Dict] = {
+        "extrinsic": cattr.unstructure(RewardSignalSettings())
+    }
+    init_path: Optional[str] = None
+    output_path: str = "default"
+    # TODO: Remove parser default and remove from CLI
+    keep_checkpoints: int = parser.get_default("keep_checkpoints")
+    max_steps: int = 500000
+    time_horizon: int = 64
+    summary_freq: int = 50000
+    threaded: bool = False
+    self_play: Optional[SelfPlaySettings] = None
+    behavioral_cloning: Optional[SelfPlaySettings] = None
+
+
 @attr.s(auto_attribs=True)
 class CheckpointSettings:
     save_freq: int = parser.get_default("save_freq")
@@ -60,7 +181,9 @@ class EngineSettings:
 
 @attr.s(auto_attribs=True)
 class RunOptions:
-    behaviors: Dict[str, Dict]
+    behaviors: DefaultDict[str, TrainerSettings] = attr.ib(
+        default=attr.Factory(lambda: collections.defaultdict(TrainerSettings))
+    )
     env_settings: EnvironmentSettings = EnvironmentSettings()
     engine_settings: EngineSettings = EngineSettings()
     parameter_randomization: Optional[Dict] = None
diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
index a2443f2704..d9b077af8b 100644
--- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
@@ -2,12 +2,12 @@
 from typing import Dict, List
 from collections import defaultdict
 import abc
+import cattr
 import time
 
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
 from mlagents.trainers.buffer import AgentBuffer
 from mlagents.trainers.trainer import Trainer
-from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.components.reward_signals import RewardSignalResult
 from mlagents_envs.timers import hierarchical_timer
 from mlagents.trainers.agent_processor import AgentManagerQueue
@@ -24,13 +24,6 @@ class RLTrainer(Trainer):  # pylint: disable=abstract-method
 
     def __init__(self, *args, **kwargs):
         super(RLTrainer, self).__init__(*args, **kwargs)
-        # Make sure we have at least one reward_signal
-        if not self.trainer_parameters["reward_signals"]:
-            raise UnityTrainerException(
-                "No reward signals were defined. At least one must be used with {}.".format(
-                    self.__class__.__name__
-                )
-            )
         # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
         # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
         # of what reward signals are actually present.
@@ -40,7 +33,8 @@ def __init__(self, *args, **kwargs):
         }
         self.update_buffer: AgentBuffer = AgentBuffer()
         self._stats_reporter.add_property(
-            StatsPropertyType.HYPERPARAMETERS, self.trainer_parameters
+            StatsPropertyType.HYPERPARAMETERS,
+            cattr.unstructure(self.trainer_parameters),
         )
 
     def end_episode(self) -> None:
diff --git a/ml-agents/mlagents/trainers/trainer/trainer.py b/ml-agents/mlagents/trainers/trainer/trainer.py
index 735b44b8e7..ffeeb2b05a 100644
--- a/ml-agents/mlagents/trainers/trainer/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer.py
@@ -1,5 +1,5 @@
 # # Unity ML-Agents Toolkit
-from typing import Dict, List, Deque, Any
+from typing import List, Deque
 import abc
 
 from collections import deque
@@ -14,6 +14,7 @@
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.settings import TrainerSettings
 
 
 logger = get_logger(__name__)
@@ -25,7 +26,7 @@ class Trainer(abc.ABC):
     def __init__(
         self,
         brain_name: str,
-        trainer_parameters: dict,
+        trainer_parameters: TrainerSettings,
         training: bool,
         run_id: str,
         reward_buff_cap: int = 1,
@@ -42,14 +43,14 @@ def __init__(
         self.brain_name = brain_name
         self.run_id = run_id
         self.trainer_parameters = trainer_parameters
-        self._threaded = trainer_parameters.get("threaded", True)
+        self._threaded = trainer_parameters.threaded
         self._stats_reporter = StatsReporter(brain_name)
         self.is_training = training
         self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
         self.policy_queues: List[AgentManagerQueue[Policy]] = []
         self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
         self.step: int = 0
-        self.summary_freq = self.trainer_parameters["summary_freq"]
+        self.summary_freq = self.trainer_parameters.summary_freq
         self.next_summary_step = self.summary_freq
 
     @property
@@ -68,7 +69,7 @@ def _check_param_keys(self):
                 )
 
     @property
-    def parameters(self) -> Dict[str, Any]:
+    def parameters(self) -> TrainerSettings:
         """
         Returns the trainer parameters of the trainer.
         """
@@ -80,7 +81,7 @@ def get_max_steps(self) -> int:
         Returns the maximum number of steps. Is used to know when the trainer should be stopped.
         :return: The maximum number of steps of the trainer
         """
-        return int(float(self.trainer_parameters["max_steps"]))
+        return int(float(self.trainer_parameters.max_steps))
 
     @property
     def get_step(self) -> int:
diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py
index 0bac684c7a..61387b1ca9 100644
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
@@ -3,7 +3,6 @@
 """Launches trainers for each External Brains in a Unity Environment."""
 
 import os
-import sys
 import threading
 from typing import Dict, Optional, Set, List
 from collections import defaultdict
@@ -184,7 +183,7 @@ def _create_trainer_and_manager(
             policy,
             name_behavior_id,
             trainer.stats_reporter,
-            trainer.parameters.get("time_horizon", sys.maxsize),
+            trainer.parameters.time_horizon,
             threaded=trainer.threaded,
         )
         env_manager.set_agent_manager(name_behavior_id, agent_manager)
diff --git a/ml-agents/mlagents/trainers/trainer_util.py b/ml-agents/mlagents/trainers/trainer_util.py
index b02d92be34..07ac0abaed 100644
--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py
@@ -1,6 +1,5 @@
 import os
-import yaml
-from typing import Any, Dict, TextIO
+from typing import Any, Dict
 
 from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.meta_curriculum import MetaCurriculum
@@ -11,6 +10,7 @@
 from mlagents.trainers.sac.trainer import SACTrainer
 from mlagents.trainers.ghost.trainer import GhostTrainer
 from mlagents.trainers.ghost.controller import GhostController
+from mlagents.trainers.settings import TrainerSettings
 
 
 logger = get_logger(__name__)
@@ -44,7 +44,7 @@ def __init__(
 
     def generate(self, brain_name: str) -> Trainer:
         return initialize_trainer(
-            self.trainer_config,
+            self.trainer_config[brain_name],
             brain_name,
             self.run_id,
             self.output_path,
@@ -60,7 +60,7 @@ def generate(self, brain_name: str) -> Trainer:
 
 
 def initialize_trainer(
-    trainer_config: Any,
+    trainer_config: TrainerSettings,
     brain_name: str,
     run_id: str,
     output_path: str,
@@ -90,27 +90,10 @@ def initialize_trainer(
     :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
     :return:
     """
-    if "default" not in trainer_config and brain_name not in trainer_config:
-        raise TrainerConfigError(
-            f'Trainer config must have either a "default" section, or a section for the brain name {brain_name}. '
-            "See the config/ directory for examples."
-        )
-
-    trainer_parameters = trainer_config.get("default", {}).copy()
-    trainer_parameters["output_path"] = os.path.join(output_path, brain_name)
-    if init_path is not None:
-        trainer_parameters["init_path"] = os.path.join(init_path, brain_name)
-    trainer_parameters["keep_checkpoints"] = keep_checkpoints
-    if brain_name in trainer_config:
-        _brain_key: Any = brain_name
-        while not isinstance(trainer_config[_brain_key], dict):
-            _brain_key = trainer_config[_brain_key]
-        trainer_parameters.update(trainer_config[_brain_key])
-
+    print(trainer_config)
+    trainer_config.output_path = os.path.join(output_path, brain_name)
     if init_path is not None:
-        trainer_parameters["init_path"] = "{basedir}/{name}".format(
-            basedir=init_path, name=brain_name
-        )
+        trainer_config.init_path = os.path.join(init_path, brain_name)
 
     min_lesson_length = 1
     if meta_curriculum:
@@ -125,84 +108,46 @@ def initialize_trainer(
             )
 
     trainer: Trainer = None  # type: ignore  # will be set to one of these, or raise
-    if "trainer" not in trainer_parameters:
-        raise TrainerConfigError(
-            f'The "trainer" key must be set in your trainer config for brain {brain_name} (or the default brain).'
-        )
-    trainer_type = trainer_parameters["trainer"]
+    trainer_type = trainer_config.trainer_type
 
-    if trainer_type == "offline_bc":
-        raise UnityTrainerException(
-            "The offline_bc trainer has been removed. To train with demonstrations, "
-            "please use a PPO or SAC trainer with the GAIL Reward Signal and/or the "
-            "Behavioral Cloning feature enabled."
-        )
-    elif trainer_type == "ppo":
+    if trainer_type == TrainerSettings.TrainerType.PPO:
         trainer = PPOTrainer(
             brain_name,
             min_lesson_length,
-            trainer_parameters,
+            trainer_config,
             train_model,
             load_model,
             seed,
             run_id,
         )
-    elif trainer_type == "sac":
+    elif trainer_type == TrainerSettings.TrainerType.SAC:
         trainer = SACTrainer(
             brain_name,
             min_lesson_length,
-            trainer_parameters,
+            trainer_config,
             train_model,
             load_model,
             seed,
             run_id,
         )
-
     else:
         raise TrainerConfigError(
             f'The trainer config contains an unknown trainer type "{trainer_type}" for brain {brain_name}'
         )
 
-    if "self_play" in trainer_parameters:
+    if trainer_config.self_play is not None:
         trainer = GhostTrainer(
             trainer,
             brain_name,
             ghost_controller,
             min_lesson_length,
-            trainer_parameters,
+            trainer_config,
             train_model,
             run_id,
         )
     return trainer
 
 
-def load_config(config_path: str) -> Dict[str, Any]:
-    try:
-        with open(config_path) as data_file:
-            return _load_config(data_file)
-    except IOError:
-        abs_path = os.path.abspath(config_path)
-        raise TrainerConfigError(f"Config file could not be found at {abs_path}.")
-    except UnicodeDecodeError:
-        raise TrainerConfigError(
-            f"There was an error decoding Config file from {config_path}. "
-            f"Make sure your file is save using UTF-8"
-        )
-
-
-def _load_config(fp: TextIO) -> Dict[str, Any]:
-    """
-    Load the yaml config from the file-like object.
-    """
-    try:
-        return yaml.safe_load(fp)
-    except yaml.parser.ParserError as e:
-        raise TrainerConfigError(
-            "Error parsing yaml file. Please check for formatting errors. "
-            "A tool such as http://www.yamllint.com/ can be helpful with this."
-        ) from e
-
-
 def assemble_curriculum_config(trainer_config: Dict[str, Any]) -> Dict[str, Any]:
     """
     Assembles a curriculum config Dict from a trainer config. The resulting

From 3dfe312a7d8788671b1a4aebdff100efec0bee12 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 1 May 2020 17:59:02 -0700
Subject: [PATCH 05/54] Use new settings for BC module

---
 .../mlagents/trainers/components/bc/module.py | 39 +++++--------------
 .../trainers/optimizer/tf_optimizer.py        | 10 ++---
 ml-agents/mlagents/trainers/settings.py       | 26 ++++++++-----
 3 files changed, 32 insertions(+), 43 deletions(-)

diff --git a/ml-agents/mlagents/trainers/components/bc/module.py b/ml-agents/mlagents/trainers/components/bc/module.py
index cfd3ccdfe9..b8ef9baec1 100644
--- a/ml-agents/mlagents/trainers/components/bc/module.py
+++ b/ml-agents/mlagents/trainers/components/bc/module.py
@@ -4,22 +4,17 @@
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from .model import BCModel
 from mlagents.trainers.demo_loader import demo_to_buffer
-from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.settings import BehavioralCloningSettings
 
 
 class BCModule:
     def __init__(
         self,
         policy: TFPolicy,
+        settings: BehavioralCloningSettings,
         policy_learning_rate: float,
         default_batch_size: int,
         default_num_epoch: int,
-        strength: float,
-        demo_path: str,
-        steps: int,
-        batch_size: int = None,
-        num_epoch: int = None,
-        samples_per_update: int = 0,
     ):
         """
         A BC trainer that can be used inline with RL.
@@ -36,14 +31,16 @@ def __init__(
         :param samples_per_update: Maximum number of samples to train on during each BC update.
         """
         self.policy = policy
-        self.current_lr = policy_learning_rate * strength
-        self.model = BCModel(policy, self.current_lr, steps)
+        self.current_lr = policy_learning_rate * settings.strength
+        self.model = BCModel(policy, self.current_lr, settings.steps)
         _, self.demonstration_buffer = demo_to_buffer(
-            demo_path, policy.sequence_length, policy.brain
+            settings.demo_path, policy.sequence_length, policy.brain
         )
 
-        self.batch_size = batch_size if batch_size else default_batch_size
-        self.num_epoch = num_epoch if num_epoch else default_num_epoch
+        self.batch_size = (
+            settings.batch_size if settings.batch_size else default_batch_size
+        )
+        self.num_epoch = settings.num_epoch if settings.num_epoch else default_num_epoch
         self.n_sequences = max(
             min(self.batch_size, self.demonstration_buffer.num_experiences)
             // policy.sequence_length,
@@ -52,29 +49,13 @@ def __init__(
 
         self.has_updated = False
         self.use_recurrent = self.policy.use_recurrent
-        self.samples_per_update = samples_per_update
+        self.samples_per_update = settings.samples_per_update
         self.out_dict = {
             "loss": self.model.loss,
             "update": self.model.update_batch,
             "learning_rate": self.model.annealed_learning_rate,
         }
 
-    @staticmethod
-    def check_config(config_dict: Dict[str, Any]) -> None:
-        """
-        Check the behavioral_cloning config for the required keys.
-        :param config_dict: Pretraining section of trainer_config
-        """
-        param_keys = ["strength", "demo_path", "steps"]
-        for k in param_keys:
-            if k not in config_dict:
-                raise UnityTrainerException(
-                    "The required pre-training hyper-parameter {0} was not defined. Please check your \
-                    trainer YAML file.".format(
-                        k
-                    )
-                )
-
     def update(self) -> Dict[str, Any]:
         """
         Updates model using buffer.
diff --git a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
index 392eb78724..2916948165 100644
--- a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
@@ -9,11 +9,12 @@
 from mlagents.trainers.components.reward_signals.reward_signal_factory import (
     create_reward_signal,
 )
+from mlagents.trainers.settings import TrainerSettings
 from mlagents.trainers.components.bc.module import BCModule
 
 
 class TFOptimizer(Optimizer):  # pylint: disable=W0223
-    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
         self.sess = policy.sess
         self.policy = policy
         self.update_dict: Dict[str, tf.Tensor] = {}
@@ -25,13 +26,12 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
         self.bc_module: Optional[BCModule] = None
         # Create pretrainer if needed
         if trainer_params.behavioral_cloning is not None:
-            BCModule.check_config(trainer_params["behavioral_cloning"])
             self.bc_module = BCModule(
                 self.policy,
-                policy_learning_rate=trainer_params["learning_rate"],
-                default_batch_size=trainer_params["batch_size"],
+                trainer_params.behavioral_cloning,
+                policy_learning_rate=trainer_params.hyperparameters.learning_rate,
+                default_batch_size=trainer_params.hyperparameters.batch_size,
                 default_num_epoch=3,
-                **trainer_params["behavioral_cloning"],
             )
 
     def get_trajectory_value_estimates(
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index d7f5d5a298..6e2beace5a 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -66,21 +66,31 @@ class MemorySettings:
     memory: Optional[MemorySettings] = None
 
 
+@attr.s(auto_attribs=True)
+class BehavioralCloningSettings:
+    demo_path: str
+    steps: int = 0
+    strength: float = 1.0
+    samples_per_update: int = 0
+    num_epoch: Optional[int] = None
+    batch_size: Optional[int] = None
+
+
 @attr.s(auto_attribs=True)
 class HyperparamSettings:
-    pass
+    batch_size: int = 1024
+    buffer_size: int = 10240
+    learning_rate: float = 3.0e-4
+    learning_rate_schedule: LearningRateSchedule = LearningRateSchedule.CONSTANT
 
 
 @attr.s(auto_attribs=True)
 class PPOSettings(HyperparamSettings):
-    batch_size: int = 1024
     beta: float = 5.0e-3
-    buffer_size: int = 10240
     epsilon: float = 0.2
     lambd: float = 0.95
-    learning_rate: float = 3.0e-4
     num_epoch: int = 3
-    learning_rate_schedule: LearningRateSchedule = LearningRateSchedule.CONSTANT
+    learning_rate_schedule: LearningRateSchedule = LearningRateSchedule.LINEAR
 
 
 @attr.s(auto_attribs=True)
@@ -90,8 +100,6 @@ class SACSettings(HyperparamSettings):
     buffer_size: int = 10240
     epsilon: float = 0.2
     lambd: float = 0.95
-    learning_rate: float = 3.0e-4
-    learning_rate_schedule: LearningRateSchedule = LearningRateSchedule.CONSTANT
 
 
 @attr.s(auto_attribs=True)
@@ -140,9 +148,9 @@ def _set_default_hyperparameters(self):
     max_steps: int = 500000
     time_horizon: int = 64
     summary_freq: int = 50000
-    threaded: bool = False
+    threaded: bool = True
     self_play: Optional[SelfPlaySettings] = None
-    behavioral_cloning: Optional[SelfPlaySettings] = None
+    behavioral_cloning: Optional[BehavioralCloningSettings] = None
 
 
 @attr.s(auto_attribs=True)

From cd23b0a6c44160e7d1f3eca4d3c9175901b7e681 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 1 May 2020 18:11:20 -0700
Subject: [PATCH 06/54] Use correct enum typing

---
 ml-agents/mlagents/trainers/settings.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 6e2beace5a..5dbc922a65 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -8,7 +8,7 @@
 from mlagents.trainers.cli_utils import StoreConfigFile, DetectDefault, parser
 from mlagents.trainers.cli_utils import load_config
 from mlagents.trainers.exception import TrainerConfigError
-from mlagents.trainers.models import LearningRateSchedule
+from mlagents.trainers.models import LearningRateSchedule, EncoderType
 
 
 def check_and_structure(key: str, value: Any, class_type: type) -> Any:
@@ -62,7 +62,7 @@ class MemorySettings:
     normalize: bool = False
     hidden_units: int = 3
     num_layers: int = 2
-    vis_encode_type: str = "simple"
+    vis_encode_type: EncoderType = EncoderType.SIMPLE
     memory: Optional[MemorySettings] = None
 
 

From 0ba816d42638d8fbb6c68bafbe01168c7aff9212 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 1 May 2020 18:54:21 -0700
Subject: [PATCH 07/54] SAC now works

---
 ml-agents/mlagents/trainers/sac/optimizer.py | 38 +++++-----
 ml-agents/mlagents/trainers/sac/trainer.py   | 80 ++++++--------------
 ml-agents/mlagents/trainers/settings.py      | 13 +++-
 3 files changed, 56 insertions(+), 75 deletions(-)

diff --git a/ml-agents/mlagents/trainers/sac/optimizer.py b/ml-agents/mlagents/trainers/sac/optimizer.py
index 72494f4fe9..c26a736465 100644
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
@@ -1,15 +1,16 @@
 import numpy as np
-from typing import Dict, List, Optional, Any, Mapping
+from typing import Dict, List, Optional, Any, Mapping, cast
 
 from mlagents.tf_utils import tf
 
 from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
-from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
+from mlagents.trainers.models import ModelUtils
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.buffer import AgentBuffer
 from mlagents_envs.timers import timed
+from mlagents.trainers.settings import TrainerSettings, SACSettings
 
 EPSILON = 1e-6  # Small value to avoid divide by zero
 
@@ -20,7 +21,7 @@
 
 
 class SACOptimizer(TFOptimizer):
-    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
         """
         Takes a Unity environment and model-specific hyper-parameters and returns the
         appropriate PPO agent model for the environment.
@@ -44,20 +45,24 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
         with policy.graph.as_default():
             with tf.variable_scope(""):
                 super().__init__(policy, trainer_params)
-                lr = float(trainer_params["learning_rate"])
-                lr_schedule = LearningRateSchedule(
-                    trainer_params.get("learning_rate_schedule", "constant")
+                hyperparameters: SACSettings = cast(
+                    SACSettings, trainer_params.hyperparameters
                 )
+                lr = hyperparameters.learning_rate
+                lr_schedule = hyperparameters.learning_rate_schedule
+                max_step = trainer_params.max_steps
+                self.tau = hyperparameters.tau
+                self.init_entcoef = hyperparameters.init_entcoef
+
                 self.policy = policy
-                self.act_size = self.policy.act_size
-                h_size = int(trainer_params["hidden_units"])
-                max_step = float(trainer_params["max_steps"])
-                num_layers = int(trainer_params["num_layers"])
-                vis_encode_type = EncoderType(
-                    trainer_params.get("vis_encode_type", "simple")
-                )
-                self.tau = trainer_params.get("tau", 0.005)
-                self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
+                self.act_size = policy.act_size
+                policy_network_settings = policy.network_settings
+                h_size = policy_network_settings.hidden_units
+                num_layers = policy_network_settings.num_layers
+                vis_encode_type = policy_network_settings.vis_encode_type
+
+                self.tau = hyperparameters.tau
+                self.burn_in_ratio = 0.0
 
                 # Non-exposed SAC parameters
                 self.discrete_target_entropy_scale = (
@@ -65,11 +70,10 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
                 )  # Roughly equal to e-greedy 0.05
                 self.continuous_target_entropy_scale = 1.0
 
-                self.init_entcoef = trainer_params.get("init_entcoef", 1.0)
                 stream_names = list(self.reward_signals.keys())
                 # Use to reduce "survivor bonus" when using Curiosity or GAIL.
                 self.gammas = [
-                    _val["gamma"] for _val in trainer_params["reward_signals"].values()
+                    _val["gamma"] for _val in trainer_params.reward_signals.values()
                 ]
                 self.use_dones_in_backup = {
                     name: tf.Variable(1.0) for name in stream_names
diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
index ea2aeca2d4..3de982c3dc 100644
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -3,7 +3,7 @@
 # and implemented in https://github.com/hill-a/stable-baselines
 
 from collections import defaultdict
-from typing import Dict
+from typing import Dict, cast
 import os
 
 import numpy as np
@@ -19,12 +19,12 @@
 from mlagents.trainers.brain import BrainParameters
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.settings import TrainerSettings, SACSettings
 
 
 logger = get_logger(__name__)
 
 BUFFER_TRUNCATE_PERCENT = 0.8
-DEFAULT_STEPS_PER_UPDATE = 1
 
 
 class SACTrainer(RLTrainer):
@@ -37,7 +37,7 @@ def __init__(
         self,
         brain_name: str,
         reward_buff_cap: int,
-        trainer_parameters: dict,
+        trainer_parameters: TrainerSettings,
         training: bool,
         load: bool,
         seed: int,
@@ -56,57 +56,26 @@ def __init__(
         super().__init__(
             brain_name, trainer_parameters, training, run_id, reward_buff_cap
         )
-        self.param_keys = [
-            "batch_size",
-            "buffer_size",
-            "buffer_init_steps",
-            "hidden_units",
-            "learning_rate",
-            "init_entcoef",
-            "max_steps",
-            "normalize",
-            "num_layers",
-            "time_horizon",
-            "steps_per_update",
-            "sequence_length",
-            "summary_freq",
-            "tau",
-            "use_recurrent",
-            "memory_size",
-            "output_path",
-            "reward_signals",
-        ]
-
-        self._check_param_keys()
+
         self.load = load
         self.seed = seed
         self.policy: NNPolicy = None  # type: ignore
         self.optimizer: SACOptimizer = None  # type: ignore
-
+        self.hyperparameters: SACSettings = cast(
+            SACSettings, trainer_parameters.hyperparameters
+        )
         self.step = 0
 
         # Don't count buffer_init_steps in steps_per_update ratio, but also don't divide-by-0
-        self.update_steps = max(1, self.trainer_parameters["buffer_init_steps"])
-        self.reward_signal_update_steps = max(
-            1, self.trainer_parameters["buffer_init_steps"]
-        )
+        self.update_steps = max(1, self.hyperparameters.buffer_init_steps)
+        self.reward_signal_update_steps = max(1, self.hyperparameters.buffer_init_steps)
 
-        self.steps_per_update = (
-            trainer_parameters["steps_per_update"]
-            if "steps_per_update" in trainer_parameters
-            else DEFAULT_STEPS_PER_UPDATE
-        )
+        self.steps_per_update = self.hyperparameters.steps_per_update
         self.reward_signal_steps_per_update = (
-            trainer_parameters["reward_signals"]["reward_signal_steps_per_update"]
-            if "reward_signal_steps_per_update" in trainer_parameters["reward_signals"]
-            else self.steps_per_update
+            self.hyperparameters.reward_signal_steps_per_update
         )
 
-        self.checkpoint_replay_buffer = (
-            trainer_parameters["save_replay_buffer"]
-            if "save_replay_buffer" in trainer_parameters
-            else False
-        )
+        self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer
 
     def _check_param_keys(self):
         super()._check_param_keys()
@@ -135,7 +104,7 @@ def save_replay_buffer(self) -> None:
         Save the training buffer's update buffer to a pickle file.
         """
         filename = os.path.join(
-            self.trainer_parameters["output_path"], "last_replay_buffer.hdf5"
+            self.trainer_parameters.output_path, "last_replay_buffer.hdf5"
         )
         logger.info("Saving Experience Replay Buffer to {}".format(filename))
         with open(filename, "wb") as file_object:
@@ -146,7 +115,7 @@ def load_replay_buffer(self) -> None:
         Loads the last saved replay buffer from a file.
         """
         filename = os.path.join(
-            self.trainer_parameters["output_path"], "last_replay_buffer.hdf5"
+            self.trainer_parameters.output_path, "last_replay_buffer.hdf5"
         )
         logger.info("Loading Experience Replay Buffer from {}".format(filename))
         with open(filename, "rb+") as file_object:
@@ -217,8 +186,8 @@ def _is_ready_update(self) -> bool:
         :return: A boolean corresponding to whether or not _update_policy() can be run
         """
         return (
-            self.update_buffer.num_experiences >= self.trainer_parameters["batch_size"]
-            and self.step >= self.trainer_parameters["buffer_init_steps"]
+            self.update_buffer.num_experiences >= self.hyperparameters.batch_size
+            and self.step >= self.hyperparameters.buffer_init_steps
         )
 
     @timed
@@ -270,19 +239,16 @@ def _update_sac_policy(self) -> bool:
         has_updated = False
         self.cumulative_returns_since_policy_update.clear()
         n_sequences = max(
-            int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
         )
 
         batch_update_stats: Dict[str, list] = defaultdict(list)
         while self.step / self.update_steps > self.steps_per_update:
             logger.debug("Updating SAC policy at step {}".format(self.step))
             buffer = self.update_buffer
-            if (
-                self.update_buffer.num_experiences
-                >= self.trainer_parameters["batch_size"]
-            ):
+            if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:
                 sampled_minibatch = buffer.sample_mini_batch(
-                    self.trainer_parameters["batch_size"],
+                    self.hyperparameters.batch_size,
                     sequence_length=self.policy.sequence_length,
                 )
                 # Get rewards for each reward
@@ -308,9 +274,9 @@ def _update_sac_policy(self) -> bool:
 
         # Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating
         # a large buffer at each update.
-        if self.update_buffer.num_experiences > self.trainer_parameters["buffer_size"]:
+        if self.update_buffer.num_experiences > self.hyperparameters.buffer_size:
             self.update_buffer.truncate(
-                int(self.trainer_parameters["buffer_size"] * BUFFER_TRUNCATE_PERCENT)
+                int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
             )
         return has_updated
 
@@ -326,7 +292,7 @@ def _update_reward_signals(self) -> None:
         """
         buffer = self.update_buffer
         n_sequences = max(
-            int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
         )
         batch_update_stats: Dict[str, list] = defaultdict(list)
         while (
@@ -340,7 +306,7 @@ def _update_reward_signals(self) -> None:
                 # Some signals don't need a minibatch to be sampled - so we don't!
                 if signal.update_dict:
                     reward_signal_minibatches[name] = buffer.sample_mini_batch(
-                        self.trainer_parameters["batch_size"],
+                        self.hyperparameters.batch_size,
                         sequence_length=self.policy.sequence_length,
                     )
             update_stats = self.optimizer.update_reward_signals(
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 5dbc922a65..b15d902eda 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -98,8 +98,18 @@ class SACSettings(HyperparamSettings):
     batch_size: int = 1024
     beta: float = 5.0e-3
     buffer_size: int = 10240
+    buffer_init_steps: int = 0
     epsilon: float = 0.2
-    lambd: float = 0.95
+    tau: float = 0.005
+    steps_per_update: float = 1
+    save_replay_buffer: bool = False
+    reward_signal_steps_per_update: float = attr.ib()
+
+    @reward_signal_steps_per_update.default
+    def _reward_signal_steps_per_update_default(self):
+        return self.steps_per_update
+
+    init_entcoef: float = 1.0
 
 
 @attr.s(auto_attribs=True)
@@ -206,6 +216,7 @@ class RunOptions:
     cattr.register_structure_hook(EnvironmentSettings, strict_to_cls)
     cattr.register_structure_hook(EngineSettings, strict_to_cls)
     cattr.register_structure_hook(CheckpointSettings, strict_to_cls)
+    cattr.register_structure_hook(TrainerSettings, trainer_settings_to_cls)
 
     @staticmethod
     def from_argparse(args: argparse.Namespace) -> "RunOptions":

From ad33ab12efbfaa4c81d976cdb6f58b7e5ffbab6c Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 1 May 2020 19:01:02 -0700
Subject: [PATCH 08/54] Better SAC defaults

---
 ml-agents/mlagents/trainers/settings.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index b15d902eda..9629ecc2ec 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -95,22 +95,19 @@ class PPOSettings(HyperparamSettings):
 
 @attr.s(auto_attribs=True)
 class SACSettings(HyperparamSettings):
-    batch_size: int = 1024
-    beta: float = 5.0e-3
-    buffer_size: int = 10240
+    batch_size: int = 128
+    buffer_size: int = 50000
     buffer_init_steps: int = 0
-    epsilon: float = 0.2
     tau: float = 0.005
     steps_per_update: float = 1
     save_replay_buffer: bool = False
+    init_entcoef: float = 1.0
     reward_signal_steps_per_update: float = attr.ib()
 
     @reward_signal_steps_per_update.default
     def _reward_signal_steps_per_update_default(self):
         return self.steps_per_update
 
-    init_entcoef: float = 1.0
-
 
 @attr.s(auto_attribs=True)
 class RewardSignalSettings:

From a8406d9b61c9c98bcc57ff410e4fcf3e393d5770 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 4 May 2020 14:48:39 -0700
Subject: [PATCH 09/54] Reward Signals and GhostTrainer to new settings

---
 config/imitation/Pyramids.yaml                | 30 ++++----
 .../components/reward_signals/__init__.py     | 26 ++-----
 .../reward_signals/curiosity/signal.py        | 29 ++------
 .../reward_signals/extrinsic/signal.py        | 12 ----
 .../components/reward_signals/gail/signal.py  | 37 +++-------
 .../reward_signals/reward_signal_factory.py   | 24 +++----
 ml-agents/mlagents/trainers/ghost/trainer.py  | 14 ++--
 .../trainers/optimizer/tf_optimizer.py        | 11 +--
 ml-agents/mlagents/trainers/settings.py       | 69 +++++++++++++++++--
 9 files changed, 125 insertions(+), 127 deletions(-)

diff --git a/config/imitation/Pyramids.yaml b/config/imitation/Pyramids.yaml
index 699e571513..826a9f683e 100644
--- a/config/imitation/Pyramids.yaml
+++ b/config/imitation/Pyramids.yaml
@@ -1,22 +1,20 @@
 behaviors:
   Pyramids:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    max_steps: 1.0e7
-    memory_size: 256
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
+    trainer_type: ppo
     time_horizon: 128
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
+    max_steps: 1.0e7
+    hyperparameters:
+      batch_size: 128
+      beta: 0.01
+      buffer_size: 2048
+      epsilon: 0.2
+      lambd: 0.95
+      learning_rate: 0.0003
+      num_epoch: 3
+    network_settings:
+      num_layers: 2
+      normalize: false
+      hidden_units: 512
     reward_signals:
       extrinsic:
         strength: 1.0
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/__init__.py b/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
index 9c721b4d64..a1e46ed5b3 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Any, Dict
 from collections import namedtuple
 import numpy as np
 import abc
@@ -6,9 +6,9 @@
 from mlagents.tf_utils import tf
 
 from mlagents_envs.logging_util import get_logger
-from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.settings import RewardSignalSettings
 
 
 logger = get_logger(__name__)
@@ -19,7 +19,7 @@
 
 
 class RewardSignal(abc.ABC):
-    def __init__(self, policy: TFPolicy, strength: float, gamma: float):
+    def __init__(self, policy: TFPolicy, settings: RewardSignalSettings):
         """
         Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
         the reward strength, and the gamma (discount factor.)
@@ -36,9 +36,9 @@ def __init__(self, policy: TFPolicy, strength: float, gamma: float):
         # no natural end, e.g. GAIL or Curiosity
         self.use_terminal_states = True
         self.update_dict: Dict[str, tf.Tensor] = {}
-        self.gamma = gamma
+        self.gamma = settings.gamma
         self.policy = policy
-        self.strength = strength
+        self.strength = settings.strength
         self.stats_name_to_update_name: Dict[str, str] = {}
 
     def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
@@ -66,19 +66,3 @@ def prepare_update(
         :return: A dict that corresponds to the feed_dict needed for the update.
         """
         return {}
-
-    @classmethod
-    def check_config(
-        cls, config_dict: Dict[str, Any], param_keys: List[str] = None
-    ) -> None:
-        """
-        Check the config dict, and throw an error if there are missing hyperparameters.
-        """
-        param_keys = param_keys or []
-        for k in param_keys:
-            if k not in config_dict:
-                raise UnityTrainerException(
-                    "The hyper-parameter {0} could not be found for {1}.".format(
-                        k, cls.__name__
-                    )
-                )
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
index e780f30de9..8408ab2a0d 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Any, Dict
 import numpy as np
 from mlagents.tf_utils import tf
 
@@ -6,17 +6,11 @@
 from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.settings import CuriositySettings
 
 
 class CuriosityRewardSignal(RewardSignal):
-    def __init__(
-        self,
-        policy: TFPolicy,
-        strength: float,
-        gamma: float,
-        encoding_size: int = 128,
-        learning_rate: float = 3e-4,
-    ):
+    def __init__(self, policy: TFPolicy, settings: CuriositySettings):
         """
         Creates the Curiosity reward generator
         :param policy: The Learning Policy
@@ -26,9 +20,11 @@ def __init__(
         :param encoding_size: The size of the hidden encoding layer for the ICM
         :param learning_rate: The learning rate for the ICM.
         """
-        super().__init__(policy, strength, gamma)
+        super().__init__(policy, settings)
         self.model = CuriosityModel(
-            policy, encoding_size=encoding_size, learning_rate=learning_rate
+            policy,
+            encoding_size=settings.encoding_size,
+            learning_rate=settings.learning_rate,
         )
         self.use_terminal_states = False
         self.update_dict = {
@@ -69,17 +65,6 @@ def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
         )
         return RewardSignalResult(scaled_reward, unscaled_reward)
 
-    @classmethod
-    def check_config(
-        cls, config_dict: Dict[str, Any], param_keys: List[str] = None
-    ) -> None:
-        """
-        Checks the config and throw an exception if a hyperparameter is missing. Curiosity requires strength,
-        gamma, and encoding size at minimum.
-        """
-        param_keys = ["strength", "gamma", "encoding_size"]
-        super().check_config(config_dict, param_keys)
-
     def prepare_update(
         self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
     ) -> Dict[tf.Tensor, Any]:
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
index 2eeb5e6a06..cbe2c1d4f5 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
@@ -1,4 +1,3 @@
-from typing import Any, Dict, List
 import numpy as np
 
 from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
@@ -6,17 +5,6 @@
 
 
 class ExtrinsicRewardSignal(RewardSignal):
-    @classmethod
-    def check_config(
-        cls, config_dict: Dict[str, Any], param_keys: List[str] = None
-    ) -> None:
-        """
-        Checks the config and throw an exception if a hyperparameter is missing. Extrinsic requires strength and gamma
-        at minimum.
-        """
-        param_keys = ["strength", "gamma"]
-        super().check_config(config_dict, param_keys)
-
     def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
         env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
         return RewardSignalResult(self.strength * env_rews, env_rews)
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
index 2fe17c569d..89a6b8bc64 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Any, Dict
 import numpy as np
 from mlagents.tf_utils import tf
 
@@ -7,20 +7,11 @@
 from .model import GAILModel
 from mlagents.trainers.demo_loader import demo_to_buffer
 from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.settings import GAILSettings
 
 
 class GAILRewardSignal(RewardSignal):
-    def __init__(
-        self,
-        policy: TFPolicy,
-        strength: float,
-        gamma: float,
-        demo_path: str,
-        encoding_size: int = 64,
-        learning_rate: float = 3e-4,
-        use_actions: bool = False,
-        use_vail: bool = False,
-    ):
+    def __init__(self, policy: TFPolicy, settings: GAILSettings):
         """
         The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476
         :param policy: The policy of the learning model
@@ -35,14 +26,19 @@ def __init__(
         :param use_vail: Whether or not to use a variational bottleneck for the discriminator.
         See https://arxiv.org/abs/1810.00821.
         """
-        super().__init__(policy, strength, gamma)
+        super().__init__(policy, settings)
         self.use_terminal_states = False
 
         self.model = GAILModel(
-            policy, 128, learning_rate, encoding_size, use_actions, use_vail
+            policy,
+            128,
+            settings.learning_rate,
+            settings.encoding_size,
+            settings.use_actions,
+            settings.use_vail,
         )
         _, self.demonstration_buffer = demo_to_buffer(
-            demo_path, policy.sequence_length, policy.brain
+            settings.demo_path, policy.sequence_length, policy.brain
         )
         self.has_updated = False
         self.update_dict: Dict[str, tf.Tensor] = {
@@ -92,17 +88,6 @@ def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
         scaled_reward = unscaled_reward * float(self.has_updated) * self.strength
         return RewardSignalResult(scaled_reward, unscaled_reward)
 
-    @classmethod
-    def check_config(
-        cls, config_dict: Dict[str, Any], param_keys: List[str] = None
-    ) -> None:
-        """
-        Checks the config and throw an exception if a hyperparameter is missing. GAIL requires strength and gamma
-        at minimum.
-        """
-        param_keys = ["strength", "gamma", "demo_path"]
-        super().check_config(config_dict, param_keys)
-
     def prepare_update(
         self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
     ) -> Dict[tf.Tensor, Any]:
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
index e781efe289..ab91c653fc 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Type
+from typing import Dict, Type
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.components.reward_signals import RewardSignal
 from mlagents.trainers.components.reward_signals.extrinsic.signal import (
@@ -9,17 +9,20 @@
     CuriosityRewardSignal,
 )
 from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.settings import RewardSignalSettings
 
 
-NAME_TO_CLASS: Dict[str, Type[RewardSignal]] = {
-    "extrinsic": ExtrinsicRewardSignal,
-    "curiosity": CuriosityRewardSignal,
-    "gail": GAILRewardSignal,
+NAME_TO_CLASS: Dict[RewardSignalSettings.RewardSignalType, Type[RewardSignal]] = {
+    RewardSignalSettings.RewardSignalType.EXTRINSIC: ExtrinsicRewardSignal,
+    RewardSignalSettings.RewardSignalType.CURIOSITY: CuriosityRewardSignal,
+    RewardSignalSettings.RewardSignalType.GAIL: GAILRewardSignal,
 }
 
 
 def create_reward_signal(
-    policy: TFPolicy, name: str, config_entry: Dict[str, Any]
+    policy: TFPolicy,
+    name: RewardSignalSettings.RewardSignalType,
+    settings: RewardSignalSettings,
 ) -> RewardSignal:
     """
     Creates a reward signal class based on the name and config entry provided as a dict.
@@ -31,11 +34,6 @@ def create_reward_signal(
     rcls = NAME_TO_CLASS.get(name)
     if not rcls:
         raise UnityTrainerException("Unknown reward signal type {0}".format(name))
-    rcls.check_config(config_entry)
-    try:
-        class_inst = rcls(policy, **config_entry)
-    except TypeError:
-        raise UnityTrainerException(
-            "Unknown parameters given for reward signal {0}".format(name)
-        )
+
+    class_inst = rcls(policy, settings)
     return class_inst
diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py
index 5de4d22832..8a4a5bdd80 100644
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
@@ -79,10 +79,10 @@ def __init__(
         # Set the logging to print ELO in the console
         self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY, True)
 
-        self_play_parameters = trainer_parameters["self_play"]
-        self.window = self_play_parameters.get("window", 10)
-        self.play_against_latest_model_ratio = self_play_parameters.get(
-            "play_against_latest_model_ratio", 0.5
+        self_play_parameters = trainer_parameters.self_play
+        self.window = self_play_parameters.window
+        self.play_against_latest_model_ratio = (
+            self_play_parameters.play_against_latest_model_ratio
         )
         if (
             self.play_against_latest_model_ratio > 1.0
@@ -92,9 +92,9 @@ def __init__(
                 "The play_against_latest_model_ratio is not between 0 and 1."
             )
 
-        self.steps_between_save = self_play_parameters.get("save_steps", 20000)
-        self.steps_between_swap = self_play_parameters.get("swap_steps", 20000)
-        self.steps_to_train_team = self_play_parameters.get("team_change", 100000)
+        self.steps_between_save = self_play_parameters.save_steps
+        self.steps_between_swap = self_play_parameters.swap_steps
+        self.steps_to_train_team = self_play_parameters.team_change
         if self.steps_to_train_team > self.get_max_steps:
             logger.warning(
                 "The max steps of the GhostTrainer for behavior name {} is less than team change. This team will not face \
diff --git a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
index 2916948165..c6891f0605 100644
--- a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
@@ -9,7 +9,7 @@
 from mlagents.trainers.components.reward_signals.reward_signal_factory import (
     create_reward_signal,
 )
-from mlagents.trainers.settings import TrainerSettings
+from mlagents.trainers.settings import TrainerSettings, RewardSignalSettings
 from mlagents.trainers.components.bc.module import BCModule
 
 
@@ -122,16 +122,19 @@ def _get_value_estimates(
 
         return value_estimates
 
-    def create_reward_signals(self, reward_signal_configs: Dict[str, Any]) -> None:
+    def create_reward_signals(
+        self, reward_signal_configs: Dict[RewardSignalSettings.RewardSignalType, Any]
+    ) -> None:
         """
         Create reward signals
         :param reward_signal_configs: Reward signal config.
         """
+        print(reward_signal_configs)
         self.reward_signals = {}
         # Create reward signals
-        for reward_signal, config in reward_signal_configs.items():
+        for reward_signal, settings in reward_signal_configs.items():
             self.reward_signals[reward_signal] = create_reward_signal(
-                self.policy, reward_signal, config
+                self.policy, reward_signal, settings
             )
             self.update_dict.update(self.reward_signals[reward_signal].update_dict)
 
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 9629ecc2ec..83edf6ec95 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -47,11 +47,28 @@ def trainer_settings_to_cls(d: Mapping, t: type) -> Any:
                 d_copy[key] = strict_to_cls(
                     d_copy[key], TrainerSettings.to_settings(d_copy["trainer_type"])
                 )
+        elif key == "reward_signals":
+            d_copy[key] = rewardsignal_settings_to_cls(val)
         else:
             d_copy[key] = check_and_structure(key, val, t)
     return t(**d_copy)
 
 
+def rewardsignal_settings_to_cls(d: Mapping) -> Any:
+    if d is None:
+        return None
+    d_final: Dict[RewardSignalSettings.RewardSignalType, RewardSignalSettings] = {}
+
+    for key, val in d.items():
+        try:
+            enum_key = RewardSignalSettings.RewardSignalType(key)
+            t = RewardSignalSettings.to_settings(enum_key)
+            d_final[enum_key] = strict_to_cls(val, t)
+        except KeyError:
+            raise TrainerConfigError(f"Unknown reward signal type {key}")
+    return d_final
+
+
 @attr.s(auto_attribs=True)
 class NetworkSettings:
     @attr.s(auto_attribs=True)
@@ -111,13 +128,52 @@ def _reward_signal_steps_per_update_default(self):
 
 @attr.s(auto_attribs=True)
 class RewardSignalSettings:
+    class RewardSignalType(Enum):
+        EXTRINSIC: str = "extrinsic"
+        GAIL: str = "gail"
+        CURIOSITY: str = "curiosity"
+
+    @staticmethod
+    def to_settings(ttype: RewardSignalType) -> type:
+        _mapping = {
+            RewardSignalSettings.RewardSignalType.EXTRINSIC: RewardSignalSettings,
+            RewardSignalSettings.RewardSignalType.GAIL: GAILSettings,
+            RewardSignalSettings.RewardSignalType.CURIOSITY: CuriositySettings,
+        }
+        return _mapping[ttype]
+
     gamma: float = 0.99
     strength: float = 1.0
 
 
+@attr.s(auto_attribs=True)
+class GAILSettings(RewardSignalSettings):
+    encoding_size: int = 64
+    learning_rate: float = 3e-4
+    use_actions: bool = False
+    use_vail: bool = False
+    demo_path: str = attr.ib(kw_only=True)
+
+
+@attr.s(auto_attribs=True)
+class CuriositySettings(RewardSignalSettings):
+    encoding_size: int = 128
+    learning_rate: float = 3e-4
+
+
 @attr.s(auto_attribs=True)
 class SelfPlaySettings:
-    hi: int = 0
+    save_steps: int = 20000
+    team_change: int = attr.ib()
+
+    @team_change.default
+    def _team_change_default(self):
+        # Assign team_change to about 4x save_steps
+        return self.save_steps * 5
+
+    swap_steps: int = 10000
+    window: int = 10
+    play_against_latest_model_ratio: float = 0.5
 
 
 @attr.s(auto_attribs=True)
@@ -145,20 +201,22 @@ def _set_default_hyperparameters(self):
         return TrainerSettings.to_settings(self.trainer_type)()
 
     network_settings: NetworkSettings = NetworkSettings()
-    reward_signals: Dict[str, Dict] = {
-        "extrinsic": cattr.unstructure(RewardSignalSettings())
-    }
+    reward_signals: Dict[
+        RewardSignalSettings.RewardSignalType, RewardSignalSettings
+    ] = {RewardSignalSettings.RewardSignalType.EXTRINSIC: RewardSignalSettings()}
     init_path: Optional[str] = None
     output_path: str = "default"
     # TODO: Remove parser default and remove from CLI
     keep_checkpoints: int = parser.get_default("keep_checkpoints")
-    max_steps: int = 500000
+    max_steps: float = 500000
     time_horizon: int = 64
     summary_freq: int = 50000
     threaded: bool = True
     self_play: Optional[SelfPlaySettings] = None
     behavioral_cloning: Optional[BehavioralCloningSettings] = None
 
+    cattr.register_structure_hook(RewardSignalSettings, rewardsignal_settings_to_cls)
+
 
 @attr.s(auto_attribs=True)
 class CheckpointSettings:
@@ -208,7 +266,6 @@ class RunOptions:
     # These are options that are relevant to the run itself, and not the engine or environment.
     # They will be left here.
     debug: bool = parser.get_default("debug")
-    multi_gpu: bool = False
     # Strict conversion
     cattr.register_structure_hook(EnvironmentSettings, strict_to_cls)
     cattr.register_structure_hook(EngineSettings, strict_to_cls)

From a826bb4018b0733db626fa8f60d1019273e22083 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 4 May 2020 16:41:52 -0700
Subject: [PATCH 10/54] Conversion script and fix mypy

---
 config/upgrade_config.py                      | 97 +++++++++++++++++++
 ml-agents/mlagents/trainers/learn.py          |  2 +-
 .../trainers/optimizer/tf_optimizer.py        |  7 +-
 ml-agents/mlagents/trainers/ppo/optimizer.py  |  8 +-
 ml-agents/mlagents/trainers/ppo/trainer.py    |  5 +-
 ml-agents/mlagents/trainers/sac/optimizer.py  |  4 +-
 ml-agents/mlagents/trainers/settings.py       |  5 +
 7 files changed, 119 insertions(+), 9 deletions(-)
 create mode 100644 config/upgrade_config.py

diff --git a/config/upgrade_config.py b/config/upgrade_config.py
new file mode 100644
index 0000000000..c3ee60dc7c
--- /dev/null
+++ b/config/upgrade_config.py
@@ -0,0 +1,97 @@
+import attr
+import cattr
+import yaml
+from typing import Dict, Any
+import argparse
+from mlagents.trainers.settings import TrainerSettings, NetworkSettings
+from mlagents.trainers.cli_utils import load_config
+
+
+# Take an existing trainer config (e.g. trainer_config.yaml) and turn it into the new format.
+def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]:
+    all_behavior_config_dict = {}
+    default_config = old_config.get("default", {})
+    for behavior_name, config in old_config.items():
+        if behavior_name != "default":
+            config = default_config.copy()
+            config.update(old_config[behavior_name])
+
+            # Convert to split TrainerSettings, Hyperparameters, NetworkSettings
+            # Set trainer_type and get appropriate hyperparameter settings
+            trainer_type = config["trainer"]
+            new_config = {}
+            new_config["trainer_type"] = trainer_type
+            hyperparam_cls = TrainerSettings.to_settings(
+                TrainerSettings.TrainerType(trainer_type)
+            )
+            # Try to absorb as much as possible into the hyperparam_cls
+            new_config["hyperparameters"] = cattr.structure(config, hyperparam_cls)
+
+            # Try to absorb as much as possible into the network settings
+            new_config["network_settings"] = cattr.structure(config, NetworkSettings)
+            # Deal with recurrent
+            if config["use_recurrent"]:
+                new_config["network_settings"].memory = NetworkSettings.MemorySettings(
+                    sequence_length=config["sequence_length"],
+                    memory_size=config["memory_size"],
+                )
+
+            # Absorb the rest into the base TrainerSettings
+            for key, val in config.items():
+                if key in attr.fields_dict(TrainerSettings):
+                    new_config[key] = val
+
+            # Structure the whole thing
+            all_behavior_config_dict[behavior_name] = cattr.structure(
+                new_config, TrainerSettings
+            )
+    return all_behavior_config_dict
+
+
+def write_to_yaml_file(config: Dict[str, Any], output_config: str):
+    with open(output_config, "w") as f:
+        try:
+            yaml.dump(cattr.unstructure(config), f, sort_keys=False)
+        except TypeError:  # Older versions of pyyaml don't support sort_keys
+            yaml.dump(cattr.unstructure(config), f)
+
+
+if __name__ == "__main__":
+
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    argparser.add_argument(
+        "trainer_config_path",
+        help="Path to old format (<=0.16.X) trainer configuration YAML.",
+    )
+    argparser.add_argument(
+        "curriculum",
+        help="Path to old format (<=0.16.X) curriculum configuration YAML.",
+        default=None,
+    )
+    argparser.add_argument(
+        "sampler",
+        help="Path to old format (<=0.16.X) parameter randomization configuration YAML.",
+        default=None,
+    )
+    argparser.add_argument(
+        "output_config_path", help="Path to write converted YAML file."
+    )
+    args = argparser.parse_args()
+
+    old_config = load_config(args.trainer_config_path)
+    behavior_config_dict = convert_behaviors(old_config)
+    full_config = {"behaviors": behavior_config_dict}
+
+    # Convert curriculum and sampler. note that we don't validate these; if it was correct
+    # before it should be correct now.
+    if args.curriculum is not None:
+        curriculum_config_dict = load_config(args.curriculum)
+        full_config["curriculum"] = curriculum_config_dict
+
+    if args.sampler is not None:
+        sampler_config_dict = load_config(args.curriculum)
+        full_config["parameter_randomization"] = sampler_config_dict
+
+    write_to_yaml_file(full_config, args.output_config_path)
diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index cdb9e87503..146778a8dd 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -145,7 +145,7 @@ def run_training(run_seed: int, options: RunOptions) -> None:
             run_seed,
             maybe_init_path,
             maybe_meta_curriculum,
-            options.multi_gpu,
+            False,
         )
         # Create controller and begin training.
         tc = TrainerController(
diff --git a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
index c6891f0605..98f61e54d5 100644
--- a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
@@ -133,10 +133,13 @@ def create_reward_signals(
         self.reward_signals = {}
         # Create reward signals
         for reward_signal, settings in reward_signal_configs.items():
-            self.reward_signals[reward_signal] = create_reward_signal(
+            # Name reward signals by string in case we have duplicates later
+            self.reward_signals[reward_signal.value] = create_reward_signal(
                 self.policy, reward_signal, settings
             )
-            self.update_dict.update(self.reward_signals[reward_signal].update_dict)
+            self.update_dict.update(
+                self.reward_signals[reward_signal.value].update_dict
+            )
 
     def create_optimizer_op(
         self, learning_rate: tf.Tensor, name: str = "Adam"
diff --git a/ml-agents/mlagents/trainers/ppo/optimizer.py b/ml-agents/mlagents/trainers/ppo/optimizer.py
index 42751c3c0d..49b171c981 100644
--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
@@ -1,4 +1,4 @@
-from typing import Optional, Any, Dict
+from typing import Optional, Any, Dict, cast
 import numpy as np
 from mlagents.tf_utils import tf
 from mlagents_envs.timers import timed
@@ -23,7 +23,9 @@ def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
         with policy.graph.as_default():
             with tf.variable_scope("optimizer/"):
                 super().__init__(policy, trainer_params)
-                hyperparameters: PPOSettings = trainer_params.hyperparameters
+                hyperparameters: PPOSettings = cast(
+                    PPOSettings, trainer_params.hyperparameters
+                )
                 lr = float(hyperparameters.learning_rate)
                 lr_schedule = hyperparameters.learning_rate_schedule
                 epsilon = float(hyperparameters.epsilon)
@@ -36,7 +38,7 @@ def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
                 vis_encode_type = policy_network_settings.vis_encode_type
                 self.burn_in_ratio = 0.0
 
-                self.stream_names = list(self.reward_signals.keys())
+                self.stream_names = [key.value for key in self.reward_signals.keys()]
 
                 self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
                 self.grads = None
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
index 085f05381a..c60446691c 100644
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -3,6 +3,7 @@
 # Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347
 
 from collections import defaultdict
+from typing import cast
 
 import numpy as np
 
@@ -67,7 +68,9 @@ def __init__(
             "output_path",
             "reward_signals",
         ]
-        self.hyperparameters: PPOSettings = self.trainer_parameters.hyperparameters
+        self.hyperparameters: PPOSettings = cast(
+            PPOSettings, self.trainer_parameters.hyperparameters
+        )
         self.load = load
         self.seed = seed
         self.policy: NNPolicy = None  # type: ignore
diff --git a/ml-agents/mlagents/trainers/sac/optimizer.py b/ml-agents/mlagents/trainers/sac/optimizer.py
index c26a736465..7ae18b27e8 100644
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
@@ -70,10 +70,10 @@ def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
                 )  # Roughly equal to e-greedy 0.05
                 self.continuous_target_entropy_scale = 1.0
 
-                stream_names = list(self.reward_signals.keys())
+                stream_names = [key.value for key in self.reward_signals.keys()]
                 # Use to reduce "survivor bonus" when using Curiosity or GAIL.
                 self.gammas = [
-                    _val["gamma"] for _val in trainer_params.reward_signals.values()
+                    _val.gamma for _val in trainer_params.reward_signals.values()
                 ]
                 self.use_dones_in_backup = {
                     name: tf.Variable(1.0) for name in stream_names
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 83edf6ec95..277cb59987 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -38,12 +38,17 @@ def trainer_settings_to_cls(d: Mapping, t: type) -> Any:
     d_copy.update(d)
 
     for key, val in d_copy.items():
+        if attr.has(type(val)):
+            # Don't convert already-converted attrs classes.
+            continue
         if key == "hyperparameters":
             if "trainer_type" not in d_copy:
                 raise TrainerConfigError(
                     "Hyperparameters were specified but no trainer_type was given."
                 )
+
             else:
+                print(d_copy[key])
                 d_copy[key] = strict_to_cls(
                     d_copy[key], TrainerSettings.to_settings(d_copy["trainer_type"])
                 )

From 65a0e1316c3764df58d617d7fe6a0a7dc4f8ff68 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 4 May 2020 16:57:48 -0700
Subject: [PATCH 11/54] Update curriculum to new settings

---
 ml-agents/mlagents/trainers/curriculum.py     | 66 +++++--------------
 .../mlagents/trainers/meta_curriculum.py      |  7 +-
 ml-agents/mlagents/trainers/settings.py       | 15 ++++-
 .../mlagents/trainers/trainer_controller.py   |  5 +-
 4 files changed, 36 insertions(+), 57 deletions(-)

diff --git a/ml-agents/mlagents/trainers/curriculum.py b/ml-agents/mlagents/trainers/curriculum.py
index 12eaa7a5e7..f81fb26c38 100644
--- a/ml-agents/mlagents/trainers/curriculum.py
+++ b/ml-agents/mlagents/trainers/curriculum.py
@@ -1,16 +1,16 @@
-import json
 import math
-from typing import Dict, Any, TextIO
+from typing import Dict, Any
 
-from .exception import CurriculumConfigError, CurriculumLoadingError
+from mlagents.trainers.exception import CurriculumConfigError
 
 from mlagents_envs.logging_util import get_logger
+from mlagents.trainers.settings import CurriculumSettings
 
 logger = get_logger(__name__)
 
 
 class Curriculum:
-    def __init__(self, brain_name: str, config: Dict):
+    def __init__(self, brain_name: str, settings: CurriculumSettings):
         """
         Initializes a Curriculum object.
         :param brain_name: Name of the brain this Curriculum is associated with
@@ -20,26 +20,14 @@ def __init__(self, brain_name: str, config: Dict):
         self.measure = None
         self._lesson_num = 0
         self.brain_name = brain_name
-        self.config = config
+        self.settings = settings
 
         self.smoothing_value = 0.0
-        for key in [
-            "parameters",
-            "measure",
-            "thresholds",
-            "min_lesson_length",
-            "signal_smoothing",
-        ]:
-            if key not in self.config:
-                raise CurriculumConfigError(
-                    f"{brain_name} curriculum config does not contain a {key} field."
-                )
-        self.smoothing_value = 0
-        self.measure = self.config["measure"]
-        self.min_lesson_length = self.config["min_lesson_length"]
-        self.max_lesson_num = len(self.config["thresholds"])
+        self.measure = self.settings.measure
+        self.min_lesson_length = self.settings.min_lesson_length
+        self.max_lesson_num = len(self.settings.thresholds)
 
-        parameters = self.config["parameters"]
+        parameters = self.settings.parameters
         for key in parameters:
             if len(parameters[key]) != self.max_lesson_num + 1:
                 raise CurriculumConfigError(
@@ -62,16 +50,16 @@ def increment_lesson(self, measure_val: float) -> bool:
                steps completed).
         :return Whether the lesson was incremented.
         """
-        if not self.config or not measure_val or math.isnan(measure_val):
+        if not self.settings or not measure_val or math.isnan(measure_val):
             return False
-        if self.config["signal_smoothing"]:
+        if self.settings.signal_smoothing:
             measure_val = self.smoothing_value * 0.25 + 0.75 * measure_val
             self.smoothing_value = measure_val
         if self.lesson_num < self.max_lesson_num:
-            if measure_val > self.config["thresholds"][self.lesson_num]:
+            if measure_val > self.settings.thresholds[self.lesson_num]:
                 self.lesson_num += 1
                 config = {}
-                parameters = self.config["parameters"]
+                parameters = self.settings.parameters
                 for key in parameters:
                     config[key] = parameters[key][self.lesson_num]
                 logger.info(
@@ -91,37 +79,13 @@ def get_config(self, lesson: int = None) -> Dict[str, Any]:
                current lesson is returned.
         :return: The configuration of the reset parameters.
         """
-        if not self.config:
+        if not self.settings:
             return {}
         if lesson is None:
             lesson = self.lesson_num
         lesson = max(0, min(lesson, self.max_lesson_num))
         config = {}
-        parameters = self.config["parameters"]
+        parameters = self.settings.parameters
         for key in parameters:
             config[key] = parameters[key][lesson]
         return config
-
-    @staticmethod
-    def load_curriculum_file(config_path: str) -> Dict:
-        try:
-            with open(config_path) as data_file:
-                return Curriculum._load_curriculum(data_file)
-        except IOError:
-            raise CurriculumLoadingError(
-                "The file {0} could not be found.".format(config_path)
-            )
-        except UnicodeDecodeError:
-            raise CurriculumLoadingError(
-                "There was an error decoding {}".format(config_path)
-            )
-
-    @staticmethod
-    def _load_curriculum(fp: TextIO) -> Dict:
-        try:
-            return json.load(fp)
-        except json.decoder.JSONDecodeError as e:
-            raise CurriculumLoadingError(
-                "Error parsing JSON file. Please check for formatting errors. "
-                "A tool such as https://jsonlint.com/ can be helpful with this."
-            ) from e
diff --git a/ml-agents/mlagents/trainers/meta_curriculum.py b/ml-agents/mlagents/trainers/meta_curriculum.py
index 699890359f..0f4000605c 100644
--- a/ml-agents/mlagents/trainers/meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/meta_curriculum.py
@@ -2,6 +2,7 @@
 
 from typing import Dict, Set
 from mlagents.trainers.curriculum import Curriculum
+from mlagents.trainers.settings import CurriculumSettings
 
 from mlagents_envs.logging_util import get_logger
 
@@ -13,7 +14,7 @@ class MetaCurriculum:
     particular brain in the environment.
     """
 
-    def __init__(self, curriculum_configs: Dict[str, Dict]):
+    def __init__(self, curriculum_configs: Dict[str, CurriculumSettings]):
         """Initializes a MetaCurriculum object.
 
         :param curriculum_folder: Dictionary of brain_name to the
@@ -21,9 +22,9 @@ def __init__(self, curriculum_configs: Dict[str, Dict]):
         """
         self._brains_to_curricula: Dict[str, Curriculum] = {}
         used_reset_parameters: Set[str] = set()
-        for brain_name, curriculum_config in curriculum_configs.items():
+        for brain_name, curriculum_settings in curriculum_configs.items():
             self._brains_to_curricula[brain_name] = Curriculum(
-                brain_name, curriculum_config
+                brain_name, curriculum_settings
             )
             config_keys: Set[str] = set(
                 self._brains_to_curricula[brain_name].get_config().keys()
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 277cb59987..e5cbbabe2d 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -223,6 +223,19 @@ def _set_default_hyperparameters(self):
     cattr.register_structure_hook(RewardSignalSettings, rewardsignal_settings_to_cls)
 
 
+@attr.s(auto_attribs=True)
+class CurriculumSettings:
+    class MeasureType:
+        PROGRESS: str = "progress"
+        REWARD: str = "reward"
+
+    measure: str = attr.ib(default=MeasureType.REWARD)
+    thresholds: List[int] = attr.Factory(list)
+    min_lesson_length: int = 0
+    signal_smoothing: bool = True
+    parameters: Dict[str, List[float]] = attr.ib(kw_only=True)
+
+
 @attr.s(auto_attribs=True)
 class CheckpointSettings:
     save_freq: int = parser.get_default("save_freq")
@@ -265,7 +278,7 @@ class RunOptions:
     env_settings: EnvironmentSettings = EnvironmentSettings()
     engine_settings: EngineSettings = EngineSettings()
     parameter_randomization: Optional[Dict] = None
-    curriculum_config: Optional[Dict] = None
+    curriculum_config: Optional[Dict[str, CurriculumSettings]] = None
     checkpoint_settings: CheckpointSettings = CheckpointSettings()
 
     # These are options that are relevant to the run itself, and not the engine or environment.
diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py
index 61387b1ca9..db804243f7 100644
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
@@ -23,6 +23,7 @@
 from mlagents.trainers.trainer_util import TrainerFactory
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
 from mlagents.trainers.agent_processor import AgentManager
+from mlagents.trainers.settings import CurriculumSettings
 
 
 class TrainerController(object):
@@ -78,12 +79,12 @@ def _get_measure_vals(self):
                 # Skip brains that are in the metacurriculum but no trainer yet.
                 if brain_name not in self.trainers:
                     continue
-                if curriculum.measure == "progress":
+                if curriculum.measure == CurriculumSettings.MeasureType.PROGRESS:
                     measure_val = self.trainers[brain_name].get_step / float(
                         self.trainers[brain_name].get_max_steps
                     )
                     brain_names_to_measure_vals[brain_name] = measure_val
-                elif curriculum.measure == "reward":
+                elif curriculum.measure == CurriculumSettings.MeasureType.REWARD:
                     measure_val = np.mean(self.trainers[brain_name].reward_buffer)
                     brain_names_to_measure_vals[brain_name] = measure_val
         else:

From cf7990d0088c8f262fb776865364d821821de8a0 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 4 May 2020 18:41:04 -0700
Subject: [PATCH 12/54] Fix issue with mypy fix

---
 ml-agents/mlagents/trainers/ppo/optimizer.py | 2 +-
 ml-agents/mlagents/trainers/sac/optimizer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ml-agents/mlagents/trainers/ppo/optimizer.py b/ml-agents/mlagents/trainers/ppo/optimizer.py
index 49b171c981..ef479a7680 100644
--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
@@ -38,7 +38,7 @@ def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
                 vis_encode_type = policy_network_settings.vis_encode_type
                 self.burn_in_ratio = 0.0
 
-                self.stream_names = [key.value for key in self.reward_signals.keys()]
+                self.stream_names = list(self.reward_signals.keys())
 
                 self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
                 self.grads = None
diff --git a/ml-agents/mlagents/trainers/sac/optimizer.py b/ml-agents/mlagents/trainers/sac/optimizer.py
index 7ae18b27e8..52d08c5c41 100644
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
@@ -70,7 +70,7 @@ def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
                 )  # Roughly equal to e-greedy 0.05
                 self.continuous_target_entropy_scale = 1.0
 
-                stream_names = [key.value for key in self.reward_signals.keys()]
+                stream_names = list(self.reward_signals.keys())
                 # Use to reduce "survivor bonus" when using Curiosity or GAIL.
                 self.gammas = [
                     _val.gamma for _val in trainer_params.reward_signals.values()

From 5060638d3502e5d1ce018c08bba5967adc115dbc Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 4 May 2020 19:16:00 -0700
Subject: [PATCH 13/54] Enable running without config file

---
 ml-agents/mlagents/trainers/cli_utils.py |  4 +++-
 ml-agents/mlagents/trainers/learn.py     |  1 -
 ml-agents/mlagents/trainers/settings.py  | 23 +++++++++++++++++------
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py
index c2112ab454..c740b83bf2 100644
--- a/ml-agents/mlagents/trainers/cli_utils.py
+++ b/ml-agents/mlagents/trainers/cli_utils.py
@@ -49,7 +49,9 @@ def _create_parser() -> argparse.ArgumentParser:
     argparser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    argparser.add_argument("trainer_config_path", action=StoreConfigFile)
+    argparser.add_argument(
+        "trainer_config_path", action=StoreConfigFile, nargs="?", default=None
+    )
     argparser.add_argument(
         "--env",
         default=None,
diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index 146778a8dd..dff6290100 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -297,7 +297,6 @@ def run_cli(options: RunOptions) -> None:
     logging_util.set_log_level(log_level)
 
     logger.debug("Configuration for this run:")
-    print(options)
     logger.debug(json.dumps(cattr.unstructure(options), indent=4))
 
     # Options deprecation warnings
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index e5cbbabe2d..0aad3ca7d4 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -74,6 +74,16 @@ def rewardsignal_settings_to_cls(d: Mapping) -> Any:
     return d_final
 
 
+def defaultdict_to_dict(d: DefaultDict) -> Dict:
+    return {key: cattr.unstructure(val) for key, val in d.items()}
+
+
+def dict_to_defaultdict(d: Dict, t: type) -> DefaultDict:
+    return collections.defaultdict(
+        TrainerSettings, cattr.structure(d, Dict[str, TrainerSettings])
+    )
+
+
 @attr.s(auto_attribs=True)
 class NetworkSettings:
     @attr.s(auto_attribs=True)
@@ -289,6 +299,10 @@ class RunOptions:
     cattr.register_structure_hook(EngineSettings, strict_to_cls)
     cattr.register_structure_hook(CheckpointSettings, strict_to_cls)
     cattr.register_structure_hook(TrainerSettings, trainer_settings_to_cls)
+    cattr.register_structure_hook(
+        DefaultDict[str, TrainerSettings], dict_to_defaultdict
+    )
+    cattr.register_unstructure_hook(collections.defaultdict, defaultdict_to_dict)
 
     @staticmethod
     def from_argparse(args: argparse.Namespace) -> "RunOptions":
@@ -308,12 +322,9 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions":
             "env_settings": {},
             "engine_settings": {},
         }
-        configured_dict.update(load_config(config_path))
-        # This is the only option that is not optional and has no defaults.
-        if "behaviors" not in configured_dict:
-            raise TrainerConfigError(
-                "Trainer configurations not found. Make sure your YAML file has a section for behaviors."
-            )
+        if config_path is not None:
+            configured_dict.update(load_config(config_path))
+
         # Use the YAML file values for all values not specified in the CLI.
         for key in configured_dict.keys():
             # Detect bad config options

From 69ebbfbddce8ad7b52869afedcc6b13429d281cb Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 4 May 2020 19:29:00 -0700
Subject: [PATCH 14/54] Fix issue with upgrade script

---
 config/upgrade_config.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/config/upgrade_config.py b/config/upgrade_config.py
index c3ee60dc7c..c2c716db5d 100644
--- a/config/upgrade_config.py
+++ b/config/upgrade_config.py
@@ -66,12 +66,12 @@ def write_to_yaml_file(config: Dict[str, Any], output_config: str):
         help="Path to old format (<=0.16.X) trainer configuration YAML.",
     )
     argparser.add_argument(
-        "curriculum",
+        "--curriculum",
         help="Path to old format (<=0.16.X) curriculum configuration YAML.",
         default=None,
     )
     argparser.add_argument(
-        "sampler",
+        "--sampler",
         help="Path to old format (<=0.16.X) parameter randomization configuration YAML.",
         default=None,
     )
@@ -79,6 +79,9 @@ def write_to_yaml_file(config: Dict[str, Any], output_config: str):
         "output_config_path", help="Path to write converted YAML file."
     )
     args = argparser.parse_args()
+    print(
+        f"Converting {args.trainer_config_path} and saving to {args.output_config_path}."
+    )
 
     old_config = load_config(args.trainer_config_path)
     behavior_config_dict = convert_behaviors(old_config)

From 9e7c32c3bf82144f1bb410d0f1bf70367a257b80 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Tue, 5 May 2020 18:10:21 -0700
Subject: [PATCH 15/54] Fix some tests

---
 .../tests/test_barracuda_converter.py         | 39 +----------
 .../mlagents/trainers/tests/test_bcmodule.py  | 67 +++++++++++--------
 .../mlagents/trainers/tests/test_learn.py     |  3 +-
 .../mlagents/trainers/tests/test_nn_policy.py | 49 +++-----------
 .../trainers/tests/test_trainer_util.py       |  7 +-
 5 files changed, 55 insertions(+), 110 deletions(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py b/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
index 0b4d4d49f0..79006ded31 100644
--- a/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
+++ b/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
@@ -1,10 +1,10 @@
 import os
 import tempfile
 import pytest
-import yaml
 
 import mlagents.trainers.tensorflow_to_barracuda as tf2bc
 from mlagents.trainers.tests.test_nn_policy import create_policy_mock
+from mlagents.trainers.settings import TrainerSettings
 from mlagents.tf_utils import tf
 from mlagents.model_serialization import SerializationSettings, export_policy_model
 
@@ -31,45 +31,12 @@ def test_barracuda_converter():
     os.remove(tmpfile)
 
 
-@pytest.fixture
-def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 8
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        output_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
-
-
 @pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
 @pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
 @pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
-def test_policy_conversion(dummy_config, tmpdir, rnn, visual, discrete):
+def test_policy_conversion(tmpdir, rnn, visual, discrete):
     tf.reset_default_graph()
-    dummy_config["output_path"] = os.path.join(tmpdir, "test")
+    dummy_config = TrainerSettings(output_path=os.path.join(tmpdir, "test"))
     policy = create_policy_mock(
         dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
     )
diff --git a/ml-agents/mlagents/trainers/tests/test_bcmodule.py b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
index 60d05f96fd..d685080df1 100644
--- a/ml-agents/mlagents/trainers/tests/test_bcmodule.py
+++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
@@ -7,6 +7,11 @@
 
 from mlagents.trainers.policy.nn_policy import NNPolicy
 from mlagents.trainers.components.bc.module import BCModule
+from mlagents.trainers.settings import (
+    TrainerSettings,
+    BehavioralCloningSettings,
+    NetworkSettings,
+)
 
 
 def ppo_dummy_config():
@@ -41,25 +46,22 @@ def ppo_dummy_config():
     )
 
 
-def create_bc_module(mock_brain, trainer_config, use_rnn, demo_file, tanhresample):
+def create_bc_module(mock_brain, bc_settings, use_rnn, tanhresample):
     # model_path = env.external_brain_names[0]
-    trainer_config["output_path"] = "testpath"
-    trainer_config["keep_checkpoints"] = 3
-    trainer_config["use_recurrent"] = use_rnn
-    trainer_config["behavioral_cloning"]["demo_path"] = (
-        os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file
+    trainer_config = TrainerSettings()
+    trainer_config.network_settings.memory = (
+        NetworkSettings.MemorySettings() if use_rnn else None
     )
-
     policy = NNPolicy(
         0, mock_brain, trainer_config, False, False, tanhresample, tanhresample
     )
     with policy.graph.as_default():
         bc_module = BCModule(
             policy,
-            policy_learning_rate=trainer_config["learning_rate"],
-            default_batch_size=trainer_config["batch_size"],
+            policy_learning_rate=trainer_config.hyperparameters.learning_rate,
+            default_batch_size=trainer_config.hyperparameters.batch_size,
             default_num_epoch=3,
-            **trainer_config["behavioral_cloning"],
+            settings=bc_settings,
         )
     policy.initialize_or_load()  # Normally the optimizer calls this after the BCModule is created
     return bc_module
@@ -69,14 +71,19 @@ def create_bc_module(mock_brain, trainer_config, use_rnn, demo_file, tanhresampl
 def test_bcmodule_defaults():
     # See if default values match
     mock_brain = mb.create_mock_3dball_brain()
-    trainer_config = ppo_dummy_config()
-    bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", False)
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
+    )
+    bc_module = create_bc_module(mock_brain, bc_settings, False, False)
     assert bc_module.num_epoch == 3
-    assert bc_module.batch_size == trainer_config["batch_size"]
+    assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size
     # Assign strange values and see if it overrides properly
-    trainer_config["behavioral_cloning"]["num_epoch"] = 100
-    trainer_config["behavioral_cloning"]["batch_size"] = 10000
-    bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", False)
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo",
+        num_epoch=100,
+        batch_size=10000,
+    )
+    bc_module = create_bc_module(mock_brain, bc_settings, False, False)
     assert bc_module.num_epoch == 100
     assert bc_module.batch_size == 10000
 
@@ -85,9 +92,10 @@ def test_bcmodule_defaults():
 @pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
 def test_bcmodule_update(is_sac):
     mock_brain = mb.create_mock_3dball_brain()
-    bc_module = create_bc_module(
-        mock_brain, ppo_dummy_config(), False, "test.demo", is_sac
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
     )
+    bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
     stats = bc_module.update()
     for _, item in stats.items():
         assert isinstance(item, np.float32)
@@ -96,10 +104,12 @@ def test_bcmodule_update(is_sac):
 # Test with constant pretraining learning rate
 @pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
 def test_bcmodule_constant_lr_update(is_sac):
-    trainer_config = ppo_dummy_config()
     mock_brain = mb.create_mock_3dball_brain()
-    trainer_config["behavioral_cloning"]["steps"] = 0
-    bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", is_sac)
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo",
+        steps=0,
+    )
+    bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
     stats = bc_module.update()
     for _, item in stats.items():
         assert isinstance(item, np.float32)
@@ -113,9 +123,10 @@ def test_bcmodule_constant_lr_update(is_sac):
 @pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
 def test_bcmodule_rnn_update(is_sac):
     mock_brain = mb.create_mock_3dball_brain()
-    bc_module = create_bc_module(
-        mock_brain, ppo_dummy_config(), True, "test.demo", is_sac
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
     )
+    bc_module = create_bc_module(mock_brain, bc_settings, True, is_sac)
     stats = bc_module.update()
     for _, item in stats.items():
         assert isinstance(item, np.float32)
@@ -125,9 +136,10 @@ def test_bcmodule_rnn_update(is_sac):
 @pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
 def test_bcmodule_dc_visual_update(is_sac):
     mock_brain = mb.create_mock_banana_brain()
-    bc_module = create_bc_module(
-        mock_brain, ppo_dummy_config(), False, "testdcvis.demo", is_sac
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo"
     )
+    bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
     stats = bc_module.update()
     for _, item in stats.items():
         assert isinstance(item, np.float32)
@@ -137,9 +149,10 @@ def test_bcmodule_dc_visual_update(is_sac):
 @pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
 def test_bcmodule_rnn_dc_update(is_sac):
     mock_brain = mb.create_mock_banana_brain()
-    bc_module = create_bc_module(
-        mock_brain, ppo_dummy_config(), True, "testdcvis.demo", is_sac
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo"
     )
+    bc_module = create_bc_module(mock_brain, bc_settings, True, is_sac)
     stats = bc_module.update()
     for _, item in stats.items():
         assert isinstance(item, np.float32)
diff --git a/ml-agents/mlagents/trainers/tests/test_learn.py b/ml-agents/mlagents/trainers/tests/test_learn.py
index 14fbd7ad4e..3746ee9fbc 100644
--- a/ml-agents/mlagents/trainers/tests/test_learn.py
+++ b/ml-agents/mlagents/trainers/tests/test_learn.py
@@ -3,7 +3,8 @@
 from unittest.mock import MagicMock, patch, mock_open
 from mlagents.trainers import learn
 from mlagents.trainers.trainer_controller import TrainerController
-from mlagents.trainers.learn import parse_command_line, DetectDefault
+from mlagents.trainers.learn import parse_command_line
+from mlagents.trainers.cli_utils import DetectDefault
 from mlagents_envs.exception import UnityEnvironmentException
 from mlagents.trainers.stats import StatsReporter
 
diff --git a/ml-agents/mlagents/trainers/tests/test_nn_policy.py b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
index 17b15231bd..ea6c2fa6c6 100644
--- a/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
@@ -1,53 +1,19 @@
 import pytest
 import os
-from typing import Dict, Any
 
 import numpy as np
 from mlagents.tf_utils import tf
 
-import yaml
 
 from mlagents.trainers.policy.nn_policy import NNPolicy
 from mlagents.trainers.models import EncoderType, ModelUtils
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.brain import BrainParameters, CameraResolution
 from mlagents.trainers.tests import mock_brain as mb
+from mlagents.trainers.settings import TrainerSettings, NetworkSettings
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
 
 
-@pytest.fixture
-def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 8
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        output_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
-
-
 VECTOR_ACTION_SPACE = [2]
 VECTOR_OBS_SPACE = 8
 DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
@@ -56,7 +22,7 @@ def dummy_config():
 
 
 def create_policy_mock(
-    dummy_config: Dict[str, Any],
+    dummy_config: TrainerSettings,
     use_rnn: bool = False,
     use_discrete: bool = True,
     use_visual: bool = False,
@@ -72,17 +38,18 @@ def create_policy_mock(
     )
 
     trainer_parameters = dummy_config
-    trainer_parameters["keep_checkpoints"] = 3
-    trainer_parameters["use_recurrent"] = use_rnn
+    trainer_parameters.keep_checkpoints = 3
+    trainer_parameters.network_settings.memory = (
+        NetworkSettings.MemorySettings() if use_rnn else None
+    )
     policy = NNPolicy(seed, mock_brain, trainer_parameters, False, load)
     return policy
 
 
-def test_load_save(dummy_config, tmp_path):
+def test_load_save(tmp_path):
     path1 = os.path.join(tmp_path, "runid1")
     path2 = os.path.join(tmp_path, "runid2")
-    trainer_params = dummy_config
-    trainer_params["output_path"] = path1
+    trainer_params = TrainerSettings(output_path=path1)
     policy = create_policy_mock(trainer_params)
     policy.initialize_or_load()
     policy.save_model(2000)
diff --git a/ml-agents/mlagents/trainers/tests/test_trainer_util.py b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
index 5c48fa3df3..a1490518ad 100644
--- a/ml-agents/mlagents/trainers/tests/test_trainer_util.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
@@ -5,11 +5,8 @@
 from unittest.mock import patch
 
 from mlagents.trainers import trainer_util
-from mlagents.trainers.trainer_util import (
-    load_config,
-    _load_config,
-    assemble_curriculum_config,
-)
+from mlagents.trainers.trainer_util import assemble_curriculum_config
+from mlagents.trainers.cli_utils import load_config, _load_config
 from mlagents.trainers.ppo.trainer import PPOTrainer
 from mlagents.trainers.exception import TrainerConfigError, UnityTrainerException
 from mlagents.trainers.brain import BrainParameters

From d29b4b7a960a51b3671747589831dc41236d523d Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 6 May 2020 12:53:15 -0700
Subject: [PATCH 16/54] Fix most of simple_rl tests

---
 ml-agents/mlagents/trainers/ghost/trainer.py  |   2 +-
 ml-agents/mlagents/trainers/settings.py       |   3 +-
 .../trainers/tests/test_curriculum.py         | 100 ++----
 .../trainers/tests/test_meta_curriculum.py    |  19 +-
 .../mlagents/trainers/tests/test_simple_rl.py | 329 ++++++++++--------
 5 files changed, 230 insertions(+), 223 deletions(-)

diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py
index 8a4a5bdd80..f4c236d8b3 100644
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
@@ -130,7 +130,7 @@ def __init__(
         self.last_team_change: int = 0
 
         # Chosen because it is the initial ELO in Chess
-        self.initial_elo: float = self_play_parameters.get("initial_elo", 1200.0)
+        self.initial_elo: float = self_play_parameters.initial_elo
         self.policy_elos: List[float] = [self.initial_elo] * (
             self.window + 1
         )  # for learning policy
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 0aad3ca7d4..112b6b8fbb 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -92,7 +92,7 @@ class MemorySettings:
         memory_size: int = 128
 
     normalize: bool = False
-    hidden_units: int = 3
+    hidden_units: int = 128
     num_layers: int = 2
     vis_encode_type: EncoderType = EncoderType.SIMPLE
     memory: Optional[MemorySettings] = None
@@ -189,6 +189,7 @@ def _team_change_default(self):
     swap_steps: int = 10000
     window: int = 10
     play_against_latest_model_ratio: float = 0.5
+    initial_elo: float = 1200.0
 
 
 @attr.s(auto_attribs=True)
diff --git a/ml-agents/mlagents/trainers/tests/test_curriculum.py b/ml-agents/mlagents/trainers/tests/test_curriculum.py
index ca65c822cb..2740206924 100644
--- a/ml-agents/mlagents/trainers/tests/test_curriculum.py
+++ b/ml-agents/mlagents/trainers/tests/test_curriculum.py
@@ -1,45 +1,33 @@
-import io
-import json
 import pytest
-from unittest.mock import patch, mock_open
 
-from mlagents.trainers.exception import CurriculumConfigError, CurriculumLoadingError
+from mlagents.trainers.exception import CurriculumConfigError
 from mlagents.trainers.curriculum import Curriculum
-
-dummy_curriculum_json_str = """
-    {
-        "measure" : "reward",
-        "thresholds" : [10, 20, 50],
-        "min_lesson_length" : 3,
-        "signal_smoothing" : true,
-        "parameters" :
-        {
-            "param1" : [0.7, 0.5, 0.3, 0.1],
-            "param2" : [100, 50, 20, 15],
-            "param3" : [0.2, 0.3, 0.7, 0.9]
-        }
-    }
-    """
-
-dummy_curriculum_config = json.loads(dummy_curriculum_json_str)
-
-bad_curriculum_json_str = """
-    {
-        "measure" : "reward",
-        "thresholds" : [10, 20, 50],
-        "min_lesson_length" : 3,
-        "signal_smoothing" : false,
-        "parameters" :
-        {
-            "param1" : [0.7, 0.5, 0.3, 0.1],
-            "param2" : [100, 50, 20],
-            "param3" : [0.2, 0.3, 0.7, 0.9]
-        }
-    }
-    """
-
-
-dummy_curriculum_config_path = "TestBrain.json"
+from mlagents.trainers.settings import CurriculumSettings
+
+
+dummy_curriculum_config = CurriculumSettings(
+    measure="reward",
+    thresholds=[10, 20, 50],
+    min_lesson_length=3,
+    signal_smoothing=True,
+    parameters={
+        "param1": [0.7, 0.5, 0.3, 0.1],
+        "param2": [100, 50, 20, 15],
+        "param3": [0.2, 0.3, 0.7, 0.9],
+    },
+)
+
+bad_curriculum_config = CurriculumSettings(
+    measure="reward",
+    thresholds=[10, 20, 50],
+    min_lesson_length=3,
+    signal_smoothing=False,
+    parameters={
+        "param1": [0.7, 0.5, 0.3, 0.1],
+        "param2": [100, 50, 20],
+        "param3": [0.2, 0.3, 0.7, 0.9],
+    },
+)
 
 
 @pytest.fixture
@@ -55,14 +43,6 @@ def test_init_curriculum_happy_path():
     assert curriculum.measure == "reward"
 
 
-@patch("builtins.open", new_callable=mock_open, read_data=bad_curriculum_json_str)
-def test_load_bad_curriculum_file_raises_error(mock_file):
-    with pytest.raises(CurriculumConfigError):
-        Curriculum(
-            "TestBrain", Curriculum.load_curriculum_file(dummy_curriculum_config_path)
-        )
-
-
 def test_increment_lesson():
     curriculum = Curriculum("TestBrain", dummy_curriculum_config)
     assert curriculum.lesson_num == 0
@@ -92,26 +72,6 @@ def test_get_parameters():
     assert curriculum.get_config(0) == {"param1": 0.7, "param2": 100, "param3": 0.2}
 
 
-# Test json loading and error handling. These examples don't need to valid config files.
-def test_curriculum_load_good():
-    expected = {"x": 1}
-    value = json.dumps(expected)
-    fp = io.StringIO(value)
-    assert expected == Curriculum._load_curriculum(fp)
-
-
-def test_curriculum_load_missing_file():
-    with pytest.raises(CurriculumLoadingError):
-        Curriculum.load_curriculum_file("notAValidFile.json")
-
-
-def test_curriculum_load_invalid_json():
-    # This isn't valid json because of the trailing comma
-    contents = """
-{
-  "x": [1, 2, 3,]
-}
-"""
-    fp = io.StringIO(contents)
-    with pytest.raises(CurriculumLoadingError):
-        Curriculum._load_curriculum(fp)
+def test_load_bad_curriculum_file_raises_error():
+    with pytest.raises(CurriculumConfigError):
+        Curriculum("TestBrain", bad_curriculum_config)
diff --git a/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py b/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
index 02f7d9f445..1ccacaaf6e 100644
--- a/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
@@ -2,12 +2,12 @@
 from unittest.mock import patch, Mock
 
 from mlagents.trainers.meta_curriculum import MetaCurriculum
-import json
 import yaml
 
 from mlagents.trainers.tests.simple_test_envs import SimpleEnvironment
 from mlagents.trainers.tests.test_simple_rl import _check_environment_trains, BRAIN_NAME
-from mlagents.trainers.tests.test_curriculum import dummy_curriculum_json_str
+from mlagents.trainers.tests.test_curriculum import dummy_curriculum_config
+from mlagents.trainers.settings import CurriculumSettings
 
 
 @pytest.fixture
@@ -21,13 +21,11 @@ def reward_buff_sizes():
 
 
 def test_curriculum_config(param_name="test_param1", min_lesson_length=100):
-    return {
-        "measure": "progress",
-        "thresholds": [0.1, 0.3, 0.5],
-        "min_lesson_length": min_lesson_length,
-        "signal_smoothing": True,
-        "parameters": {f"{param_name}": [0.0, 4.0, 6.0, 8.0]},
-    }
+    return CurriculumSettings(
+        thresholds=[0.1, 0.3, 0.5],
+        min_lesson_length=min_lesson_length,
+        parameters={f"{param_name}": [0.0, 4.0, 6.0, 8.0]},
+    )
 
 
 test_meta_curriculum_config = {
@@ -119,8 +117,7 @@ def test_get_config():
 @pytest.mark.parametrize("curriculum_brain_name", [BRAIN_NAME, "WrongBrainName"])
 def test_simple_metacurriculum(curriculum_brain_name):
     env = SimpleEnvironment([BRAIN_NAME], use_discrete=False)
-    curriculum_config = json.loads(dummy_curriculum_json_str)
-    mc = MetaCurriculum({curriculum_brain_name: curriculum_config})
+    mc = MetaCurriculum({curriculum_brain_name: dummy_curriculum_config})
     trainer_config = yaml.safe_load(TRAINER_CONFIG)
     _check_environment_trains(
         env, trainer_config, meta_curriculum=mc, success_threshold=None
diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
index f3d9703b4e..12561c66e9 100644
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -3,6 +3,7 @@
 import pytest
 import yaml
 import numpy as np
+import attr
 from typing import Dict, Any
 
 from mlagents.trainers.tests.simple_test_envs import (
@@ -16,6 +17,17 @@
 from mlagents.trainers.sampler_class import SamplerManager
 from mlagents.trainers.demo_loader import write_demo
 from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
+from mlagents.trainers.settings import (
+    TrainerSettings,
+    PPOSettings,
+    SACSettings,
+    NetworkSettings,
+    SelfPlaySettings,
+    BehavioralCloningSettings,
+    RewardSignalSettings,
+    GAILSettings,
+)
+from mlagents.trainers.models import LearningRateSchedule, EncoderType
 from mlagents_envs.side_channel.environment_parameters_channel import (
     EnvironmentParametersChannel,
 )
@@ -27,7 +39,22 @@
 
 BRAIN_NAME = "1D"
 
-PPO_CONFIG = f"""
+
+PPO_CONFIG = TrainerSettings(
+    trainer_type=TrainerSettings.TrainerType.PPO,
+    hyperparameters=PPOSettings(
+        learning_rate=5.0e-3,
+        learning_rate_schedule=LearningRateSchedule.CONSTANT,
+        batch_size=16,
+        buffer_size=64,
+    ),
+    network_settings=NetworkSettings(num_layers=1, hidden_units=32),
+    summary_freq=500,
+    max_steps=3000,
+    threaded=False,
+)
+
+PPO_CONFIG2 = f"""
     {BRAIN_NAME}:
         trainer: ppo
         batch_size: 16
@@ -54,7 +81,24 @@
                 gamma: 0.99
     """
 
-SAC_CONFIG = f"""
+SAC_CONFIG = TrainerSettings(
+    trainer_type=TrainerSettings.TrainerType.SAC,
+    hyperparameters=SACSettings(
+        learning_rate=5.0e-3,
+        learning_rate_schedule=LearningRateSchedule.CONSTANT,
+        batch_size=8,
+        buffer_init_steps=100,
+        buffer_size=5000,
+        tau=0.01,
+        init_entcoef=0.01,
+    ),
+    network_settings=NetworkSettings(num_layers=1, hidden_units=16),
+    summary_freq=100,
+    max_steps=1000,
+    threaded=False,
+)
+
+SAC_CONFIG2 = f"""
     {BRAIN_NAME}:
         trainer: sac
         batch_size: 8
@@ -141,8 +185,6 @@ def _check_environment_trains(
         StatsReporter.writers.clear()  # Clear StatsReporters so we don't write to file
         debug_writer = DebugWriter()
         StatsReporter.add_writer(debug_writer)
-        # Make sure threading is turned off for determinism
-        trainer_config["threading"] = False
         if env_manager is None:
             env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
         trainer_factory = TrainerFactory(
@@ -184,7 +226,7 @@ def _check_environment_trains(
 @pytest.mark.parametrize("use_discrete", [True, False])
 def test_simple_ppo(use_discrete):
     env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-    config = generate_config(PPO_CONFIG)
+    config = attr.evolve(PPO_CONFIG)
     _check_environment_trains(env, config)
 
 
@@ -193,8 +235,8 @@ def test_2d_ppo(use_discrete):
     env = SimpleEnvironment(
         [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.5
     )
-    config = generate_config(PPO_CONFIG)
-    _check_environment_trains(env, config)
+    config = attr.evolve(PPO_CONFIG)
+    _check_environment_trains(env, {BRAIN_NAME: config})
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
@@ -207,9 +249,9 @@ def test_visual_ppo(num_visual, use_discrete):
         num_vector=0,
         step_size=0.2,
     )
-    override_vals = {"learning_rate": 3.0e-4}
-    config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config)
+    new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4)
+    config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams)
+    _check_environment_trains(env, {BRAIN_NAME: config})
 
 
 @pytest.mark.parametrize("num_visual", [1, 2])
@@ -223,36 +265,42 @@ def test_visual_advanced_ppo(vis_encode_type, num_visual):
         step_size=0.5,
         vis_obs_size=(36, 36, 3),
     )
-    override_vals = {
-        "learning_rate": 3.0e-4,
-        "vis_encode_type": vis_encode_type,
-        "max_steps": 500,
-        "summary_freq": 100,
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
+    new_hyperparams = attr.evolve(
+        PPO_CONFIG.hyperparameters,
+        learning_rate=3.0e-4,
+        vis_encode_type=EncoderType(vis_encode_type),
+    )
+    config = attr.evolve(
+        PPO_CONFIG, hyperparameters=new_hyperparams, max_steps=500, summary_freq=100
+    )
     # The number of steps is pretty small for these encoders
-    _check_environment_trains(env, config, success_threshold=0.5)
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
 def test_recurrent_ppo(use_discrete):
     env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-    override_vals = {
-        "max_steps": 5000,
-        "batch_size": 64,
-        "buffer_size": 128,
-        "learning_rate": 1e-3,
-        "use_recurrent": True,
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config, success_threshold=0.9)
+    new_network_settings = attr.evolve(
+        PPO_CONFIG.network_settings,
+        memory=NetworkSettings.MemorySettings(memory_size=16),
+    )
+    new_hyperparams = attr.evolve(
+        PPO_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128
+    )
+    config = attr.evolve(
+        PPO_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_network_settings,
+        max_steps=5000,
+    )
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
 def test_simple_sac(use_discrete):
     env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-    config = generate_config(SAC_CONFIG)
-    _check_environment_trains(env, config)
+    config = attr.evolve(SAC_CONFIG)
+    _check_environment_trains(env, {BRAIN_NAME: config})
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
@@ -260,9 +308,9 @@ def test_2d_sac(use_discrete):
     env = SimpleEnvironment(
         [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
     )
-    override_vals = {"buffer_init_steps": 2000, "max_steps": 10000}
-    config = generate_config(SAC_CONFIG, override_vals)
-    _check_environment_trains(env, config, success_threshold=0.8)
+    new_hyperparams = attr.evolve(SAC_CONFIG.hyperparameters, buffer_init_steps=2000)
+    config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams, max_steps=10000)
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
@@ -275,9 +323,11 @@ def test_visual_sac(num_visual, use_discrete):
         num_vector=0,
         step_size=0.2,
     )
-    override_vals = {"batch_size": 16, "learning_rate": 3e-4}
-    config = generate_config(SAC_CONFIG, override_vals)
-    _check_environment_trains(env, config)
+    new_hyperparams = attr.evolve(
+        SAC_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
+    )
+    config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams)
+    _check_environment_trains(env, {BRAIN_NAME: config})
 
 
 @pytest.mark.parametrize("num_visual", [1, 2])
@@ -291,31 +341,46 @@ def test_visual_advanced_sac(vis_encode_type, num_visual):
         step_size=0.5,
         vis_obs_size=(36, 36, 3),
     )
-    override_vals = {
-        "batch_size": 16,
-        "learning_rate": 3.0e-4,
-        "vis_encode_type": vis_encode_type,
-        "buffer_init_steps": 0,
-        "max_steps": 100,
-    }
-    config = generate_config(SAC_CONFIG, override_vals)
+    new_networksettings = attr.evolve(
+        SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
+    )
+    new_hyperparams = attr.evolve(
+        SAC_CONFIG.hyperparameters,
+        batch_size=16,
+        learning_rate=3e-4,
+        buffer_init_steps=0,
+    )
+    config = attr.evolve(
+        SAC_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_networksettings,
+        max_steps=100,
+    )
     # The number of steps is pretty small for these encoders
-    _check_environment_trains(env, config, success_threshold=0.5)
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
 def test_recurrent_sac(use_discrete):
     env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-    override_vals = {
-        "batch_size": 64,
-        "use_recurrent": True,
-        "max_steps": 5000,
-        "learning_rate": 1e-3,
-        "buffer_init_steps": 500,
-        "steps_per_update": 2,
-    }
-    config = generate_config(SAC_CONFIG, override_vals)
-    _check_environment_trains(env, config)
+    new_networksettings = attr.evolve(
+        SAC_CONFIG.network_settings,
+        memory=NetworkSettings.MemorySettings(memory_size=16),
+    )
+    new_hyperparams = attr.evolve(
+        SAC_CONFIG.hyperparameters,
+        batch_size=64,
+        learning_rate=1e-3,
+        buffer_init_steps=500,
+        steps_per_update=2,
+    )
+    config = attr.evolve(
+        SAC_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_networksettings,
+        max_steps=5000,
+    )
+    _check_environment_trains(env, {BRAIN_NAME: config})
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
@@ -323,16 +388,11 @@ def test_simple_ghost(use_discrete):
     env = SimpleEnvironment(
         [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
     )
-    override_vals = {
-        "max_steps": 2500,
-        "self_play": {
-            "play_against_latest_model_ratio": 1.0,
-            "save_steps": 2000,
-            "swap_steps": 2000,
-        },
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config)
+    self_play_settings = SelfPlaySettings(
+        play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
+    )
+    config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500)
+    _check_environment_trains(env, {BRAIN_NAME: config})
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
@@ -342,16 +402,11 @@ def test_simple_ghost_fails(use_discrete):
     )
     # This config should fail because the ghosted policy is never swapped with a competent policy.
     # Swap occurs after max step is reached.
-    override_vals = {
-        "max_steps": 2500,
-        "self_play": {
-            "play_against_latest_model_ratio": 1.0,
-            "save_steps": 2000,
-            "swap_steps": 4000,
-        },
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config, success_threshold=None)
+    self_play_settings = SelfPlaySettings(
+        play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
+    )
+    config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500)
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
     processed_rewards = [
         default_reward_processor(rewards) for rewards in env.final_rewards.values()
     ]
@@ -368,18 +423,14 @@ def test_simple_asymm_ghost(use_discrete):
     env = SimpleEnvironment(
         [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
     )
-    override_vals = {
-        "max_steps": 4000,
-        "self_play": {
-            "play_against_latest_model_ratio": 1.0,
-            "save_steps": 10000,
-            "swap_steps": 10000,
-            "team_change": 4000,
-        },
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
-    config[brain_name_opp] = config[BRAIN_NAME]
-    _check_environment_trains(env, config)
+    self_play_settings = SelfPlaySettings(
+        play_against_latest_model_ratio=1.0,
+        save_steps=10000,
+        swap_steps=10000,
+        team_change=400,
+    )
+    config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=4000)
+    _check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
@@ -391,18 +442,16 @@ def test_simple_asymm_ghost_fails(use_discrete):
     )
     # This config should fail because the team that us not learning when both have reached
     # max step should be executing the initial, untrained poliy.
-    override_vals = {
-        "max_steps": 2000,
-        "self_play": {
-            "play_against_latest_model_ratio": 0.0,
-            "save_steps": 5000,
-            "swap_steps": 5000,
-            "team_change": 2000,
-        },
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
-    config[brain_name_opp] = config[BRAIN_NAME]
-    _check_environment_trains(env, config, success_threshold=None)
+    self_play_settings = SelfPlaySettings(
+        play_against_latest_model_ratio=1.0,
+        save_steps=5000,
+        swap_steps=5000,
+        team_change=2000,
+    )
+    config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2000)
+    _check_environment_trains(
+        env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
+    )
     processed_rewards = [
         default_reward_processor(rewards) for rewards in env.final_rewards.values()
     ]
@@ -448,20 +497,19 @@ def record_demo(use_discrete, num_visual=0, num_vector=1):
 def test_gail(simple_record, use_discrete, trainer_config):
     demo_path = simple_record(use_discrete)
     env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
-    override_vals = {
-        "max_steps": 500,
-        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
-        "reward_signals": {
-            "gail": {
-                "strength": 1.0,
-                "gamma": 0.99,
-                "encoding_size": 32,
-                "demo_path": demo_path,
-            }
-        },
+    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
+    reward_signals = {
+        RewardSignalSettings.RewardSignalType.GAIL: GAILSettings(
+            encoding_size=32, demo_path=demo_path
+        )
     }
-    config = generate_config(trainer_config, override_vals)
-    _check_environment_trains(env, config, success_threshold=0.9)
+    config = attr.evolve(
+        trainer_config,
+        reward_signals=reward_signals,
+        behavioral_cloning=bc_settings,
+        max_steps=500,
+    )
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
@@ -474,21 +522,21 @@ def test_gail_visual_ppo(simple_record, use_discrete):
         use_discrete=use_discrete,
         step_size=0.2,
     )
-    override_vals = {
-        "max_steps": 750,
-        "learning_rate": 3.0e-4,
-        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
-        "reward_signals": {
-            "gail": {
-                "strength": 1.0,
-                "gamma": 0.99,
-                "encoding_size": 32,
-                "demo_path": demo_path,
-            }
-        },
+    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
+    reward_signals = {
+        RewardSignalSettings.RewardSignalType.GAIL: GAILSettings(
+            encoding_size=32, demo_path=demo_path
+        )
     }
-    config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config, success_threshold=0.9)
+    hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3e-4)
+    config = attr.evolve(
+        PPO_CONFIG,
+        reward_signals=reward_signals,
+        hyperparameters=hyperparams,
+        behavioral_cloning=bc_settings,
+        max_steps=750,
+    )
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
@@ -501,19 +549,20 @@ def test_gail_visual_sac(simple_record, use_discrete):
         use_discrete=use_discrete,
         step_size=0.2,
     )
-    override_vals = {
-        "max_steps": 500,
-        "batch_size": 16,
-        "learning_rate": 3.0e-4,
-        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
-        "reward_signals": {
-            "gail": {
-                "strength": 1.0,
-                "gamma": 0.99,
-                "encoding_size": 32,
-                "demo_path": demo_path,
-            }
-        },
+    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
+    reward_signals = {
+        RewardSignalSettings.RewardSignalType.GAIL: GAILSettings(
+            encoding_size=32, demo_path=demo_path
+        )
     }
-    config = generate_config(SAC_CONFIG, override_vals)
-    _check_environment_trains(env, config, success_threshold=0.9)
+    hyperparams = attr.evolve(
+        SAC_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
+    )
+    config = attr.evolve(
+        SAC_CONFIG,
+        reward_signals=reward_signals,
+        hyperparameters=hyperparams,
+        behavioral_cloning=bc_settings,
+        max_steps=500,
+    )
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)

From c9c66138524ff98ce981f8a5e4ed8ce0dab12a68 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 6 May 2020 14:34:03 -0700
Subject: [PATCH 17/54] Fix remaining simple_rl tests

---
 .../mlagents/trainers/tests/test_simple_rl.py | 60 +------------------
 1 file changed, 2 insertions(+), 58 deletions(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
index 12561c66e9..6b1d5935c6 100644
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -54,33 +54,6 @@
     threaded=False,
 )
 
-PPO_CONFIG2 = f"""
-    {BRAIN_NAME}:
-        trainer: ppo
-        batch_size: 16
-        beta: 5.0e-3
-        buffer_size: 64
-        epsilon: 0.2
-        hidden_units: 32
-        lambd: 0.95
-        learning_rate: 5.0e-3
-        learning_rate_schedule: constant
-        max_steps: 3000
-        memory_size: 16
-        normalize: false
-        num_epoch: 3
-        num_layers: 1
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 500
-        use_recurrent: false
-        threaded: false
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-    """
-
 SAC_CONFIG = TrainerSettings(
     trainer_type=TrainerSettings.TrainerType.SAC,
     hyperparameters=SACSettings(
@@ -98,35 +71,6 @@
     threaded=False,
 )
 
-SAC_CONFIG2 = f"""
-    {BRAIN_NAME}:
-        trainer: sac
-        batch_size: 8
-        buffer_size: 5000
-        buffer_init_steps: 100
-        hidden_units: 16
-        init_entcoef: 0.01
-        learning_rate: 5.0e-3
-        max_steps: 1000
-        memory_size: 16
-        normalize: false
-        steps_per_update: 1
-        num_layers: 1
-        time_horizon: 64
-        sequence_length: 32
-        summary_freq: 100
-        tau: 0.01
-        use_recurrent: false
-        curiosity_enc_size: 128
-        demo_path: None
-        vis_encode_type: simple
-        threaded: false
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-    """
-
 
 def generate_config(
     config: str, override_vals: Dict[str, Any] = None
@@ -365,7 +309,7 @@ def test_recurrent_sac(use_discrete):
     env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
     new_networksettings = attr.evolve(
         SAC_CONFIG.network_settings,
-        memory=NetworkSettings.MemorySettings(memory_size=16),
+        memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=32),
     )
     new_hyperparams = attr.evolve(
         SAC_CONFIG.hyperparameters,
@@ -443,7 +387,7 @@ def test_simple_asymm_ghost_fails(use_discrete):
     # This config should fail because the team that us not learning when both have reached
     # max step should be executing the initial, untrained poliy.
     self_play_settings = SelfPlaySettings(
-        play_against_latest_model_ratio=1.0,
+        play_against_latest_model_ratio=0.0,
         save_steps=5000,
         swap_steps=5000,
         team_change=2000,

From 32b934d643d48495908490bd668333e2547c4906 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 6 May 2020 14:34:59 -0700
Subject: [PATCH 18/54] Remove unneeded methods

---
 ml-agents/mlagents/trainers/tests/test_simple_rl.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
index 6b1d5935c6..2b35603b17 100644
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -1,10 +1,9 @@
 import math
 import tempfile
 import pytest
-import yaml
 import numpy as np
 import attr
-from typing import Dict, Any
+from typing import Dict
 
 from mlagents.trainers.tests.simple_test_envs import (
     SimpleEnvironment,
@@ -72,15 +71,6 @@
 )
 
 
-def generate_config(
-    config: str, override_vals: Dict[str, Any] = None
-) -> Dict[str, Any]:
-    trainer_config = yaml.safe_load(config)
-    if override_vals is not None:
-        trainer_config[BRAIN_NAME].update(override_vals)
-    return trainer_config
-
-
 # The reward processor is passed as an argument to _check_environment_trains.
 # It is applied to the list pf all final rewards for each brain individually.
 # This is so that we can process all final rewards in different ways for different algorithms.

From d0c3bd382682033a96d4db62602551f3ff58dbe0 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 6 May 2020 19:32:30 -0700
Subject: [PATCH 19/54] Fix some more tests

---
 ml-agents/mlagents/trainers/learn.py          |   9 +-
 ml-agents/mlagents/trainers/settings.py       |   2 +-
 .../mlagents/trainers/tests/test_ghost.py     |  40 +-----
 .../mlagents/trainers/tests/test_learn.py     | 133 +++++++++---------
 .../tests/test_subprocess_env_manager.py      |   4 +-
 5 files changed, 74 insertions(+), 114 deletions(-)

diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index dff6290100..7574a7bc02 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -13,11 +13,7 @@
 from mlagents import tf_utils
 from mlagents.trainers.trainer_controller import TrainerController
 from mlagents.trainers.meta_curriculum import MetaCurriculum
-from mlagents.trainers.trainer_util import (
-    TrainerFactory,
-    handle_existing_directories,
-    assemble_curriculum_config,
-)
+from mlagents.trainers.trainer_util import TrainerFactory, handle_existing_directories
 from mlagents.trainers.stats import (
     TensorboardWriter,
     CSVWriter,
@@ -128,9 +124,8 @@ def run_training(run_seed: int, options: RunOptions) -> None:
         env_manager = SubprocessEnvManager(
             env_factory, engine_config, env_settings.num_envs
         )
-        curriculum_config = assemble_curriculum_config(options.behaviors)
         maybe_meta_curriculum = try_create_meta_curriculum(
-            curriculum_config, env_manager, checkpoint_settings.lesson
+            options.curriculum, env_manager, checkpoint_settings.lesson
         )
         sampler_manager, resampling_interval = create_sampler_manager(
             options.parameter_randomization, run_seed
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 112b6b8fbb..78aede4ae5 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -289,7 +289,7 @@ class RunOptions:
     env_settings: EnvironmentSettings = EnvironmentSettings()
     engine_settings: EngineSettings = EngineSettings()
     parameter_randomization: Optional[Dict] = None
-    curriculum_config: Optional[Dict[str, CurriculumSettings]] = None
+    curriculum: Optional[Dict[str, CurriculumSettings]] = None
     checkpoint_settings: CheckpointSettings = CheckpointSettings()
 
     # These are options that are relevant to the run itself, and not the engine or environment.
diff --git a/ml-agents/mlagents/trainers/tests/test_ghost.py b/ml-agents/mlagents/trainers/tests/test_ghost.py
index 433988467e..f7fa23b09e 100644
--- a/ml-agents/mlagents/trainers/tests/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/test_ghost.py
@@ -2,8 +2,6 @@
 
 import numpy as np
 
-import yaml
-
 from mlagents.trainers.ghost.trainer import GhostTrainer
 from mlagents.trainers.ghost.controller import GhostController
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
@@ -12,44 +10,12 @@
 from mlagents.trainers.agent_processor import AgentManagerQueue
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
+from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings
 
 
 @pytest.fixture
 def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 8
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        output_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        self_play:
-            window: 5
-            play_against_current_self_ratio: 0.5
-            save_steps: 1000
-            swap_steps: 1000
-        """
-    )
+    return TrainerSettings(self_play=SelfPlaySettings())
 
 
 VECTOR_ACTION_SPACE = [1]
@@ -116,7 +82,6 @@ def test_process_trajectory(dummy_config):
         vector_action_descriptions=[],
         vector_action_space_type=0,
     )
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
     ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
     controller = GhostController(100)
     trainer = GhostTrainer(
@@ -188,7 +153,6 @@ def test_publish_queue(dummy_config):
         vector_action_descriptions=[],
         vector_action_space_type=0,
     )
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
     ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
     controller = GhostController(100)
     trainer = GhostTrainer(
diff --git a/ml-agents/mlagents/trainers/tests/test_learn.py b/ml-agents/mlagents/trainers/tests/test_learn.py
index 3746ee9fbc..3476aedd99 100644
--- a/ml-agents/mlagents/trainers/tests/test_learn.py
+++ b/ml-agents/mlagents/trainers/tests/test_learn.py
@@ -25,28 +25,30 @@ def basic_options(extra_args=None):
 MOCK_PARAMETER_YAML = """
     behaviors:
         {}
-    env_path: "./oldenvfile"
-    keep_checkpoints: 34
-    lesson: 2
-    run_id: uselessrun
-    save_freq: 654321
-    seed: 9870
-    base_port: 4001
-    num_envs: 4
+    env_settings:
+        env_path: "./oldenvfile"
+        num_envs: 4
+        base_port: 4001
+        seed: 9870
+    checkpoint_settings:
+        lesson: 2
+        run_id: uselessrun
+        save_freq: 654321
+        keep_checkpoints: 34
     debug: false
     """
 
 MOCK_SAMPLER_CURRICULUM_YAML = """
-    behaviors:
+    parameter_randomization:
+        sampler1: foo
+
+    curriculum:
         behavior1:
-            curriculum:
-                curriculum1
+            parameters:
+                foo: [0.2, 0.5]
         behavior2:
-            curriculum:
-                curriculum2
-
-    parameter_randomization:
-        sampler1
+            parameters:
+                foo: [0.2, 0.5]
     """
 
 
@@ -57,7 +59,7 @@ def basic_options(extra_args=None):
 @patch("mlagents.trainers.learn.SamplerManager")
 @patch("mlagents.trainers.learn.SubprocessEnvManager")
 @patch("mlagents.trainers.learn.create_environment_factory")
-@patch("mlagents.trainers.learn.load_config")
+@patch("mlagents.trainers.settings.load_config")
 def test_run_training(
     load_config,
     create_environment_factory,
@@ -111,25 +113,25 @@ def test_bad_env_path():
 @patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)
 def test_commandline_args(mock_file):
     # No args raises
-    with pytest.raises(SystemExit):
-        parse_command_line([])
+    # with pytest.raises(SystemExit):
+    #     parse_command_line([])
     # Test with defaults
     opt = parse_command_line(["mytrainerpath"])
     assert opt.behaviors == {}
-    assert opt.env_path is None
+    assert opt.env_settings.env_path is None
     assert opt.parameter_randomization is None
-    assert opt.keep_checkpoints == 5
-    assert opt.lesson == 0
-    assert opt.resume is False
-    assert opt.inference is False
-    assert opt.run_id == "ppo"
-    assert opt.save_freq == 50000
-    assert opt.seed == -1
-    assert opt.base_port == 5005
-    assert opt.num_envs == 1
-    assert opt.no_graphics is False
+    assert opt.checkpoint_settings.keep_checkpoints == 5
+    assert opt.checkpoint_settings.lesson == 0
+    assert opt.checkpoint_settings.resume is False
+    assert opt.checkpoint_settings.inference is False
+    assert opt.checkpoint_settings.run_id == "ppo"
+    assert opt.checkpoint_settings.save_freq == 50000
+    assert opt.env_settings.seed == -1
+    assert opt.env_settings.base_port == 5005
+    assert opt.env_settings.num_envs == 1
+    assert opt.engine_settings.no_graphics is False
     assert opt.debug is False
-    assert opt.env_args is None
+    assert opt.env_settings.env_args is None
 
     full_args = [
         "mytrainerpath",
@@ -150,19 +152,19 @@ def test_commandline_args(mock_file):
 
     opt = parse_command_line(full_args)
     assert opt.behaviors == {}
-    assert opt.env_path == "./myenvfile"
+    assert opt.env_settings.env_path == "./myenvfile"
     assert opt.parameter_randomization is None
-    assert opt.keep_checkpoints == 42
-    assert opt.lesson == 3
-    assert opt.run_id == "myawesomerun"
-    assert opt.save_freq == 123456
-    assert opt.seed == 7890
-    assert opt.base_port == 4004
-    assert opt.num_envs == 2
-    assert opt.no_graphics is True
+    assert opt.checkpoint_settings.keep_checkpoints == 42
+    assert opt.checkpoint_settings.lesson == 3
+    assert opt.checkpoint_settings.run_id == "myawesomerun"
+    assert opt.checkpoint_settings.save_freq == 123456
+    assert opt.env_settings.seed == 7890
+    assert opt.env_settings.base_port == 4004
+    assert opt.env_settings.num_envs == 2
+    assert opt.engine_settings.no_graphics is True
     assert opt.debug is True
-    assert opt.inference is True
-    assert opt.resume is True
+    assert opt.checkpoint_settings.inference is True
+    assert opt.checkpoint_settings.resume is True
 
 
 @patch("builtins.open", new_callable=mock_open, read_data=MOCK_PARAMETER_YAML)
@@ -171,18 +173,18 @@ def test_yaml_args(mock_file):
     DetectDefault.non_default_args.clear()
     opt = parse_command_line(["mytrainerpath"])
     assert opt.behaviors == {}
-    assert opt.env_path == "./oldenvfile"
+    assert opt.env_settings.env_path == "./oldenvfile"
     assert opt.parameter_randomization is None
-    assert opt.keep_checkpoints == 34
-    assert opt.lesson == 2
-    assert opt.run_id == "uselessrun"
-    assert opt.save_freq == 654321
-    assert opt.seed == 9870
-    assert opt.base_port == 4001
-    assert opt.num_envs == 4
-    assert opt.no_graphics is False
+    assert opt.checkpoint_settings.keep_checkpoints == 34
+    assert opt.checkpoint_settings.lesson == 2
+    assert opt.checkpoint_settings.run_id == "uselessrun"
+    assert opt.checkpoint_settings.save_freq == 654321
+    assert opt.env_settings.seed == 9870
+    assert opt.env_settings.base_port == 4001
+    assert opt.env_settings.num_envs == 4
+    assert opt.engine_settings.no_graphics is False
     assert opt.debug is False
-    assert opt.env_args is None
+    assert opt.env_settings.env_args is None
     # Test that CLI overrides YAML
     full_args = [
         "mytrainerpath",
@@ -203,25 +205,26 @@ def test_yaml_args(mock_file):
 
     opt = parse_command_line(full_args)
     assert opt.behaviors == {}
-    assert opt.env_path == "./myenvfile"
+    assert opt.env_settings.env_path == "./myenvfile"
     assert opt.parameter_randomization is None
-    assert opt.keep_checkpoints == 42
-    assert opt.lesson == 3
-    assert opt.run_id == "myawesomerun"
-    assert opt.save_freq == 123456
-    assert opt.seed == 7890
-    assert opt.base_port == 4004
-    assert opt.num_envs == 2
-    assert opt.no_graphics is True
+    assert opt.checkpoint_settings.keep_checkpoints == 42
+    assert opt.checkpoint_settings.lesson == 3
+    assert opt.checkpoint_settings.run_id == "myawesomerun"
+    assert opt.checkpoint_settings.save_freq == 123456
+    assert opt.env_settings.seed == 7890
+    assert opt.env_settings.base_port == 4004
+    assert opt.env_settings.num_envs == 2
+    assert opt.engine_settings.no_graphics is True
     assert opt.debug is True
-    assert opt.inference is True
-    assert opt.resume is True
+    assert opt.checkpoint_settings.inference is True
+    assert opt.checkpoint_settings.resume is True
 
 
 @patch("builtins.open", new_callable=mock_open, read_data=MOCK_SAMPLER_CURRICULUM_YAML)
 def test_sampler_configs(mock_file):
     opt = parse_command_line(["mytrainerpath"])
-    assert opt.parameter_randomization == "sampler1"
+    assert opt.parameter_randomization == {"sampler1": "foo"}
+    assert len(opt.curriculum.keys()) == 2
 
 
 @patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)
@@ -237,4 +240,4 @@ def test_env_args(mock_file):
     ]
 
     opt = parse_command_line(full_args)
-    assert opt.env_args == ["--foo=bar", "--blah", "baz", "100"]
+    assert opt.env_settings.env_args == ["--foo=bar", "--blah", "baz", "100"]
diff --git a/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py b/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
index c66c9343e2..2b60ade95a 100644
--- a/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
@@ -21,7 +21,6 @@
 from mlagents.trainers.tests.test_simple_rl import (
     _check_environment_trains,
     PPO_CONFIG,
-    generate_config,
     DebugWriter,
 )
 
@@ -193,11 +192,10 @@ def simple_env_factory(worker_id, config):
     env_manager = SubprocessEnvManager(
         simple_env_factory, EngineConfig.default_config(), num_envs
     )
-    trainer_config = generate_config(PPO_CONFIG, override_vals={"max_steps": 5000})
     # Run PPO using env_manager
     _check_environment_trains(
         simple_env_factory(0, []),
-        trainer_config,
+        PPO_CONFIG,
         env_manager=env_manager,
         success_threshold=None,
     )

From a2bb9a041bb1e744b7affe4dbfaac1c88d1ede0c Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 6 May 2020 19:51:52 -0700
Subject: [PATCH 20/54] Fix meta curriculum test

---
 .../trainers/tests/test_meta_curriculum.py    | 36 ++++---------------
 1 file changed, 6 insertions(+), 30 deletions(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py b/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
index 1ccacaaf6e..440df2ae0b 100644
--- a/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
@@ -2,10 +2,13 @@
 from unittest.mock import patch, Mock
 
 from mlagents.trainers.meta_curriculum import MetaCurriculum
-import yaml
 
 from mlagents.trainers.tests.simple_test_envs import SimpleEnvironment
-from mlagents.trainers.tests.test_simple_rl import _check_environment_trains, BRAIN_NAME
+from mlagents.trainers.tests.test_simple_rl import (
+    _check_environment_trains,
+    BRAIN_NAME,
+    PPO_CONFIG,
+)
 from mlagents.trainers.tests.test_curriculum import dummy_curriculum_config
 from mlagents.trainers.settings import CurriculumSettings
 
@@ -88,37 +91,10 @@ def test_get_config():
     assert meta_curriculum.get_config() == {"test_param1": 0.0, "test_param2": 0.0}
 
 
-TRAINER_CONFIG = """
-    default:
-        trainer: ppo
-        batch_size: 16
-        beta: 5.0e-3
-        buffer_size: 64
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 5.0e-3
-        max_steps: 100
-        memory_size: 256
-        normalize: false
-        num_epoch: 3
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 50
-        use_recurrent: false
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-    """
-
-
 @pytest.mark.parametrize("curriculum_brain_name", [BRAIN_NAME, "WrongBrainName"])
 def test_simple_metacurriculum(curriculum_brain_name):
     env = SimpleEnvironment([BRAIN_NAME], use_discrete=False)
     mc = MetaCurriculum({curriculum_brain_name: dummy_curriculum_config})
-    trainer_config = yaml.safe_load(TRAINER_CONFIG)
     _check_environment_trains(
-        env, trainer_config, meta_curriculum=mc, success_threshold=None
+        env, {BRAIN_NAME: PPO_CONFIG}, meta_curriculum=mc, success_threshold=None
     )

From 8885cb0239279989e692dce222c6004a4e781c14 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 7 May 2020 00:02:14 -0700
Subject: [PATCH 21/54] Fix remaining tests

---
 .../trainers/optimizer/tf_optimizer.py        |   1 -
 ml-agents/mlagents/trainers/settings.py       |  13 +-
 .../mlagents/trainers/tests/test_nn_policy.py |  19 +-
 .../mlagents/trainers/tests/test_policy.py    |  11 +-
 ml-agents/mlagents/trainers/tests/test_ppo.py |  95 +++----
 .../trainers/tests/test_reward_signals.py     | 134 +++------
 .../trainers/tests/test_rl_trainer.py         |  18 +-
 ml-agents/mlagents/trainers/tests/test_sac.py |  77 ++---
 .../tests/test_subprocess_env_manager.py      |   2 +-
 .../trainers/tests/test_trainer_util.py       | 262 +-----------------
 10 files changed, 132 insertions(+), 500 deletions(-)

diff --git a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
index 98f61e54d5..de40056f8c 100644
--- a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
@@ -129,7 +129,6 @@ def create_reward_signals(
         Create reward signals
         :param reward_signal_configs: Reward signal config.
         """
-        print(reward_signal_configs)
         self.reward_signals = {}
         # Create reward signals
         for reward_signal, settings in reward_signal_configs.items():
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 78aede4ae5..789cc39a1b 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -216,7 +216,7 @@ def to_settings(ttype: TrainerType) -> type:
     def _set_default_hyperparameters(self):
         return TrainerSettings.to_settings(self.trainer_type)()
 
-    network_settings: NetworkSettings = NetworkSettings()
+    network_settings: NetworkSettings = attr.ib(default=NetworkSettings())
     reward_signals: Dict[
         RewardSignalSettings.RewardSignalType, RewardSignalSettings
     ] = {RewardSignalSettings.RewardSignalType.EXTRINSIC: RewardSignalSettings()}
@@ -231,6 +231,17 @@ def _set_default_hyperparameters(self):
     self_play: Optional[SelfPlaySettings] = None
     behavioral_cloning: Optional[BehavioralCloningSettings] = None
 
+    @network_settings.validator
+    def _check_batch_size_seq_length(self, attribute, value):
+        if self.network_settings.memory is not None:
+            if (
+                self.network_settings.memory.sequence_length
+                > self.hyperparameters.batch_size
+            ):
+                raise TrainerConfigError(
+                    "When using memory, sequence length must be less than or equal to batch size. "
+                )
+
     cattr.register_structure_hook(RewardSignalSettings, rewardsignal_settings_to_cls)
 
 
diff --git a/ml-agents/mlagents/trainers/tests/test_nn_policy.py b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
index ea6c2fa6c6..d6cb96ed0e 100644
--- a/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
@@ -62,8 +62,8 @@ def test_load_save(tmp_path):
     _compare_two_policies(policy, policy2)
 
     # Try initialize from path 1
-    trainer_params["model_path"] = path2
-    trainer_params["init_path"] = path1
+    trainer_params.output_path = path2
+    trainer_params.init_path = path1
     policy3 = create_policy_mock(trainer_params, load=False, seed=2)
     policy3.initialize_or_load()
 
@@ -84,11 +84,11 @@ def _compare_two_policies(policy1: NNPolicy, policy2: NNPolicy) -> None:
 @pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
 @pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
 @pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
-def test_policy_evaluate(dummy_config, rnn, visual, discrete):
+def test_policy_evaluate(rnn, visual, discrete):
     # Test evaluate
     tf.reset_default_graph()
     policy = create_policy_mock(
-        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
+        TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
     )
     decision_step, terminal_step = mb.create_steps_from_brainparams(
         policy.brain, num_agents=NUM_AGENTS
@@ -101,7 +101,7 @@ def test_policy_evaluate(dummy_config, rnn, visual, discrete):
         assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
 
 
-def test_normalization(dummy_config):
+def test_normalization():
     brain_params = BrainParameters(
         brain_name="test_brain",
         vector_observation_space_size=1,
@@ -110,7 +110,6 @@ def test_normalization(dummy_config):
         vector_action_descriptions=[],
         vector_action_space_type=0,
     )
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
 
     time_horizon = 6
     trajectory = make_fake_trajectory(
@@ -123,7 +122,13 @@ def test_normalization(dummy_config):
     # Change half of the obs to 0
     for i in range(3):
         trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
-    policy = policy = NNPolicy(0, brain_params, dummy_config, False, False)
+    policy = NNPolicy(
+        0,
+        brain_params,
+        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
+        False,
+        False,
+    )
 
     trajectory_buffer = trajectory.to_agentbuffer()
     policy.update_normalization(trajectory_buffer["vector_obs"])
diff --git a/ml-agents/mlagents/trainers/tests/test_policy.py b/ml-agents/mlagents/trainers/tests/test_policy.py
index 497cc60db4..86c1112aec 100644
--- a/ml-agents/mlagents/trainers/tests/test_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_policy.py
@@ -2,6 +2,7 @@
 from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
 from mlagents.trainers.action_info import ActionInfo
 from unittest.mock import MagicMock
+from mlagents.trainers.settings import TrainerSettings
 import numpy as np
 
 
@@ -13,10 +14,6 @@ def basic_mock_brain():
     return mock_brain
 
 
-def basic_params():
-    return {"use_recurrent": False, "output_path": "my/path"}
-
-
 class FakePolicy(TFPolicy):
     def create_tf_graph(self):
         pass
@@ -27,7 +24,7 @@ def get_trainable_variables(self):
 
 def test_take_action_returns_empty_with_no_agents():
     test_seed = 3
-    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
+    policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings())
     # Doesn't really matter what this is
     dummy_groupspec = BehaviorSpec([(1,)], "continuous", 1)
     no_agent_step = DecisionSteps.empty(dummy_groupspec)
@@ -37,7 +34,7 @@ def test_take_action_returns_empty_with_no_agents():
 
 def test_take_action_returns_nones_on_missing_values():
     test_seed = 3
-    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
+    policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings())
     policy.evaluate = MagicMock(return_value={})
     policy.save_memories = MagicMock()
     step_with_agents = DecisionSteps(
@@ -49,7 +46,7 @@ def test_take_action_returns_nones_on_missing_values():
 
 def test_take_action_returns_action_info_when_available():
     test_seed = 3
-    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
+    policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings())
     policy_eval_out = {
         "action": np.array([1.0], dtype=np.float32),
         "memory_out": np.array([[2.5]], dtype=np.float32),
diff --git a/ml-agents/mlagents/trainers/tests/test_ppo.py b/ml-agents/mlagents/trainers/tests/test_ppo.py
index 6c546c26c5..633a01c388 100644
--- a/ml-agents/mlagents/trainers/tests/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/test_ppo.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 from mlagents.tf_utils import tf
-
-import yaml
+import copy
+import attr
 
 from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
 from mlagents.trainers.ppo.optimizer import PPOOptimizer
@@ -14,7 +14,9 @@
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.mock_brain import make_brain_parameters
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.settings import NetworkSettings, TrainerSettings, PPOSettings
+from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG
+from mlagents.trainers.exception import TrainerConfigError
 from mlagents.trainers.tests.test_reward_signals import (  # noqa: F401; pylint: disable=unused-variable
     curiosity_dummy_config,
     gail_dummy_config,
@@ -23,35 +25,7 @@
 
 @pytest.fixture
 def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 16
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 10
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        output_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
+    return copy.deepcopy(PPO_CONFIG)
 
 
 VECTOR_ACTION_SPACE = [2]
@@ -70,11 +44,12 @@ def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visu
         discrete_action_space=DISCRETE_ACTION_SPACE,
     )
 
-    trainer_parameters = dummy_config
-    model_path = "testmodel"
-    trainer_parameters["model_path"] = model_path
-    trainer_parameters["keep_checkpoints"] = 3
-    trainer_parameters["use_recurrent"] = use_rnn
+    trainer_parameters = attr.evolve(dummy_config)
+    trainer_parameters.network_settings.memory = (
+        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
+        if use_rnn
+        else None
+    )
     policy = NNPolicy(
         0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
     )
@@ -131,11 +106,11 @@ def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
 @pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
 # We need to test this separately from test_reward_signals.py to ensure no interactions
 def test_ppo_optimizer_update_curiosity(
-    curiosity_dummy_config, dummy_config, rnn, visual, discrete  # noqa: F811
+    dummy_config, curiosity_dummy_config, rnn, visual, discrete  # noqa: F811
 ):
     # Test evaluate
     tf.reset_default_graph()
-    dummy_config["reward_signals"].update(curiosity_dummy_config)
+    dummy_config.reward_signals = curiosity_dummy_config
     optimizer = _create_ppo_optimizer_ops_mock(
         dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
     )
@@ -157,9 +132,9 @@ def test_ppo_optimizer_update_curiosity(
 def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config):  # noqa: F811
     # Test evaluate
     tf.reset_default_graph()
-    dummy_config["reward_signals"].update(gail_dummy_config)
+    dummy_config.reward_signals = gail_dummy_config
     optimizer = _create_ppo_optimizer_ops_mock(
-        dummy_config, use_rnn=False, use_discrete=False, use_visual=False
+        PPO_CONFIG, use_rnn=False, use_discrete=False, use_visual=False
     )
     # Test update
     update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
@@ -233,8 +208,8 @@ def test_rl_functions():
 
 
 @mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
-def test_trainer_increment_step(ppo_optimizer, dummy_config):
-    trainer_params = dummy_config
+def test_trainer_increment_step(ppo_optimizer):
+    trainer_params = PPO_CONFIG
     mock_optimizer = mock.Mock()
     mock_optimizer.reward_signals = {}
     ppo_optimizer.return_value = mock_optimizer
@@ -254,8 +229,8 @@ def test_trainer_increment_step(ppo_optimizer, dummy_config):
     policy_mock = mock.Mock(spec=NNPolicy)
     policy_mock.get_current_step.return_value = 0
     step_count = (
-        5
-    )  # 10 hacked because this function is no longer called through trainer
+        5  # 10 hacked because this function is no longer called through trainer
+    )
     policy_mock.increment_step = mock.Mock(return_value=step_count)
     trainer.add_policy("testbehavior", policy_mock)
 
@@ -265,7 +240,9 @@ def test_trainer_increment_step(ppo_optimizer, dummy_config):
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
-def test_trainer_update_policy(dummy_config, use_discrete):
+def test_trainer_update_policy(
+    dummy_config, curiosity_dummy_config, use_discrete  # noqa: F811
+):
     mock_brain = mb.setup_mock_brain(
         use_discrete,
         False,
@@ -275,14 +252,12 @@ def test_trainer_update_policy(dummy_config, use_discrete):
     )
 
     trainer_params = dummy_config
-    trainer_params["use_recurrent"] = True
+    trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
+        memory_size=10, sequence_length=16
+    )
 
     # Test curiosity reward signal
-    trainer_params["reward_signals"]["curiosity"] = {}
-    trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0
-    trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99
-    trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128
-
+    trainer_params.reward_signals = curiosity_dummy_config
     trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
     policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
     trainer.add_policy(mock_brain.brain_name, policy)
@@ -310,7 +285,6 @@ def test_process_trajectory(dummy_config):
         vector_action_descriptions=[],
         vector_action_space_type=0,
     )
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
     trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
     policy = trainer.create_policy(brain_params.brain_name, brain_params)
     trainer.add_policy(brain_params.brain_name, policy)
@@ -368,7 +342,6 @@ def test_add_get_policy(ppo_optimizer, dummy_config):
     mock_optimizer.reward_signals = {}
     ppo_optimizer.return_value = mock_optimizer
 
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
     trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
     policy = mock.Mock(spec=NNPolicy)
     policy.get_current_step.return_value = 2000
@@ -386,15 +359,19 @@ def test_add_get_policy(ppo_optimizer, dummy_config):
         trainer.add_policy(brain_params, policy)
 
 
-def test_bad_config(dummy_config):
+# TODO: Move this to test_settings.py
+def test_bad_config():
     brain_params = make_brain_parameters(
         discrete_action=False, visual_inputs=0, vec_obs_size=6
     )
     # Test that we throw an error if we have sequence length greater than batch size
-    dummy_config["sequence_length"] = 64
-    dummy_config["batch_size"] = 32
-    dummy_config["use_recurrent"] = True
-    with pytest.raises(UnityTrainerException):
+    with pytest.raises(TrainerConfigError):
+        TrainerSettings(
+            network_settings=NetworkSettings(
+                memory=NetworkSettings.MemorySettings(sequence_length=64)
+            ),
+            hyperparameters=PPOSettings(batch_size=32),
+        )
         _ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
 
 
diff --git a/ml-agents/mlagents/trainers/tests/test_reward_signals.py b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
index e6acba95df..cf660e1a20 100644
--- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py
+++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
@@ -1,88 +1,49 @@
 import pytest
-import yaml
+import copy
 import os
 import mlagents.trainers.tests.mock_brain as mb
 from mlagents.trainers.policy.nn_policy import NNPolicy
 from mlagents.trainers.sac.optimizer import SACOptimizer
 from mlagents.trainers.ppo.optimizer import PPOOptimizer
+from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG, SAC_CONFIG
+from mlagents.trainers.settings import (
+    GAILSettings,
+    CuriositySettings,
+    RewardSignalSettings,
+    BehavioralCloningSettings,
+    NetworkSettings,
+    TrainerSettings,
+)
 
 CONTINUOUS_PATH = os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
 DISCRETE_PATH = os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo"
 
 
 def ppo_dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        memory_size: 8
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-        """
-    )
+    return copy.deepcopy(PPO_CONFIG)
 
 
 def sac_dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: sac
-        batch_size: 128
-        buffer_size: 50000
-        buffer_init_steps: 0
-        hidden_units: 128
-        init_entcoef: 1.0
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        memory_size: 256
-        normalize: false
-        steps_per_update: 1
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        tau: 0.005
-        use_recurrent: false
-        vis_encode_type: simple
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-        """
-    )
+    return copy.deepcopy(SAC_CONFIG)
 
 
 @pytest.fixture
 def gail_dummy_config():
     return {
-        "gail": {
-            "strength": 0.1,
-            "gamma": 0.9,
-            "encoding_size": 128,
-            "use_vail": True,
-            "demo_path": CONTINUOUS_PATH,
-        }
+        RewardSignalSettings.RewardSignalType.GAIL: GAILSettings(
+            demo_path=CONTINUOUS_PATH
+        )
     }
 
 
 @pytest.fixture
 def curiosity_dummy_config():
-    return {"curiosity": {"strength": 0.1, "gamma": 0.9, "encoding_size": 128}}
+    return {RewardSignalSettings.RewardSignalType.CURIOSITY: CuriositySettings()}
+
+
+@pytest.fixture
+def extrinsic_dummy_config():
+    return {RewardSignalSettings.RewardSignalType.EXTRINSIC: RewardSignalSettings()}
 
 
 VECTOR_ACTION_SPACE = [2]
@@ -104,15 +65,16 @@ def create_optimizer_mock(
         discrete_action_space=DISCRETE_ACTION_SPACE,
     )
     trainer_parameters = trainer_config
-    model_path = "testpath"
-    trainer_parameters["output_path"] = model_path
-    trainer_parameters["keep_checkpoints"] = 3
-    trainer_parameters["reward_signals"].update(reward_signal_config)
-    trainer_parameters["use_recurrent"] = use_rnn
+    trainer_parameters.reward_signals = reward_signal_config
+    trainer_parameters.network_settings.memory = (
+        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
+        if use_rnn
+        else None
+    )
     policy = NNPolicy(
         0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
     )
-    if trainer_parameters["trainer"] == "sac":
+    if trainer_parameters.trainer_type == TrainerSettings.TrainerType.SAC:
         optimizer = SACOptimizer(policy, trainer_parameters)
     else:
         optimizer = PPOOptimizer(policy, trainer_parameters)
@@ -142,14 +104,8 @@ def reward_signal_update(optimizer, reward_signal_name):
     "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
 def test_gail_cc(trainer_config, gail_dummy_config):
-    trainer_config.update(
-        {
-            "behavioral_cloning": {
-                "demo_path": CONTINUOUS_PATH,
-                "strength": 1.0,
-                "steps": 10000000,
-            }
-        }
+    trainer_config.behavioral_cloning = BehavioralCloningSettings(
+        demo_path=CONTINUOUS_PATH
     )
     optimizer = create_optimizer_mock(
         trainer_config, gail_dummy_config, False, False, False
@@ -162,18 +118,13 @@ def test_gail_cc(trainer_config, gail_dummy_config):
     "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
 def test_gail_dc_visual(trainer_config, gail_dummy_config):
-    gail_dummy_config["gail"]["demo_path"] = DISCRETE_PATH
-    trainer_config.update(
-        {
-            "behavioral_cloning": {
-                "demo_path": DISCRETE_PATH,
-                "strength": 1.0,
-                "steps": 10000000,
-            }
-        }
-    )
+    gail_dummy_config_discrete = {
+        RewardSignalSettings.RewardSignalType.GAIL: GAILSettings(
+            demo_path=DISCRETE_PATH
+        )
+    }
     optimizer = create_optimizer_mock(
-        trainer_config, gail_dummy_config, False, True, True
+        trainer_config, gail_dummy_config_discrete, False, True, True
     )
     reward_signal_eval(optimizer, "gail")
     reward_signal_update(optimizer, "gail")
@@ -183,15 +134,6 @@ def test_gail_dc_visual(trainer_config, gail_dummy_config):
     "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
 def test_gail_rnn(trainer_config, gail_dummy_config):
-    trainer_config.update(
-        {
-            "behavioral_cloning": {
-                "demo_path": CONTINUOUS_PATH,
-                "strength": 1.0,
-                "steps": 10000000,
-            }
-        }
-    )
     policy = create_optimizer_mock(
         trainer_config, gail_dummy_config, True, False, False
     )
@@ -246,9 +188,9 @@ def test_curiosity_rnn(trainer_config, curiosity_dummy_config):
 @pytest.mark.parametrize(
     "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
-def test_extrinsic(trainer_config, curiosity_dummy_config):
+def test_extrinsic(trainer_config, extrinsic_dummy_config):
     policy = create_optimizer_mock(
-        trainer_config, curiosity_dummy_config, False, False, False
+        trainer_config, extrinsic_dummy_config, False, False, False
     )
     reward_signal_eval(policy, "extrinsic")
     reward_signal_update(policy, "extrinsic")
diff --git a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
index 9ee983e331..854bb99266 100644
--- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
+++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
@@ -1,24 +1,10 @@
-import yaml
 from unittest import mock
 import pytest
 import mlagents.trainers.tests.mock_brain as mb
 from mlagents.trainers.trainer.rl_trainer import RLTrainer
 from mlagents.trainers.tests.test_buffer import construct_fake_buffer
 from mlagents.trainers.agent_processor import AgentManagerQueue
-
-
-def dummy_config():
-    return yaml.safe_load(
-        """
-        output_path: "test/"
-        summary_freq: 1000
-        max_steps: 100
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
+from mlagents.trainers.settings import TrainerSettings
 
 
 def create_mock_brain():
@@ -57,7 +43,7 @@ def _process_trajectory(self, trajectory):
 
 def create_rl_trainer():
     mock_brainparams = create_mock_brain()
-    trainer = FakeTrainer(mock_brainparams, dummy_config(), True, 0)
+    trainer = FakeTrainer(mock_brainparams, TrainerSettings(max_steps=100), True, 0)
     trainer.set_is_policy_updating(True)
     return trainer
 
diff --git a/ml-agents/mlagents/trainers/tests/test_sac.py b/ml-agents/mlagents/trainers/tests/test_sac.py
index 66756f8640..1fad099048 100644
--- a/ml-agents/mlagents/trainers/tests/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/test_sac.py
@@ -1,6 +1,6 @@
 import pytest
 from unittest import mock
-import yaml
+import copy
 
 from mlagents.tf_utils import tf
 
@@ -12,39 +12,16 @@
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.mock_brain import make_brain_parameters
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.tests.test_simple_rl import SAC_CONFIG
+from mlagents.trainers.settings import NetworkSettings
+from mlagents.trainers.tests.test_reward_signals import (  # noqa: F401; pylint: disable=unused-variable
+    curiosity_dummy_config,
+)
 
 
 @pytest.fixture
 def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: sac
-        batch_size: 8
-        buffer_size: 10240
-        buffer_init_steps: 0
-        hidden_units: 32
-        init_entcoef: 0.1
-        learning_rate: 3.0e-4
-        max_steps: 1024
-        memory_size: 10
-        normalize: true
-        steps_per_update: 1
-        num_layers: 1
-        time_horizon: 64
-        sequence_length: 16
-        summary_freq: 1000
-        tau: 0.005
-        use_recurrent: false
-        curiosity_enc_size: 128
-        demo_path: None
-        vis_encode_type: simple
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-        """
-    )
+    return copy.deepcopy(SAC_CONFIG)
 
 
 VECTOR_ACTION_SPACE = [2]
@@ -62,12 +39,12 @@ def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
         vector_obs_space=VECTOR_OBS_SPACE,
         discrete_action_space=DISCRETE_ACTION_SPACE,
     )
-
     trainer_parameters = dummy_config
-    model_path = "testmodel"
-    trainer_parameters["output_path"] = model_path
-    trainer_parameters["keep_checkpoints"] = 3
-    trainer_parameters["use_recurrent"] = use_rnn
+    trainer_parameters.network_settings.memory = (
+        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
+        if use_rnn
+        else None
+    )
     policy = NNPolicy(
         0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
     )
@@ -95,14 +72,13 @@ def test_sac_optimizer_update(dummy_config, rnn, visual, discrete):
 
 
 @pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
-def test_sac_update_reward_signals(dummy_config, discrete):
+def test_sac_update_reward_signals(
+    dummy_config, curiosity_dummy_config, discrete  # noqa: F811
+):
     # Test evaluate
     tf.reset_default_graph()
     # Add a Curiosity module
-    dummy_config["reward_signals"]["curiosity"] = {}
-    dummy_config["reward_signals"]["curiosity"]["strength"] = 1.0
-    dummy_config["reward_signals"]["curiosity"]["gamma"] = 0.99
-    dummy_config["reward_signals"]["curiosity"]["encoding_size"] = 128
+    dummy_config.reward_signals = curiosity_dummy_config
     optimizer = create_sac_optimizer_mock(
         dummy_config, use_rnn=False, use_discrete=discrete, use_visual=False
     )
@@ -127,8 +103,7 @@ def test_sac_save_load_buffer(tmpdir, dummy_config):
         discrete_action_space=DISCRETE_ACTION_SPACE,
     )
     trainer_params = dummy_config
-    trainer_params["output_path"] = str(tmpdir)
-    trainer_params["save_replay_buffer"] = True
+    trainer_params.hyperparameters.save_replay_buffer = True
     trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0)
     policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
     trainer.add_policy(mock_brain.brain_name, policy)
@@ -154,7 +129,6 @@ def test_add_get_policy(sac_optimizer, dummy_config):
     mock_optimizer.reward_signals = {}
     sac_optimizer.return_value = mock_optimizer
 
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
     trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
     policy = mock.Mock(spec=NNPolicy)
     policy.get_current_step.return_value = 2000
@@ -176,8 +150,8 @@ def test_advance(dummy_config):
     brain_params = make_brain_parameters(
         discrete_action=False, visual_inputs=0, vec_obs_size=6
     )
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
-    dummy_config["steps_per_update"] = 20
+    dummy_config.hyperparameters.steps_per_update = 20
+    dummy_config.hyperparameters.buffer_init_steps = 0
     trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
     policy = trainer.create_policy(brain_params.brain_name, brain_params)
     trainer.add_policy(brain_params.brain_name, policy)
@@ -247,18 +221,5 @@ def test_advance(dummy_config):
         policy_queue.get_nowait()
 
 
-def test_bad_config(dummy_config):
-    brain_params = make_brain_parameters(
-        discrete_action=False, visual_inputs=0, vec_obs_size=6
-    )
-    # Test that we throw an error if we have sequence length greater than batch size
-    dummy_config["sequence_length"] = 64
-    dummy_config["batch_size"] = 32
-    dummy_config["use_recurrent"] = True
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
-    with pytest.raises(UnityTrainerException):
-        _ = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
-
-
 if __name__ == "__main__":
     pytest.main()
diff --git a/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py b/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
index 2b60ade95a..7ad1026563 100644
--- a/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
@@ -195,7 +195,7 @@ def simple_env_factory(worker_id, config):
     # Run PPO using env_manager
     _check_environment_trains(
         simple_env_factory(0, []),
-        PPO_CONFIG,
+        {"1D": PPO_CONFIG},
         env_manager=env_manager,
         success_threshold=None,
     )
diff --git a/ml-agents/mlagents/trainers/tests/test_trainer_util.py b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
index a1490518ad..2c0dbd3193 100644
--- a/ml-agents/mlagents/trainers/tests/test_trainer_util.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
@@ -1,140 +1,20 @@
 import pytest
-import yaml
 import io
 import os
 from unittest.mock import patch
 
 from mlagents.trainers import trainer_util
-from mlagents.trainers.trainer_util import assemble_curriculum_config
 from mlagents.trainers.cli_utils import load_config, _load_config
 from mlagents.trainers.ppo.trainer import PPOTrainer
 from mlagents.trainers.exception import TrainerConfigError, UnityTrainerException
 from mlagents.trainers.brain import BrainParameters
+from mlagents.trainers.settings import RunOptions
+from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG
 
 
 @pytest.fixture
 def dummy_config():
-    return yaml.safe_load(
-        """
-        default:
-            trainer: ppo
-            batch_size: 32
-            beta: 5.0e-3
-            buffer_size: 512
-            epsilon: 0.2
-            gamma: 0.99
-            hidden_units: 128
-            lambd: 0.95
-            learning_rate: 3.0e-4
-            max_steps: 5.0e4
-            normalize: true
-            num_epoch: 5
-            num_layers: 2
-            time_horizon: 64
-            sequence_length: 64
-            summary_freq: 1000
-            use_recurrent: false
-            memory_size: 8
-            use_curiosity: false
-            curiosity_strength: 0.0
-            curiosity_enc_size: 1
-            reward_signals:
-                extrinsic:
-                    strength: 1.0
-                    gamma: 0.99
-        """
-    )
-
-
-@pytest.fixture
-def dummy_config_with_override(dummy_config):
-    base = dummy_config
-    base["testbrain"] = {}
-    base["testbrain"]["normalize"] = False
-    return base
-
-
-@pytest.fixture
-def dummy_bad_config():
-    return yaml.safe_load(
-        """
-        default:
-            trainer: incorrect_trainer
-            brain_to_imitate: ExpertBrain
-            batches_per_epoch: 16
-            batch_size: 32
-            beta: 5.0e-3
-            buffer_size: 512
-            epsilon: 0.2
-            gamma: 0.99
-            hidden_units: 128
-            lambd: 0.95
-            learning_rate: 3.0e-4
-            max_steps: 5.0e4
-            normalize: true
-            num_epoch: 5
-            num_layers: 2
-            time_horizon: 64
-            sequence_length: 64
-            summary_freq: 1000
-            use_recurrent: false
-            memory_size: 8
-        """
-    )
-
-
-@patch("mlagents.trainers.brain.BrainParameters")
-def test_initialize_trainer_parameters_override_defaults(
-    BrainParametersMock, dummy_config_with_override
-):
-    run_id = "testrun"
-    output_path = "model_dir"
-    keep_checkpoints = 1
-    train_model = True
-    load_model = False
-    seed = 11
-    expected_reward_buff_cap = 1
-
-    base_config = dummy_config_with_override
-    expected_config = base_config["default"]
-    expected_config["output_path"] = output_path + "/testbrain"
-    expected_config["keep_checkpoints"] = keep_checkpoints
-
-    # Override value from specific brain config
-    expected_config["normalize"] = False
-
-    brain_params_mock = BrainParametersMock()
-    BrainParametersMock.return_value.brain_name = "testbrain"
-    external_brains = {"testbrain": brain_params_mock}
-
-    def mock_constructor(
-        self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
-    ):
-        assert brain == brain_params_mock.brain_name
-        assert trainer_parameters == expected_config
-        assert reward_buff_cap == expected_reward_buff_cap
-        assert training == train_model
-        assert load == load_model
-        assert seed == seed
-        assert run_id == run_id
-
-    with patch.object(PPOTrainer, "__init__", mock_constructor):
-        trainer_factory = trainer_util.TrainerFactory(
-            trainer_config=base_config,
-            run_id=run_id,
-            output_path=output_path,
-            keep_checkpoints=keep_checkpoints,
-            train_model=train_model,
-            load_model=load_model,
-            seed=seed,
-        )
-        trainers = {}
-        for _, brain_parameters in external_brains.items():
-            trainers["testbrain"] = trainer_factory.generate(
-                brain_parameters.brain_name
-            )
-        assert "testbrain" in trainers
-        assert isinstance(trainers["testbrain"], PPOTrainer)
+    return RunOptions(behaviors={"testbrain": PPO_CONFIG})
 
 
 @patch("mlagents.trainers.brain.BrainParameters")
@@ -150,10 +30,8 @@ def test_initialize_ppo_trainer(BrainParametersMock, dummy_config):
     seed = 11
     expected_reward_buff_cap = 1
 
-    base_config = dummy_config
-    expected_config = base_config["default"]
-    expected_config["output_path"] = output_path + "/testbrain"
-    expected_config["keep_checkpoints"] = keep_checkpoints
+    base_config = dummy_config.behaviors
+    expected_config = PPO_CONFIG
 
     def mock_constructor(
         self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
@@ -184,72 +62,12 @@ def mock_constructor(
 
 
 @patch("mlagents.trainers.brain.BrainParameters")
-def test_initialize_invalid_trainer_raises_exception(
-    BrainParametersMock, dummy_bad_config
-):
-    run_id = "testrun"
-    output_path = "results_dir"
-    keep_checkpoints = 1
-    train_model = True
-    load_model = False
-    seed = 11
-    bad_config = dummy_bad_config
-    BrainParametersMock.return_value.brain_name = "testbrain"
-    external_brains = {"testbrain": BrainParametersMock()}
-
-    with pytest.raises(TrainerConfigError):
-        trainer_factory = trainer_util.TrainerFactory(
-            trainer_config=bad_config,
-            run_id=run_id,
-            output_path=output_path,
-            keep_checkpoints=keep_checkpoints,
-            train_model=train_model,
-            load_model=load_model,
-            seed=seed,
-        )
-        trainers = {}
-        for brain_name, brain_parameters in external_brains.items():
-            trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
-
-    # Test no trainer specified
-    del bad_config["default"]["trainer"]
-    with pytest.raises(TrainerConfigError):
-        trainer_factory = trainer_util.TrainerFactory(
-            trainer_config=bad_config,
-            run_id=run_id,
-            output_path=output_path,
-            keep_checkpoints=keep_checkpoints,
-            train_model=train_model,
-            load_model=load_model,
-            seed=seed,
-        )
-        trainers = {}
-        for brain_name, brain_parameters in external_brains.items():
-            trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
-
-    # Test BC trainer specified
-    bad_config["default"]["trainer"] = "offline_bc"
-    with pytest.raises(UnityTrainerException):
-        trainer_factory = trainer_util.TrainerFactory(
-            trainer_config=bad_config,
-            run_id=run_id,
-            output_path=output_path,
-            keep_checkpoints=keep_checkpoints,
-            train_model=train_model,
-            load_model=load_model,
-            seed=seed,
-        )
-        trainers = {}
-        for brain_name, brain_parameters in external_brains.items():
-            trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
-
-
-def test_handles_no_default_section(dummy_config):
+def test_handles_no_config_provided(BrainParametersMock):
     """
-    Make sure the trainer setup handles a missing "default" in the config.
+    Make sure the trainer setup handles no configs provided at all.
     """
     brain_name = "testbrain"
-    no_default_config = {brain_name: dummy_config["default"]}
+    no_default_config = RunOptions().behaviors
     brain_parameters = BrainParameters(
         brain_name=brain_name,
         vector_observation_space_size=1,
@@ -271,35 +89,6 @@ def test_handles_no_default_section(dummy_config):
     trainer_factory.generate(brain_parameters.brain_name)
 
 
-def test_raise_if_no_config_for_brain(dummy_config):
-    """
-    Make sure the trainer setup raises a friendlier exception if both "default" and the brain name
-    are missing from the config.
-    """
-    brain_name = "testbrain"
-    bad_config = {"some_other_brain": dummy_config["default"]}
-    brain_parameters = BrainParameters(
-        brain_name=brain_name,
-        vector_observation_space_size=1,
-        camera_resolutions=[],
-        vector_action_space_size=[2],
-        vector_action_descriptions=[],
-        vector_action_space_type=0,
-    )
-
-    trainer_factory = trainer_util.TrainerFactory(
-        trainer_config=bad_config,
-        run_id="testrun",
-        output_path="output_path",
-        keep_checkpoints=1,
-        train_model=True,
-        load_model=False,
-        seed=42,
-    )
-    with pytest.raises(TrainerConfigError):
-        trainer_factory.generate(brain_parameters)
-
-
 def test_load_config_missing_file():
     with pytest.raises(TrainerConfigError):
         load_config("thisFileDefinitelyDoesNotExist.yaml")
@@ -327,41 +116,6 @@ def test_load_config_invalid_yaml():
         _load_config(fp)
 
 
-def test_assemble_curriculum_config():
-    file_contents = """
-behavior1:
-    curriculum:
-        foo: 5
-behavior2:
-    curriculum:
-        foo: 6
-    """
-    trainer_config = _load_config(file_contents)
-    curriculum_config = assemble_curriculum_config(trainer_config)
-    assert curriculum_config == {"behavior1": {"foo": 5}, "behavior2": {"foo": 6}}
-
-    # Check that nothing is returned if no curriculum.
-    file_contents = """
-behavior1:
-    foo: 3
-behavior2:
-    foo: 4
-    """
-    trainer_config = _load_config(file_contents)
-    curriculum_config = assemble_curriculum_config(trainer_config)
-    assert curriculum_config == {}
-
-    # Check that method doesn't break if 1st level entity isn't a dict.
-    # Note: this is a malformed configuration.
-    file_contents = """
-behavior1: 3
-behavior2: 4
-    """
-    trainer_config = _load_config(file_contents)
-    curriculum_config = assemble_curriculum_config(trainer_config)
-    assert curriculum_config == {}
-
-
 def test_existing_directories(tmp_path):
     output_path = os.path.join(tmp_path, "runid")
     # Test fresh new unused path - should do nothing.

From f5a97c84e31bfd2dd846123f96a45bc251d438c4 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 7 May 2020 14:25:38 -0700
Subject: [PATCH 22/54] Fix update config script

---
 config/upgrade_config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/upgrade_config.py b/config/upgrade_config.py
index c2c716db5d..ec4ce7bdf7 100644
--- a/config/upgrade_config.py
+++ b/config/upgrade_config.py
@@ -10,11 +10,11 @@
 # Take an existing trainer config (e.g. trainer_config.yaml) and turn it into the new format.
 def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]:
     all_behavior_config_dict = {}
-    default_config = old_config.get("default", {})
-    for behavior_name, config in old_config.items():
+    default_config = old_trainer_config.get("default", {})
+    for behavior_name, config in old_trainer_config.items():
         if behavior_name != "default":
             config = default_config.copy()
-            config.update(old_config[behavior_name])
+            config.update(old_trainer_config[behavior_name])
 
             # Convert to split TrainerSettings, Hyperparameters, NetworkSettings
             # Set trainer_type and get appropriate hyperparameter settings

From 85827ce3a87b9abd92ba9fd2edc29618ca5e7756 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 7 May 2020 15:01:01 -0700
Subject: [PATCH 23/54] Revert 3DBall.yaml

---
 config/ppo/3DBall.yaml | 52 +++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

diff --git a/config/ppo/3DBall.yaml b/config/ppo/3DBall.yaml
index fffdd4efeb..9f2767f73d 100644
--- a/config/ppo/3DBall.yaml
+++ b/config/ppo/3DBall.yaml
@@ -1,31 +1,25 @@
 behaviors:
   3DBall:
-      trainer_type: ppo
-
-      # hyperparameters:
-      #   batch_size: 64
-      #   beta: 0.001
-      #   buffer_size: 12000
-      #   epsilon: 0.2
-      #   lambd: 0.99
-      #   learning_rate: 0.0003
-      #   learning_rate_schedule: linear
-      #   max_steps: 5.0e5
-      #   num_epoch: 3
-
-      #   time_horizon: 1000
-      #   sequence_length: 64
-      #   summary_freq: 12000
-      network_settings:
-        num_layers: 2
-        normalize: true
-        hidden_units: 128
-      reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-      time_horizon: 1000
-      sequence_length: 64
-      summary_freq: 12000
-
-
+    trainer: ppo
+    batch_size: 64
+    beta: 0.001
+    buffer_size: 12000
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.99
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 5.0e5
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 12000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99

From b3bb2699579c0b3d0e3471e54925e29747a66747 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 7 May 2020 15:22:27 -0700
Subject: [PATCH 24/54] Convert PPO configs to new format

---
 config/ppo/3DBall.yaml                  |  42 ++++-----
 config/ppo/3DBallHard.yaml              |  42 ++++-----
 config/ppo/3DBall_randomize.yaml        |  76 ++++++++--------
 config/ppo/Basic.yaml                   |  42 ++++-----
 config/ppo/Bouncer.yaml                 |  42 ++++-----
 config/ppo/CrawlerDynamic.yaml          |  42 ++++-----
 config/ppo/CrawlerStatic.yaml           |  42 ++++-----
 config/ppo/FoodCollector.yaml           |  42 ++++-----
 config/ppo/GridWorld.yaml               |  42 ++++-----
 config/ppo/Hallway.yaml                 |  45 +++++-----
 config/ppo/PushBlock.yaml               |  42 ++++-----
 config/ppo/Pyramids.yaml                |  45 +++++-----
 config/ppo/Reacher.yaml                 |  42 ++++-----
 config/ppo/SoccerTwos.yaml              |  56 ++++++------
 config/ppo/StrikersVsGoalie.yaml        |  99 ++++++++++----------
 config/ppo/Tennis.yaml                  |  49 +++++-----
 config/ppo/VisualHallway.yaml           |  45 +++++-----
 config/ppo/VisualPushBlock.yaml         |  45 +++++-----
 config/ppo/VisualPyramids.yaml          |  45 +++++-----
 config/ppo/Walker.yaml                  |  42 ++++-----
 config/ppo/WallJump.yaml                |  83 ++++++++---------
 config/ppo/WallJump_curriculum.yaml     | 115 ++++++++++++------------
 config/ppo/WormDynamic.yaml             |  42 ++++-----
 config/ppo/WormStatic.yaml              |  42 ++++-----
 ml-agents/mlagents/trainers/settings.py |   7 +-
 25 files changed, 660 insertions(+), 596 deletions(-)

diff --git a/config/ppo/3DBall.yaml b/config/ppo/3DBall.yaml
index 9f2767f73d..a76e60656a 100644
--- a/config/ppo/3DBall.yaml
+++ b/config/ppo/3DBall.yaml
@@ -1,25 +1,27 @@
 behaviors:
   3DBall:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.001
-    buffer_size: 12000
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.99
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 12000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 12000
+      learning_rate: 0.0003
+      beta: 0.001
+      epsilon: 0.2
+      lambd: 0.99
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
diff --git a/config/ppo/3DBallHard.yaml b/config/ppo/3DBallHard.yaml
index 9054b21de5..d4155223dd 100644
--- a/config/ppo/3DBallHard.yaml
+++ b/config/ppo/3DBallHard.yaml
@@ -1,25 +1,27 @@
 behaviors:
   3DBallHard:
-    trainer: ppo
-    batch_size: 1200
-    beta: 0.001
-    buffer_size: 12000
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5.0e6
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 12000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 1200
+      buffer_size: 12000
+      learning_rate: 0.0003
+      beta: 0.001
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
diff --git a/config/ppo/3DBall_randomize.yaml b/config/ppo/3DBall_randomize.yaml
index d01efad8c0..f7a4c6c2fb 100644
--- a/config/ppo/3DBall_randomize.yaml
+++ b/config/ppo/3DBall_randomize.yaml
@@ -1,40 +1,42 @@
 behaviors:
-    3DBall:
-        trainer: ppo
-        batch_size: 64
-        beta: 0.001
-        buffer_size: 12000
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.99
-        learning_rate: 3.0e-4
-        learning_rate_schedule: linear
-        max_steps: 5.0e5
-        memory_size: 128
-        normalize: true
-        num_epoch: 3
-        num_layers: 2
-        time_horizon: 1000
-        sequence_length: 64
-        summary_freq: 12000
-        use_recurrent: false
-        vis_encode_type: simple
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
+  3DBall:
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 12000
+      learning_rate: 0.0003
+      beta: 0.001
+      epsilon: 0.2
+      lambd: 0.99
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
 
 parameter_randomization:
-    resampling-interval: 500
-    mass:
-        sampler-type: "uniform"
-        min_value: 0.5
-        max_value: 10
-    gravity:
-        sampler-type: "uniform"
-        min_value: 7
-        max_value: 12
-    scale:
-        sampler-type: "uniform"
-        min_value: 0.75
-        max_value: 3
+  resampling-interval: 500
+  mass:
+    sampler-type: uniform
+    min_value: 0.5
+    max_value: 10
+  gravity:
+    sampler-type: uniform
+    min_value: 7
+    max_value: 12
+  scale:
+    sampler-type: uniform
+    min_value: 0.75
+    max_value: 3
diff --git a/config/ppo/Basic.yaml b/config/ppo/Basic.yaml
index 07bd93c12c..5b054a6612 100644
--- a/config/ppo/Basic.yaml
+++ b/config/ppo/Basic.yaml
@@ -1,25 +1,27 @@
 behaviors:
   Basic:
-    trainer: ppo
-    batch_size: 32
-    beta: 0.005
-    buffer_size: 256
-    epsilon: 0.2
-    hidden_units: 20
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 1
-    time_horizon: 3
-    sequence_length: 64
-    summary_freq: 2000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 32
+      buffer_size: 256
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 20
+      num_layers: 1
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.9
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 3
+    summary_freq: 2000
+    threaded: true
diff --git a/config/ppo/Bouncer.yaml b/config/ppo/Bouncer.yaml
index 7deee97ac3..56629e0a10 100644
--- a/config/ppo/Bouncer.yaml
+++ b/config/ppo/Bouncer.yaml
@@ -1,25 +1,27 @@
 behaviors:
   Bouncer:
-    trainer: ppo
-    batch_size: 1024
-    beta: 0.005
-    buffer_size: 10240
-    epsilon: 0.2
-    hidden_units: 64
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 4.0e6
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 1024
+      buffer_size: 10240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 64
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 4000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
diff --git a/config/ppo/CrawlerDynamic.yaml b/config/ppo/CrawlerDynamic.yaml
index 0922eb7e01..82268ac509 100644
--- a/config/ppo/CrawlerDynamic.yaml
+++ b/config/ppo/CrawlerDynamic.yaml
@@ -1,25 +1,27 @@
 behaviors:
   CrawlerDynamic:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1e7
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
diff --git a/config/ppo/CrawlerStatic.yaml b/config/ppo/CrawlerStatic.yaml
index e532c1c198..1d38e26623 100644
--- a/config/ppo/CrawlerStatic.yaml
+++ b/config/ppo/CrawlerStatic.yaml
@@ -1,25 +1,27 @@
 behaviors:
   CrawlerStatic:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1e7
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
diff --git a/config/ppo/FoodCollector.yaml b/config/ppo/FoodCollector.yaml
index 53abc801db..95c9d10703 100644
--- a/config/ppo/FoodCollector.yaml
+++ b/config/ppo/FoodCollector.yaml
@@ -1,25 +1,27 @@
 behaviors:
   FoodCollector:
-    trainer: ppo
-    batch_size: 1024
-    beta: 0.005
-    buffer_size: 10240
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2.0e6
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 1024
+      buffer_size: 10240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 2000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
diff --git a/config/ppo/GridWorld.yaml b/config/ppo/GridWorld.yaml
index e7ccd13434..b124fb46c1 100644
--- a/config/ppo/GridWorld.yaml
+++ b/config/ppo/GridWorld.yaml
@@ -1,25 +1,27 @@
 behaviors:
   GridWorld:
-    trainer: ppo
-    batch_size: 32
-    beta: 0.005
-    buffer_size: 256
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 500000
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 1
-    time_horizon: 5
-    sequence_length: 64
-    summary_freq: 20000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 32
+      buffer_size: 256
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 1
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.9
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 5
+    summary_freq: 20000
+    threaded: true
diff --git a/config/ppo/Hallway.yaml b/config/ppo/Hallway.yaml
index 29247505ba..38a941689a 100644
--- a/config/ppo/Hallway.yaml
+++ b/config/ppo/Hallway.yaml
@@ -1,25 +1,30 @@
 behaviors:
   Hallway:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 1024
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 1024
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+      memory:
+        sequence_length: 64
+        memory_size: 128
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
diff --git a/config/ppo/PushBlock.yaml b/config/ppo/PushBlock.yaml
index 246d07aa11..062f8fd364 100644
--- a/config/ppo/PushBlock.yaml
+++ b/config/ppo/PushBlock.yaml
@@ -1,25 +1,27 @@
 behaviors:
   PushBlock:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2.0e6
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 60000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 2000000
+    time_horizon: 64
+    summary_freq: 60000
+    threaded: true
diff --git a/config/ppo/Pyramids.yaml b/config/ppo/Pyramids.yaml
index 6f385ae16e..85317963bf 100644
--- a/config/ppo/Pyramids.yaml
+++ b/config/ppo/Pyramids.yaml
@@ -1,29 +1,32 @@
 behaviors:
   Pyramids:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 512
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
       curiosity:
-        strength: 0.02
         gamma: 0.99
+        strength: 0.02
         encoding_size: 256
+        learning_rate: 0.0003
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 128
+    summary_freq: 30000
+    threaded: true
diff --git a/config/ppo/Reacher.yaml b/config/ppo/Reacher.yaml
index 18ebb35580..2b17116430 100644
--- a/config/ppo/Reacher.yaml
+++ b/config/ppo/Reacher.yaml
@@ -1,25 +1,27 @@
 behaviors:
   Reacher:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2e7
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 60000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
+    time_horizon: 1000
+    summary_freq: 60000
+    threaded: true
diff --git a/config/ppo/SoccerTwos.yaml b/config/ppo/SoccerTwos.yaml
index 4b10dc911a..4e9f7861b8 100644
--- a/config/ppo/SoccerTwos.yaml
+++ b/config/ppo/SoccerTwos.yaml
@@ -1,38 +1,34 @@
 behaviors:
   SoccerTwos:
-    trainer: ppo
-    batch_size: 2048
-    beta: 0.005
-    buffer_size: 20480
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2048
+      buffer_size: 20480
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: false
+      hidden_units: 512
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 50000000
+    time_horizon: 1000
+    summary_freq: 10000
+    threaded: true
     self_play:
-      window: 10
-      play_against_latest_model_ratio: 0.5
       save_steps: 50000
-      swap_steps: 50000
       team_change: 200000
-    curriculum:
-      measure: progress
-      thresholds: [0.05, 0.1]
-      min_lesson_length: 100
-      signal_smoothing: true
-      parameters:
-        ball_touch: [1.0, 0.5, 0.0]
+      swap_steps: 50000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
diff --git a/config/ppo/StrikersVsGoalie.yaml b/config/ppo/StrikersVsGoalie.yaml
index bd93f3a733..cf69e1dde2 100644
--- a/config/ppo/StrikersVsGoalie.yaml
+++ b/config/ppo/StrikersVsGoalie.yaml
@@ -1,62 +1,67 @@
 behaviors:
   Goalie:
-    trainer: ppo
-    batch_size: 2048
-    beta: 0.005
-    buffer_size: 20480
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2048
+      buffer_size: 20480
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: false
+      hidden_units: 512
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 50000000
+    time_horizon: 1000
+    summary_freq: 10000
+    threaded: true
     self_play:
-      window: 10
-      play_against_latest_model_ratio: 0.5
       save_steps: 50000
-      swap_steps: 25000
       team_change: 200000
-
+      swap_steps: 25000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
   Striker:
-    trainer: ppo
-    batch_size: 2048
-    beta: 0.005
-    buffer_size: 20480
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2048
+      buffer_size: 20480
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: false
+      hidden_units: 512
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 50000000
+    time_horizon: 1000
+    summary_freq: 10000
+    threaded: true
     self_play:
-      window: 10
-      play_against_latest_model_ratio: 0.5
       save_steps: 50000
-      swap_steps: 100000
       team_change: 200000
+      swap_steps: 100000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
diff --git a/config/ppo/Tennis.yaml b/config/ppo/Tennis.yaml
index 3ad006fa18..9463f8b23c 100644
--- a/config/ppo/Tennis.yaml
+++ b/config/ppo/Tennis.yaml
@@ -1,31 +1,34 @@
 behaviors:
   Tennis:
-    trainer: ppo
-    batch_size: 1024
-    beta: 0.005
-    buffer_size: 10240
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e7
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 1024
+      buffer_size: 10240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: true
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 50000000
+    time_horizon: 1000
+    summary_freq: 10000
+    threaded: true
     self_play:
-      window: 10
-      play_against_latest_model_ratio: 0.5
       save_steps: 50000
-      swap_steps: 50000
       team_change: 100000
+      swap_steps: 50000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
diff --git a/config/ppo/VisualHallway.yaml b/config/ppo/VisualHallway.yaml
index abcbfc3ee3..bf953bba40 100644
--- a/config/ppo/VisualHallway.yaml
+++ b/config/ppo/VisualHallway.yaml
@@ -1,25 +1,30 @@
 behaviors:
   VisualHallway:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.01
-    buffer_size: 1024
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 1
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 1024
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+      memory:
+        sequence_length: 64
+        memory_size: 128
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
diff --git a/config/ppo/VisualPushBlock.yaml b/config/ppo/VisualPushBlock.yaml
index 1ea0f1fa6f..260a31f3ff 100644
--- a/config/ppo/VisualPushBlock.yaml
+++ b/config/ppo/VisualPushBlock.yaml
@@ -1,25 +1,30 @@
 behaviors:
   VisualPushBlock:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.01
-    buffer_size: 1024
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 3.0e6
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 1
-    time_horizon: 64
-    sequence_length: 32
-    summary_freq: 60000
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 1024
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+      memory:
+        sequence_length: 32
+        memory_size: 128
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3000000
+    time_horizon: 64
+    summary_freq: 60000
+    threaded: true
diff --git a/config/ppo/VisualPyramids.yaml b/config/ppo/VisualPyramids.yaml
index 2447c44c48..34f1b8843c 100644
--- a/config/ppo/VisualPyramids.yaml
+++ b/config/ppo/VisualPyramids.yaml
@@ -1,29 +1,32 @@
 behaviors:
   VisualPyramids:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.01
-    buffer_size: 2024
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 1
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 2024
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 1
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
       curiosity:
-        strength: 0.01
         gamma: 0.99
+        strength: 0.01
         encoding_size: 256
+        learning_rate: 0.0003
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 128
+    summary_freq: 10000
+    threaded: true
diff --git a/config/ppo/Walker.yaml b/config/ppo/Walker.yaml
index 52648fedff..fde187be69 100644
--- a/config/ppo/Walker.yaml
+++ b/config/ppo/Walker.yaml
@@ -1,25 +1,27 @@
 behaviors:
   Walker:
-    trainer: ppo
-    batch_size: 2048
-    beta: 0.005
-    buffer_size: 20480
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2e7
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2048
+      buffer_size: 20480
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
diff --git a/config/ppo/WallJump.yaml b/config/ppo/WallJump.yaml
index b4fdb65b7e..f150863489 100644
--- a/config/ppo/WallJump.yaml
+++ b/config/ppo/WallJump.yaml
@@ -1,50 +1,53 @@
 behaviors:
   BigWallJump:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.005
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 20000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
-
-  SmallWallJump:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.005
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5e6
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
     time_horizon: 128
-    sequence_length: 64
     summary_freq: 20000
-    use_recurrent: false
-    vis_encode_type: simple
+    threaded: true
+  SmallWallJump:
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 128
+    summary_freq: 20000
+    threaded: true
diff --git a/config/ppo/WallJump_curriculum.yaml b/config/ppo/WallJump_curriculum.yaml
index 48640a767d..de13b5efcf 100644
--- a/config/ppo/WallJump_curriculum.yaml
+++ b/config/ppo/WallJump_curriculum.yaml
@@ -1,65 +1,70 @@
 behaviors:
   BigWallJump:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.005
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 20000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
-    curriculum:
-      measure: progress
-      thresholds: [0.1, 0.3, 0.5]
-      min_lesson_length: 100
-      signal_smoothing: true
-      parameters:
-        big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
-        big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
-
-  SmallWallJump:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.005
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5e6
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
     time_horizon: 128
-    sequence_length: 64
     summary_freq: 20000
-    use_recurrent: false
-    vis_encode_type: simple
+    threaded: true
+  SmallWallJump:
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
-    curriculum:
-      measure: progress
-      thresholds: [0.1, 0.3, 0.5]
-      min_lesson_length: 100
-      signal_smoothing: true
-      parameters:
-        small_wall_height: [1.5, 2.0, 2.5, 4.0]
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 128
+    summary_freq: 20000
+    threaded: true
+
+curriculum:
+  BigWallJump:
+    measure: progress
+    thresholds: [0.1, 0.3, 0.5]
+    min_lesson_length: 100
+    signal_smoothing: true
+    parameters:
+      big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
+      big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
+  SmallWallJump:
+    measure: progress
+    thresholds: [0.1, 0.3, 0.5]
+    min_lesson_length: 100
+    signal_smoothing: true
+    parameters:
+      small_wall_height: [1.5, 2.0, 2.5, 4.0]
diff --git a/config/ppo/WormDynamic.yaml b/config/ppo/WormDynamic.yaml
index 6a1d5d862c..0c0331209f 100644
--- a/config/ppo/WormDynamic.yaml
+++ b/config/ppo/WormDynamic.yaml
@@ -1,25 +1,27 @@
 behaviors:
   WormDynamic:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 3.5e6
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3500000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
diff --git a/config/ppo/WormStatic.yaml b/config/ppo/WormStatic.yaml
index 07ebaacd85..5bbcbc58ea 100644
--- a/config/ppo/WormStatic.yaml
+++ b/config/ppo/WormStatic.yaml
@@ -1,25 +1,27 @@
 behaviors:
   WormStatic:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 3.5e6
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3500000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 789cc39a1b..da48c86503 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -48,12 +48,15 @@ def trainer_settings_to_cls(d: Mapping, t: type) -> Any:
                 )
 
             else:
-                print(d_copy[key])
                 d_copy[key] = strict_to_cls(
                     d_copy[key], TrainerSettings.to_settings(d_copy["trainer_type"])
                 )
         elif key == "reward_signals":
             d_copy[key] = rewardsignal_settings_to_cls(val)
+        elif key == "max_steps":
+            d_copy[key] = int(
+                float(val)
+            )  # In some configs, max steps was specified as a float
         else:
             d_copy[key] = check_and_structure(key, val, t)
     return t(**d_copy)
@@ -224,7 +227,7 @@ def _set_default_hyperparameters(self):
     output_path: str = "default"
     # TODO: Remove parser default and remove from CLI
     keep_checkpoints: int = parser.get_default("keep_checkpoints")
-    max_steps: float = 500000
+    max_steps: int = 500000
     time_horizon: int = 64
     summary_freq: int = 50000
     threaded: bool = True

From 41b11f1d749db9b60c882b5099d92f096b7a2752 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 7 May 2020 15:26:04 -0700
Subject: [PATCH 25/54] Update SAC configs

---
 config/sac/3DBall.yaml          | 44 +++++++++--------
 config/sac/3DBallHard.yaml      | 44 +++++++++--------
 config/sac/Basic.yaml           | 44 +++++++++--------
 config/sac/Bouncer.yaml         | 44 +++++++++--------
 config/sac/CrawlerDynamic.yaml  | 44 +++++++++--------
 config/sac/CrawlerStatic.yaml   | 44 +++++++++--------
 config/sac/FoodCollector.yaml   | 44 +++++++++--------
 config/sac/GridWorld.yaml       | 44 +++++++++--------
 config/sac/Hallway.yaml         | 47 ++++++++++--------
 config/sac/PushBlock.yaml       | 44 +++++++++--------
 config/sac/Pyramids.yaml        | 48 ++++++++++--------
 config/sac/Reacher.yaml         | 44 +++++++++--------
 config/sac/Tennis.yaml          | 50 ++++++++++---------
 config/sac/VisualHallway.yaml   | 48 ++++++++++--------
 config/sac/VisualPushBlock.yaml | 48 ++++++++++--------
 config/sac/VisualPyramids.yaml  | 48 ++++++++++--------
 config/sac/Walker.yaml          | 44 +++++++++--------
 config/sac/WallJump.yaml        | 87 ++++++++++++++++++---------------
 config/sac/WormDynamic.yaml     | 44 +++++++++--------
 config/sac/WormStatic.yaml      | 44 +++++++++--------
 20 files changed, 522 insertions(+), 426 deletions(-)

diff --git a/config/sac/3DBall.yaml b/config/sac/3DBall.yaml
index 498062df0e..3949be48c3 100644
--- a/config/sac/3DBall.yaml
+++ b/config/sac/3DBall.yaml
@@ -1,25 +1,29 @@
 behaviors:
   3DBall:
-    trainer: sac
-    batch_size: 64
-    buffer_size: 12000
-    buffer_init_steps: 0
-    hidden_units: 64
-    init_entcoef: 0.5
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: true
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 12000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 64
+      buffer_size: 12000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.5
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: true
+      hidden_units: 64
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
diff --git a/config/sac/3DBallHard.yaml b/config/sac/3DBallHard.yaml
index b72c13897d..2ce4183e70 100644
--- a/config/sac/3DBallHard.yaml
+++ b/config/sac/3DBallHard.yaml
@@ -1,25 +1,29 @@
 behaviors:
   3DBallHard:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: true
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 12000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
diff --git a/config/sac/Basic.yaml b/config/sac/Basic.yaml
index 401159a683..51cab0ebce 100644
--- a/config/sac/Basic.yaml
+++ b/config/sac/Basic.yaml
@@ -1,25 +1,29 @@
 behaviors:
   Basic:
-    trainer: sac
-    batch_size: 64
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 20
-    init_entcoef: 0.01
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 10
-    sequence_length: 64
-    summary_freq: 2000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 64
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.01
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 20
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 10
+    summary_freq: 2000
+    threaded: true
diff --git a/config/sac/Bouncer.yaml b/config/sac/Bouncer.yaml
index 81dee56cb5..0503a1eb4c 100644
--- a/config/sac/Bouncer.yaml
+++ b/config/sac/Bouncer.yaml
@@ -1,25 +1,29 @@
 behaviors:
   Bouncer:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 64
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 1.0e6
-    memory_size: 128
-    normalize: true
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 20000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: true
+      hidden_units: 64
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 1000000
+    time_horizon: 64
+    summary_freq: 20000
+    threaded: true
diff --git a/config/sac/CrawlerDynamic.yaml b/config/sac/CrawlerDynamic.yaml
index ef6001b9b4..9324162947 100644
--- a/config/sac/CrawlerDynamic.yaml
+++ b/config/sac/CrawlerDynamic.yaml
@@ -1,25 +1,29 @@
 behaviors:
   CrawlerDynamic:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 0
-    hidden_units: 512
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5e6
-    memory_size: 128
-    normalize: true
-    steps_per_update: 20
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
diff --git a/config/sac/CrawlerStatic.yaml b/config/sac/CrawlerStatic.yaml
index e12f354f75..5935bb4fb3 100644
--- a/config/sac/CrawlerStatic.yaml
+++ b/config/sac/CrawlerStatic.yaml
@@ -1,25 +1,29 @@
 behaviors:
   CrawlerStatic:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 2000
-    hidden_units: 512
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 3e6
-    memory_size: 128
-    normalize: true
-    steps_per_update: 20
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 2000
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
diff --git a/config/sac/FoodCollector.yaml b/config/sac/FoodCollector.yaml
index c415a011d5..1ce4d6cb11 100644
--- a/config/sac/FoodCollector.yaml
+++ b/config/sac/FoodCollector.yaml
@@ -1,25 +1,29 @@
 behaviors:
   FoodCollector:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 0.05
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2.0e6
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.05
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 2000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
diff --git a/config/sac/GridWorld.yaml b/config/sac/GridWorld.yaml
index ac60928197..084e821a87 100644
--- a/config/sac/GridWorld.yaml
+++ b/config/sac/GridWorld.yaml
@@ -1,25 +1,29 @@
 behaviors:
   GridWorld:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 1000
-    hidden_units: 128
-    init_entcoef: 0.5
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 500000
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 1
-    time_horizon: 5
-    sequence_length: 64
-    summary_freq: 20000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 1000
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.5
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.9
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 5
+    summary_freq: 20000
+    threaded: true
diff --git a/config/sac/Hallway.yaml b/config/sac/Hallway.yaml
index 26a6b4af3a..30a507d8dd 100644
--- a/config/sac/Hallway.yaml
+++ b/config/sac/Hallway.yaml
@@ -1,25 +1,32 @@
 behaviors:
   Hallway:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 0.1
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e6
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 32
-    summary_freq: 10000
-    tau: 0.005
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.1
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+      memory:
+        sequence_length: 32
+        memory_size: 128
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
diff --git a/config/sac/PushBlock.yaml b/config/sac/PushBlock.yaml
index b87851a0bd..bb0319885e 100644
--- a/config/sac/PushBlock.yaml
+++ b/config/sac/PushBlock.yaml
@@ -1,25 +1,29 @@
 behaviors:
   PushBlock:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 256
-    init_entcoef: 0.05
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2e6
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 100000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.05
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 2000000
+    time_horizon: 64
+    summary_freq: 100000
+    threaded: true
diff --git a/config/sac/Pyramids.yaml b/config/sac/Pyramids.yaml
index 6f42e6df52..476273d681 100644
--- a/config/sac/Pyramids.yaml
+++ b/config/sac/Pyramids.yaml
@@ -1,31 +1,37 @@
 behaviors:
   Pyramids:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 500000
-    buffer_init_steps: 10000
-    hidden_units: 256
-    init_entcoef: 0.01
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 128
-    sequence_length: 16
-    summary_freq: 30000
-    tau: 0.01
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 500000
+      buffer_init_steps: 10000
+      tau: 0.01
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.01
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 2.0
         gamma: 0.99
+        strength: 2.0
       gail:
-        strength: 0.02
         gamma: 0.99
+        strength: 0.02
         encoding_size: 128
+        learning_rate: 0.0003
         use_actions: true
+        use_vail: false
         demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 128
+    summary_freq: 30000
+    threaded: true
diff --git a/config/sac/Reacher.yaml b/config/sac/Reacher.yaml
index 258c26582e..91d4e02a59 100644
--- a/config/sac/Reacher.yaml
+++ b/config/sac/Reacher.yaml
@@ -1,25 +1,29 @@
 behaviors:
   Reacher:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 500000
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2e7
-    memory_size: 128
-    normalize: true
-    steps_per_update: 20
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 60000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 500000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
+    time_horizon: 1000
+    summary_freq: 60000
+    threaded: true
diff --git a/config/sac/Tennis.yaml b/config/sac/Tennis.yaml
index bad7ab4feb..f5e258b655 100644
--- a/config/sac/Tennis.yaml
+++ b/config/sac/Tennis.yaml
@@ -1,30 +1,36 @@
 behaviors:
   Tennis:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 256
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2e7
-    memory_size: 128
-    normalize: true
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: true
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
     self_play:
-      window: 10
-      play_against_current_self_ratio: 0.5
       save_steps: 50000
+      team_change: 250000
       swap_steps: 50000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
diff --git a/config/sac/VisualHallway.yaml b/config/sac/VisualHallway.yaml
index 1d1bf6826b..e27d1a0298 100644
--- a/config/sac/VisualHallway.yaml
+++ b/config/sac/VisualHallway.yaml
@@ -1,26 +1,32 @@
 behaviors:
   VisualHallway:
-    trainer: sac
-    batch_size: 64
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 1
-    time_horizon: 64
-    sequence_length: 32
-    summary_freq: 10000
-    tau: 0.005
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 64
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+      memory:
+        sequence_length: 32
+        memory_size: 128
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
-    gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
diff --git a/config/sac/VisualPushBlock.yaml b/config/sac/VisualPushBlock.yaml
index 88c622fb84..315773656b 100644
--- a/config/sac/VisualPushBlock.yaml
+++ b/config/sac/VisualPushBlock.yaml
@@ -1,26 +1,32 @@
 behaviors:
   VisualPushBlock:
-    trainer: sac
-    batch_size: 64
-    buffer_size: 1024
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 3.0e6
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 1
-    time_horizon: 64
-    sequence_length: 32
-    summary_freq: 60000
-    tau: 0.005
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 64
+      buffer_size: 1024
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+      memory:
+        sequence_length: 32
+        memory_size: 128
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
-    gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3000000
+    time_horizon: 64
+    summary_freq: 60000
+    threaded: true
diff --git a/config/sac/VisualPyramids.yaml b/config/sac/VisualPyramids.yaml
index 7bcc61f289..ce4af8b017 100644
--- a/config/sac/VisualPyramids.yaml
+++ b/config/sac/VisualPyramids.yaml
@@ -1,31 +1,37 @@
 behaviors:
   VisualPyramids:
-    trainer: sac
-    batch_size: 64
-    buffer_size: 500000
-    buffer_init_steps: 1000
-    hidden_units: 256
-    init_entcoef: 0.01
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 1
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 10000
-    tau: 0.01
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 64
+      buffer_size: 500000
+      buffer_init_steps: 1000
+      tau: 0.01
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.01
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 1
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 2.0
         gamma: 0.99
+        strength: 2.0
       gail:
-        strength: 0.02
         gamma: 0.99
+        strength: 0.02
         encoding_size: 128
+        learning_rate: 0.0003
         use_actions: true
+        use_vail: false
         demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 128
+    summary_freq: 10000
+    threaded: true
diff --git a/config/sac/Walker.yaml b/config/sac/Walker.yaml
index bb5023ca18..7216208ace 100644
--- a/config/sac/Walker.yaml
+++ b/config/sac/Walker.yaml
@@ -1,25 +1,29 @@
 behaviors:
   Walker:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 0
-    hidden_units: 512
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2e7
-    memory_size: 128
-    normalize: true
-    steps_per_update: 30
-    num_layers: 4
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 30.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 30.0
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 4
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
diff --git a/config/sac/WallJump.yaml b/config/sac/WallJump.yaml
index 2b8991dd8c..cb2962ded4 100644
--- a/config/sac/WallJump.yaml
+++ b/config/sac/WallJump.yaml
@@ -1,50 +1,57 @@
 behaviors:
   BigWallJump:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 256
-    init_entcoef: 0.1
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2e7
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 20000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.1
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
-
-  SmallWallJump:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 256
-    init_entcoef: 0.1
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5e6
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
     time_horizon: 128
-    sequence_length: 64
     summary_freq: 20000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    threaded: true
+  SmallWallJump:
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.1
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 128
+    summary_freq: 20000
+    threaded: true
diff --git a/config/sac/WormDynamic.yaml b/config/sac/WormDynamic.yaml
index e5bbef7e16..9af3fe2e66 100644
--- a/config/sac/WormDynamic.yaml
+++ b/config/sac/WormDynamic.yaml
@@ -1,25 +1,29 @@
 behaviors:
   WormDynamic:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 0
-    hidden_units: 512
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5e6
-    memory_size: 128
-    normalize: true
-    steps_per_update: 20
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
diff --git a/config/sac/WormStatic.yaml b/config/sac/WormStatic.yaml
index b0645a8bc6..48a688afe8 100644
--- a/config/sac/WormStatic.yaml
+++ b/config/sac/WormStatic.yaml
@@ -1,25 +1,29 @@
 behaviors:
   WormStatic:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 2000
-    hidden_units: 512
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 3e6
-    memory_size: 128
-    normalize: true
-    steps_per_update: 20
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 2000
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.995
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true

From 02b54fc616f801379975f2e2faf69e380889befb Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 7 May 2020 15:28:55 -0700
Subject: [PATCH 26/54] Remove nulls from configs, update imitation

---
 config/imitation/CrawlerStatic.yaml | 46 +++++++++++++++------------
 config/imitation/FoodCollector.yaml | 46 +++++++++++++++------------
 config/imitation/Hallway.yaml       | 48 +++++++++++++++++------------
 config/imitation/PushBlock.yaml     | 43 +++++++++++++++-----------
 config/upgrade_config.py            | 16 ++++++++--
 5 files changed, 122 insertions(+), 77 deletions(-)

diff --git a/config/imitation/CrawlerStatic.yaml b/config/imitation/CrawlerStatic.yaml
index 57705f7815..c69ed5571f 100644
--- a/config/imitation/CrawlerStatic.yaml
+++ b/config/imitation/CrawlerStatic.yaml
@@ -1,29 +1,37 @@
 behaviors:
   CrawlerStatic:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    max_steps: 1e7
-    memory_size: 256
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
     reward_signals:
       gail:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
         encoding_size: 128
+        learning_rate: 0.0003
+        use_actions: false
+        use_vail: false
         demo_path: Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
     behavioral_cloning:
       demo_path: Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo
-      strength: 0.5
       steps: 50000
+      strength: 0.5
+      samples_per_update: 0
diff --git a/config/imitation/FoodCollector.yaml b/config/imitation/FoodCollector.yaml
index f5682763be..3d6328269e 100644
--- a/config/imitation/FoodCollector.yaml
+++ b/config/imitation/FoodCollector.yaml
@@ -1,29 +1,37 @@
 behaviors:
   FoodCollector:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.005
-    buffer_size: 10240
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    max_steps: 2.0e6
-    memory_size: 256
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 32
-    summary_freq: 10000
-    use_recurrent: false
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 10240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       gail:
-        strength: 0.1
         gamma: 0.99
+        strength: 0.1
         encoding_size: 128
+        learning_rate: 0.0003
+        use_actions: false
+        use_vail: false
         demo_path: Project/Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 2000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
     behavioral_cloning:
       demo_path: Project/Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo
-      strength: 1.0
       steps: 0
+      strength: 1.0
+      samples_per_update: 0
diff --git a/config/imitation/Hallway.yaml b/config/imitation/Hallway.yaml
index 235b310877..27baeacdd4 100644
--- a/config/imitation/Hallway.yaml
+++ b/config/imitation/Hallway.yaml
@@ -1,28 +1,38 @@
 behaviors:
   Hallway:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 1024
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    max_steps: 1.0e7
-    memory_size: 256
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: true
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 1024
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+      memory:
+        sequence_length: 64
+        memory_size: 256
     reward_signals:
       extrinsic:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
       gail:
-        strength: 0.1
         gamma: 0.99
+        strength: 0.1
         encoding_size: 128
+        learning_rate: 0.0003
+        use_actions: false
+        use_vail: false
         demo_path: Project/Assets/ML-Agents/Examples/Hallway/Demos/ExpertHallway.demo
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
diff --git a/config/imitation/PushBlock.yaml b/config/imitation/PushBlock.yaml
index 2d1e996733..ffddc01278 100644
--- a/config/imitation/PushBlock.yaml
+++ b/config/imitation/PushBlock.yaml
@@ -1,25 +1,32 @@
 behaviors:
   PushBlock:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    max_steps: 1.5e7
-    memory_size: 256
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 60000
-    use_recurrent: false
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
     reward_signals:
       gail:
-        strength: 1.0
         gamma: 0.99
+        strength: 1.0
         encoding_size: 128
+        learning_rate: 0.0003
+        use_actions: false
+        use_vail: false
         demo_path: Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 15000000
+    time_horizon: 64
+    summary_freq: 60000
+    threaded: true
diff --git a/config/upgrade_config.py b/config/upgrade_config.py
index ec4ce7bdf7..f2b105e865 100644
--- a/config/upgrade_config.py
+++ b/config/upgrade_config.py
@@ -49,11 +49,23 @@ def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]:
 
 
 def write_to_yaml_file(config: Dict[str, Any], output_config: str):
+    unstructed_config = cattr.unstructure(config)
+    unstructed_config = remove_nones(unstructed_config)
     with open(output_config, "w") as f:
         try:
-            yaml.dump(cattr.unstructure(config), f, sort_keys=False)
+            yaml.dump(unstructed_config, f, sort_keys=False)
         except TypeError:  # Older versions of pyyaml don't support sort_keys
-            yaml.dump(cattr.unstructure(config), f)
+            yaml.dump(unstructed_config, f)
+
+
+def remove_nones(config: Dict[Any, Any]):
+    new_config = {}
+    for key, val in config.items():
+        if isinstance(val, dict):
+            new_config[key] = remove_nones(val)
+        elif val is not None:
+            new_config[key] = val
+    return new_config
 
 
 if __name__ == "__main__":

From bb88ff21c9f848316d3ab45644cd48ee43541985 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 7 May 2020 16:09:28 -0700
Subject: [PATCH 27/54] Fix setup.py

---
 ml-agents/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ml-agents/setup.py b/ml-agents/setup.py
index 64db9a3a5e..1b12489899 100644
--- a/ml-agents/setup.py
+++ b/ml-agents/setup.py
@@ -65,8 +65,8 @@ def run(self):
         "protobuf>=3.6",
         "pyyaml",
         "tensorflow>=1.7,<3.0",
-        "cattr>=1.0.0",
-        "attr>=19.3.0",
+        "cattrs>=1.0.0",
+        "attrs>=19.3.0",
         'pypiwin32==223;platform_system=="Windows"',
         # We don't actually need six, but tensorflow does, and pip seems
         # to get confused and install the wrong version.

From df3ed19d219ca1e2f89c1295d45e4fcffeb3c9f5 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 11 May 2020 18:35:57 -0700
Subject: [PATCH 28/54] Clean up typing, variable names

---
 ml-agents/mlagents/trainers/ghost/trainer.py  |  8 ++--
 .../mlagents/trainers/policy/tf_policy.py     | 14 +++----
 ml-agents/mlagents/trainers/ppo/trainer.py    | 18 ++++-----
 ml-agents/mlagents/trainers/sac/trainer.py    | 22 +++++------
 .../mlagents/trainers/tests/test_nn_policy.py |  8 ++--
 ml-agents/mlagents/trainers/tests/test_ppo.py |  8 ++--
 .../trainers/tests/test_reward_signals.py     | 14 +++----
 ml-agents/mlagents/trainers/tests/test_sac.py |  8 ++--
 .../trainers/tests/test_trainer_util.py       |  4 +-
 .../mlagents/trainers/trainer/rl_trainer.py   |  3 +-
 .../mlagents/trainers/trainer/trainer.py      | 16 ++++----
 ml-agents/mlagents/trainers/trainer_util.py   | 39 ++++++-------------
 12 files changed, 72 insertions(+), 90 deletions(-)

diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py
index f4c236d8b3..089b9c61a1 100644
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
@@ -43,7 +43,7 @@ def __init__(
         brain_name,
         controller,
         reward_buff_cap,
-        trainer_parameters,
+        trainer_settings,
         training,
         run_id,
     ):
@@ -53,13 +53,13 @@ def __init__(
         :param brain_name: The name of the brain associated with trainer config
         :param controller: GhostController that coordinates all ghost trainers and calculates ELO
         :param reward_buff_cap: Max reward history to track in the reward buffer
-        :param trainer_parameters: The parameters for the trainer (dictionary).
+        :param trainer_settings: The parameters for the trainer (dictionary).
         :param training: Whether the trainer is set for training.
         :param run_id: The identifier of the current run
         """
 
         super(GhostTrainer, self).__init__(
-            brain_name, trainer_parameters, training, run_id, reward_buff_cap
+            brain_name, trainer_settings, training, run_id, reward_buff_cap
         )
 
         self.trainer = trainer
@@ -79,7 +79,7 @@ def __init__(
         # Set the logging to print ELO in the console
         self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY, True)
 
-        self_play_parameters = trainer_parameters.self_play
+        self_play_parameters = trainer_settings.self_play
         self.window = self_play_parameters.window
         self.play_against_latest_model_ratio = (
             self_play_parameters.play_against_latest_model_ratio
diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py
index 26c7864c2a..ad22a80cac 100644
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
@@ -37,19 +37,19 @@ def __init__(
         self,
         seed: int,
         brain: BrainParameters,
-        trainer_parameters: TrainerSettings,
+        trainer_settings: TrainerSettings,
         load: bool = False,
     ):
         """
         Initialized the policy.
         :param seed: Random seed to use for TensorFlow.
         :param brain: The corresponding Brain for this policy.
-        :param trainer_parameters: The trainer parameters.
+        :param trainer_settings: The trainer parameters.
         """
         self._version_number_ = 2
         self.m_size = 0
-        self.trainer_parameters = trainer_parameters
-        self.network_settings: NetworkSettings = trainer_parameters.network_settings
+        self.trainer_settings = trainer_settings
+        self.network_settings: NetworkSettings = trainer_settings.network_settings
         # for ghost trainer save/load snapshots
         self.assign_phs: List[tf.Tensor] = []
         self.assign_ops: List[tf.Operation] = []
@@ -72,9 +72,9 @@ def __init__(
         self.use_continuous_act = brain.vector_action_space_type == "continuous"
         if self.use_continuous_act:
             self.num_branches = self.brain.vector_action_space_size[0]
-        self.model_path = self.trainer_parameters.output_path
-        self.initialize_path = self.trainer_parameters.init_path
-        self.keep_checkpoints = self.trainer_parameters.keep_checkpoints
+        self.model_path = self.trainer_settings.output_path
+        self.initialize_path = self.trainer_settings.init_path
+        self.keep_checkpoints = self.trainer_settings.keep_checkpoints
         self.graph = tf.Graph()
         self.sess = tf.Session(
             config=tf_utils.generate_session_config(), graph=self.graph
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
index 5467fbb8c7..ec5d1cf383 100644
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -29,7 +29,7 @@ def __init__(
         self,
         brain_name: str,
         reward_buff_cap: int,
-        trainer_parameters: TrainerSettings,
+        trainer_settings: TrainerSettings,
         training: bool,
         load: bool,
         seed: int,
@@ -39,14 +39,14 @@ def __init__(
         Responsible for collecting experiences and training PPO model.
         :param brain_name: The name of the brain associated with trainer config
         :param reward_buff_cap: Max reward history to track in the reward buffer
-        :param trainer_parameters: The parameters for the trainer (dictionary).
+        :param trainer_settings: The parameters for the trainer (dictionary).
         :param training: Whether the trainer is set for training.
         :param load: Whether the model should be loaded.
         :param seed: The seed the model will be initialized with
         :param run_id: The identifier of the current run
         """
         super(PPOTrainer, self).__init__(
-            brain_name, trainer_parameters, training, run_id, reward_buff_cap
+            brain_name, trainer_settings, training, run_id, reward_buff_cap
         )
         self.param_keys = [
             "batch_size",
@@ -69,7 +69,7 @@ def __init__(
             "reward_signals",
         ]
         self.hyperparameters: PPOSettings = cast(
-            PPOSettings, self.trainer_parameters.hyperparameters
+            PPOSettings, self.trainer_settings.hyperparameters
         )
         self.load = load
         self.seed = seed
@@ -80,9 +80,9 @@ def _check_param_keys(self):
         # Check that batch size is greater than sequence length. Else, throw
         # an exception.
         if (
-            self.trainer_parameters["sequence_length"]
-            > self.trainer_parameters["batch_size"]
-            and self.trainer_parameters["use_recurrent"]
+            self.trainer_settings["sequence_length"]
+            > self.trainer_settings["batch_size"]
+            and self.trainer_settings["use_recurrent"]
         ):
             raise UnityTrainerException(
                 "batch_size must be greater than or equal to sequence_length when use_recurrent is True."
@@ -235,7 +235,7 @@ def create_policy(
         policy = NNPolicy(
             self.seed,
             brain_parameters,
-            self.trainer_parameters,
+            self.trainer_settings,
             self.is_training,
             self.load,
             condition_sigma_on_obs=False,  # Faster training for PPO
@@ -262,7 +262,7 @@ def add_policy(
         if not isinstance(policy, NNPolicy):
             raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
         self.policy = policy
-        self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)
+        self.optimizer = PPOOptimizer(self.policy, self.trainer_settings)
         for _reward_signal in self.optimizer.reward_signals.keys():
             self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
         # Needed to resume loads properly
diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
index 6b1b435b98..45e52fff6e 100644
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -37,7 +37,7 @@ def __init__(
         self,
         brain_name: str,
         reward_buff_cap: int,
-        trainer_parameters: TrainerSettings,
+        trainer_settings: TrainerSettings,
         training: bool,
         load: bool,
         seed: int,
@@ -47,14 +47,14 @@ def __init__(
         Responsible for collecting experiences and training SAC model.
         :param brain_name: The name of the brain associated with trainer config
         :param reward_buff_cap: Max reward history to track in the reward buffer
-        :param trainer_parameters: The parameters for the trainer (dictionary).
+        :param trainer_settings: The parameters for the trainer (dictionary).
         :param training: Whether the trainer is set for training.
         :param load: Whether the model should be loaded.
         :param seed: The seed the model will be initialized with
         :param run_id: The The identifier of the current run
         """
         super().__init__(
-            brain_name, trainer_parameters, training, run_id, reward_buff_cap
+            brain_name, trainer_settings, training, run_id, reward_buff_cap
         )
 
         self.load = load
@@ -62,7 +62,7 @@ def __init__(
         self.policy: NNPolicy = None  # type: ignore
         self.optimizer: SACOptimizer = None  # type: ignore
         self.hyperparameters: SACSettings = cast(
-            SACSettings, trainer_parameters.hyperparameters
+            SACSettings, trainer_settings.hyperparameters
         )
         self.step = 0
 
@@ -82,9 +82,9 @@ def _check_param_keys(self):
         # Check that batch size is greater than sequence length. Else, throw
         # an exception.
         if (
-            self.trainer_parameters["sequence_length"]
-            > self.trainer_parameters["batch_size"]
-            and self.trainer_parameters["use_recurrent"]
+            self.trainer_settings["sequence_length"]
+            > self.trainer_settings["batch_size"]
+            and self.trainer_settings["use_recurrent"]
         ):
             raise UnityTrainerException(
                 "batch_size must be greater than or equal to sequence_length when use_recurrent is True."
@@ -104,7 +104,7 @@ def save_replay_buffer(self) -> None:
         Save the training buffer's update buffer to a pickle file.
         """
         filename = os.path.join(
-            self.trainer_parameters.output_path, "last_replay_buffer.hdf5"
+            self.trainer_settings.output_path, "last_replay_buffer.hdf5"
         )
         logger.info("Saving Experience Replay Buffer to {}".format(filename))
         with open(filename, "wb") as file_object:
@@ -115,7 +115,7 @@ def load_replay_buffer(self) -> None:
         Loads the last saved replay buffer from a file.
         """
         filename = os.path.join(
-            self.trainer_parameters.output_path, "last_replay_buffer.hdf5"
+            self.trainer_settings.output_path, "last_replay_buffer.hdf5"
         )
         logger.info("Loading Experience Replay Buffer from {}".format(filename))
         with open(filename, "rb+") as file_object:
@@ -208,7 +208,7 @@ def create_policy(
         policy = NNPolicy(
             self.seed,
             brain_parameters,
-            self.trainer_parameters,
+            self.trainer_settings,
             self.is_training,
             self.load,
             tanh_squash=True,
@@ -336,7 +336,7 @@ def add_policy(
         if not isinstance(policy, NNPolicy):
             raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()")
         self.policy = policy
-        self.optimizer = SACOptimizer(self.policy, self.trainer_parameters)
+        self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
         for _reward_signal in self.optimizer.reward_signals.keys():
             self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
         # Needed to resume loads properly
diff --git a/ml-agents/mlagents/trainers/tests/test_nn_policy.py b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
index d6cb96ed0e..30e760b716 100644
--- a/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
@@ -37,12 +37,12 @@ def create_policy_mock(
         discrete_action_space=DISCRETE_ACTION_SPACE,
     )
 
-    trainer_parameters = dummy_config
-    trainer_parameters.keep_checkpoints = 3
-    trainer_parameters.network_settings.memory = (
+    trainer_settings = dummy_config
+    trainer_settings.keep_checkpoints = 3
+    trainer_settings.network_settings.memory = (
         NetworkSettings.MemorySettings() if use_rnn else None
     )
-    policy = NNPolicy(seed, mock_brain, trainer_parameters, False, load)
+    policy = NNPolicy(seed, mock_brain, trainer_settings, False, load)
     return policy
 
 
diff --git a/ml-agents/mlagents/trainers/tests/test_ppo.py b/ml-agents/mlagents/trainers/tests/test_ppo.py
index 633a01c388..29247cc1a4 100644
--- a/ml-agents/mlagents/trainers/tests/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/test_ppo.py
@@ -44,16 +44,16 @@ def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visu
         discrete_action_space=DISCRETE_ACTION_SPACE,
     )
 
-    trainer_parameters = attr.evolve(dummy_config)
-    trainer_parameters.network_settings.memory = (
+    trainer_settings = attr.evolve(dummy_config)
+    trainer_settings.network_settings.memory = (
         NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
         if use_rnn
         else None
     )
     policy = NNPolicy(
-        0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
+        0, mock_brain, trainer_settings, False, False, create_tf_graph=False
     )
-    optimizer = PPOOptimizer(policy, trainer_parameters)
+    optimizer = PPOOptimizer(policy, trainer_settings)
     return optimizer
 
 
diff --git a/ml-agents/mlagents/trainers/tests/test_reward_signals.py b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
index cf660e1a20..4bfe43a3a9 100644
--- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py
+++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
@@ -64,20 +64,20 @@ def create_optimizer_mock(
         vector_obs_space=VECTOR_OBS_SPACE,
         discrete_action_space=DISCRETE_ACTION_SPACE,
     )
-    trainer_parameters = trainer_config
-    trainer_parameters.reward_signals = reward_signal_config
-    trainer_parameters.network_settings.memory = (
+    trainer_settings = trainer_config
+    trainer_settings.reward_signals = reward_signal_config
+    trainer_settings.network_settings.memory = (
         NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
         if use_rnn
         else None
     )
     policy = NNPolicy(
-        0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
+        0, mock_brain, trainer_settings, False, False, create_tf_graph=False
     )
-    if trainer_parameters.trainer_type == TrainerSettings.TrainerType.SAC:
-        optimizer = SACOptimizer(policy, trainer_parameters)
+    if trainer_settings.trainer_type == TrainerSettings.TrainerType.SAC:
+        optimizer = SACOptimizer(policy, trainer_settings)
     else:
-        optimizer = PPOOptimizer(policy, trainer_parameters)
+        optimizer = PPOOptimizer(policy, trainer_settings)
     return optimizer
 
 
diff --git a/ml-agents/mlagents/trainers/tests/test_sac.py b/ml-agents/mlagents/trainers/tests/test_sac.py
index 1fad099048..62fa154cc3 100644
--- a/ml-agents/mlagents/trainers/tests/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/test_sac.py
@@ -39,16 +39,16 @@ def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
         vector_obs_space=VECTOR_OBS_SPACE,
         discrete_action_space=DISCRETE_ACTION_SPACE,
     )
-    trainer_parameters = dummy_config
-    trainer_parameters.network_settings.memory = (
+    trainer_settings = dummy_config
+    trainer_settings.network_settings.memory = (
         NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
         if use_rnn
         else None
     )
     policy = NNPolicy(
-        0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
+        0, mock_brain, trainer_settings, False, False, create_tf_graph=False
     )
-    optimizer = SACOptimizer(policy, trainer_parameters)
+    optimizer = SACOptimizer(policy, trainer_settings)
     return optimizer
 
 
diff --git a/ml-agents/mlagents/trainers/tests/test_trainer_util.py b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
index 2c0dbd3193..79bc8c2f94 100644
--- a/ml-agents/mlagents/trainers/tests/test_trainer_util.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
@@ -34,10 +34,10 @@ def test_initialize_ppo_trainer(BrainParametersMock, dummy_config):
     expected_config = PPO_CONFIG
 
     def mock_constructor(
-        self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
+        self, brain, reward_buff_cap, trainer_settings, training, load, seed, run_id
     ):
         assert brain == brain_params_mock.brain_name
-        assert trainer_parameters == expected_config
+        assert trainer_settings == expected_config
         assert reward_buff_cap == expected_reward_buff_cap
         assert training == train_model
         assert load == load_model
diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
index d9b077af8b..44f4f993aa 100644
--- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
@@ -33,8 +33,7 @@ def __init__(self, *args, **kwargs):
         }
         self.update_buffer: AgentBuffer = AgentBuffer()
         self._stats_reporter.add_property(
-            StatsPropertyType.HYPERPARAMETERS,
-            cattr.unstructure(self.trainer_parameters),
+            StatsPropertyType.HYPERPARAMETERS, cattr.unstructure(self.trainer_settings)
         )
 
     def end_episode(self) -> None:
diff --git a/ml-agents/mlagents/trainers/trainer/trainer.py b/ml-agents/mlagents/trainers/trainer/trainer.py
index ffeeb2b05a..e5c010984f 100644
--- a/ml-agents/mlagents/trainers/trainer/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer.py
@@ -26,7 +26,7 @@ class Trainer(abc.ABC):
     def __init__(
         self,
         brain_name: str,
-        trainer_parameters: TrainerSettings,
+        trainer_settings: TrainerSettings,
         training: bool,
         run_id: str,
         reward_buff_cap: int = 1,
@@ -34,7 +34,7 @@ def __init__(
         """
         Responsible for collecting experiences and training a neural network model.
         :BrainParameters brain: Brain to be trained.
-        :dict trainer_parameters: The parameters for the trainer (dictionary).
+        :dict trainer_settings: The parameters for the trainer (dictionary).
         :bool training: Whether the trainer is set for training.
         :str run_id: The identifier of the current run
         :int reward_buff_cap:
@@ -42,15 +42,15 @@ def __init__(
         self.param_keys: List[str] = []
         self.brain_name = brain_name
         self.run_id = run_id
-        self.trainer_parameters = trainer_parameters
-        self._threaded = trainer_parameters.threaded
+        self.trainer_settings = trainer_settings
+        self._threaded = trainer_settings.threaded
         self._stats_reporter = StatsReporter(brain_name)
         self.is_training = training
         self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
         self.policy_queues: List[AgentManagerQueue[Policy]] = []
         self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
         self.step: int = 0
-        self.summary_freq = self.trainer_parameters.summary_freq
+        self.summary_freq = self.trainer_settings.summary_freq
         self.next_summary_step = self.summary_freq
 
     @property
@@ -62,7 +62,7 @@ def stats_reporter(self):
 
     def _check_param_keys(self):
         for k in self.param_keys:
-            if k not in self.trainer_parameters:
+            if k not in self.trainer_settings:
                 raise UnityTrainerException(
                     "The hyper-parameter {0} could not be found for the {1} trainer of "
                     "brain {2}.".format(k, self.__class__, self.brain_name)
@@ -73,7 +73,7 @@ def parameters(self) -> TrainerSettings:
         """
         Returns the trainer parameters of the trainer.
         """
-        return self.trainer_parameters
+        return self.trainer_settings
 
     @property
     def get_max_steps(self) -> int:
@@ -81,7 +81,7 @@ def get_max_steps(self) -> int:
         Returns the maximum number of steps. Is used to know when the trainer should be stopped.
         :return: The maximum number of steps of the trainer
         """
-        return int(float(self.trainer_parameters.max_steps))
+        return int(float(self.trainer_settings.max_steps))
 
     @property
     def get_step(self) -> int:
diff --git a/ml-agents/mlagents/trainers/trainer_util.py b/ml-agents/mlagents/trainers/trainer_util.py
index 07ac0abaed..d986be0292 100644
--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Dict
+from typing import Dict
 
 from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.meta_curriculum import MetaCurriculum
@@ -19,7 +19,7 @@
 class TrainerFactory:
     def __init__(
         self,
-        trainer_config: Any,
+        trainer_config: Dict[str, TrainerSettings],
         run_id: str,
         output_path: str,
         keep_checkpoints: int,
@@ -60,7 +60,7 @@ def generate(self, brain_name: str) -> Trainer:
 
 
 def initialize_trainer(
-    trainer_config: TrainerSettings,
+    trainer_settings: TrainerSettings,
     brain_name: str,
     run_id: str,
     output_path: str,
@@ -77,7 +77,7 @@ def initialize_trainer(
     Initializes a trainer given a provided trainer configuration and brain parameters, as well as
     some general training session options.
 
-    :param trainer_config: Original trainer configuration loaded from YAML
+    :param trainer_settings: Original trainer configuration loaded from YAML
     :param brain_name: Name of the brain to be associated with trainer
     :param run_id: Run ID to associate with this training run
     :param output_path: Path to save the model and summary statistics
@@ -90,10 +90,9 @@ def initialize_trainer(
     :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
     :return:
     """
-    print(trainer_config)
-    trainer_config.output_path = os.path.join(output_path, brain_name)
+    trainer_settings.output_path = os.path.join(output_path, brain_name)
     if init_path is not None:
-        trainer_config.init_path = os.path.join(init_path, brain_name)
+        trainer_settings.init_path = os.path.join(init_path, brain_name)
 
     min_lesson_length = 1
     if meta_curriculum:
@@ -108,13 +107,13 @@ def initialize_trainer(
             )
 
     trainer: Trainer = None  # type: ignore  # will be set to one of these, or raise
-    trainer_type = trainer_config.trainer_type
+    trainer_type = trainer_settings.trainer_type
 
     if trainer_type == TrainerSettings.TrainerType.PPO:
         trainer = PPOTrainer(
             brain_name,
             min_lesson_length,
-            trainer_config,
+            trainer_settings,
             train_model,
             load_model,
             seed,
@@ -124,7 +123,7 @@ def initialize_trainer(
         trainer = SACTrainer(
             brain_name,
             min_lesson_length,
-            trainer_config,
+            trainer_settings,
             train_model,
             load_model,
             seed,
@@ -135,35 +134,19 @@ def initialize_trainer(
             f'The trainer config contains an unknown trainer type "{trainer_type}" for brain {brain_name}'
         )
 
-    if trainer_config.self_play is not None:
+    if trainer_settings.self_play is not None:
         trainer = GhostTrainer(
             trainer,
             brain_name,
             ghost_controller,
             min_lesson_length,
-            trainer_config,
+            trainer_settings,
             train_model,
             run_id,
         )
     return trainer
 
 
-def assemble_curriculum_config(trainer_config: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Assembles a curriculum config Dict from a trainer config. The resulting
-    dictionary should have a mapping of {brain_name: config}, where config is another
-    Dict that
-    :param trainer_config: Dict of trainer configurations (keys are brain_names).
-    :return: Dict of curriculum configurations. Returns empty dict if none are found.
-    """
-    curriculum_config: Dict[str, Any] = {}
-    for behavior_name, behavior_config in trainer_config.items():
-        # Don't try to iterate non-Dicts. This probably means your config is malformed.
-        if isinstance(behavior_config, dict) and "curriculum" in behavior_config:
-            curriculum_config[behavior_name] = behavior_config["curriculum"]
-    return curriculum_config
-
-
 def handle_existing_directories(
     output_path: str, resume: bool, force: bool, init_path: str = None
 ) -> None:

From b88717016e7afa8cd340a34250202d26caf7c6e9 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 11 May 2020 18:38:26 -0700
Subject: [PATCH 29/54] Remove unneeded cast

---
 ml-agents/mlagents/trainers/trainer/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/trainer/trainer.py b/ml-agents/mlagents/trainers/trainer/trainer.py
index e5c010984f..975bb77576 100644
--- a/ml-agents/mlagents/trainers/trainer/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer.py
@@ -81,7 +81,7 @@ def get_max_steps(self) -> int:
         Returns the maximum number of steps. Is used to know when the trainer should be stopped.
         :return: The maximum number of steps of the trainer
         """
-        return int(float(self.trainer_settings.max_steps))
+        return self.trainer_settings.max_steps
 
     @property
     def get_step(self) -> int:

From 1e252e85b294764ff050dc94467fae91ee1d32b1 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 11 May 2020 18:48:03 -0700
Subject: [PATCH 30/54] Move cattr.unstructure to settings.py

---
 ml-agents/mlagents/trainers/learn.py              |  7 +++----
 ml-agents/mlagents/trainers/settings.py           | 13 +++++++++++--
 ml-agents/mlagents/trainers/trainer/rl_trainer.py |  3 +--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index 7574a7bc02..09f99d01a6 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -6,7 +6,6 @@
 import json
 
 from typing import Callable, Optional, List, Dict
-import cattr
 
 import mlagents.trainers
 import mlagents_envs
@@ -169,9 +168,9 @@ def write_run_options(output_dir: str, run_options: RunOptions) -> None:
     try:
         with open(run_options_path, "w") as f:
             try:
-                yaml.dump(cattr.unstructure(run_options), f, sort_keys=False)
+                yaml.dump(run_options.as_dict(), f, sort_keys=False)
             except TypeError:  # Older versions of pyyaml don't support sort_keys
-                yaml.dump(cattr.unstructure(run_options), f)
+                yaml.dump(run_options.as_dict(), f)
     except FileNotFoundError:
         logger.warning(
             f"Unable to save configuration to {run_options_path}. Make sure the directory exists"
@@ -292,7 +291,7 @@ def run_cli(options: RunOptions) -> None:
     logging_util.set_log_level(log_level)
 
     logger.debug("Configuration for this run:")
-    logger.debug(json.dumps(cattr.unstructure(options), indent=4))
+    logger.debug(json.dumps(options.as_dict(), indent=4))
 
     # Options deprecation warnings
     if options.checkpoint_settings.load_model:
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index da48c86503..4db1ef0604 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -87,6 +87,12 @@ def dict_to_defaultdict(d: Dict, t: type) -> DefaultDict:
     )
 
 
+@attr.s(auto_attribs=True)
+class ExportableSettings:
+    def as_dict(self):
+        return cattr.unstructure(self)
+
+
 @attr.s(auto_attribs=True)
 class NetworkSettings:
     @attr.s(auto_attribs=True)
@@ -196,7 +202,7 @@ def _team_change_default(self):
 
 
 @attr.s(auto_attribs=True)
-class TrainerSettings:
+class TrainerSettings(ExportableSettings):
     # Edit these two fields to add new trainers #
     class TrainerType(Enum):
         PPO: str = "ppo"
@@ -296,7 +302,7 @@ class EngineSettings:
 
 
 @attr.s(auto_attribs=True)
-class RunOptions:
+class RunOptions(ExportableSettings):
     behaviors: DefaultDict[str, TrainerSettings] = attr.ib(
         default=attr.Factory(lambda: collections.defaultdict(TrainerSettings))
     )
@@ -319,6 +325,9 @@ class RunOptions:
     )
     cattr.register_unstructure_hook(collections.defaultdict, defaultdict_to_dict)
 
+    def as_dict(self):
+        return cattr.unstructure(self)
+
     @staticmethod
     def from_argparse(args: argparse.Namespace) -> "RunOptions":
         """
diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
index 44f4f993aa..7dfb4163a1 100644
--- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
@@ -2,7 +2,6 @@
 from typing import Dict, List
 from collections import defaultdict
 import abc
-import cattr
 import time
 
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
@@ -33,7 +32,7 @@ def __init__(self, *args, **kwargs):
         }
         self.update_buffer: AgentBuffer = AgentBuffer()
         self._stats_reporter.add_property(
-            StatsPropertyType.HYPERPARAMETERS, cattr.unstructure(self.trainer_settings)
+            StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict()
         )
 
     def end_episode(self) -> None:

From 65d451f924de5e01340ff177c6ce15c0a0679187 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Tue, 12 May 2020 18:06:27 -0700
Subject: [PATCH 31/54] Make Type enums standalone

Move dict_to_defaultdict to TrainerSettings
---
 .../reward_signals/reward_signal_factory.py   |  14 ++-
 .../trainers/optimizer/tf_optimizer.py        |   4 +-
 ml-agents/mlagents/trainers/settings.py       | 102 ++++++++----------
 .../trainers/tests/test_reward_signals.py     |  19 ++--
 .../mlagents/trainers/tests/test_simple_rl.py |  19 ++--
 ml-agents/mlagents/trainers/trainer_util.py   |   6 +-
 6 files changed, 71 insertions(+), 93 deletions(-)

diff --git a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
index ab91c653fc..c3d79c5387 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
@@ -9,20 +9,18 @@
     CuriosityRewardSignal,
 )
 from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.settings import RewardSignalSettings
+from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
 
 
-NAME_TO_CLASS: Dict[RewardSignalSettings.RewardSignalType, Type[RewardSignal]] = {
-    RewardSignalSettings.RewardSignalType.EXTRINSIC: ExtrinsicRewardSignal,
-    RewardSignalSettings.RewardSignalType.CURIOSITY: CuriosityRewardSignal,
-    RewardSignalSettings.RewardSignalType.GAIL: GAILRewardSignal,
+NAME_TO_CLASS: Dict[RewardSignalType, Type[RewardSignal]] = {
+    RewardSignalType.EXTRINSIC: ExtrinsicRewardSignal,
+    RewardSignalType.CURIOSITY: CuriosityRewardSignal,
+    RewardSignalType.GAIL: GAILRewardSignal,
 }
 
 
 def create_reward_signal(
-    policy: TFPolicy,
-    name: RewardSignalSettings.RewardSignalType,
-    settings: RewardSignalSettings,
+    policy: TFPolicy, name: RewardSignalType, settings: RewardSignalSettings
 ) -> RewardSignal:
     """
     Creates a reward signal class based on the name and config entry provided as a dict.
diff --git a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
index de40056f8c..b3cf8f7de7 100644
--- a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
@@ -9,7 +9,7 @@
 from mlagents.trainers.components.reward_signals.reward_signal_factory import (
     create_reward_signal,
 )
-from mlagents.trainers.settings import TrainerSettings, RewardSignalSettings
+from mlagents.trainers.settings import TrainerSettings, RewardSignalType
 from mlagents.trainers.components.bc.module import BCModule
 
 
@@ -123,7 +123,7 @@ def _get_value_estimates(
         return value_estimates
 
     def create_reward_signals(
-        self, reward_signal_configs: Dict[RewardSignalSettings.RewardSignalType, Any]
+        self, reward_signal_configs: Dict[RewardSignalType, Any]
     ) -> None:
         """
         Create reward signals
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 4db1ef0604..9fc61d45df 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -46,13 +46,12 @@ def trainer_settings_to_cls(d: Mapping, t: type) -> Any:
                 raise TrainerConfigError(
                     "Hyperparameters were specified but no trainer_type was given."
                 )
-
             else:
                 d_copy[key] = strict_to_cls(
-                    d_copy[key], TrainerSettings.to_settings(d_copy["trainer_type"])
+                    d_copy[key], TrainerType(d_copy["trainer_type"]).to_settings()
                 )
         elif key == "reward_signals":
-            d_copy[key] = rewardsignal_settings_to_cls(val)
+            d_copy[key] = RewardSignalSettings.structure_from_dict(val)
         elif key == "max_steps":
             d_copy[key] = int(
                 float(val)
@@ -62,31 +61,10 @@ def trainer_settings_to_cls(d: Mapping, t: type) -> Any:
     return t(**d_copy)
 
 
-def rewardsignal_settings_to_cls(d: Mapping) -> Any:
-    if d is None:
-        return None
-    d_final: Dict[RewardSignalSettings.RewardSignalType, RewardSignalSettings] = {}
-
-    for key, val in d.items():
-        try:
-            enum_key = RewardSignalSettings.RewardSignalType(key)
-            t = RewardSignalSettings.to_settings(enum_key)
-            d_final[enum_key] = strict_to_cls(val, t)
-        except KeyError:
-            raise TrainerConfigError(f"Unknown reward signal type {key}")
-    return d_final
-
-
 def defaultdict_to_dict(d: DefaultDict) -> Dict:
     return {key: cattr.unstructure(val) for key, val in d.items()}
 
 
-def dict_to_defaultdict(d: Dict, t: type) -> DefaultDict:
-    return collections.defaultdict(
-        TrainerSettings, cattr.structure(d, Dict[str, TrainerSettings])
-    )
-
-
 @attr.s(auto_attribs=True)
 class ExportableSettings:
     def as_dict(self):
@@ -150,25 +128,39 @@ def _reward_signal_steps_per_update_default(self):
         return self.steps_per_update
 
 
-@attr.s(auto_attribs=True)
-class RewardSignalSettings:
-    class RewardSignalType(Enum):
-        EXTRINSIC: str = "extrinsic"
-        GAIL: str = "gail"
-        CURIOSITY: str = "curiosity"
+class RewardSignalType(Enum):
+    EXTRINSIC: str = "extrinsic"
+    GAIL: str = "gail"
+    CURIOSITY: str = "curiosity"
 
-    @staticmethod
-    def to_settings(ttype: RewardSignalType) -> type:
+    def to_settings(self) -> type:
         _mapping = {
-            RewardSignalSettings.RewardSignalType.EXTRINSIC: RewardSignalSettings,
-            RewardSignalSettings.RewardSignalType.GAIL: GAILSettings,
-            RewardSignalSettings.RewardSignalType.CURIOSITY: CuriositySettings,
+            RewardSignalType.EXTRINSIC: RewardSignalSettings,
+            RewardSignalType.GAIL: GAILSettings,
+            RewardSignalType.CURIOSITY: CuriositySettings,
         }
-        return _mapping[ttype]
+        return _mapping[self]
+
 
+@attr.s(auto_attribs=True)
+class RewardSignalSettings:
     gamma: float = 0.99
     strength: float = 1.0
 
+    @staticmethod
+    def structure_from_dict(d: Mapping) -> Any:
+        if d is None:
+            return None
+        d_final: Dict[RewardSignalType, RewardSignalSettings] = {}
+        for key, val in d.items():
+            try:
+                enum_key = RewardSignalType(key)
+                t = enum_key.to_settings()
+                d_final[enum_key] = strict_to_cls(val, t)
+            except KeyError:
+                raise TrainerConfigError(f"Unknown reward signal type {key}")
+        return d_final
+
 
 @attr.s(auto_attribs=True)
 class GAILSettings(RewardSignalSettings):
@@ -201,34 +193,28 @@ def _team_change_default(self):
     initial_elo: float = 1200.0
 
 
-@attr.s(auto_attribs=True)
-class TrainerSettings(ExportableSettings):
-    # Edit these two fields to add new trainers #
-    class TrainerType(Enum):
-        PPO: str = "ppo"
-        SAC: str = "sac"
+class TrainerType(Enum):
+    PPO: str = "ppo"
+    SAC: str = "sac"
 
-    @staticmethod
-    def to_settings(ttype: TrainerType) -> type:
-        _mapping = {
-            TrainerSettings.TrainerType.PPO: PPOSettings,
-            TrainerSettings.TrainerType.SAC: SACSettings,
-        }
-        return _mapping[ttype]
+    def to_settings(self) -> type:
+        _mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings}
+        return _mapping[self]
 
-    ###############################################
 
+@attr.s(auto_attribs=True)
+class TrainerSettings(ExportableSettings):
     trainer_type: TrainerType = TrainerType.PPO
     hyperparameters: HyperparamSettings = attr.ib()
 
     @hyperparameters.default
     def _set_default_hyperparameters(self):
-        return TrainerSettings.to_settings(self.trainer_type)()
+        return self.trainer_type.to_settings()
 
     network_settings: NetworkSettings = attr.ib(default=NetworkSettings())
-    reward_signals: Dict[
-        RewardSignalSettings.RewardSignalType, RewardSignalSettings
-    ] = {RewardSignalSettings.RewardSignalType.EXTRINSIC: RewardSignalSettings()}
+    reward_signals: Dict[RewardSignalType, RewardSignalSettings] = {
+        RewardSignalType.EXTRINSIC: RewardSignalSettings()
+    }
     init_path: Optional[str] = None
     output_path: str = "default"
     # TODO: Remove parser default and remove from CLI
@@ -251,7 +237,11 @@ def _check_batch_size_seq_length(self, attribute, value):
                     "When using memory, sequence length must be less than or equal to batch size. "
                 )
 
-    cattr.register_structure_hook(RewardSignalSettings, rewardsignal_settings_to_cls)
+    @staticmethod
+    def dict_to_defaultdict(d: Dict, t: type) -> DefaultDict:
+        return collections.defaultdict(
+            TrainerSettings, cattr.structure(d, Dict[str, TrainerSettings])
+        )
 
 
 @attr.s(auto_attribs=True)
@@ -321,7 +311,7 @@ class RunOptions(ExportableSettings):
     cattr.register_structure_hook(CheckpointSettings, strict_to_cls)
     cattr.register_structure_hook(TrainerSettings, trainer_settings_to_cls)
     cattr.register_structure_hook(
-        DefaultDict[str, TrainerSettings], dict_to_defaultdict
+        DefaultDict[str, TrainerSettings], TrainerSettings.dict_to_defaultdict
     )
     cattr.register_unstructure_hook(collections.defaultdict, defaultdict_to_dict)
 
diff --git a/ml-agents/mlagents/trainers/tests/test_reward_signals.py b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
index 4bfe43a3a9..d0b8f21038 100644
--- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py
+++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
@@ -12,7 +12,8 @@
     RewardSignalSettings,
     BehavioralCloningSettings,
     NetworkSettings,
-    TrainerSettings,
+    TrainerType,
+    RewardSignalType,
 )
 
 CONTINUOUS_PATH = os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
@@ -29,21 +30,17 @@ def sac_dummy_config():
 
 @pytest.fixture
 def gail_dummy_config():
-    return {
-        RewardSignalSettings.RewardSignalType.GAIL: GAILSettings(
-            demo_path=CONTINUOUS_PATH
-        )
-    }
+    return {RewardSignalType.GAIL: GAILSettings(demo_path=CONTINUOUS_PATH)}
 
 
 @pytest.fixture
 def curiosity_dummy_config():
-    return {RewardSignalSettings.RewardSignalType.CURIOSITY: CuriositySettings()}
+    return {RewardSignalType.CURIOSITY: CuriositySettings()}
 
 
 @pytest.fixture
 def extrinsic_dummy_config():
-    return {RewardSignalSettings.RewardSignalType.EXTRINSIC: RewardSignalSettings()}
+    return {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
 
 
 VECTOR_ACTION_SPACE = [2]
@@ -74,7 +71,7 @@ def create_optimizer_mock(
     policy = NNPolicy(
         0, mock_brain, trainer_settings, False, False, create_tf_graph=False
     )
-    if trainer_settings.trainer_type == TrainerSettings.TrainerType.SAC:
+    if trainer_settings.trainer_type == TrainerType.SAC:
         optimizer = SACOptimizer(policy, trainer_settings)
     else:
         optimizer = PPOOptimizer(policy, trainer_settings)
@@ -119,9 +116,7 @@ def test_gail_cc(trainer_config, gail_dummy_config):
 )
 def test_gail_dc_visual(trainer_config, gail_dummy_config):
     gail_dummy_config_discrete = {
-        RewardSignalSettings.RewardSignalType.GAIL: GAILSettings(
-            demo_path=DISCRETE_PATH
-        )
+        RewardSignalType.GAIL: GAILSettings(demo_path=DISCRETE_PATH)
     }
     optimizer = create_optimizer_mock(
         trainer_config, gail_dummy_config_discrete, False, True, True
diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
index 2b35603b17..629100b501 100644
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -23,8 +23,9 @@
     NetworkSettings,
     SelfPlaySettings,
     BehavioralCloningSettings,
-    RewardSignalSettings,
     GAILSettings,
+    TrainerType,
+    RewardSignalType,
 )
 from mlagents.trainers.models import LearningRateSchedule, EncoderType
 from mlagents_envs.side_channel.environment_parameters_channel import (
@@ -40,7 +41,7 @@
 
 
 PPO_CONFIG = TrainerSettings(
-    trainer_type=TrainerSettings.TrainerType.PPO,
+    trainer_type=TrainerType.PPO,
     hyperparameters=PPOSettings(
         learning_rate=5.0e-3,
         learning_rate_schedule=LearningRateSchedule.CONSTANT,
@@ -54,7 +55,7 @@
 )
 
 SAC_CONFIG = TrainerSettings(
-    trainer_type=TrainerSettings.TrainerType.SAC,
+    trainer_type=TrainerType.SAC,
     hyperparameters=SACSettings(
         learning_rate=5.0e-3,
         learning_rate_schedule=LearningRateSchedule.CONSTANT,
@@ -433,9 +434,7 @@ def test_gail(simple_record, use_discrete, trainer_config):
     env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
     bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
     reward_signals = {
-        RewardSignalSettings.RewardSignalType.GAIL: GAILSettings(
-            encoding_size=32, demo_path=demo_path
-        )
+        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
     }
     config = attr.evolve(
         trainer_config,
@@ -458,9 +457,7 @@ def test_gail_visual_ppo(simple_record, use_discrete):
     )
     bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
     reward_signals = {
-        RewardSignalSettings.RewardSignalType.GAIL: GAILSettings(
-            encoding_size=32, demo_path=demo_path
-        )
+        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
     }
     hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3e-4)
     config = attr.evolve(
@@ -485,9 +482,7 @@ def test_gail_visual_sac(simple_record, use_discrete):
     )
     bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
     reward_signals = {
-        RewardSignalSettings.RewardSignalType.GAIL: GAILSettings(
-            encoding_size=32, demo_path=demo_path
-        )
+        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
     }
     hyperparams = attr.evolve(
         SAC_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
diff --git a/ml-agents/mlagents/trainers/trainer_util.py b/ml-agents/mlagents/trainers/trainer_util.py
index d986be0292..0648cdbaf7 100644
--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py
@@ -10,7 +10,7 @@
 from mlagents.trainers.sac.trainer import SACTrainer
 from mlagents.trainers.ghost.trainer import GhostTrainer
 from mlagents.trainers.ghost.controller import GhostController
-from mlagents.trainers.settings import TrainerSettings
+from mlagents.trainers.settings import TrainerSettings, TrainerType
 
 
 logger = get_logger(__name__)
@@ -109,7 +109,7 @@ def initialize_trainer(
     trainer: Trainer = None  # type: ignore  # will be set to one of these, or raise
     trainer_type = trainer_settings.trainer_type
 
-    if trainer_type == TrainerSettings.TrainerType.PPO:
+    if trainer_type == TrainerType.PPO:
         trainer = PPOTrainer(
             brain_name,
             min_lesson_length,
@@ -119,7 +119,7 @@ def initialize_trainer(
             seed,
             run_id,
         )
-    elif trainer_type == TrainerSettings.TrainerType.SAC:
+    elif trainer_type == TrainerType.SAC:
         trainer = SACTrainer(
             brain_name,
             min_lesson_length,

From 6beda1fba5b45f7a5a3f9190aa5e1ff1e252af16 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 13 May 2020 10:45:45 -0700
Subject: [PATCH 32/54] Update upgrade_config script

---
 config/upgrade_config.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/config/upgrade_config.py b/config/upgrade_config.py
index f2b105e865..5ee060e5d8 100644
--- a/config/upgrade_config.py
+++ b/config/upgrade_config.py
@@ -3,7 +3,7 @@
 import yaml
 from typing import Dict, Any
 import argparse
-from mlagents.trainers.settings import TrainerSettings, NetworkSettings
+from mlagents.trainers.settings import TrainerSettings, NetworkSettings, TrainerType
 from mlagents.trainers.cli_utils import load_config
 
 
@@ -21,9 +21,7 @@ def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]:
             trainer_type = config["trainer"]
             new_config = {}
             new_config["trainer_type"] = trainer_type
-            hyperparam_cls = TrainerSettings.to_settings(
-                TrainerSettings.TrainerType(trainer_type)
-            )
+            hyperparam_cls = TrainerType(trainer_type).to_settings()
             # Try to absorb as much as possible into the hyperparam_cls
             new_config["hyperparameters"] = cattr.structure(config, hyperparam_cls)
 

From 4edef4627738282094979dca7c1aa06a2904b8ea Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 13 May 2020 16:10:49 -0700
Subject: [PATCH 33/54] Fix issue with default hyperparams

---
 ml-agents/mlagents/trainers/settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 9fc61d45df..4b072e37e6 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -209,7 +209,7 @@ class TrainerSettings(ExportableSettings):
 
     @hyperparameters.default
     def _set_default_hyperparameters(self):
-        return self.trainer_type.to_settings()
+        return self.trainer_type.to_settings()()
 
     network_settings: NetworkSettings = attr.ib(default=NetworkSettings())
     reward_signals: Dict[RewardSignalType, RewardSignalSettings] = {

From 52b23f82c572b5119bc5ab50dd3c014e28846ff6 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 13 May 2020 16:11:14 -0700
Subject: [PATCH 34/54] Fix simple RL test

---
 ml-agents/mlagents/trainers/tests/test_simple_rl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
index 629100b501..7eca2236bd 100644
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -162,7 +162,7 @@ def _check_environment_trains(
 def test_simple_ppo(use_discrete):
     env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
     config = attr.evolve(PPO_CONFIG)
-    _check_environment_trains(env, config)
+    _check_environment_trains(env, {BRAIN_NAME: config})
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])

From ce20517d4db9e411584b9c83277729b55d216c00 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 13 May 2020 17:11:44 -0700
Subject: [PATCH 35/54] Refactor structure methods into appropriate classes

---
 ml-agents/mlagents/trainers/settings.py | 85 +++++++++++++------------
 1 file changed, 46 insertions(+), 39 deletions(-)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 4b072e37e6..1c8f602bf8 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -22,8 +22,8 @@ def check_and_structure(key: str, value: Any, class_type: type) -> Any:
 
 
 def strict_to_cls(d: Mapping, t: type) -> Any:
-    if d is None:
-        return None
+    if not isinstance(d, Mapping):
+        raise TrainerConfigError(f"Unsupported config {d} for {t.__name__}.")
     d_copy: Dict[str, Any] = {}
     d_copy.update(d)
     for key, val in d_copy.items():
@@ -31,36 +31,6 @@ def strict_to_cls(d: Mapping, t: type) -> Any:
     return t(**d_copy)
 
 
-def trainer_settings_to_cls(d: Mapping, t: type) -> Any:
-    if d is None:
-        return None
-    d_copy: Dict[str, Any] = {}
-    d_copy.update(d)
-
-    for key, val in d_copy.items():
-        if attr.has(type(val)):
-            # Don't convert already-converted attrs classes.
-            continue
-        if key == "hyperparameters":
-            if "trainer_type" not in d_copy:
-                raise TrainerConfigError(
-                    "Hyperparameters were specified but no trainer_type was given."
-                )
-            else:
-                d_copy[key] = strict_to_cls(
-                    d_copy[key], TrainerType(d_copy["trainer_type"]).to_settings()
-                )
-        elif key == "reward_signals":
-            d_copy[key] = RewardSignalSettings.structure_from_dict(val)
-        elif key == "max_steps":
-            d_copy[key] = int(
-                float(val)
-            )  # In some configs, max steps was specified as a float
-        else:
-            d_copy[key] = check_and_structure(key, val, t)
-    return t(**d_copy)
-
-
 def defaultdict_to_dict(d: DefaultDict) -> Dict:
     return {key: cattr.unstructure(val) for key, val in d.items()}
 
@@ -148,9 +118,13 @@ class RewardSignalSettings:
     strength: float = 1.0
 
     @staticmethod
-    def structure_from_dict(d: Mapping) -> Any:
-        if d is None:
-            return None
+    def structure(d: Mapping, t: type) -> Any:
+        """
+        Helper method to structure a TrainerSettings class. Meant to be registered with
+        cattr.register_structure_hook() and called with cattr.structure().
+        """
+        if not isinstance(d, Mapping):
+            raise TrainerConfigError(f"Unsupported reward signal configuration {d}.")
         d_final: Dict[RewardSignalType, RewardSignalSettings] = {}
         for key, val in d.items():
             try:
@@ -226,6 +200,10 @@ def _set_default_hyperparameters(self):
     self_play: Optional[SelfPlaySettings] = None
     behavioral_cloning: Optional[BehavioralCloningSettings] = None
 
+    cattr.register_structure_hook(
+        Dict[RewardSignalType, RewardSignalSettings], RewardSignalSettings.structure
+    )
+
     @network_settings.validator
     def _check_batch_size_seq_length(self, attribute, value):
         if self.network_settings.memory is not None:
@@ -243,6 +221,37 @@ def dict_to_defaultdict(d: Dict, t: type) -> DefaultDict:
             TrainerSettings, cattr.structure(d, Dict[str, TrainerSettings])
         )
 
+    @staticmethod
+    def structure(d: Mapping, t: type) -> Any:
+        """
+        Helper method to structure a TrainerSettings class. Meant to be registered with
+        cattr.register_structure_hook() and called with cattr.structure().
+        """
+        if not isinstance(d, Mapping):
+            raise TrainerConfigError(f"Unsupported config {d} for {t.__name__}.")
+        d_copy: Dict[str, Any] = {}
+        d_copy.update(d)
+
+        for key, val in d_copy.items():
+            if attr.has(type(val)):
+                # Don't convert already-converted attrs classes.
+                continue
+            if key == "hyperparameters":
+                if "trainer_type" not in d_copy:
+                    raise TrainerConfigError(
+                        "Hyperparameters were specified but no trainer_type was given."
+                    )
+                else:
+                    d_copy[key] = strict_to_cls(
+                        d_copy[key], TrainerType(d_copy["trainer_type"]).to_settings()
+                    )
+            elif key == "max_steps":
+                d_copy[key] = int(float(val))
+                # In some legacy configs, max steps was specified as a float
+            else:
+                d_copy[key] = check_and_structure(key, val, t)
+        return t(**d_copy)
+
 
 @attr.s(auto_attribs=True)
 class CurriculumSettings:
@@ -309,15 +318,13 @@ class RunOptions(ExportableSettings):
     cattr.register_structure_hook(EnvironmentSettings, strict_to_cls)
     cattr.register_structure_hook(EngineSettings, strict_to_cls)
     cattr.register_structure_hook(CheckpointSettings, strict_to_cls)
-    cattr.register_structure_hook(TrainerSettings, trainer_settings_to_cls)
+    cattr.register_structure_hook(CurriculumSettings, strict_to_cls)
+    cattr.register_structure_hook(TrainerSettings, TrainerSettings.structure)
     cattr.register_structure_hook(
         DefaultDict[str, TrainerSettings], TrainerSettings.dict_to_defaultdict
     )
     cattr.register_unstructure_hook(collections.defaultdict, defaultdict_to_dict)
 
-    def as_dict(self):
-        return cattr.unstructure(self)
-
     @staticmethod
     def from_argparse(args: argparse.Namespace) -> "RunOptions":
         """

From d1912627893ff50a8a69956dab245f489b971570 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 13 May 2020 17:23:41 -0700
Subject: [PATCH 36/54] Fix simple_rl tests

---
 ml-agents/mlagents/trainers/tests/test_simple_rl.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
index 7eca2236bd..0d62a2c614 100644
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -200,13 +200,16 @@ def test_visual_advanced_ppo(vis_encode_type, num_visual):
         step_size=0.5,
         vis_obs_size=(36, 36, 3),
     )
-    new_hyperparams = attr.evolve(
-        PPO_CONFIG.hyperparameters,
-        learning_rate=3.0e-4,
-        vis_encode_type=EncoderType(vis_encode_type),
+    new_networksettings = attr.evolve(
+        SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
     )
+    new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4)
     config = attr.evolve(
-        PPO_CONFIG, hyperparameters=new_hyperparams, max_steps=500, summary_freq=100
+        PPO_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_networksettings,
+        max_steps=500,
+        summary_freq=100,
     )
     # The number of steps is pretty small for these encoders
     _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)

From e16e20ebaa6960f09db4c1e9523f23db7624619f Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 13 May 2020 18:15:44 -0700
Subject: [PATCH 37/54] Clean up some test files

---
 .../mlagents/trainers/tests/test_bcmodule.py  | 33 -----------------
 .../trainers/tests/test_distributions.py      | 36 -------------------
 2 files changed, 69 deletions(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_bcmodule.py b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
index d685080df1..e67318897f 100644
--- a/ml-agents/mlagents/trainers/tests/test_bcmodule.py
+++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
@@ -2,7 +2,6 @@
 import mlagents.trainers.tests.mock_brain as mb
 
 import numpy as np
-import yaml
 import os
 
 from mlagents.trainers.policy.nn_policy import NNPolicy
@@ -14,38 +13,6 @@
 )
 
 
-def ppo_dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        memory_size: 8
-        behavioral_cloning:
-          demo_path: ./Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
-          strength: 1.0
-          steps: 10000000
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
-
-
 def create_bc_module(mock_brain, bc_settings, use_rnn, tanhresample):
     # model_path = env.external_brain_names[0]
     trainer_config = TrainerSettings()
diff --git a/ml-agents/mlagents/trainers/tests/test_distributions.py b/ml-agents/mlagents/trainers/tests/test_distributions.py
index c27047fd69..30756f12a2 100644
--- a/ml-agents/mlagents/trainers/tests/test_distributions.py
+++ b/ml-agents/mlagents/trainers/tests/test_distributions.py
@@ -2,48 +2,12 @@
 
 from mlagents.tf_utils import tf
 
-import yaml
-
 from mlagents.trainers.distributions import (
     GaussianDistribution,
     MultiCategoricalDistribution,
 )
 
 
-@pytest.fixture
-def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 8
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        summary_path: test
-        model_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
-
-
 VECTOR_ACTION_SPACE = [2]
 VECTOR_OBS_SPACE = 8
 DISCRETE_ACTION_SPACE = [3, 3, 3, 2]

From 3376551f2f2f782380c65a0426212b488003f032 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 13 May 2020 18:36:57 -0700
Subject: [PATCH 38/54] Fix usage of factories in settings classes

---
 ml-agents/mlagents/trainers/settings.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 1c8f602bf8..89cbc661de 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -185,7 +185,7 @@ class TrainerSettings(ExportableSettings):
     def _set_default_hyperparameters(self):
         return self.trainer_type.to_settings()()
 
-    network_settings: NetworkSettings = attr.ib(default=NetworkSettings())
+    network_settings: NetworkSettings = attr.ib(factory=NetworkSettings)
     reward_signals: Dict[RewardSignalType, RewardSignalSettings] = {
         RewardSignalType.EXTRINSIC: RewardSignalSettings()
     }
@@ -260,7 +260,7 @@ class MeasureType:
         REWARD: str = "reward"
 
     measure: str = attr.ib(default=MeasureType.REWARD)
-    thresholds: List[int] = attr.Factory(list)
+    thresholds: List[int] = attr.ib(factory=list)
     min_lesson_length: int = 0
     signal_smoothing: bool = True
     parameters: Dict[str, List[float]] = attr.ib(kw_only=True)
@@ -303,7 +303,7 @@ class EngineSettings:
 @attr.s(auto_attribs=True)
 class RunOptions(ExportableSettings):
     behaviors: DefaultDict[str, TrainerSettings] = attr.ib(
-        default=attr.Factory(lambda: collections.defaultdict(TrainerSettings))
+        factory=lambda: collections.defaultdict(TrainerSettings)
     )
     env_settings: EnvironmentSettings = EnvironmentSettings()
     engine_settings: EngineSettings = EngineSettings()

From f0cd7136e0d5ad145be61e1f09c1022710191c2b Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 13 May 2020 19:08:06 -0700
Subject: [PATCH 39/54] Add test and fix default mutables

---
 ml-agents/mlagents/trainers/settings.py       | 12 +++++------
 .../mlagents/trainers/tests/test_settings.py  | 21 +++++++++++++++++++
 2 files changed, 27 insertions(+), 6 deletions(-)
 create mode 100644 ml-agents/mlagents/trainers/tests/test_settings.py

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 89cbc661de..1062f4c572 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -186,9 +186,9 @@ def _set_default_hyperparameters(self):
         return self.trainer_type.to_settings()()
 
     network_settings: NetworkSettings = attr.ib(factory=NetworkSettings)
-    reward_signals: Dict[RewardSignalType, RewardSignalSettings] = {
-        RewardSignalType.EXTRINSIC: RewardSignalSettings()
-    }
+    reward_signals: Dict[RewardSignalType, RewardSignalSettings] = attr.ib(
+        factory=lambda: {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
+    )
     init_path: Optional[str] = None
     output_path: str = "default"
     # TODO: Remove parser default and remove from CLI
@@ -305,11 +305,11 @@ class RunOptions(ExportableSettings):
     behaviors: DefaultDict[str, TrainerSettings] = attr.ib(
         factory=lambda: collections.defaultdict(TrainerSettings)
     )
-    env_settings: EnvironmentSettings = EnvironmentSettings()
-    engine_settings: EngineSettings = EngineSettings()
+    env_settings: EnvironmentSettings = attr.ib(factory=EnvironmentSettings)
+    engine_settings: EngineSettings = attr.ib(factory=EngineSettings)
     parameter_randomization: Optional[Dict] = None
     curriculum: Optional[Dict[str, CurriculumSettings]] = None
-    checkpoint_settings: CheckpointSettings = CheckpointSettings()
+    checkpoint_settings: CheckpointSettings = attr.ib(factory=CheckpointSettings)
 
     # These are options that are relevant to the run itself, and not the engine or environment.
     # They will be left here.
diff --git a/ml-agents/mlagents/trainers/tests/test_settings.py b/ml-agents/mlagents/trainers/tests/test_settings.py
new file mode 100644
index 0000000000..c5b62677f6
--- /dev/null
+++ b/ml-agents/mlagents/trainers/tests/test_settings.py
@@ -0,0 +1,21 @@
+import attr
+
+from mlagents.trainers.settings import RunOptions, TrainerSettings
+
+
+def check_if_different(testobj1: object, testobj2: object) -> None:
+    assert testobj1 is not testobj2
+    if attr.has(testobj1.__class__) and attr.has(testobj2.__class__):
+        for key, val in attr.asdict(testobj1, recurse=False).items():
+            if isinstance(val, dict) or isinstance(val, list) or attr.has(val):
+                # Note: this check doesn't check the contents of mutables.
+                check_if_different(val, attr.asdict(testobj2, recurse=False)[key])
+
+
+def test_is_new_instance():
+    """
+    Verify that every instance of RunOptions() and its subclasses
+    is a new instance (i.e. all factory methods are used properly.)
+    """
+    check_if_different(RunOptions(), RunOptions())
+    check_if_different(TrainerSettings(), TrainerSettings())

From e9740e4617e88abf25ae77e741e3ab1bdfae5810 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 14 May 2020 16:15:36 -0700
Subject: [PATCH 40/54] Update training_int_tests

---
 ml-agents/tests/yamato/training_int_tests.py |  8 ++++++--
 ml-agents/tests/yamato/yamato_utils.py       | 12 ++++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/ml-agents/tests/yamato/training_int_tests.py b/ml-agents/tests/yamato/training_int_tests.py
index 2ae332fdc8..1f17480399 100644
--- a/ml-agents/tests/yamato/training_int_tests.py
+++ b/ml-agents/tests/yamato/training_int_tests.py
@@ -64,14 +64,18 @@ def run_training(python_version, csharp_version):
 
     # Copy the default training config but override the max_steps parameter,
     # and reduce the batch_size and buffer_size enough to ensure an update step happens.
-    overrides = {"max_steps": 100, "batch_size": 10, "buffer_size": 10}
     yaml_out = "override.yaml"
     if python_version:
+        overrides = {"max_steps": 100, "batch_size": 10, "buffer_size": 10}
         override_legacy_config_file(
             python_version, "config/trainer_config.yaml", yaml_out, **overrides
         )
     else:
-        override_config_file("config/ppo/3DBall.yaml", yaml_out, **overrides)
+        overrides = {
+            "hyperparameters": {"batch_size": 10, "buffer_size": 10},
+            "max_steps": 100,
+        }
+        override_config_file("config/ppo/3DBall.yaml", yaml_out, overrides)
 
     mla_learn_cmd = (
         f"mlagents-learn {yaml_out} --force --env="
diff --git a/ml-agents/tests/yamato/yamato_utils.py b/ml-agents/tests/yamato/yamato_utils.py
index 7939836ff1..b8499a4d86 100644
--- a/ml-agents/tests/yamato/yamato_utils.py
+++ b/ml-agents/tests/yamato/yamato_utils.py
@@ -152,7 +152,7 @@ def undo_git_checkout():
     subprocess.check_call(f"rm -rf Project/Library", shell=True)
 
 
-def override_config_file(src_path, dest_path, **kwargs):
+def override_config_file(src_path, dest_path, overrides):
     """
     Override settings in a trainer config file. For example,
         override_config_file(src_path, dest_path, max_steps=42)
@@ -163,12 +163,20 @@ def override_config_file(src_path, dest_path, **kwargs):
         behavior_configs = configs["behaviors"]
 
     for config in behavior_configs.values():
-        config.update(**kwargs)
+        _override_config_dict(config, overrides)
 
     with open(dest_path, "w") as f:
         yaml.dump(configs, f)
 
 
+def _override_config_dict(config, overrides):
+    for key, val in overrides.items():
+        if isinstance(val, dict):
+            _override_config_dict(config[key], val)
+        else:
+            config[key] = val
+
+
 def override_legacy_config_file(python_version, src_path, dest_path, **kwargs):
     """
     Override settings in a trainer config file, using an old version of the src_path. For example,

From 2ebb433979ee5dc984d430ec32f36e27c11b1a51 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 14 May 2020 18:06:52 -0700
Subject: [PATCH 41/54] Change docs

---
 docs/Training-Configuration-File.md |  45 +++++-----
 docs/Training-ML-Agents.md          | 128 +++++++++++++++-------------
 2 files changed, 93 insertions(+), 80 deletions(-)

diff --git a/docs/Training-Configuration-File.md b/docs/Training-Configuration-File.md
index 9efa16ff82..f040d286a7 100644
--- a/docs/Training-Configuration-File.md
+++ b/docs/Training-Configuration-File.md
@@ -26,18 +26,18 @@ choice of the trainer (which we review on subsequent sections).
 
 | **Setting**              | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | :----------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `trainer`                | The type of trainer to use: `ppo` or `sac`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| `trainer_type`                | The type of trainer to use: `ppo` or `sac`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 | `summary_freq`           | Number of experiences that needs to be collected before generating and displaying training statistics. This determines the granularity of the graphs in Tensorboard.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| `batch_size`             | Number of experiences in each iteration of gradient descent. **This should always be multiple times smaller than `buffer_size`**. If you are using a continuous action space, this value should be large (in the order of 1000s). If you are using a discrete action space, this value should be smaller (in order of 10s). <br><br> Typical range: (Continuous - PPO): `512` - `5120`; (Continuous - SAC): `128` - `1024`; (Discrete, PPO & SAC): `32` - `512`.                                                                                                                                                                                                                                                               |
-| `buffer_size`            | Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. In SAC, the max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000`                                                                                                                                                      |
-| `hidden_units`           | Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512`                                                                                                                                                                                                                                                                                    |
-| `learning_rate`          | Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| `learning_rate_schedule` | (Optional, default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run.                                                                                                           |
-| `max_steps`              | Total number of experience points that must be collected from the simulation before ending the training process. <br><br>Typical range: `5e5` - `1e7`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| `normalize`              | Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| `num_layers`             | The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3`                                                                                                                                                                                                                                                                                                                                                    |
 | `time_horizon`           | How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
-| `vis_encoder_type`       | (Optional, default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two.                                                                                                                                                                                             |
+| `hyperparameters -> batch_size`             | Number of experiences in each iteration of gradient descent. **This should always be multiple times smaller than `buffer_size`**. If you are using a continuous action space, this value should be large (in the order of 1000s). If you are using a discrete action space, this value should be smaller (in order of 10s). <br><br> Typical range: (Continuous - PPO): `512` - `5120`; (Continuous - SAC): `128` - `1024`; (Discrete, PPO & SAC): `32` - `512`.                                                                                                                                                                                                                                                               |
+| `hyperparameters -> buffer_size`            | Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. In SAC, the max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000`                                                                                                                                                      |
+| `hyperparameters -> learning_rate`          | Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| `hyperparameters -> learning_rate_schedule` | (Optional, default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run.                                                                                                           |
+| `max_steps`              | Total number of experience points that must be collected from the simulation before ending the training process. <br><br>Typical range: `5e5` - `1e7`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| `network_settings -> hidden_units`           | Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512`                                                                                                                                                                                                                                                                                    |
+| `network_settings -> num_layers`             | The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3`                                                                                                                                                                                                                                                                                                                                                    |
+| `network_settings -> normalize`              | Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| `network_settings -> vis_encoder_type`       | (Optional, default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two.                                                                                                                                                                                             |
 | `init_path`              | (Optional, default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run.                                                                                                                                  |
 | `threaded`               | (Optional, default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC.                                                                                                                                                                                                                                                       |
 
@@ -52,20 +52,20 @@ the `trainer` setting above).
 
 | **Setting** | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 | :---------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `beta`      | Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2`                                                                                                                                                                     |
-| `epsilon`   | Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3`                                                                                                                                                                                                                                                                                                                      |
-| `lambd`     | Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
-| `num_epoch` | Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                           |
+| `hyperparameters -> beta`      | Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2`                                                                                                                                                                     |
+| `hyperparameters -> epsilon`   | Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3`                                                                                                                                                                                                                                                                                                                      |
+| `hyperparameters -> lambd`     | Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
+| `hyperparameters -> num_epoch` | Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                           |
 
 ### SAC-specific Configurations
 
 | **Setting**          | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 | :------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `buffer_init_steps`  | Number of experiences to collect into the buffer before updating the policy model. As the untrained policy is fairly random, pre-filling the buffer with random actions is useful for exploration. Typically, at least several episodes of experiences should be pre-filled. <br><br>Typical range: `1000` - `10000`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| `init_entcoef`       | How much the agent should explore in the beginning of training. Corresponds to the initial entropy coefficient set at the beginning of training. In SAC, the agent is incentivized to make its actions entropic to facilitate better exploration. The entropy coefficient weighs the true reward with a bonus entropy reward. The entropy coefficient is [automatically adjusted](https://arxiv.org/abs/1812.05905) to a preset target entropy, so the `init_entcoef` only corresponds to the starting value of the entropy bonus. Increase init_entcoef to explore more in the beginning, decrease to converge to a solution faster. <br><br>Typical range: (Continuous): `0.5` - `1.0`; (Discrete): `0.05` - `0.5`                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| `save_replay_buffer` | (Optional, default = `false`) Whether to save and load the experience replay buffer as well as the model when quitting and re-starting training. This may help resumes go more smoothly, as the experiences collected won't be wiped. Note that replay buffers can be very large, and will take up a considerable amount of disk space. For that reason, we disable this feature by default.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| `tau`                | How aggressively to update the target network used for bootstrapping value estimation in SAC. Corresponds to the magnitude of the target Q update during the SAC model update. In SAC, there are two neural networks: the target and the policy. The target network is used to bootstrap the policy's estimate of the future rewards at a given state, and is fixed while the policy is being updated. This target is then slowly updated according to tau. Typically, this value should be left at 0.005. For simple problems, increasing tau to 0.01 might reduce the time it takes to learn, at the cost of stability. <br><br>Typical range: `0.005` - `0.01`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| `steps_per_update`   | Average ratio of agent steps (actions) taken to updates made of the agent's policy. In SAC, a single "update" corresponds to grabbing a batch of size `batch_size` from the experience replay buffer, and using this mini batch to update the models. Note that it is not guaranteed that after exactly `steps_per_update` steps an update will be made, only that the ratio will hold true over many steps. Typically, `steps_per_update` should be greater than or equal to 1. Note that setting `steps_per_update` lower will improve sample efficiency (reduce the number of steps required to train) but increase the CPU time spent performing updates. For most environments where steps are fairly fast (e.g. our example environments) `steps_per_update` equal to the number of agents in the scene is a good balance. For slow environments (steps take 0.1 seconds or more) reducing `steps_per_update` may improve training speed. We can also change `steps_per_update` to lower than 1 to update more often than once per step, though this will usually result in a slowdown unless the environment is very slow. <br><br>Typical range: `1` - `20` |
+| `hyperparameters -> buffer_init_steps`  | Number of experiences to collect into the buffer before updating the policy model. As the untrained policy is fairly random, pre-filling the buffer with random actions is useful for exploration. Typically, at least several episodes of experiences should be pre-filled. <br><br>Typical range: `1000` - `10000`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| `hyperparameters -> init_entcoef` | How much the agent should explore in the beginning of training. Corresponds to the initial entropy coefficient set at the beginning of training. In SAC, the agent is incentivized to make its actions entropic to facilitate better exploration. The entropy coefficient weighs the true reward with a bonus entropy reward. The entropy coefficient is [automatically adjusted](https://arxiv.org/abs/1812.05905) to a preset target entropy, so the `init_entcoef` only corresponds to the starting value of the entropy bonus. Increase init_entcoef to explore more in the beginning, decrease to converge to a solution faster. <br><br>Typical range: (Continuous): `0.5` - `1.0`; (Discrete): `0.05` - `0.5`                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| `hyperparameters -> save_replay_buffer` | (Optional, default = `false`) Whether to save and load the experience replay buffer as well as the model when quitting and re-starting training. This may help resumes go more smoothly, as the experiences collected won't be wiped. Note that replay buffers can be very large, and will take up a considerable amount of disk space. For that reason, we disable this feature by default.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `hyperparameters -> tau` | How aggressively to update the target network used for bootstrapping value estimation in SAC. Corresponds to the magnitude of the target Q update during the SAC model update. In SAC, there are two neural networks: the target and the policy. The target network is used to bootstrap the policy's estimate of the future rewards at a given state, and is fixed while the policy is being updated. This target is then slowly updated according to tau. Typically, this value should be left at 0.005. For simple problems, increasing tau to 0.01 might reduce the time it takes to learn, at the cost of stability. <br><br>Typical range: `0.005` - `0.01`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| `hyperparameters -> steps_per_update` | Average ratio of agent steps (actions) taken to updates made of the agent's policy. In SAC, a single "update" corresponds to grabbing a batch of size `batch_size` from the experience replay buffer, and using this mini batch to update the models. Note that it is not guaranteed that after exactly `steps_per_update` steps an update will be made, only that the ratio will hold true over many steps. Typically, `steps_per_update` should be greater than or equal to 1. Note that setting `steps_per_update` lower will improve sample efficiency (reduce the number of steps required to train) but increase the CPU time spent performing updates. For most environments where steps are fairly fast (e.g. our example environments) `steps_per_update` equal to the number of agents in the scene is a good balance. For slow environments (steps take 0.1 seconds or more) reducing `steps_per_update` may improve training speed. We can also change `steps_per_update` to lower than 1 to update more often than once per step, though this will usually result in a slowdown unless the environment is very slow. <br><br>Typical range: `1` - `20` |
 
 ## Reward Signals
 
@@ -141,14 +141,13 @@ recorded demonstrations), provide the following configurations under the
 
 ## Memory-enhanced Agents using Recurrent Neural Networks
 
-You can enable your agents to use memory, by setting `use_recurrent` to `true`
+You can enable your agents to use memory by adding a `memory` section under `network_settings`,
 and setting `memory_size` and `sequence_length`:
 
 | **Setting**       | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | :---------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `use_recurrent`   | Whether to enable this option or not.                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| `memory_size`     | Size of the memory an agent must keep. In order to use a LSTM, training requires a sequence of experiences instead of single experiences. Corresponds to the size of the array of floating point numbers used to store the hidden state of the recurrent neural network of the policy. This value must be a multiple of 2, and should scale with the amount of information you expect the agent will need to remember in order to successfully complete the task. <br><br>Typical range: `32` - `256` |
-| `sequence_length` | Defines how long the sequences of experiences must be while training. Note that if this number is too small, the agent will not be able to remember things over longer periods of time. If this number is too large, the neural network will take longer to train. <br><br>Typical range: `4` - `128`                                                                                                                                                                                                 |
+| `network_settings -> memory -> memory_size` | Size of the memory an agent must keep. In order to use a LSTM, training requires a sequence of experiences instead of single experiences. Corresponds to the size of the array of floating point numbers used to store the hidden state of the recurrent neural network of the policy. This value must be a multiple of 2, and should scale with the amount of information you expect the agent will need to remember in order to successfully complete the task. <br><br>Typical range: `32` - `256` |
+| `network_settings -> memory -> sequence_length` | Defines how long the sequences of experiences must be while training. Note that if this number is too small, the agent will not be able to remember things over longer periods of time. If this number is too large, the neural network will take longer to train. <br><br>Typical range: `4` - `128`                                                                                                                                                                                                 |
 
 A few considerations when deciding to use memory:
 
diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
index 9cd1ecb7c7..d653185647 100644
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
@@ -194,32 +194,38 @@ configuration, but their settings live in different sections that we'll cover su
 ```yaml
 behaviors:
   BehaviorPPO:
-    trainer: ppo
+    trainer_type: ppo
+
+    hyperparameters:
+      # Hyperparameters common to PPO and SAC
+      batch_size: 1024
+      buffer_size: 10240
+      learning_rate: 3.0e-4
+      learning_rate_schedule: linear
+
+      # PPO-specific hyperparameters
+      # Replaces the "PPO-specific hyperparameters" section above
+      beta: 5.0e-3
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
 
-    # Trainer configs common to PPO/SAC (excluding reward signals)
-    batch_size: 1024
-    buffer_size: 10240
-    hidden_units: 128
-    learning_rate: 3.0e-4
-    learning_rate_schedule: linear
+    # Configuration of the neural network (common to PPO/SAC)
+    network_settings:
+      vis_encoder_type: simple
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      # memory
+      memory:
+        sequence_length: 64
+        memory_size: 256
+
+    # Trainer configurations common to all trainers
     max_steps: 5.0e5
-    normalize: false
-    num_layers: 2
     time_horizon: 64
-    vis_encoder_type: simple
-
-    # PPO-specific configs
-    beta: 5.0e-3
-    epsilon: 0.2
-    lambd: 0.95
-    num_epoch: 3
     threaded: true
 
-    # memory
-    use_recurrent: true
-    sequence_length: 64
-    memory_size: 256
-
     # behavior cloning
     behavioral_cloning:
       demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
@@ -269,29 +275,37 @@ curiosity and self-play) remain unchanged.
 ```yaml
 behaviors:
   BehaviorSAC:
-    trainer: sac
+    trainer_type: sac
 
     # Trainer configs common to PPO/SAC (excluding reward signals)
     # same as PPO config
 
-    # SAC-specific configs (replaces the "PPO-specific configs" section above)
-    buffer_init_steps: 0
-    tau: 0.005
-    steps_per_update: 1
-    train_interval: 1
-    init_entcoef: 1.0
-    save_replay_buffer: false
+    # SAC-specific configs (replaces the hyperparameters section above)
+    hyperparameters:
+      # Hyperparameters common to PPO and SAC
+      # Same as PPO config
 
-    # memory
-    # same as PPO config
+      # SAC-specific hyperparameters
+      # Replaces the "PPO-specific hyperparameters" section above
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.5
+      reward_signal_steps_per_update: 10.0
+
+    # Configuration of the neural network (common to PPO/SAC)
+    network_settings:
+      # Same as PPO config
+
+    # Trainer configurations common to all trainers
+      # <Same as PPO config>
 
     # pre-training using behavior cloning
     behavioral_cloning:
       # same as PPO config
 
     reward_signals:
-      reward_signal_num_update: 1 # only applies to SAC
-
       # environment reward
       extrinsic:
         # same as PPO config
@@ -316,23 +330,24 @@ description of all the configurations listed above.
 
 ### Curriculum Learning
 
-To enable curriculum learning, you need to add a sub-section to the corresponding
-`behaivors` entry in the trainer config YAML file that defines the curriculum for that
-behavior. Here is one example:
+To enable curriculum learning, you need to add a `curriculum ` sub-section to the trainer
+configuration YAML file. Within this sub-section, add an entry for each behavior that defines
+the curriculum for thatbehavior. Here is one example:
 
 ```yml
 behaviors:
   BehaviorY:
     # < Same as above >
 
-    # Add this section
-    curriculum:
-      measure: progress
-      thresholds: [0.1, 0.3, 0.5]
-      min_lesson_length: 100
-      signal_smoothing: true
-      parameters:
-        wall_height: [1.5, 2.0, 2.5, 4.0]
+# Add this section
+curriculum:
+  BehaviorY:
+    measure: progress
+    thresholds: [0.1, 0.3, 0.5]
+    min_lesson_length: 100
+    signal_smoothing: true
+    parameters:
+      wall_height: [1.5, 2.0, 2.5, 4.0]
 ```
 
 Each group of Agents under the same `Behavior Name` in an environment can have a
@@ -355,8 +370,11 @@ example config for the curricula for the Wall Jump environment.
 behaviors:
   BigWallJump:
     # < Trainer parameters for BigWallJump >
-    # Curriculum configuration
-    curriculum:
+  SmallWallJump:
+    # < Trainer parameters for SmallWallJump >
+
+curriculum:
+  BigWallJump:
       measure: progress
       thresholds: [0.1, 0.3, 0.5]
       min_lesson_length: 100
@@ -364,17 +382,13 @@ behaviors:
       parameters:
         big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
         big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
-
   SmallWallJump:
-    # < Trainer parameters for BigWallJump >
-    # Curriculum configuration
-    curriculum:
-      measure: progress
-      thresholds: [0.1, 0.3, 0.5]
-      min_lesson_length: 100
-      signal_smoothing: true
-      parameters:
-        small_wall_height: [1.5, 2.0, 2.5, 4.0]
+    measure: progress
+    thresholds: [0.1, 0.3, 0.5]
+    min_lesson_length: 100
+    signal_smoothing: true
+    parameters:
+      small_wall_height: [1.5, 2.0, 2.5, 4.0]
 ```
 
 The curriculum for each Behavior has the following parameters:
@@ -390,7 +404,7 @@ The curriculum for each Behavior has the following parameters:
 #### Training with a Curriculum
 
 Once we have specified our metacurriculum and curricula, we can launch
-`mlagents-learn` using the `–curriculum` flag to point to the config file for
+`mlagents-learn` to point to the config file containing
 our curricula and PPO will train using Curriculum Learning. For example, to
 train agents in the Wall Jump environment with curriculum learning, we can run:
 

From 74d523d2195e458897733f2e08bbe7bbd0f2c275 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 14 May 2020 18:21:52 -0700
Subject: [PATCH 42/54] Update with migration

---
 docs/Migrating.md          | 13 +++++++++----
 docs/Training-ML-Agents.md |  5 +++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/docs/Migrating.md b/docs/Migrating.md
index f4a1448d78..fd6c3cd77a 100644
--- a/docs/Migrating.md
+++ b/docs/Migrating.md
@@ -21,14 +21,19 @@ double-check that the versions are in the same. The versions can be found in
   instead of `summaries/` and `models/`.
 - Trainer configuration, curriculum configuration, and parameter randomization
   configuration have all been moved to a single YAML file. (#3791)
+- Trainer configuration format has changed, and using a "default" behavior name has
+  been deprecated. (#3936)
 - `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
 - On the UnityEnvironment API, `get_behavior_names()` and `get_behavior_specs()` methods were combined into the property `behavior_specs` that contains a mapping from behavior names to behavior spec.
 
 ### Steps to Migrate
-- Before upgrading, copy your `Behavior Name` sections from `trainer_config.yaml` into
-  a separate trainer configuration file, under a `behaviors` section. You can move the `default` section too
-  if it's being used. This file should be specific to your environment, and not contain configurations for
-  multiple environments (unless they have the same Behavior Names).
+- To upgrade your configuration files, an upgrade script has been provided. Run `python config/update_config.py
+  -h` to see the script usage.
+
+  To do it manually, copy your `<BehaviorName>` sections from `trainer_config.yaml` into a separate trainer configuration file, under a `behaviors` section.
+  The `default` section is no longer needed. This new file should be specific to your environment, and not contain
+  configurations for multiple environments (unless they have the same Behavior Names).
+  - You will need to reformat your trainer settings as per the [example](Training-ML-Agents.md).
   - If your training uses [curriculum](Training-ML-Agents.md#curriculum-learning), move those configurations under
   the `Behavior Name` section.
   - If your training uses [parameter randomization](Training-ML-Agents.md#environment-parameter-randomization), move
diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
index d653185647..3a05b925bd 100644
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
@@ -179,6 +179,11 @@ use during training, and the answers to the above questions will dictate its con
 The rest of this guide breaks down the different sub-sections of the trainer config file
 and explains the possible settings for each.
 
+**NOTE:** The configuration file format has been changed from 0.17.0 and onwards. To convert
+an old set of configuration files (trainer config, curriculum, and sampler files) to the new
+format, a script has been provided. Run `python config/upgrade_config.py -h` in your  console
+to see the script's usage.
+
 ### Behavior Configurations
 
 The primary section of the trainer config file is a

From 93ab9d35fdc44ce99b65eb3db894db28f6b4bd5f Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 14 May 2020 18:27:44 -0700
Subject: [PATCH 43/54] Fix run_experiment

---
 ml-agents/mlagents/trainers/run_experiment.py | 6 ++++--
 ml-agents/mlagents/trainers/settings.py       | 6 +++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/ml-agents/mlagents/trainers/run_experiment.py b/ml-agents/mlagents/trainers/run_experiment.py
index fe57ca889e..a372fca3b7 100644
--- a/ml-agents/mlagents/trainers/run_experiment.py
+++ b/ml-agents/mlagents/trainers/run_experiment.py
@@ -1,6 +1,8 @@
 import argparse
 from typing import Optional, List
-from mlagents.trainers.learn import RunOptions, run_cli, load_config
+from mlagents.trainers.learn import run_cli
+from mlagents.trainers.settings import RunOptions
+from mlagents.trainers.cli_utils import load_config
 
 
 def parse_command_line(argv: Optional[List[str]] = None) -> argparse.Namespace:
@@ -19,7 +21,7 @@ def main():
     """
     args = parse_command_line()
     expt_config = load_config(args.experiment_config_path)
-    run_cli(RunOptions(**expt_config))
+    run_cli(RunOptions.from_dict(expt_config))
 
 
 if __name__ == "__main__":
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 9163815700..338acc8d5f 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -368,4 +368,8 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions":
                     configured_dict["engine_settings"][key] = val
                 else:  # Base options
                     configured_dict[key] = val
-        return cattr.structure(configured_dict, RunOptions)
+        return RunOptions.from_dict(configured_dict)
+
+    @staticmethod
+    def from_dict(options_dict: Dict[str, Any]) -> "RunOptions":
+        return cattr.structure(options_dict, RunOptions)

From 68634fb7230975dc83246fce6bcd7ebed2347cab Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Thu, 14 May 2020 19:08:01 -0700
Subject: [PATCH 44/54] Fix simple_rl test

---
 ml-agents/mlagents/trainers/tests/test_simple_rl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
index 65c4a6aeb6..c43dabc5a5 100644
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -458,7 +458,7 @@ def test_gail_visual_ppo(simple_record, use_discrete):
         use_discrete=use_discrete,
         step_size=0.2,
     )
-    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
+    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
     reward_signals = {
         RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
     }

From 182b7a5f2397ef058a36e44bca423492bfa43c53 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 15 May 2020 10:15:51 -0700
Subject: [PATCH 45/54] Update docs with defaults

---
 docs/Training-Configuration-File.md     | 103 +++++++++++-------------
 docs/Training-ML-Agents.md              |   5 +-
 ml-agents/mlagents/trainers/settings.py |   2 +-
 3 files changed, 52 insertions(+), 58 deletions(-)

diff --git a/docs/Training-Configuration-File.md b/docs/Training-Configuration-File.md
index f040d286a7..98df68a09a 100644
--- a/docs/Training-Configuration-File.md
+++ b/docs/Training-Configuration-File.md
@@ -26,20 +26,20 @@ choice of the trainer (which we review on subsequent sections).
 
 | **Setting**              | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | :----------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `trainer_type`                | The type of trainer to use: `ppo` or `sac`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| `summary_freq`           | Number of experiences that needs to be collected before generating and displaying training statistics. This determines the granularity of the graphs in Tensorboard.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| `time_horizon`           | How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
+| `trainer_type`                | (default = `ppo`) The type of trainer to use: `ppo` or `sac`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| `summary_freq`           | (default = `50000`) Number of experiences that needs to be collected before generating and displaying training statistics. This determines the granularity of the graphs in Tensorboard.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| `time_horizon`           | (default = `64`) How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
 | `hyperparameters -> batch_size`             | Number of experiences in each iteration of gradient descent. **This should always be multiple times smaller than `buffer_size`**. If you are using a continuous action space, this value should be large (in the order of 1000s). If you are using a discrete action space, this value should be smaller (in order of 10s). <br><br> Typical range: (Continuous - PPO): `512` - `5120`; (Continuous - SAC): `128` - `1024`; (Discrete, PPO & SAC): `32` - `512`.                                                                                                                                                                                                                                                               |
-| `hyperparameters -> buffer_size`            | Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. In SAC, the max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000`                                                                                                                                                      |
-| `hyperparameters -> learning_rate`          | Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| `hyperparameters -> learning_rate_schedule` | (Optional, default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run.                                                                                                           |
-| `max_steps`              | Total number of experience points that must be collected from the simulation before ending the training process. <br><br>Typical range: `5e5` - `1e7`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| `network_settings -> hidden_units`           | Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512`                                                                                                                                                                                                                                                                                    |
-| `network_settings -> num_layers`             | The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3`                                                                                                                                                                                                                                                                                                                                                    |
-| `network_settings -> normalize`              | Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| `network_settings -> vis_encoder_type`       | (Optional, default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two.                                                                                                                                                                                             |
-| `init_path`              | (Optional, default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run.                                                                                                                                  |
-| `threaded`               | (Optional, default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC.                                                                                                                                                                                                                                                       |
+| `hyperparameters -> buffer_size`            | (default = `10240` for PPO and `50000` for SAC) Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. In SAC, the max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000`                                                                                                                                                      |
+| `hyperparameters -> learning_rate`          | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| `hyperparameters -> learning_rate_schedule` | (default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run.                                                                                                           |
+| `max_steps`              | (default = `500000`) Total number of experience points that must be collected from the simulation before ending the training process. <br><br>Typical range: `5e5` - `1e7`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| `network_settings -> hidden_units`           | (default = `128`) Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512`                                                                                                                                                                                                                                                                                    |
+| `network_settings -> num_layers`             | (default = `false`) The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3`                                                                                                                                                                                                                                                                                                                                                    |
+| `network_settings -> normalize`              | (default = `false`) Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| `network_settings -> vis_encoder_type`       | (default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two.                                                                                                                                                                                             |
+| `init_path`              | (default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run.                                                                                                                                  |
+| `threaded`               | (default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC.                                                                                                                                                                                                                                                       |
 
 ## Trainer-specific Configurations
 
@@ -52,20 +52,22 @@ the `trainer` setting above).
 
 | **Setting** | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 | :---------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `hyperparameters -> beta`      | Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2`                                                                                                                                                                     |
-| `hyperparameters -> epsilon`   | Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3`                                                                                                                                                                                                                                                                                                                      |
-| `hyperparameters -> lambd`     | Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
+| `hyperparameters -> beta`      | (default = `5.0e-3`) Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2`                                                                                                                                                                     |
+| `hyperparameters -> epsilon`   | (default = `0.2`) Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3`                                                                                                                                                                                                                                                                                                                      |
+| `hyperparameters -> lambd`     | (default = `0.95`) Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
 | `hyperparameters -> num_epoch` | Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                           |
 
 ### SAC-specific Configurations
 
 | **Setting**          | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 | :------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `hyperparameters -> buffer_init_steps`  | Number of experiences to collect into the buffer before updating the policy model. As the untrained policy is fairly random, pre-filling the buffer with random actions is useful for exploration. Typically, at least several episodes of experiences should be pre-filled. <br><br>Typical range: `1000` - `10000`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| `hyperparameters -> init_entcoef` | How much the agent should explore in the beginning of training. Corresponds to the initial entropy coefficient set at the beginning of training. In SAC, the agent is incentivized to make its actions entropic to facilitate better exploration. The entropy coefficient weighs the true reward with a bonus entropy reward. The entropy coefficient is [automatically adjusted](https://arxiv.org/abs/1812.05905) to a preset target entropy, so the `init_entcoef` only corresponds to the starting value of the entropy bonus. Increase init_entcoef to explore more in the beginning, decrease to converge to a solution faster. <br><br>Typical range: (Continuous): `0.5` - `1.0`; (Discrete): `0.05` - `0.5`                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| `hyperparameters -> save_replay_buffer` | (Optional, default = `false`) Whether to save and load the experience replay buffer as well as the model when quitting and re-starting training. This may help resumes go more smoothly, as the experiences collected won't be wiped. Note that replay buffers can be very large, and will take up a considerable amount of disk space. For that reason, we disable this feature by default.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| `hyperparameters -> tau` | How aggressively to update the target network used for bootstrapping value estimation in SAC. Corresponds to the magnitude of the target Q update during the SAC model update. In SAC, there are two neural networks: the target and the policy. The target network is used to bootstrap the policy's estimate of the future rewards at a given state, and is fixed while the policy is being updated. This target is then slowly updated according to tau. Typically, this value should be left at 0.005. For simple problems, increasing tau to 0.01 might reduce the time it takes to learn, at the cost of stability. <br><br>Typical range: `0.005` - `0.01`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| `hyperparameters -> steps_per_update` | Average ratio of agent steps (actions) taken to updates made of the agent's policy. In SAC, a single "update" corresponds to grabbing a batch of size `batch_size` from the experience replay buffer, and using this mini batch to update the models. Note that it is not guaranteed that after exactly `steps_per_update` steps an update will be made, only that the ratio will hold true over many steps. Typically, `steps_per_update` should be greater than or equal to 1. Note that setting `steps_per_update` lower will improve sample efficiency (reduce the number of steps required to train) but increase the CPU time spent performing updates. For most environments where steps are fairly fast (e.g. our example environments) `steps_per_update` equal to the number of agents in the scene is a good balance. For slow environments (steps take 0.1 seconds or more) reducing `steps_per_update` may improve training speed. We can also change `steps_per_update` to lower than 1 to update more often than once per step, though this will usually result in a slowdown unless the environment is very slow. <br><br>Typical range: `1` - `20` |
+| `hyperparameters -> buffer_init_steps`  | (default = `0`) Number of experiences to collect into the buffer before updating the policy model. As the untrained policy is fairly random, pre-filling the buffer with random actions is useful for exploration. Typically, at least several episodes of experiences should be pre-filled. <br><br>Typical range: `1000` - `10000`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| `hyperparameters -> init_entcoef` | (default = `1.0`) How much the agent should explore in the beginning of training. Corresponds to the initial entropy coefficient set at the beginning of training. In SAC, the agent is incentivized to make its actions entropic to facilitate better exploration. The entropy coefficient weighs the true reward with a bonus entropy reward. The entropy coefficient is [automatically adjusted](https://arxiv.org/abs/1812.05905) to a preset target entropy, so the `init_entcoef` only corresponds to the starting value of the entropy bonus. Increase init_entcoef to explore more in the beginning, decrease to converge to a solution faster. <br><br>Typical range: (Continuous): `0.5` - `1.0`; (Discrete): `0.05` - `0.5`                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| `hyperparameters -> save_replay_buffer` | (default = `false`) Whether to save and load the experience replay buffer as well as the model when quitting and re-starting training. This may help resumes go more smoothly, as the experiences collected won't be wiped. Note that replay buffers can be very large, and will take up a considerable amount of disk space. For that reason, we disable this feature by default.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `hyperparameters -> tau` | (default = `0.005`) How aggressively to update the target network used for bootstrapping value estimation in SAC. Corresponds to the magnitude of the target Q update during the SAC model update. In SAC, there are two neural networks: the target and the policy. The target network is used to bootstrap the policy's estimate of the future rewards at a given state, and is fixed while the policy is being updated. This target is then slowly updated according to tau. Typically, this value should be left at 0.005. For simple problems, increasing tau to 0.01 might reduce the time it takes to learn, at the cost of stability. <br><br>Typical range: `0.005` - `0.01`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| `hyperparameters -> steps_per_update` | (default = `1`) Average ratio of agent steps (actions) taken to updates made of the agent's policy. In SAC, a single "update" corresponds to grabbing a batch of size `batch_size` from the experience replay buffer, and using this mini batch to update the models. Note that it is not guaranteed that after exactly `steps_per_update` steps an update will be made, only that the ratio will hold true over many steps. Typically, `steps_per_update` should be greater than or equal to 1. Note that setting `steps_per_update` lower will improve sample efficiency (reduce the number of steps required to train) but increase the CPU time spent performing updates. For most environments where steps are fairly fast (e.g. our example environments) `steps_per_update` equal to the number of agents in the scene is a good balance. For slow environments (steps take 0.1 seconds or more) reducing `steps_per_update` may improve training speed. We can also change `steps_per_update` to lower than 1 to update more often than once per step, though this will usually result in a slowdown unless the environment is very slow. <br><br>Typical range: `1` - `20` |
+
+| `hyperparameters -> reward_signal_num_update` | (default = `steps_per_update`) Number of steps per mini batch sampled and used for updating the reward signals. By default, we update the reward signals once every time the main policy is updated. However, to imitate the training procedure in certain imitation learning papers (e.g. [Kostrikov et. al](http://arxiv.org/abs/1809.02925), [Blondé et. al](http://arxiv.org/abs/1809.02064)), we may want to update the reward signal (GAIL) M times for every update of the policy. We can change `steps_per_update` of SAC to N, as well as `reward_signal_steps_per_update` under `reward_signals` to N / M to accomplish this. By default, `reward_signal_steps_per_update` is set to `steps_per_update`. |
 
 ## Reward Signals
 
@@ -85,8 +87,8 @@ environment-based reward signal:
 
 | **Setting**             | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 | :---------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `extrinsic -> strength` | Factor by which to multiply the reward given by the environment. Typical ranges will vary depending on the reward signal. <br><br>Typical range: `1.00`                                                                                                                                                                                                                                                                                              |
-| `extrinsic -> gamma`    | Discount factor for future rewards coming from the environment. This can be thought of as how far into the future the agent should care about possible rewards. In situations when the agent should be acting in the present in order to prepare for rewards in the distant future, this value should be large. In cases when rewards are more immediate, it can be smaller. Must be strictly smaller than 1. <br><br>Typical range: `0.8` - `0.995` |
+| `extrinsic -> strength` | (default = `1.0`) Factor by which to multiply the reward given by the environment. Typical ranges will vary depending on the reward signal. <br><br>Typical range: `1.00`                                                                                                                                                                                                                                                                                              |
+| `extrinsic -> gamma`    | (default = `0.99`) Discount factor for future rewards coming from the environment. This can be thought of as how far into the future the agent should care about possible rewards. In situations when the agent should be acting in the present in order to prepare for rewards in the distant future, this value should be large. In cases when rewards are more immediate, it can be smaller. Must be strictly smaller than 1. <br><br>Typical range: `0.8` - `0.995` |
 
 ### Curiosity Intrinsic Reward
 
@@ -94,10 +96,10 @@ To enable curiosity, provide these settings:
 
 | **Setting**                  | **Description**                                                                                                                                                                                                                                                                                                                       |
 | :--------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `curiosity -> strength`      | Magnitude of the curiosity reward generated by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough to not be overwhelmed by extrinsic reward signals in the environment. Likewise it should not be too large to overwhelm the extrinsic reward signal. <br><br>Typical range: `0.001` - `0.1` |
-| `curiosity -> gamma`         | Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.995`                                                                                                                                                                                                                                                            |
-| `curiosity -> encoding_size` | (Optional, default = `64`) Size of the encoding used by the intrinsic curiosity model. This value should be small enough to encourage the ICM to compress the original observation, but also not too small to prevent it from learning to differentiate between expected and actual observations. <br><br>Typical range: `64` - `256` |
-| `curiosity -> learning_rate` | (Optional, default = `3e-4`) Learning rate used to update the intrinsic curiosity module. This should typically be decreased if training is unstable, and the curiosity loss is unstable. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                      |
+| `curiosity -> strength`      | (default = `1.0`) Magnitude of the curiosity reward generated by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough to not be overwhelmed by extrinsic reward signals in the environment. Likewise it should not be too large to overwhelm the extrinsic reward signal. <br><br>Typical range: `0.001` - `0.1` |
+| `curiosity -> gamma`         | (default = `0.99`) Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.995`                                                                                                                                                                                                                                                            |
+| `curiosity -> encoding_size` | (default = `64`) Size of the encoding used by the intrinsic curiosity model. This value should be small enough to encourage the ICM to compress the original observation, but also not too small to prevent it from learning to differentiate between expected and actual observations. <br><br>Typical range: `64` - `256` |
+| `curiosity -> learning_rate` | (default = `3e-4`) Learning rate used to update the intrinsic curiosity module. This should typically be decreased if training is unstable, and the curiosity loss is unstable. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                      |
 
 ### GAIL Intrinsic Reward
 
@@ -106,22 +108,14 @@ settings:
 
 | **Setting**             | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
 | :---------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `gail -> strength`      | Factor by which to multiply the raw reward. Note that when using GAIL with an Extrinsic Signal, this value should be set lower if your demonstrations are suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases. <br><br>Typical range: `0.01` - `1.0`                                                                              |
-| `gail -> gamma`         | Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.9`                                                                                                                                                                                                                                                                                                                                                                                                        |
-| `gail -> demo_path`     | The path to your .demo file or directory of .demo files.                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| `gail -> encoding_size` | (Optional, default = `64`) Size of the hidden layer used by the discriminator. This value should be small enough to encourage the discriminator to compress the original observation, but also not too small to prevent it from learning to differentiate between demonstrated and actual behavior. Dramatically increasing this size will also negatively affect training times. <br><br>Typical range: `64` - `256`                                                           |
+| `gail -> strength`      | (default = `1.0`) Factor by which to multiply the raw reward. Note that when using GAIL with an Extrinsic Signal, this value should be set lower if your demonstrations are suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases. <br><br>Typical range: `0.01` - `1.0`                                                                              |
+| `gail -> gamma`         | (default = `0.99`) Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.9`                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `gail -> demo_path`     | (Required, no default) The path to your .demo file or directory of .demo files.                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `gail -> encoding_size` | (default = `64`) Size of the hidden layer used by the discriminator. This value should be small enough to encourage the discriminator to compress the original observation, but also not too small to prevent it from learning to differentiate between demonstrated and actual behavior. Dramatically increasing this size will also negatively affect training times. <br><br>Typical range: `64` - `256`                                                           |
 | `gail -> learning_rate` | (Optional, default = `3e-4`) Learning rate used to update the discriminator. This should typically be decreased if training is unstable, and the GAIL loss is unstable. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                  |
-| `gail -> use_actions`   | (Optional, default = `false`) Determines whether the discriminator should discriminate based on both observations and actions, or just observations. Set to True if you want the agent to mimic the actions from the demonstrations, and False if you'd rather have the agent visit the same states as in the demonstrations but with possibly different actions. Setting to False is more likely to be stable, especially with imperfect demonstrations, but may learn slower. |
-| `gail -> use_vail`      | (Optional, default = `false`) Enables a variational bottleneck within the GAIL discriminator. This forces the discriminator to learn a more general representation and reduces its tendency to be "too good" at discriminating, making learning more stable. However, it does increase training time. Enable this if you notice your imitation learning is unstable, or unable to learn the task at hand.                                                                       |
-
-### Reward Signal Settings for SAC
-
-All of the reward signals configurations described above apply to both PPO and
-SAC. There is one configuration for all reward signals that only applies to SAC.
+| `gail -> use_actions`   | (default = `false`) Determines whether the discriminator should discriminate based on both observations and actions, or just observations. Set to True if you want the agent to mimic the actions from the demonstrations, and False if you'd rather have the agent visit the same states as in the demonstrations but with possibly different actions. Setting to False is more likely to be stable, especially with imperfect demonstrations, but may learn slower. |
+| `gail -> use_vail`      | (default = `false`) Enables a variational bottleneck within the GAIL discriminator. This forces the discriminator to learn a more general representation and reduces its tendency to be "too good" at discriminating, making learning more stable. However, it does increase training time. Enable this if you notice your imitation learning is unstable, or unable to learn the task at hand.                                                                       |
 
-| **Setting**                                  | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| :------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `reward_signals -> reward_signal_num_update` | (Optional, default = `steps_per_update`) Number of steps per mini batch sampled and used for updating the reward signals. By default, we update the reward signals once every time the main policy is updated. However, to imitate the training procedure in certain imitation learning papers (e.g. [Kostrikov et. al](http://arxiv.org/abs/1809.02925), [Blondé et. al](http://arxiv.org/abs/1809.02064)), we may want to update the reward signal (GAIL) M times for every update of the policy. We can change `steps_per_update` of SAC to N, as well as `reward_signal_steps_per_update` under `reward_signals` to N / M to accomplish this. By default, `reward_signal_steps_per_update` is set to `steps_per_update`. |
 
 ## Behavioral Cloning
 
@@ -131,13 +125,12 @@ recorded demonstrations), provide the following configurations under the
 
 | **Setting**          | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
 | :------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `demo_path`          | The path to your .demo file or directory of .demo files.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| `strength`           | Learning rate of the imitation relative to the learning rate of PPO, and roughly corresponds to how strongly we allow BC to influence the policy. <br><br>Typical range: `0.1` - `0.5`                                                                                                                                                                                                                                                                                                                                                                     |
-| `steps`              | During BC, it is often desirable to stop using demonstrations after the agent has "seen" rewards, and allow it to optimize past the available demonstrations and/or generalize outside of the provided demonstrations. steps corresponds to the training steps over which BC is active. The learning rate of BC will anneal over the steps. Set the steps to 0 for constant imitation over the entire training run.                                                                                                                                        |
-| `batch_size`         | Number of demonstration experiences used for one iteration of a gradient descent update. If not specified, it will default to the `batch_size`. <br><br>Typical range: (Continuous): `512` - `5120`; (Discrete): `32` - `512`                                                                                                                                                                                                                                                                                                                              |
-| `num_epoch`          | Number of passes through the experience buffer during gradient descent. If not specified, it will default to the number of epochs set for PPO. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                                           |
-| `samples_per_update` | (Optional, default = `0`) Maximum number of samples to use during each imitation update. You may want to lower this if your demonstration dataset is very large to avoid overfitting the policy on demonstrations. Set to 0 to train over all of the demonstrations at each update step. <br><br>Typical range: `buffer_size`                                                                                                                                                                                                                              |
-| `init_path`          | Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run. |
+| `demo_path`          | (Required, no default) The path to your .demo file or directory of .demo files.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| `strength`           | (default = `1.0`) Learning rate of the imitation relative to the learning rate of PPO, and roughly corresponds to how strongly we allow BC to influence the policy. <br><br>Typical range: `0.1` - `0.5`                                                                                                                                                                                                                                                                                                                                                                     |
+| `steps`              | (default = `0`) During BC, it is often desirable to stop using demonstrations after the agent has "seen" rewards, and allow it to optimize past the available demonstrations and/or generalize outside of the provided demonstrations. steps corresponds to the training steps over which BC is active. The learning rate of BC will anneal over the steps. Set the steps to 0 for constant imitation over the entire training run.                                                                                                                                        |
+| `batch_size`         | (default = `batch_size` of trainer) Number of demonstration experiences used for one iteration of a gradient descent update. If not specified, it will default to the `batch_size` of the trainer. <br><br>Typical range: (Continuous): `512` - `5120`; (Discrete): `32` - `512`                                                                                                                                                                                                                                                                                                                              |
+| `num_epoch`          | (default = `num_epoch` of trainer) Number of passes through the experience buffer during gradient descent. If not specified, it will default to the number of epochs set for PPO. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                                           |
+| `samples_per_update` | (default = `0`) Maximum number of samples to use during each imitation update. You may want to lower this if your demonstration dataset is very large to avoid overfitting the policy on demonstrations. Set to 0 to train over all of the demonstrations at each update step. <br><br>Typical range: `buffer_size`
 
 ## Memory-enhanced Agents using Recurrent Neural Networks
 
@@ -146,8 +139,8 @@ and setting `memory_size` and `sequence_length`:
 
 | **Setting**       | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | :---------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `network_settings -> memory -> memory_size` | Size of the memory an agent must keep. In order to use a LSTM, training requires a sequence of experiences instead of single experiences. Corresponds to the size of the array of floating point numbers used to store the hidden state of the recurrent neural network of the policy. This value must be a multiple of 2, and should scale with the amount of information you expect the agent will need to remember in order to successfully complete the task. <br><br>Typical range: `32` - `256` |
-| `network_settings -> memory -> sequence_length` | Defines how long the sequences of experiences must be while training. Note that if this number is too small, the agent will not be able to remember things over longer periods of time. If this number is too large, the neural network will take longer to train. <br><br>Typical range: `4` - `128`                                                                                                                                                                                                 |
+| `network_settings -> memory -> memory_size` | (default = `128`) Size of the memory an agent must keep. In order to use a LSTM, training requires a sequence of experiences instead of single experiences. Corresponds to the size of the array of floating point numbers used to store the hidden state of the recurrent neural network of the policy. This value must be a multiple of 2, and should scale with the amount of information you expect the agent will need to remember in order to successfully complete the task. <br><br>Typical range: `32` - `256` |
+| `network_settings -> memory -> sequence_length` | (default = `64`) Defines how long the sequences of experiences must be while training. Note that if this number is too small, the agent will not be able to remember things over longer periods of time. If this number is too large, the neural network will take longer to train. <br><br>Typical range: `4` - `128`                                                                                                                                                                                                 |
 
 A few considerations when deciding to use memory:
 
@@ -175,11 +168,11 @@ each Behavior:
 
 | **Setting**                       | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | :-------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `save_steps`                      | Number of _trainer steps_ between snapshots. For example, if `save_steps=10000` then a snapshot of the current policy will be saved every `10000` trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13. <br><br>A larger value of `save_steps` will yield a set of opponents that cover a wider range of skill levels and possibly play styles since the policy receives more training. As a result, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. This value is also dependent on how intrinsically difficult the environment is for the agent. <br><br> Typical range: `10000` - `100000`                                                                                                                                                                                                                                                                                            |
-| `team_change`                     | Number of _trainer_steps_ between switching the learning team. This is the number of trainer steps the teams associated with a specific ghost trainer will train before a different team becomes the new learning team. It is possible that, in asymmetric games, opposing teams require fewer trainer steps to make similar performance gains. This enables users to train a more complicated team of agents for more trainer steps than a simpler team of agents per team switch. <br><br>A larger value of `team-change` will allow the agent to train longer against it's opponents. The longer an agent trains against the same set of opponents the more able it will be to defeat them. However, training against them for too long may result in overfitting to the particular opponent strategies and so the agent may fail against the next batch of opponents. <br><br> The value of `team-change` will determine how many snapshots of the agent's policy are saved to be used as opponents for the other team. So, we recommend setting this value as a function of the `save_steps` parameter discussed previously. <br><br> Typical range: 4x-10x where x=`save_steps` |
-| `swap_steps`                      | Number of _ghost steps_ (not trainer steps) between swapping the opponents policy with a different snapshot. A 'ghost step' refers to a step taken by an agent _that is following a fixed policy and not learning_. The reason for this distinction is that in asymmetric games, we may have teams with an unequal number of agents e.g. a 2v1 scenario like our Strikers Vs Goalie example environment. The team with two agents collects twice as many agent steps per environment step as the team with one agent. Thus, these two values will need to be distinct to ensure that the same number of trainer steps corresponds to the same number of opponent swaps for each team. The formula for `swap_steps` if a user desires `x` swaps of a team with `num_agents` agents against an opponent team with `num_opponent_agents` agents during `team-change` total steps is: `(num_agents / num_opponent_agents) * (team_change / x)` <br><br> Typical range: `10000` - `100000`                                                                                                                                                                                                 |
-| `play_against_latest_model_ratio` | Probability an agent will play against the latest opponent policy. With probability 1 - `play_against_latest_model_ratio`, the agent will play against a snapshot of its opponent from a past iteration. <br><br> A larger value of `play_against_latest_model_ratio` indicates that an agent will be playing against the current opponent more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration. This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy. <br><br> Typical range: `0.0` - `1.0`                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `window`                          | Size of the sliding window of past snapshots from which the agent's opponents are sampled. For example, a `window` size of 5 will save the last 5 snapshots taken. Each time a new snapshot is taken, the oldest is discarded. A larger value of `window` means that an agent's pool of opponents will contain a larger diversity of behaviors since it will contain policies from earlier in the training run. Like in the `save_steps` hyperparameter, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. <br><br> Typical range: `5` - `30`                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| `save_steps`                      | (default = `20000`) Number of _trainer steps_ between snapshots. For example, if `save_steps=10000` then a snapshot of the current policy will be saved every `10000` trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13. <br><br>A larger value of `save_steps` will yield a set of opponents that cover a wider range of skill levels and possibly play styles since the policy receives more training. As a result, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. This value is also dependent on how intrinsically difficult the environment is for the agent. <br><br> Typical range: `10000` - `100000`                                                                                                                                                                                                                                                                                            |
+| `team_change`                     | (default = `5 * save_steps`) Number of _trainer_steps_ between switching the learning team. This is the number of trainer steps the teams associated with a specific ghost trainer will train before a different team becomes the new learning team. It is possible that, in asymmetric games, opposing teams require fewer trainer steps to make similar performance gains. This enables users to train a more complicated team of agents for more trainer steps than a simpler team of agents per team switch. <br><br>A larger value of `team-change` will allow the agent to train longer against it's opponents. The longer an agent trains against the same set of opponents the more able it will be to defeat them. However, training against them for too long may result in overfitting to the particular opponent strategies and so the agent may fail against the next batch of opponents. <br><br> The value of `team-change` will determine how many snapshots of the agent's policy are saved to be used as opponents for the other team. So, we recommend setting this value as a function of the `save_steps` parameter discussed previously. <br><br> Typical range: 4x-10x where x=`save_steps` |
+| `swap_steps`                      | (default = `10000`) Number of _ghost steps_ (not trainer steps) between swapping the opponents policy with a different snapshot. A 'ghost step' refers to a step taken by an agent _that is following a fixed policy and not learning_. The reason for this distinction is that in asymmetric games, we may have teams with an unequal number of agents e.g. a 2v1 scenario like our Strikers Vs Goalie example environment. The team with two agents collects twice as many agent steps per environment step as the team with one agent. Thus, these two values will need to be distinct to ensure that the same number of trainer steps corresponds to the same number of opponent swaps for each team. The formula for `swap_steps` if a user desires `x` swaps of a team with `num_agents` agents against an opponent team with `num_opponent_agents` agents during `team-change` total steps is: `(num_agents / num_opponent_agents) * (team_change / x)` <br><br> Typical range: `10000` - `100000`                                                                                                                                                                                                 |
+| `play_against_latest_model_ratio` | (default = `0.5`) Probability an agent will play against the latest opponent policy. With probability 1 - `play_against_latest_model_ratio`, the agent will play against a snapshot of its opponent from a past iteration. <br><br> A larger value of `play_against_latest_model_ratio` indicates that an agent will be playing against the current opponent more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration. This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy. <br><br> Typical range: `0.0` - `1.0`                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| `window`                          | (default = `10`) Size of the sliding window of past snapshots from which the agent's opponents are sampled. For example, a `window` size of 5 will save the last 5 snapshots taken. Each time a new snapshot is taken, the oldest is discarded. A larger value of `window` means that an agent's pool of opponents will contain a larger diversity of behaviors since it will contain policies from earlier in the training run. Like in the `save_steps` hyperparameter, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. <br><br> Typical range: `5` - `30`                                                                                                                                                                                                                                                                                                                                                                                                                                  |
 
 ### Note on Reward Signals
 
diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
index 3a05b925bd..543e5efb0b 100644
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
@@ -242,7 +242,7 @@ behaviors:
       init_path:
 
     reward_signals:
-      # environment reward
+      # environment reward (default)
       extrinsic:
         strength: 1.0
         gamma: 0.99
@@ -331,7 +331,8 @@ behaviors:
 We now break apart the components of the configuration file and describe what
 each of these parameters mean and provide guidelines on how to set them. See
 [Training Configuration File](Training-Configuration-File.md) for a detailed
-description of all the configurations listed above.
+description of all the configurations listed above, along with their defaults.
+Unless otherwise specified, omitting a configuration will revert it to its default.
 
 ### Curriculum Learning
 
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 338acc8d5f..401185abab 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -147,7 +147,7 @@ class GAILSettings(RewardSignalSettings):
 
 @attr.s(auto_attribs=True)
 class CuriositySettings(RewardSignalSettings):
-    encoding_size: int = 128
+    encoding_size: int = 64
     learning_rate: float = 3e-4
 
 

From 5c392841046f843550a79cd91fb5e0660a49e99e Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 15 May 2020 10:16:47 -0700
Subject: [PATCH 46/54] Add comment about BC

---
 ml-agents/mlagents/trainers/settings.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 401185abab..613912aacd 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -61,6 +61,8 @@ class BehavioralCloningSettings:
     steps: int = 0
     strength: float = 1.0
     samples_per_update: int = 0
+    # Setting either of these to None will allow the Optimizer
+    # to decide these parameters, based on Trainer hyperparams
     num_epoch: Optional[int] = None
     batch_size: Optional[int] = None
 

From 2ff78d7cc21acd708af35ed4e51d992e909be836 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 15 May 2020 12:17:23 -0700
Subject: [PATCH 47/54] Add more tests for settings

---
 ml-agents/mlagents/trainers/settings.py       |  14 +-
 .../mlagents/trainers/tests/test_settings.py  | 125 +++++++++++++++++-
 2 files changed, 130 insertions(+), 9 deletions(-)

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 613912aacd..a74d808807 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -122,19 +122,17 @@ class RewardSignalSettings:
     @staticmethod
     def structure(d: Mapping, t: type) -> Any:
         """
-        Helper method to structure a TrainerSettings class. Meant to be registered with
-        cattr.register_structure_hook() and called with cattr.structure().
+        Helper method to structure a Dict of RewardSignalSettings class. Meant to be registered with
+        cattr.register_structure_hook() and called with cattr.structure(). This is needed to handle
+        the special Enum selection of RewardSignalSettings classes.
         """
         if not isinstance(d, Mapping):
             raise TrainerConfigError(f"Unsupported reward signal configuration {d}.")
         d_final: Dict[RewardSignalType, RewardSignalSettings] = {}
         for key, val in d.items():
-            try:
-                enum_key = RewardSignalType(key)
-                t = enum_key.to_settings()
-                d_final[enum_key] = strict_to_cls(val, t)
-            except KeyError:
-                raise TrainerConfigError(f"Unknown reward signal type {key}")
+            enum_key = RewardSignalType(key)
+            t = enum_key.to_settings()
+            d_final[enum_key] = strict_to_cls(val, t)
         return d_final
 
 
diff --git a/ml-agents/mlagents/trainers/tests/test_settings.py b/ml-agents/mlagents/trainers/tests/test_settings.py
index c5b62677f6..045513601e 100644
--- a/ml-agents/mlagents/trainers/tests/test_settings.py
+++ b/ml-agents/mlagents/trainers/tests/test_settings.py
@@ -1,6 +1,20 @@
 import attr
+import pytest
 
-from mlagents.trainers.settings import RunOptions, TrainerSettings
+from typing import Dict
+
+from mlagents.trainers.settings import (
+    RunOptions,
+    TrainerSettings,
+    PPOSettings,
+    SACSettings,
+    RewardSignalType,
+    RewardSignalSettings,
+    CuriositySettings,
+    TrainerType,
+    strict_to_cls,
+)
+from mlagents.trainers.exception import TrainerConfigError
 
 
 def check_if_different(testobj1: object, testobj2: object) -> None:
@@ -19,3 +33,112 @@ def test_is_new_instance():
     """
     check_if_different(RunOptions(), RunOptions())
     check_if_different(TrainerSettings(), TrainerSettings())
+
+
+def test_no_configuration():
+    """
+    Verify that a new config will have a PPO trainer with extrinsic rewards.
+    """
+    blank_runoptions = RunOptions()
+    assert isinstance(blank_runoptions.behaviors["test"], TrainerSettings)
+    assert isinstance(blank_runoptions.behaviors["test"].hyperparameters, PPOSettings)
+
+    assert (
+        RewardSignalType.EXTRINSIC in blank_runoptions.behaviors["test"].reward_signals
+    )
+
+
+def test_strict_to_cls():
+    """
+    Test strict structuring method.
+    """
+
+    @attr.s(auto_attribs=True)
+    class TestAttrsClass:
+        field1: int = 0
+        field2: str = "test"
+
+    correct_dict = {"field1": 1, "field2": "test2"}
+    assert strict_to_cls(correct_dict, TestAttrsClass) == TestAttrsClass(**correct_dict)
+
+    incorrect_dict = {"field3": 1, "field2": "test2"}
+
+    with pytest.raises(TrainerConfigError):
+        strict_to_cls(incorrect_dict, TestAttrsClass)
+
+    with pytest.raises(TrainerConfigError):
+        strict_to_cls("non_dict_input", TestAttrsClass)
+
+
+def test_trainersettings_structure():
+    """
+    Test structuring method for TrainerSettings
+    """
+    trainersettings_dict = {
+        "trainer_type": "sac",
+        "hyperparameters": {"batch_size": 1024},
+        "max_steps": 1.0,
+        "reward_signals": {"curiosity": {"encoding_size": 64}},
+    }
+    trainer_settings = TrainerSettings.structure(trainersettings_dict, TrainerSettings)
+    assert isinstance(trainer_settings.hyperparameters, SACSettings)
+    assert trainer_settings.trainer_type == TrainerType.SAC
+    assert isinstance(trainer_settings.max_steps, int)
+    assert RewardSignalType.CURIOSITY in trainer_settings.reward_signals
+
+    # Check invalid trainer type
+    with pytest.raises(ValueError):
+        trainersettings_dict = {
+            "trainer_type": "puppo",
+            "hyperparameters": {"batch_size": 1024},
+            "max_steps": 1.0,
+        }
+        TrainerSettings.structure(trainersettings_dict, TrainerSettings)
+
+    # Check invalid hyperparameter
+    with pytest.raises(TrainerConfigError):
+        trainersettings_dict = {
+            "trainer_type": "ppo",
+            "hyperparameters": {"notahyperparam": 1024},
+            "max_steps": 1.0,
+        }
+        TrainerSettings.structure(trainersettings_dict, TrainerSettings)
+
+    # Check non-dict
+    with pytest.raises(TrainerConfigError):
+        TrainerSettings.structure("notadict", TrainerSettings)
+
+    # Check hyperparameters specified but trainer type left as default.
+    # This shouldn't work as you could specify non-PPO hyperparameters.
+    with pytest.raises(TrainerConfigError):
+        trainersettings_dict = {"hyperparameters": {"batch_size": 1024}}
+        TrainerSettings.structure(trainersettings_dict, TrainerSettings)
+
+
+def test_reward_signal_structure():
+    """
+    Tests the RewardSignalSettings structure method. This one is special b/c
+    it takes in a Dict[RewardSignalType, RewardSignalSettings].
+    """
+    reward_signals_dict = {
+        "extrinsic": {"strength": 1.0},
+        "curiosity": {"strength": 1.0},
+    }
+    reward_signals = RewardSignalSettings.structure(
+        reward_signals_dict, Dict[RewardSignalType, RewardSignalSettings]
+    )
+    assert isinstance(reward_signals[RewardSignalType.EXTRINSIC], RewardSignalSettings)
+    assert isinstance(reward_signals[RewardSignalType.CURIOSITY], CuriositySettings)
+
+    # Check invalid reward signal type
+    reward_signals_dict = {"puppo": {"strength": 1.0}}
+    with pytest.raises(ValueError):
+        RewardSignalSettings.structure(
+            reward_signals_dict, Dict[RewardSignalType, RewardSignalSettings]
+        )
+
+    # Check non-Dict input
+    with pytest.raises(TrainerConfigError):
+        RewardSignalSettings.structure(
+            "notadict", Dict[RewardSignalType, RewardSignalSettings]
+        )

From c6234b232469a167d2cc3f27a509c38810e32d9e Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 15 May 2020 12:17:56 -0700
Subject: [PATCH 48/54] Update changelog

---
 com.unity.ml-agents/CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
index 06a35685f4..ade92204c4 100755
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
@@ -19,6 +19,8 @@ and this project adheres to
 - Curriculum and Parameter Randomization configurations have been merged
   into the main training configuration file. Note that this means training
   configuration files are now environment-specific. (#3791)
+- The format for trainer configuration has changed, and the "default" behavior has been deprecated.
+  See the [Migration Guide](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Migrating.md) for more details. (#3936)
 - Training artifacts (trained models, summaries) are now found in the `results/`
   directory. (#3829)
 - Unity Player logs are now written out to the results directory. (#3877)

From 32ffc97ef439e118ce14524159049c08b43e928a Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 15 May 2020 14:18:08 -0700
Subject: [PATCH 49/54] Test missing demo_path

---
 ml-agents/mlagents/trainers/tests/test_settings.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ml-agents/mlagents/trainers/tests/test_settings.py b/ml-agents/mlagents/trainers/tests/test_settings.py
index 045513601e..6a8b2b9355 100644
--- a/ml-agents/mlagents/trainers/tests/test_settings.py
+++ b/ml-agents/mlagents/trainers/tests/test_settings.py
@@ -137,6 +137,13 @@ def test_reward_signal_structure():
             reward_signals_dict, Dict[RewardSignalType, RewardSignalSettings]
         )
 
+    # Check missing GAIL demo path
+    reward_signals_dict = {"gail": {"strength": 1.0}}
+    with pytest.raises(TypeError):
+        RewardSignalSettings.structure(
+            reward_signals_dict, Dict[RewardSignalType, RewardSignalSettings]
+        )
+
     # Check non-Dict input
     with pytest.raises(TrainerConfigError):
         RewardSignalSettings.structure(

From 18142fc037492c9b115edb6dded4c7bb67671266 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 18 May 2020 14:27:10 -0700
Subject: [PATCH 50/54] Improve docs and docstrings

---
 docs/Training-Configuration-File.md                    |  2 +-
 .../trainers/components/reward_signals/__init__.py     |  3 +--
 .../components/reward_signals/curiosity/signal.py      |  7 ++-----
 .../trainers/components/reward_signals/gail/signal.py  | 10 +---------
 ml-agents/mlagents/trainers/ghost/trainer.py           |  2 +-
 ml-agents/mlagents/trainers/ppo/trainer.py             |  2 +-
 ml-agents/mlagents/trainers/sac/trainer.py             |  2 +-
 7 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/docs/Training-Configuration-File.md b/docs/Training-Configuration-File.md
index 98df68a09a..65cfc7b806 100644
--- a/docs/Training-Configuration-File.md
+++ b/docs/Training-Configuration-File.md
@@ -33,7 +33,7 @@ choice of the trainer (which we review on subsequent sections).
 | `hyperparameters -> buffer_size`            | (default = `10240` for PPO and `50000` for SAC) Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. In SAC, the max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000`                                                                                                                                                      |
 | `hyperparameters -> learning_rate`          | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | `hyperparameters -> learning_rate_schedule` | (default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run.                                                                                                           |
-| `max_steps`              | (default = `500000`) Total number of experience points that must be collected from the simulation before ending the training process. <br><br>Typical range: `5e5` - `1e7`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| `max_steps`              | (default = `500000`) Total number of steps (i.e., observation collected and action taken) that must be taken in the environment (or across all environments if using multiple in parallel) before ending the training process. If you have multiple agents with the same behavior name within your environment, all steps taken by those agents will contribute to the same `max_steps` count. <br><br>Typical range: `5e5` - `1e7`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
 | `network_settings -> hidden_units`           | (default = `128`) Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512`                                                                                                                                                                                                                                                                                    |
 | `network_settings -> num_layers`             | (default = `false`) The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3`                                                                                                                                                                                                                                                                                                                                                    |
 | `network_settings -> normalize`              | (default = `false`) Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/__init__.py b/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
index a1e46ed5b3..80101a8110 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
@@ -24,8 +24,7 @@ def __init__(self, policy: TFPolicy, settings: RewardSignalSettings):
         Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
         the reward strength, and the gamma (discount factor.)
         :param policy: The Policy object (e.g. NNPolicy) that this Reward Signal will apply to.
-        :param strength: The strength of the reward. The reward's raw value will be multiplied by this value.
-        :param gamma: The time discounting factor used for this reward.
+        :param settings: Settings parameters for this Reward Signal, including gamma and strength.
         :return: A RewardSignal object.
         """
         class_name = self.__class__.__name__
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
index 8408ab2a0d..a18c48a393 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
@@ -14,11 +14,8 @@ def __init__(self, policy: TFPolicy, settings: CuriositySettings):
         """
         Creates the Curiosity reward generator
         :param policy: The Learning Policy
-        :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled
-        reward multiplied by the strength parameter
-        :param gamma: The time discounting factor used for this reward.
-        :param encoding_size: The size of the hidden encoding layer for the ICM
-        :param learning_rate: The learning rate for the ICM.
+        :param settings: CuriositySettings object that contains the parameters
+            (including encoding size and learning rate) for this CuriosityRewardSignal.
         """
         super().__init__(policy, settings)
         self.model = CuriosityModel(
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
index 89a6b8bc64..9d5fcd13db 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
@@ -15,15 +15,7 @@ def __init__(self, policy: TFPolicy, settings: GAILSettings):
         """
         The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476
         :param policy: The policy of the learning model
-        :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled
-        reward multiplied by the strength parameter
-        :param gamma: The time discounting factor used for this reward.
-        :param demo_path: The path to the demonstration file
-        :param num_epoch: The number of epochs to train over the training buffer for the discriminator.
-        :param encoding_size: The size of the the hidden layers of the discriminator
-        :param learning_rate: The Learning Rate used during GAIL updates.
-        :param use_actions: Whether or not to use the actions for the discriminator.
-        :param use_vail: Whether or not to use a variational bottleneck for the discriminator.
+        :param settings: The settings for this GAILRewardSignal.
         See https://arxiv.org/abs/1810.00821.
         """
         super().__init__(policy, settings)
diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py
index 089b9c61a1..1130d16ff7 100644
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
@@ -53,7 +53,7 @@ def __init__(
         :param brain_name: The name of the brain associated with trainer config
         :param controller: GhostController that coordinates all ghost trainers and calculates ELO
         :param reward_buff_cap: Max reward history to track in the reward buffer
-        :param trainer_settings: The parameters for the trainer (dictionary).
+        :param trainer_settings: The parameters for the trainer.
         :param training: Whether the trainer is set for training.
         :param run_id: The identifier of the current run
         """
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
index 4c4c98050d..1508cb435e 100644
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -39,7 +39,7 @@ def __init__(
         Responsible for collecting experiences and training PPO model.
         :param brain_name: The name of the brain associated with trainer config
         :param reward_buff_cap: Max reward history to track in the reward buffer
-        :param trainer_settings: The parameters for the trainer (dictionary).
+        :param trainer_settings: The parameters for the trainer.
         :param training: Whether the trainer is set for training.
         :param load: Whether the model should be loaded.
         :param seed: The seed the model will be initialized with
diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
index 45e52fff6e..2e1ad89a81 100644
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -47,7 +47,7 @@ def __init__(
         Responsible for collecting experiences and training SAC model.
         :param brain_name: The name of the brain associated with trainer config
         :param reward_buff_cap: Max reward history to track in the reward buffer
-        :param trainer_settings: The parameters for the trainer (dictionary).
+        :param trainer_settings: The parameters for the trainer.
         :param training: Whether the trainer is set for training.
         :param load: Whether the model should be loaded.
         :param seed: The seed the model will be initialized with

From 81d11861ffc6bacdc3840e5e371f6d3ccfb68fe9 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 20 May 2020 13:58:59 -0700
Subject: [PATCH 51/54] Move keep_checkpoints to config rather than CLI

---
 docs/Training-Configuration-File.md      | 11 +++++++----
 docs/Training-ML-Agents.md               |  1 +
 ml-agents/mlagents/trainers/cli_utils.py |  9 ---------
 ml-agents/mlagents/trainers/settings.py  |  3 +--
 4 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/docs/Training-Configuration-File.md b/docs/Training-Configuration-File.md
index 65cfc7b806..6df42c6ade 100644
--- a/docs/Training-Configuration-File.md
+++ b/docs/Training-Configuration-File.md
@@ -29,17 +29,20 @@ choice of the trainer (which we review on subsequent sections).
 | `trainer_type`                | (default = `ppo`) The type of trainer to use: `ppo` or `sac`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 | `summary_freq`           | (default = `50000`) Number of experiences that needs to be collected before generating and displaying training statistics. This determines the granularity of the graphs in Tensorboard.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
 | `time_horizon`           | (default = `64`) How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
+| `max_steps`              | (default = `500000`) Total number of steps (i.e., observation collected and action taken) that must be taken in the environment (or across all environments if using multiple in parallel) before ending the training process. If you have multiple agents with the same behavior name within your environment, all steps taken by those agents will contribute to the same `max_steps` count. <br><br>Typical range: `5e5` - `1e7`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| `keep_checkpoints`         | (default = `5`) The maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the save-freq option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. |
+| `init_path`              | (default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run.                                                                                                                                  |
+| `threaded`               | (default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC.                                                                                                                                                                                                                                                       |
+| `hyperparameters -> learning_rate`          | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | `hyperparameters -> batch_size`             | Number of experiences in each iteration of gradient descent. **This should always be multiple times smaller than `buffer_size`**. If you are using a continuous action space, this value should be large (in the order of 1000s). If you are using a discrete action space, this value should be smaller (in order of 10s). <br><br> Typical range: (Continuous - PPO): `512` - `5120`; (Continuous - SAC): `128` - `1024`; (Discrete, PPO & SAC): `32` - `512`.                                                                                                                                                                                                                                                               |
 | `hyperparameters -> buffer_size`            | (default = `10240` for PPO and `50000` for SAC) Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. In SAC, the max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000`                                                                                                                                                      |
-| `hyperparameters -> learning_rate`          | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | `hyperparameters -> learning_rate_schedule` | (default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run.                                                                                                           |
-| `max_steps`              | (default = `500000`) Total number of steps (i.e., observation collected and action taken) that must be taken in the environment (or across all environments if using multiple in parallel) before ending the training process. If you have multiple agents with the same behavior name within your environment, all steps taken by those agents will contribute to the same `max_steps` count. <br><br>Typical range: `5e5` - `1e7`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+
 | `network_settings -> hidden_units`           | (default = `128`) Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512`                                                                                                                                                                                                                                                                                    |
 | `network_settings -> num_layers`             | (default = `false`) The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3`                                                                                                                                                                                                                                                                                                                                                    |
 | `network_settings -> normalize`              | (default = `false`) Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | `network_settings -> vis_encoder_type`       | (default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two.                                                                                                                                                                                             |
-| `init_path`              | (default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run.                                                                                                                                  |
-| `threaded`               | (default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC.                                                                                                                                                                                                                                                       |
+
 
 ## Trainer-specific Configurations
 
diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
index 543e5efb0b..ccc90534a1 100644
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
@@ -229,6 +229,7 @@ behaviors:
     # Trainer configurations common to all trainers
     max_steps: 5.0e5
     time_horizon: 64
+    keep_checkpoints: 5
     threaded: true
 
     # behavior cloning
diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py
index c740b83bf2..d0cd462145 100644
--- a/ml-agents/mlagents/trainers/cli_utils.py
+++ b/ml-agents/mlagents/trainers/cli_utils.py
@@ -66,15 +66,6 @@ def _create_parser() -> argparse.ArgumentParser:
         help="The lesson to start with when performing curriculum training",
         action=DetectDefault,
     )
-    argparser.add_argument(
-        "--keep-checkpoints",
-        default=5,
-        type=int,
-        help="The maximum number of model checkpoints to keep. Checkpoints are saved after the"
-        "number of steps specified by the save-freq option. Once the maximum number of checkpoints"
-        "has been reached, the oldest checkpoint is deleted when saving a new checkpoint.",
-        action=DetectDefault,
-    )
     argparser.add_argument(
         "--load",
         default=False,
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index a74d808807..4435510abb 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -191,8 +191,7 @@ def _set_default_hyperparameters(self):
     )
     init_path: Optional[str] = None
     output_path: str = "default"
-    # TODO: Remove parser default and remove from CLI
-    keep_checkpoints: int = parser.get_default("keep_checkpoints")
+    keep_checkpoints: int = 5
     max_steps: int = 500000
     time_horizon: int = 64
     summary_freq: int = 50000

From 04a78609947eb66b979c1e7d5ae76f038f952877 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 20 May 2020 14:03:25 -0700
Subject: [PATCH 52/54] Remove unused check param keys

---
 ml-agents/mlagents/trainers/ppo/trainer.py     | 14 --------------
 ml-agents/mlagents/trainers/sac/trainer.py     | 14 --------------
 ml-agents/mlagents/trainers/trainer/trainer.py |  9 ---------
 3 files changed, 37 deletions(-)

diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
index 1508cb435e..743091b770 100644
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -14,7 +14,6 @@
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.ppo.optimizer import PPOOptimizer
 from mlagents.trainers.trajectory import Trajectory
-from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
 from mlagents.trainers.settings import TrainerSettings, PPOSettings
 
@@ -75,19 +74,6 @@ def __init__(
         self.seed = seed
         self.policy: NNPolicy = None  # type: ignore
 
-    def _check_param_keys(self):
-        super()._check_param_keys()
-        # Check that batch size is greater than sequence length. Else, throw
-        # an exception.
-        if (
-            self.trainer_settings["sequence_length"]
-            > self.trainer_settings["batch_size"]
-            and self.trainer_settings["use_recurrent"]
-        ):
-            raise UnityTrainerException(
-                "batch_size must be greater than or equal to sequence_length when use_recurrent is True."
-            )
-
     def _process_trajectory(self, trajectory: Trajectory) -> None:
         """
         Takes a trajectory and processes it, putting it into the update buffer.
diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
index 2e1ad89a81..9b321ff500 100644
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -17,7 +17,6 @@
 from mlagents.trainers.trainer.rl_trainer import RLTrainer
 from mlagents.trainers.trajectory import Trajectory, SplitObservations
 from mlagents.trainers.brain import BrainParameters
-from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
 from mlagents.trainers.settings import TrainerSettings, SACSettings
 
@@ -77,19 +76,6 @@ def __init__(
 
         self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer
 
-    def _check_param_keys(self):
-        super()._check_param_keys()
-        # Check that batch size is greater than sequence length. Else, throw
-        # an exception.
-        if (
-            self.trainer_settings["sequence_length"]
-            > self.trainer_settings["batch_size"]
-            and self.trainer_settings["use_recurrent"]
-        ):
-            raise UnityTrainerException(
-                "batch_size must be greater than or equal to sequence_length when use_recurrent is True."
-            )
-
     def save_model(self, name_behavior_id: str) -> None:
         """
         Saves the model. Overrides the default save_model since we want to save
diff --git a/ml-agents/mlagents/trainers/trainer/trainer.py b/ml-agents/mlagents/trainers/trainer/trainer.py
index 975bb77576..fcad7ed5a7 100644
--- a/ml-agents/mlagents/trainers/trainer/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer.py
@@ -12,7 +12,6 @@
 from mlagents.trainers.agent_processor import AgentManagerQueue
 from mlagents.trainers.brain import BrainParameters
 from mlagents.trainers.policy import Policy
-from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
 from mlagents.trainers.settings import TrainerSettings
 
@@ -60,14 +59,6 @@ def stats_reporter(self):
         """
         return self._stats_reporter
 
-    def _check_param_keys(self):
-        for k in self.param_keys:
-            if k not in self.trainer_settings:
-                raise UnityTrainerException(
-                    "The hyper-parameter {0} could not be found for the {1} trainer of "
-                    "brain {2}.".format(k, self.__class__, self.brain_name)
-                )
-
     @property
     def parameters(self) -> TrainerSettings:
         """

From 43e5acd4681bed95ba012e500fa7d366003142eb Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 20 May 2020 14:23:58 -0700
Subject: [PATCH 53/54] Remove keep_checkpoints from learn.py

---
 ml-agents/mlagents/trainers/learn.py                   | 1 -
 ml-agents/mlagents/trainers/settings.py                | 1 -
 ml-agents/mlagents/trainers/tests/test_learn.py        | 7 -------
 ml-agents/mlagents/trainers/tests/test_simple_rl.py    | 1 -
 ml-agents/mlagents/trainers/tests/test_trainer_util.py | 2 --
 ml-agents/mlagents/trainers/trainer_util.py            | 4 ----
 6 files changed, 16 deletions(-)

diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index 99802a1e84..121ccf5c9c 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -132,7 +132,6 @@ def run_training(run_seed: int, options: RunOptions) -> None:
             options.behaviors,
             checkpoint_settings.run_id,
             write_path,
-            checkpoint_settings.keep_checkpoints,
             not checkpoint_settings.inference,
             checkpoint_settings.resume,
             run_seed,
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 4435510abb..4c89073d19 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -268,7 +268,6 @@ class MeasureType:
 @attr.s(auto_attribs=True)
 class CheckpointSettings:
     save_freq: int = parser.get_default("save_freq")
-    keep_checkpoints: int = parser.get_default("keep_checkpoints")
     run_id: str = parser.get_default("run_id")
     initialize_from: str = parser.get_default("initialize_from")
     load_model: bool = parser.get_default("load_model")
diff --git a/ml-agents/mlagents/trainers/tests/test_learn.py b/ml-agents/mlagents/trainers/tests/test_learn.py
index 8914415557..5af2d8bb58 100644
--- a/ml-agents/mlagents/trainers/tests/test_learn.py
+++ b/ml-agents/mlagents/trainers/tests/test_learn.py
@@ -34,7 +34,6 @@ def basic_options(extra_args=None):
         lesson: 2
         run_id: uselessrun
         save_freq: 654321
-        keep_checkpoints: 34
     debug: false
     """
 
@@ -121,7 +120,6 @@ def test_commandline_args(mock_file):
     assert opt.behaviors == {}
     assert opt.env_settings.env_path is None
     assert opt.parameter_randomization is None
-    assert opt.checkpoint_settings.keep_checkpoints == 5
     assert opt.checkpoint_settings.lesson == 0
     assert opt.checkpoint_settings.resume is False
     assert opt.checkpoint_settings.inference is False
@@ -137,7 +135,6 @@ def test_commandline_args(mock_file):
     full_args = [
         "mytrainerpath",
         "--env=./myenvfile",
-        "--keep-checkpoints=42",
         "--lesson=3",
         "--resume",
         "--inference",
@@ -155,7 +152,6 @@ def test_commandline_args(mock_file):
     assert opt.behaviors == {}
     assert opt.env_settings.env_path == "./myenvfile"
     assert opt.parameter_randomization is None
-    assert opt.checkpoint_settings.keep_checkpoints == 42
     assert opt.checkpoint_settings.lesson == 3
     assert opt.checkpoint_settings.run_id == "myawesomerun"
     assert opt.checkpoint_settings.save_freq == 123456
@@ -176,7 +172,6 @@ def test_yaml_args(mock_file):
     assert opt.behaviors == {}
     assert opt.env_settings.env_path == "./oldenvfile"
     assert opt.parameter_randomization is None
-    assert opt.checkpoint_settings.keep_checkpoints == 34
     assert opt.checkpoint_settings.lesson == 2
     assert opt.checkpoint_settings.run_id == "uselessrun"
     assert opt.checkpoint_settings.save_freq == 654321
@@ -190,7 +185,6 @@ def test_yaml_args(mock_file):
     full_args = [
         "mytrainerpath",
         "--env=./myenvfile",
-        "--keep-checkpoints=42",
         "--lesson=3",
         "--resume",
         "--inference",
@@ -208,7 +202,6 @@ def test_yaml_args(mock_file):
     assert opt.behaviors == {}
     assert opt.env_settings.env_path == "./myenvfile"
     assert opt.parameter_randomization is None
-    assert opt.checkpoint_settings.keep_checkpoints == 42
     assert opt.checkpoint_settings.lesson == 3
     assert opt.checkpoint_settings.run_id == "myawesomerun"
     assert opt.checkpoint_settings.save_freq == 123456
diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
index c43dabc5a5..1f8145478f 100644
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -126,7 +126,6 @@ def _check_environment_trains(
             trainer_config=trainer_config,
             run_id=run_id,
             output_path=dir,
-            keep_checkpoints=1,
             train_model=True,
             load_model=False,
             seed=seed,
diff --git a/ml-agents/mlagents/trainers/tests/test_trainer_util.py b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
index 79bc8c2f94..ca5242396c 100644
--- a/ml-agents/mlagents/trainers/tests/test_trainer_util.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
@@ -24,7 +24,6 @@ def test_initialize_ppo_trainer(BrainParametersMock, dummy_config):
     external_brains = {"testbrain": BrainParametersMock()}
     run_id = "testrun"
     output_path = "results_dir"
-    keep_checkpoints = 1
     train_model = True
     load_model = False
     seed = 11
@@ -49,7 +48,6 @@ def mock_constructor(
             trainer_config=base_config,
             run_id=run_id,
             output_path=output_path,
-            keep_checkpoints=keep_checkpoints,
             train_model=train_model,
             load_model=load_model,
             seed=seed,
diff --git a/ml-agents/mlagents/trainers/trainer_util.py b/ml-agents/mlagents/trainers/trainer_util.py
index 0648cdbaf7..15e5955662 100644
--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py
@@ -22,7 +22,6 @@ def __init__(
         trainer_config: Dict[str, TrainerSettings],
         run_id: str,
         output_path: str,
-        keep_checkpoints: int,
         train_model: bool,
         load_model: bool,
         seed: int,
@@ -34,7 +33,6 @@ def __init__(
         self.run_id = run_id
         self.output_path = output_path
         self.init_path = init_path
-        self.keep_checkpoints = keep_checkpoints
         self.train_model = train_model
         self.load_model = load_model
         self.seed = seed
@@ -48,7 +46,6 @@ def generate(self, brain_name: str) -> Trainer:
             brain_name,
             self.run_id,
             self.output_path,
-            self.keep_checkpoints,
             self.train_model,
             self.load_model,
             self.ghost_controller,
@@ -64,7 +61,6 @@ def initialize_trainer(
     brain_name: str,
     run_id: str,
     output_path: str,
-    keep_checkpoints: int,
     train_model: bool,
     load_model: bool,
     ghost_controller: GhostController,

From f22bae87228267e3c80a004e0713342bd6090ad1 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 20 May 2020 14:42:09 -0700
Subject: [PATCH 54/54] Fix last test

---
 ml-agents/mlagents/trainers/tests/test_trainer_util.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/tests/test_trainer_util.py b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
index ca5242396c..fe7fa9cb65 100644
--- a/ml-agents/mlagents/trainers/tests/test_trainer_util.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
@@ -79,7 +79,6 @@ def test_handles_no_config_provided(BrainParametersMock):
         trainer_config=no_default_config,
         run_id="testrun",
         output_path="output_path",
-        keep_checkpoints=1,
         train_model=True,
         load_model=False,
         seed=42,