Skip to content

Initialize normalizer with mean from first trajectory #4299

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .yamato/com.unity.ml-agents-performance.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Run_Mac_Perfomance_Tests{{ editor.version }}:
variables:
UNITY_VERSION: {{ editor.version }}
commands:
- python -m pip install unity-downloader-cli --index-url https://artifactory.prd.it.unity3d.com/artifactory/api/pypi/pypi/simple
- python -m pip install unity-downloader-cli --index-url https://artifactory.prd.it.unity3d.com/artifactory/api/pypi/pypi/simple --upgrade
- unity-downloader-cli -u {{ editor.version }} -c editor --wait --fast
- curl -s https://artifactory.internal.unity3d.com/core-automation/tools/utr-standalone/utr --output utr
- chmod +x ./utr
Expand Down
3 changes: 3 additions & 0 deletions com.unity.ml-agents/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ recursively (for example, by an Agent's CollectObservations method).
Previously, this would result in an infinite loop and cause the editor to hang.
(#4226)
#### ml-agents / ml-agents-envs / gym-unity (Python)
- The algorithm used to normalize observations was introducing NaNs if the initial observations were too large
due to incorrect initialization. The initialization was fixed and is now the observation means from the
first trajectory processed. (#4299)

## [1.2.0-preview] - 2020-07-15

Expand Down
16 changes: 13 additions & 3 deletions ml-agents/mlagents/trainers/policy/tf_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def __init__(
self.assign_ops: List[tf.Operation] = []
self.update_dict: Dict[str, tf.Tensor] = {}
self.inference_dict: Dict[str, tf.Tensor] = {}
self.first_normalization_update: bool = False

self.graph = tf.Graph()
self.sess = tf.Session(
Expand Down Expand Up @@ -453,9 +454,15 @@ def update_normalization(self, vector_obs: np.ndarray) -> None:
:param vector_obs: The vector observations to add to the running estimate of the distribution.
"""
if self.use_vec_obs and self.normalize:
self.sess.run(
self.update_normalization_op, feed_dict={self.vector_in: vector_obs}
)
if self.first_normalization_update:
self.sess.run(
self.init_normalization_op, feed_dict={self.vector_in: vector_obs}
)
self.first_normalization_update = False
else:
self.sess.run(
self.update_normalization_op, feed_dict={self.vector_in: vector_obs}
)

@property
def use_vis_obs(self):
Expand All @@ -470,6 +477,7 @@ def _initialize_tensorflow_references(self):
self.normalization_steps: Optional[tf.Variable] = None
self.running_mean: Optional[tf.Variable] = None
self.running_variance: Optional[tf.Variable] = None
self.init_normalization_op: Optional[tf.Operation] = None
self.update_normalization_op: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None
self.all_log_probs: tf.Tensor = None
Expand All @@ -495,8 +503,10 @@ def create_input_placeholders(self):
self.behavior_spec.observation_shapes
)
if self.normalize:
self.first_normalization_update = True
normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
self.update_normalization_op = normalization_tensors.update_op
self.init_normalization_op = normalization_tensors.init_op
self.normalization_steps = normalization_tensors.steps
self.running_mean = normalization_tensors.running_mean
self.running_variance = normalization_tensors.running_variance
Expand Down
3 changes: 3 additions & 0 deletions ml-agents/mlagents/trainers/tests/mock_brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ def make_fake_trajectory(
memory=memory,
)
steps_list.append(experience)
obs = []
for _shape in observation_shapes:
obs.append(np.ones(_shape, dtype=np.float32))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done so that changing the last obs doesn't overwrite the second to last obs

last_experience = AgentExperience(
obs=obs,
reward=reward,
Expand Down
113 changes: 107 additions & 6 deletions ml-agents/mlagents/trainers/tests/test_nn_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 32
NUM_AGENTS = 12
EPSILON = 1e-7


def create_policy_mock(
Expand Down Expand Up @@ -136,11 +137,112 @@ def test_policy_evaluate(rnn, visual, discrete):
assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)


def test_normalization():
def test_large_normalization():
behavior_spec = mb.setup_test_behavior_specs(
use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
)
# Taken from Walker seed 3713 which causes NaN without proper initialization
large_obs1 = [
1800.00036621,
1799.96972656,
1800.01245117,
1800.07214355,
1800.02758789,
1799.98303223,
1799.88647461,
1799.89575195,
1800.03479004,
1800.14025879,
1800.17675781,
1800.20581055,
1800.33740234,
1800.36450195,
1800.43457031,
1800.45544434,
1800.44604492,
1800.56713867,
1800.73901367,
]
large_obs2 = [
1799.99975586,
1799.96679688,
1799.92980957,
1799.89550781,
1799.93774414,
1799.95300293,
1799.94067383,
1799.92993164,
1799.84057617,
1799.69873047,
1799.70605469,
1799.82849121,
1799.85095215,
1799.76977539,
1799.78283691,
1799.76708984,
1799.67163086,
1799.59191895,
1799.5135498,
1799.45556641,
1799.3717041,
]
policy = TFPolicy(
0,
behavior_spec,
TrainerSettings(network_settings=NetworkSettings(normalize=True)),
"testdir",
False,
)
time_horizon = len(large_obs1)
trajectory = make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
observation_shapes=[(1,)],
action_space=[2],
)
for i in range(time_horizon):
trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32)
trajectory_buffer = trajectory.to_agentbuffer()
policy.update_normalization(trajectory_buffer["vector_obs"])

# Check that the running mean and variance is correct
steps, mean, variance = policy.sess.run(
[policy.normalization_steps, policy.running_mean, policy.running_variance]
)
assert mean[0] == pytest.approx(np.mean(large_obs1, dtype=np.float32), abs=0.01)
assert variance[0] / steps == pytest.approx(
np.var(large_obs1, dtype=np.float32), abs=0.01
)

time_horizon = len(large_obs2)
trajectory = make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
observation_shapes=[(1,)],
action_space=[2],
)
for i in range(time_horizon):
trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32)

trajectory_buffer = trajectory.to_agentbuffer()
policy.update_normalization(trajectory_buffer["vector_obs"])

steps, mean, variance = policy.sess.run(
[policy.normalization_steps, policy.running_mean, policy.running_variance]
)

assert mean[0] == pytest.approx(
np.mean(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
)
assert variance[0] / steps == pytest.approx(
np.var(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
)


def test_normalization():
behavior_spec = mb.setup_test_behavior_specs(
use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
)
time_horizon = 6
trajectory = make_fake_trajectory(
length=time_horizon,
Expand Down Expand Up @@ -169,10 +271,9 @@ def test_normalization():

assert steps == 6
assert mean[0] == 0.5
# Note: variance is divided by number of steps, and initialized to 1 to avoid
# divide by 0. The right answer is 0.25
assert (variance[0] - 1) / steps == 0.25

# Note: variance is initalized to the variance of the initial trajectory + EPSILON
# (to avoid divide by 0) and multiplied by the number of steps. The correct answer is 0.25
assert variance[0] / steps == pytest.approx(0.25, abs=0.01)
# Make another update, this time with all 1's
time_horizon = 10
trajectory = make_fake_trajectory(
Expand All @@ -191,7 +292,7 @@ def test_normalization():

assert steps == 16
assert mean[0] == 0.8125
assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
assert variance[0] / steps == pytest.approx(0.152, abs=0.01)


def test_min_visual_size():
Expand Down
26 changes: 21 additions & 5 deletions ml-agents/mlagents/trainers/tf/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class Tensor3DShape(NamedTuple):


class NormalizerTensors(NamedTuple):
init_op: tf.Operation
update_op: tf.Operation
steps: tf.Tensor
running_mean: tf.Tensor
Expand Down Expand Up @@ -187,8 +188,8 @@ def create_normalizer(vector_obs: tf.Tensor) -> NormalizerTensors:
:return: A NormalizerTensors tuple that holds running mean, running variance, number of steps,
and the update operation.
"""

vec_obs_size = vector_obs.shape[1]

steps = tf.get_variable(
"normalization_steps",
[],
Expand All @@ -210,11 +211,15 @@ def create_normalizer(vector_obs: tf.Tensor) -> NormalizerTensors:
dtype=tf.float32,
initializer=tf.ones_initializer(),
)
update_normalization = ModelUtils.create_normalizer_update(
initialize_normalization, update_normalization = ModelUtils.create_normalizer_update(
vector_obs, steps, running_mean, running_variance
)
return NormalizerTensors(
update_normalization, steps, running_mean, running_variance
initialize_normalization,
update_normalization,
steps,
running_mean,
running_variance,
)

@staticmethod
Expand All @@ -223,7 +228,7 @@ def create_normalizer_update(
steps: tf.Tensor,
running_mean: tf.Tensor,
running_variance: tf.Tensor,
) -> tf.Operation:
) -> Tuple[tf.Operation, tf.Operation]:
"""
Creates the update operation for the normalizer.
:param vector_input: Vector observation to use for updating the running mean and variance.
Expand All @@ -250,7 +255,18 @@ def create_normalizer_update(
update_mean = tf.assign(running_mean, new_mean)
update_variance = tf.assign(running_variance, new_variance)
update_norm_step = tf.assign(steps, total_new_steps)
return tf.group([update_mean, update_variance, update_norm_step])
# First mean and variance calculated normally
initial_mean, initial_variance = tf.nn.moments(vector_input, axes=[0])
initialize_mean = tf.assign(running_mean, initial_mean)
# Multiplied by total_new_step because it is divided by total_new_step in the normalization
initialize_variance = tf.assign(
running_variance,
(initial_variance + EPSILON) * tf.cast(total_new_steps, dtype=tf.float32),
)
return (
tf.group([initialize_mean, initialize_variance, update_norm_step]),
tf.group([update_mean, update_variance, update_norm_step]),
)

@staticmethod
def create_vector_observation_encoder(
Expand Down