22
22
DISCRETE_ACTION_SPACE = [3 , 3 , 3 , 2 ]
23
23
BUFFER_INIT_SAMPLES = 32
24
24
NUM_AGENTS = 12
25
+ EPSILON = 1e-7
25
26
26
27
27
28
def create_policy_mock (
@@ -136,11 +137,112 @@ def test_policy_evaluate(rnn, visual, discrete):
136
137
assert run_out ["action" ].shape == (NUM_AGENTS , VECTOR_ACTION_SPACE )
137
138
138
139
139
- def test_normalization ():
140
+ def test_large_normalization ():
140
141
behavior_spec = mb .setup_test_behavior_specs (
141
142
use_discrete = True , use_visual = False , vector_action_space = [2 ], vector_obs_space = 1
142
143
)
144
+ # Taken from Walker seed 3713 which causes NaN without proper initialization
145
+ large_obs1 = [
146
+ 1800.00036621 ,
147
+ 1799.96972656 ,
148
+ 1800.01245117 ,
149
+ 1800.07214355 ,
150
+ 1800.02758789 ,
151
+ 1799.98303223 ,
152
+ 1799.88647461 ,
153
+ 1799.89575195 ,
154
+ 1800.03479004 ,
155
+ 1800.14025879 ,
156
+ 1800.17675781 ,
157
+ 1800.20581055 ,
158
+ 1800.33740234 ,
159
+ 1800.36450195 ,
160
+ 1800.43457031 ,
161
+ 1800.45544434 ,
162
+ 1800.44604492 ,
163
+ 1800.56713867 ,
164
+ 1800.73901367 ,
165
+ ]
166
+ large_obs2 = [
167
+ 1799.99975586 ,
168
+ 1799.96679688 ,
169
+ 1799.92980957 ,
170
+ 1799.89550781 ,
171
+ 1799.93774414 ,
172
+ 1799.95300293 ,
173
+ 1799.94067383 ,
174
+ 1799.92993164 ,
175
+ 1799.84057617 ,
176
+ 1799.69873047 ,
177
+ 1799.70605469 ,
178
+ 1799.82849121 ,
179
+ 1799.85095215 ,
180
+ 1799.76977539 ,
181
+ 1799.78283691 ,
182
+ 1799.76708984 ,
183
+ 1799.67163086 ,
184
+ 1799.59191895 ,
185
+ 1799.5135498 ,
186
+ 1799.45556641 ,
187
+ 1799.3717041 ,
188
+ ]
189
+ policy = TFPolicy (
190
+ 0 ,
191
+ behavior_spec ,
192
+ TrainerSettings (network_settings = NetworkSettings (normalize = True )),
193
+ "testdir" ,
194
+ False ,
195
+ )
196
+ time_horizon = len (large_obs1 )
197
+ trajectory = make_fake_trajectory (
198
+ length = time_horizon ,
199
+ max_step_complete = True ,
200
+ observation_shapes = [(1 ,)],
201
+ action_space = [2 ],
202
+ )
203
+ for i in range (time_horizon ):
204
+ trajectory .steps [i ].obs [0 ] = np .array ([large_obs1 [i ]], dtype = np .float32 )
205
+ trajectory_buffer = trajectory .to_agentbuffer ()
206
+ policy .update_normalization (trajectory_buffer ["vector_obs" ])
143
207
208
+ # Check that the running mean and variance is correct
209
+ steps , mean , variance = policy .sess .run (
210
+ [policy .normalization_steps , policy .running_mean , policy .running_variance ]
211
+ )
212
+ assert mean [0 ] == pytest .approx (np .mean (large_obs1 , dtype = np .float32 ), abs = 0.01 )
213
+ assert variance [0 ] / steps == pytest .approx (
214
+ np .var (large_obs1 , dtype = np .float32 ), abs = 0.01
215
+ )
216
+
217
+ time_horizon = len (large_obs2 )
218
+ trajectory = make_fake_trajectory (
219
+ length = time_horizon ,
220
+ max_step_complete = True ,
221
+ observation_shapes = [(1 ,)],
222
+ action_space = [2 ],
223
+ )
224
+ for i in range (time_horizon ):
225
+ trajectory .steps [i ].obs [0 ] = np .array ([large_obs2 [i ]], dtype = np .float32 )
226
+
227
+ trajectory_buffer = trajectory .to_agentbuffer ()
228
+ policy .update_normalization (trajectory_buffer ["vector_obs" ])
229
+
230
+ steps , mean , variance = policy .sess .run (
231
+ [policy .normalization_steps , policy .running_mean , policy .running_variance ]
232
+ )
233
+
234
+ assert mean [0 ] == pytest .approx (
235
+ np .mean (large_obs1 + large_obs2 , dtype = np .float32 ), abs = 0.01
236
+ )
237
+ assert variance [0 ] / steps == pytest .approx (
238
+ np .var (large_obs1 + large_obs2 , dtype = np .float32 ), abs = 0.01
239
+ )
240
+
241
+
242
+ def test_normalization ():
243
+ behavior_spec = mb .setup_test_behavior_specs (
244
+ use_discrete = True , use_visual = False , vector_action_space = [2 ], vector_obs_space = 1
245
+ )
144
246
time_horizon = 6
145
247
trajectory = make_fake_trajectory (
146
248
length = time_horizon ,
@@ -169,10 +271,9 @@ def test_normalization():
169
271
170
272
assert steps == 6
171
273
assert mean [0 ] == 0.5
172
- # Note: variance is divided by number of steps, and initialized to 1 to avoid
173
- # divide by 0. The right answer is 0.25
174
- assert (variance [0 ] - 1 ) / steps == 0.25
175
-
274
+ # Note: variance is initalized to the variance of the initial trajectory + EPSILON
275
+ # (to avoid divide by 0) and multiplied by the number of steps. The correct answer is 0.25
276
+ assert variance [0 ] / steps == pytest .approx (0.25 , abs = 0.01 )
176
277
# Make another update, this time with all 1's
177
278
time_horizon = 10
178
279
trajectory = make_fake_trajectory (
@@ -191,7 +292,7 @@ def test_normalization():
191
292
192
293
assert steps == 16
193
294
assert mean [0 ] == 0.8125
194
- assert ( variance [0 ] - 1 ) / steps == pytest .approx (0.152 , abs = 0.01 )
295
+ assert variance [0 ] / steps == pytest .approx (0.152 , abs = 0.01 )
195
296
196
297
197
298
def test_min_visual_size ():
0 commit comments