@@ -276,7 +276,7 @@ def apply_momentum(updates, params=None, momentum=0.9):
276
276
for param in params :
277
277
value = param .get_value (borrow = True )
278
278
velocity = aesara .shared (
279
- np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
279
+ np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable
280
280
)
281
281
x = momentum * velocity + updates [param ]
282
282
updates [velocity ] = x - param
@@ -391,7 +391,7 @@ def apply_nesterov_momentum(updates, params=None, momentum=0.9):
391
391
for param in params :
392
392
value = param .get_value (borrow = True )
393
393
velocity = aesara .shared (
394
- np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
394
+ np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable
395
395
)
396
396
x = momentum * velocity + updates [param ] - param
397
397
updates [velocity ] = x
@@ -534,9 +534,7 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6):
534
534
535
535
for param , grad in zip (params , grads ):
536
536
value = param .get_value (borrow = True )
537
- accu = aesara .shared (
538
- np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
539
- )
537
+ accu = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
540
538
accu_new = accu + grad ** 2
541
539
updates [accu ] = accu_new
542
540
updates [param ] = param - (learning_rate * grad / at .sqrt (accu_new + epsilon ))
@@ -662,9 +660,7 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon
662
660
663
661
for param , grad in zip (params , grads ):
664
662
value = param .get_value (borrow = True )
665
- accu = aesara .shared (
666
- np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
667
- )
663
+ accu = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
668
664
accu_new = rho * accu + (one - rho ) * grad ** 2
669
665
updates [accu ] = accu_new
670
666
updates [param ] = param - (learning_rate * grad / at .sqrt (accu_new + epsilon ))
@@ -755,12 +751,10 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil
755
751
for param , grad in zip (params , grads ):
756
752
value = param .get_value (borrow = True )
757
753
# accu: accumulate gradient magnitudes
758
- accu = aesara .shared (
759
- np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
760
- )
754
+ accu = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
761
755
# delta_accu: accumulate update magnitudes (recursively!)
762
756
delta_accu = aesara .shared (
763
- np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
757
+ np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable
764
758
)
765
759
766
760
# update accu (as in rmsprop)
@@ -850,12 +844,8 @@ def adam(
850
844
851
845
for param , g_t in zip (params , all_grads ):
852
846
value = param .get_value (borrow = True )
853
- m_prev = aesara .shared (
854
- np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
855
- )
856
- v_prev = aesara .shared (
857
- np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
858
- )
847
+ m_prev = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
848
+ v_prev = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
859
849
860
850
m_t = beta1 * m_prev + (one - beta1 ) * g_t
861
851
v_t = beta2 * v_prev + (one - beta2 ) * g_t ** 2
@@ -938,12 +928,8 @@ def adamax(
938
928
939
929
for param , g_t in zip (params , all_grads ):
940
930
value = param .get_value (borrow = True )
941
- m_prev = aesara .shared (
942
- np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
943
- )
944
- u_prev = aesara .shared (
945
- np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
946
- )
931
+ m_prev = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
932
+ u_prev = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
947
933
948
934
m_t = beta1 * m_prev + (one - beta1 ) * g_t
949
935
u_t = at .maximum (beta2 * u_prev , abs (g_t ))
0 commit comments