Merge pull request #3038 from junpenglao/use_theano_ops

twiecki · web-flow · commit 8fece62bc3ad · 2018-06-18T20:14:34.000+02:00
Use theano Ops available in the newest release
diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py
@@ -10,12 +10,11 @@
 import theano.tensor as tt
 import theano
 from theano.scalar import UnaryScalarOp, upgrade_to_float
+from theano.tensor.slinalg import Cholesky
 
 from .special import gammaln
 from pymc3.theanof import floatX
 
-from six.moves import xrange
-from functools import partial
 
 f = floatX
 c = - .5 * np.log(2. * np.pi)
@@ -166,7 +165,7 @@ def MvNormalLogp():
 
     solve_lower = tt.slinalg.Solve(A_structure='lower_triangular')
     solve_upper = tt.slinalg.Solve(A_structure='upper_triangular')
-    cholesky = Cholesky(nofail=True, lower=True)
+    cholesky = Cholesky(lower=True, on_error='nan')
 
     n, k = delta.shape
     n, k = f(n), f(k)
@@ -213,88 +212,6 @@ def dlogp(inputs, gradients):
         [cov, delta], [logp], grad_overrides=dlogp, inline=True)
 
 
-class Cholesky(theano.Op):
-    """
-    Return a triangular matrix square root of positive semi-definite `x`.
-
-    This is a copy of the cholesky op in theano, that doesn't throw an
-    error if the matrix is not positive definite, but instead returns
-    nan.
-
-    This has been merged upstream and we should switch to that
-    version after the next theano release.
-
-    L = cholesky(X, lower=True) implies dot(L, L.T) == X.
-    """
-    __props__ = ('lower', 'destructive', 'nofail')
-
-    def __init__(self, lower=True, nofail=False):
-        self.lower = lower
-        self.destructive = False
-        self.nofail = nofail
-
-    def make_node(self, x):
-        x = tt.as_tensor_variable(x)
-        if x.ndim != 2:
-            raise ValueError('Matrix must me two dimensional.')
-        return tt.Apply(self, [x], [x.type()])
-
-    def perform(self, node, inputs, outputs):
-        x = inputs[0]
-        z = outputs[0]
-        try:
-            z[0] = scipy.linalg.cholesky(x, lower=self.lower).astype(x.dtype)
-        except (ValueError, scipy.linalg.LinAlgError):
-            if self.nofail:
-                z[0] = np.eye(x.shape[-1])
-                z[0][0, 0] = np.nan
-            else:
-                raise
-
-    def grad(self, inputs, gradients):
-        """
-        Cholesky decomposition reverse-mode gradient update.
-
-        Symbolic expression for reverse-mode Cholesky gradient taken from [0]_
-
-        References
-        ----------
-        .. [0] I. Murray, "Differentiation of the Cholesky decomposition",
-           http://arxiv.org/abs/1602.07527
-
-        """
-
-        x = inputs[0]
-        dz = gradients[0]
-        chol_x = self(x)
-        ok = tt.all(tt.nlinalg.diag(chol_x) > 0)
-        chol_x = tt.switch(ok, chol_x, tt.fill_diagonal(chol_x, 1))
-        dz = tt.switch(ok, dz, floatX(1))
-
-        # deal with upper triangular by converting to lower triangular
-        if not self.lower:
-            chol_x = chol_x.T
-            dz = dz.T
-
-        def tril_and_halve_diagonal(mtx):
-            """Extracts lower triangle of square matrix and halves diagonal."""
-            return tt.tril(mtx) - tt.diag(tt.diagonal(mtx) / 2.)
-
-        def conjugate_solve_triangular(outer, inner):
-            """Computes L^{-T} P L^{-1} for lower-triangular L."""
-            solve = tt.slinalg.Solve(A_structure="upper_triangular")
-            return solve(outer.T, solve(outer.T, inner.T).T)
-
-        s = conjugate_solve_triangular(
-            chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)))
-
-        if self.lower:
-            grad = tt.tril(s + s.T) - tt.diag(tt.diagonal(s))
-        else:
-            grad = tt.triu(s + s.T) - tt.diag(tt.diagonal(s))
-        return [tt.switch(ok, grad, floatX(np.nan))]
-
-
 class SplineWrapper(theano.Op):
     """
     Creates a theano operation from scipy.interpolate.UnivariateSpline
@@ -332,137 +249,6 @@ def grad(self, inputs, grads):
         return [x_grad * self.grad_op(x)]
 
 
-# Custom Eigh, EighGrad, and eigh are required until
-# https://github.com/Theano/Theano/pull/6557 is handled, since lambda's
-# cannot be used with pickling.
-class Eigh(tt.nlinalg.Eig):
-    """
-    Return the eigenvalues and eigenvectors of a Hermitian or symmetric matrix.
-
-    This is a copy of Eigh from theano that calls an EighGrad which uses
-    partial instead of lambda. Once this has been merged with theano this
-    should be removed.
-    """
-
-    _numop = staticmethod(np.linalg.eigh)
-    __props__ = ('UPLO',)
-
-    def __init__(self, UPLO='L'):
-        assert UPLO in ['L', 'U']
-        self.UPLO = UPLO
-
-    def make_node(self, x):
-        x = tt.as_tensor_variable(x)
-        assert x.ndim == 2
-        # Numpy's linalg.eigh may return either double or single
-        # presision eigenvalues depending on installed version of
-        # LAPACK.  Rather than trying to reproduce the (rather
-        # involved) logic, we just probe linalg.eigh with a trivial
-        # input.
-        w_dtype = self._numop([[np.dtype(x.dtype).type()]])[0].dtype.name
-        w = theano.tensor.vector(dtype=w_dtype)
-        v = theano.tensor.matrix(dtype=x.dtype)
-        return theano.gof.Apply(self, [x], [w, v])
-
-    def perform(self, node, inputs, outputs):
-        (x,) = inputs
-        (w, v) = outputs
-        w[0], v[0] = self._numop(x, self.UPLO)
-
-    def grad(self, inputs, g_outputs):
-        r"""The gradient function should return
-           .. math:: \sum_n\left(W_n\frac{\partial\,w_n}
-                           {\partial a_{ij}} +
-                     \sum_k V_{nk}\frac{\partial\,v_{nk}}
-                           {\partial a_{ij}}\right),
-        where [:math:`W`, :math:`V`] corresponds to ``g_outputs``,
-        :math:`a` to ``inputs``, and  :math:`(w, v)=\mbox{eig}(a)`.
-        Analytic formulae for eigensystem gradients are well-known in
-        perturbation theory:
-           .. math:: \frac{\partial\,w_n}
-                          {\partial a_{ij}} = v_{in}\,v_{jn}
-           .. math:: \frac{\partial\,v_{kn}}
-                          {\partial a_{ij}} =
-                \sum_{m\ne n}\frac{v_{km}v_{jn}}{w_n-w_m}
-        """
-        x, = inputs
-        w, v = self(x)
-        # Replace gradients wrt disconnected variables with
-        # zeros. This is a work-around for issue #1063.
-        gw, gv = tt.nlinalg._zero_disconnected([w, v], g_outputs)
-        return [EighGrad(self.UPLO)(x, w, v, gw, gv)]
-
-
-class EighGrad(theano.Op):
-    """
-    Gradient of an eigensystem of a Hermitian matrix.
-
-    This is a copy of EighGrad from theano that uses partial instead of lambda.
-    Once this has been merged with theano this should be removed.
-    """
-
-    __props__ = ('UPLO',)
-
-    def __init__(self, UPLO='L'):
-        assert UPLO in ['L', 'U']
-        self.UPLO = UPLO
-        if UPLO == 'L':
-            self.tri0 = np.tril
-            self.tri1 = partial(np.triu, k=1)
-        else:
-            self.tri0 = np.triu
-            self.tri1 = partial(np.tril, k=-1)
-
-    def make_node(self, x, w, v, gw, gv):
-        x, w, v, gw, gv = map(tt.as_tensor_variable, (x, w, v, gw, gv))
-        assert x.ndim == 2
-        assert w.ndim == 1
-        assert v.ndim == 2
-        assert gw.ndim == 1
-        assert gv.ndim == 2
-        out_dtype = theano.scalar.upcast(x.dtype, w.dtype, v.dtype,
-                                         gw.dtype, gv.dtype)
-        out = theano.tensor.matrix(dtype=out_dtype)
-        return theano.gof.Apply(self, [x, w, v, gw, gv], [out])
-
-    def perform(self, node, inputs, outputs):
-        """
-        Implements the "reverse-mode" gradient for the eigensystem of
-        a square matrix.
-        """
-        x, w, v, W, V = inputs
-        N = x.shape[0]
-        outer = np.outer
-
-        def G(n):
-            return sum(v[:, m] * V.T[n].dot(v[:, m]) / (w[n] - w[m])
-                       for m in xrange(N) if m != n)
-
-        g = sum(outer(v[:, n], v[:, n] * W[n] + G(n))
-                for n in xrange(N))
-
-        # Numpy's eigh(a, 'L') (eigh(a, 'U')) is a function of tril(a)
-        # (triu(a)) only.  This means that partial derivative of
-        # eigh(a, 'L') (eigh(a, 'U')) with respect to a[i,j] is zero
-        # for i < j (i > j).  At the same time, non-zero components of
-        # the gradient must account for the fact that variation of the
-        # opposite triangle contributes to variation of two elements
-        # of Hermitian (symmetric) matrix. The following line
-        # implements the necessary logic.
-        out = self.tri0(g) + self.tri1(g).T
-
-        # Make sure we return the right dtype even if NumPy performed
-        # upcasting in self.tri0.
-        outputs[0][0] = np.asarray(out, dtype=node.outputs[0].dtype)
-
-    def infer_shape(self, node, shapes):
-        return [shapes[0]]
-
-
-def eigh(a, UPLO='L'):
-    """A copy, remove with Eigh and EighGrad when possible"""
-    return Eigh(UPLO)(a)
-
 
 class I0e(UnaryScalarOp):
     """
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
@@ -10,7 +10,7 @@
 from ..vartypes import string_types
 
 __all__ = ['DensityDist', 'Distribution', 'Continuous', 'Discrete',
-           'NoDistribution', 'TensorType', 'draw_values']
+           'NoDistribution', 'TensorType', 'draw_values', 'generate_samples']
 
 
 class _Unpickling(object):
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
@@ -9,8 +9,8 @@
 
 from scipy import stats, linalg
 
-from theano.tensor.nlinalg import det, matrix_inverse, trace
-import theano.tensor.slinalg
+from theano.tensor.nlinalg import det, matrix_inverse, trace, eigh
+from theano.tensor.slinalg import Cholesky
 import pymc3 as pm
 
 from pymc3.theanof import floatX
@@ -20,7 +20,7 @@
 from ..model import Deterministic
 from .continuous import ChiSquared, Normal
 from .special import gammaln, multigammaln
-from .dist_math import bound, logpow, factln, Cholesky, eigh
+from .dist_math import bound, logpow, factln
 from ..math import kron_dot, kron_diag, kron_solve_lower, kronecker
 
 
@@ -49,7 +49,7 @@ def __init__(self, mu=None, cov=None, chol=None, tau=None, lower=True,
         # moment. We work around that by using a cholesky op
         # that returns a nan as first entry instead of raising
         # an error.
-        cholesky = Cholesky(nofail=True, lower=True)
+        cholesky = Cholesky(lower=True, on_error='nan')
 
         if cov is not None:
             self.k = cov.shape[0]
@@ -605,9 +605,9 @@ def _repr_latex_(self, name=None, dist=None):
 
 def posdef(AA):
     try:
-        np.linalg.cholesky(AA)
+        linalg.cholesky(AA)
         return 1
-    except np.linalg.LinAlgError:
+    except linalg.LinAlgError:
         return 0
 
 
@@ -796,7 +796,7 @@ def WishartBartlett(name, S, nu, is_cholesky=False, return_cholesky=False, testv
     if testval is not None:
         # Inverse transform
         testval = np.dot(np.dot(np.linalg.inv(L), testval), np.linalg.inv(L.T))
-        testval = scipy.linalg.cholesky(testval, lower=True)
+        testval = linalg.cholesky(testval, lower=True)
         diag_testval = testval[diag_idx]**2
         tril_testval = testval[tril_idx]
     else:
@@ -1226,7 +1226,7 @@ def __init__(self, mu=0, rowcov=None, rowchol=None, rowtau=None,
         self.solve_upper = tt.slinalg.solve_upper_triangular
 
     def _setup_matrices(self, colcov, colchol, coltau, rowcov, rowchol, rowtau):
-        cholesky = Cholesky(nofail=False, lower=True)
+        cholesky = Cholesky(lower=True, on_error='raise')
 
         # Among-row matrices
         if len([i for i in [rowtau, rowcov, rowchol] if i is not None]) != 1:
@@ -1439,7 +1439,7 @@ def __init__(self, mu, covs=None, chols=None, evds=None, sigma=None,
         self.mean = self.median = self.mode = self.mu
 
     def _setup(self, covs, chols, evds, sigma):
-        self.cholesky = Cholesky(nofail=False, lower=True)
+        self.cholesky = Cholesky(lower=True, on_error='raise')
         if len([i for i in [covs, chols, evds] if i is not None]) != 1:
             raise ValueError('Incompatible parameterization. '
                              'Specify exactly one of covs, chols, '
diff --git a/pymc3/gp/gp.py b/pymc3/gp/gp.py
@@ -10,7 +10,7 @@
 from pymc3.gp.util import (conditioned_vars, infer_shape,
                            stabilize, cholesky, solve_lower, solve_upper)
 from pymc3.distributions import draw_values
-from pymc3.distributions.dist_math import eigh
+from theano.tensor.nlinalg import eigh
 from ..math import cartesian, kron_dot, kron_diag
 
 __all__ = ['Latent', 'Marginal', 'TP', 'MarginalSparse', 'MarginalKron']