From 5852a5b90e1e63498223bd8c6a2bd1e3cbe58545 Mon Sep 17 00:00:00 2001
From: Luciano Paz <chulo1787@gmail.com>
Date: Mon, 25 Feb 2019 12:41:13 +0100
Subject: [PATCH 1/9] Changed Categorical to work with multidim p at the logp
 level.

---
 RELEASE-NOTES.md                          |  3 +++
 pymc3/distributions/discrete.py           | 33 ++++++++++++++++++-----
 pymc3/tests/test_distribution_defaults.py |  9 ++++++-
 pymc3/tests/test_distributions.py         | 18 ++++++++++++-
 pymc3/tests/test_distributions_random.py  |  3 +++
 5 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index 73cb1dfefb..a6624c34e5 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -23,6 +23,9 @@
 - Fixed incorrect usage of `broadcast_distribution_samples` in `DiscreteWeibull`.
 - `Mixture`'s default dtype is now determined by `theano.config.floatX`.
 - `dist_math.random_choice` now handles nd-arrays of category probabilities, and also handles sizes that are not `None`. Also removed unused `k` kwarg from `dist_math.random_choice`.
+- Changed `Categorical.mode` to preserve all the dimensions of `p` except the last one, which encodes each category's probability.
+- Changed initialization of `Categorical.p`. `p` is now normalized to sum to `1` inside `logp` and `random`, but not during initialization. This could hide negative values supplied to `p` as mentioned in #2082.
+- To be able to test for negative `p` values supplied to `Categorical`, `Categorical.logp` was changed to check for `sum(self.p, axis=-1) == 1` only if `self.p` is not a `Number`, `np.ndarray`, `TensorConstant` or `SharedVariable`. These cases are automatically normalized to sum to `1`. The other condition may originate from a `step_method` proposal, where `self.p` tensor's value may be set, but must sum to 1 nevertheless. This may break old code which intialized `p` with a theano expression and relied on the default normalization to get it to sum to 1. `Categorical.logp` now also checks that the used `p` has values lower than 1.
 
 ### Deprecations
 
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index e74293b068..4047fab7a2 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -1,3 +1,4 @@
+import numbers
 import numpy as np
 import theano
 import theano.tensor as tt
@@ -710,11 +711,15 @@ def __init__(self, p, *args, **kwargs):
         except AttributeError:
             self.k = tt.shape(p)[-1]
         p = tt.as_tensor_variable(floatX(p))
-        self.p = (p.T / tt.sum(p, -1)).T
-        self.mode = tt.argmax(p)
+
+        # From #2082, it may be dangerous to automatically rescale p at this
+        # point without checking for positiveness
+        self.p = p
+        self.mode = tt.argmax(p, axis=-1)
 
     def random(self, point=None, size=None):
         p, k = draw_values([self.p, self.k], point=point, size=size)
+        p = p / np.sum(p, axis=-1, keepdims=True)
 
         return generate_samples(random_choice,
                                 p=p,
@@ -723,21 +728,35 @@ def random(self, point=None, size=None):
                                 size=size)
 
     def logp(self, value):
-        p = self.p
+        p_ = self.p
         k = self.k
 
         # Clip values before using them for indexing
         value_clip = tt.clip(value, 0, k - 1)
 
-        sumto1 = theano.gradient.zero_grad(
-            tt.le(abs(tt.sum(p, axis=-1) - 1), 1e-5))
+        # We must only check that the values sum to 1 if p comes from a
+        # tensor variable, i.e. when p is a step_method proposal. In the other
+        # cases we normalize ourselves
+        if not isinstance(p_, (numbers.Number,
+                               np.ndarray,
+                               tt.TensorConstant,
+                               tt.sharedvar.SharedVariable)):
+            sumto1 = theano.gradient.zero_grad(
+                tt.le(abs(tt.sum(p_, axis=-1) - 1), 1e-5))
+            p = p_
+        else:
+            p = p_ / tt.sum(p_, axis=-1, keepdims=True)
+            sumto1 = True
+
 
         if p.ndim > 1:
-            a = tt.log(p[tt.arange(p.shape[0]), value_clip])
+            # Fancy trick to index the last axis without touching the other
+            a = tt.log((p.T)[value_clip].T)
         else:
             a = tt.log(p[value_clip])
 
-        return bound(a, value >= 0, value <= (k - 1), sumto1)
+        return bound(a, value >= 0, value <= (k - 1), sumto1,
+                     tt.all(p_ >= 0, axis=-1), tt.all(p <= 1, axis=-1))
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
diff --git a/pymc3/tests/test_distribution_defaults.py b/pymc3/tests/test_distribution_defaults.py
index 510d3998cd..9fb8b69c76 100644
--- a/pymc3/tests/test_distribution_defaults.py
+++ b/pymc3/tests/test_distribution_defaults.py
@@ -1,5 +1,5 @@
 from ..model import Model
-from ..distributions import DiscreteUniform, Continuous
+from ..distributions import DiscreteUniform, Continuous, Categorical
 
 import numpy as np
 import pytest
@@ -67,3 +67,10 @@ def test_discrete_uniform_negative():
     with model:
         x = DiscreteUniform('x', lower=-10, upper=0)
     assert model.test_point['x'] == -5
+
+
+def test_categorical_mode():
+    model = Model()
+    with model:
+        x = Categorical('x', p=np.eye(4), shape=4)
+    assert np.allclose(model.test_point['x'], np.arange(4))
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index e10a34c6d5..bed96b0448 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -321,7 +321,7 @@ def dirichlet_logpdf(value, a):
 
 def categorical_logpdf(value, p):
     if value >= 0 and value <= len(p):
-        return floatX(np.log(p[value]))
+        return floatX(np.log((p.T)[value]).T)
     else:
         return -inf
 
@@ -1079,6 +1079,22 @@ def test_categorical_bounds(self):
             assert np.isinf(x.logp({'x': -1}))
             assert np.isinf(x.logp({'x': 3}))
 
+    def test_categorical_valid_p(self):
+        with Model():
+            x = Categorical('x', p=np.array([-0.2, 0.3, 0.5]))
+            assert np.isinf(x.logp({'x': 0}))
+            assert np.isinf(x.logp({'x': 1}))
+        with Model():
+            # Hard edge case from #2082
+            # Early automatic normalization of p's sum would hide the negative
+            # entries if there is a single or pair number of negative values
+            # and the rest are zero
+            x = Categorical('x', p=np.array([-1, -1, 0, 0]))
+            assert np.isinf(x.logp({'x': 0}))
+            assert np.isinf(x.logp({'x': 1}))
+            assert np.isinf(x.logp({'x': 2}))
+            assert np.isinf(x.logp({'x': 3}))
+
     @pytest.mark.parametrize('n', [2, 3, 4])
     def test_categorical(self, n):
         self.pymc3_matches_scipy(Categorical, Domain(range(n), 'int64'), {'p': Simplex(n)},
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index 26876ee04c..6fa49ca843 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -444,6 +444,9 @@ def test_probability_vector_shape(self):
         p = np.ones((10, 5))
         assert pm.Categorical.dist(p=p).random().shape == (10,)
         assert pm.Categorical.dist(p=p).random(size=4).shape == (4, 10)
+        p = np.ones((3, 7, 5))
+        assert pm.Categorical.dist(p=p).random().shape == (3, 7)
+        assert pm.Categorical.dist(p=p).random(size=4).shape == (4, 3, 7)
 
 
 class TestScalarParameterSamples(SeededTest):

From ec7714138a3398473af91e94a4a7b7aceaf08de0 Mon Sep 17 00:00:00 2001
From: Luciano Paz <chulo1787@gmail.com>
Date: Mon, 25 Feb 2019 15:49:18 +0100
Subject: [PATCH 2/9] Fixed problems with OrderedLogistic.

---
 pymc3/distributions/discrete.py   | 7 ++++---
 pymc3/tests/test_distributions.py | 6 ++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 4047fab7a2..4ec67a01ae 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -716,6 +716,8 @@ def __init__(self, p, *args, **kwargs):
         # point without checking for positiveness
         self.p = p
         self.mode = tt.argmax(p, axis=-1)
+        if self.mode.ndim == 1:
+            self.mode = tt.squeeze(self.mode)
 
     def random(self, point=None, size=None):
         p, k = draw_values([self.p, self.k], point=point, size=size)
@@ -748,7 +750,6 @@ def logp(self, value):
             p = p_ / tt.sum(p_, axis=-1, keepdims=True)
             sumto1 = True
 
-
         if p.ndim > 1:
             # Fancy trick to index the last axis without touching the other
             a = tt.log((p.T)[value_clip].T)
@@ -756,7 +757,7 @@ def logp(self, value):
             a = tt.log(p[value_clip])
 
         return bound(a, value >= 0, value <= (k - 1), sumto1,
-                     tt.all(p_ >= 0, axis=-1), tt.all(p <= 1, axis=-1))
+                     tt.all(p_ > 0, axis=-1), tt.all(p <= 1, axis=-1))
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
@@ -1196,7 +1197,7 @@ def __init__(self, eta, cutpoints, *args, **kwargs):
             tt.zeros_like(tt.shape_padright(pa[:, 0])),
             pa,
             tt.ones_like(tt.shape_padright(pa[:, 0]))
-        ], axis=1)
+        ], axis=-1)
         p = p_cum[:, 1:] - p_cum[:, :-1]
 
         super().__init__(p=p, *args, **kwargs)
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index bed96b0448..8890e23799 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -346,8 +346,10 @@ def invlogit(x, eps=sys.float_info.epsilon):
 
 def orderedlogistic_logpdf(value, eta, cutpoints):
     c = np.concatenate(([-np.inf], cutpoints, [np.inf]))
-    p = invlogit(eta - c[value]) - invlogit(eta - c[value + 1])
-    return np.log(p)
+    ps = np.array([invlogit(eta - cc) - invlogit(eta - cc1)
+                   for cc, cc1 in zip(c[:-1], c[1:])])
+    p = ps[value]
+    return np.where(np.all(ps > 0), np.log(p), -np.inf)
 
 class Simplex:
     def __init__(self, n):

From 0369ffac9beeb12e06b312e4c5eab95650e4c9f2 Mon Sep 17 00:00:00 2001
From: lucianopaz <luciano.paz.neuro@gmail.com>
Date: Mon, 25 Feb 2019 21:19:20 +0100
Subject: [PATCH 3/9] Use np.moveaxis instead of transposing. Also added some
 more tests.

---
 pymc3/distributions/discrete.py   | 3 +--
 pymc3/tests/test_distributions.py | 9 ++++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 4ec67a01ae..952f987773 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -751,8 +751,7 @@ def logp(self, value):
             sumto1 = True
 
         if p.ndim > 1:
-            # Fancy trick to index the last axis without touching the other
-            a = tt.log((p.T)[value_clip].T)
+            a = tt.log(np.moveaxis(p, -1, 0)[value_clip])
         else:
             a = tt.log(p[value_clip])
 
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 8890e23799..05eb14ea82 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -321,7 +321,7 @@ def dirichlet_logpdf(value, a):
 
 def categorical_logpdf(value, p):
     if value >= 0 and value <= len(p):
-        return floatX(np.log((p.T)[value]).T)
+        return floatX(np.log(np.moveaxis(p, -1, 0)[value]))
     else:
         return -inf
 
@@ -1086,6 +1086,13 @@ def test_categorical_valid_p(self):
             x = Categorical('x', p=np.array([-0.2, 0.3, 0.5]))
             assert np.isinf(x.logp({'x': 0}))
             assert np.isinf(x.logp({'x': 1}))
+            assert np.isinf(x.logp({'x': 2}))
+        with Model():
+            # A model where p sums to 1 but contains negative values
+            x = Categorical('x', p=np.array([-0.2, 0.7, 0.5]))
+            assert np.isinf(x.logp({'x': 0}))
+            assert np.isinf(x.logp({'x': 1}))
+            assert np.isinf(x.logp({'x': 2}))
         with Model():
             # Hard edge case from #2082
             # Early automatic normalization of p's sum would hide the negative

From 7e899c6cdb3a3c6d84910c85e195f826db958342 Mon Sep 17 00:00:00 2001
From: lucianopaz <luciano.paz.neuro@gmail.com>
Date: Tue, 26 Feb 2019 21:53:37 +0100
Subject: [PATCH 4/9] Changed np.moveaxis to dimshuffle. Removed isinstance
 conditions that were imposible.

---
 pymc3/distributions/discrete.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 952f987773..e3f8bd33cb 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -739,9 +739,7 @@ def logp(self, value):
         # We must only check that the values sum to 1 if p comes from a
         # tensor variable, i.e. when p is a step_method proposal. In the other
         # cases we normalize ourselves
-        if not isinstance(p_, (numbers.Number,
-                               np.ndarray,
-                               tt.TensorConstant,
+        if not isinstance(p_, (tt.TensorConstant,
                                tt.sharedvar.SharedVariable)):
             sumto1 = theano.gradient.zero_grad(
                 tt.le(abs(tt.sum(p_, axis=-1) - 1), 1e-5))
@@ -751,7 +749,8 @@ def logp(self, value):
             sumto1 = True
 
         if p.ndim > 1:
-            a = tt.log(np.moveaxis(p, -1, 0)[value_clip])
+            pattern = (p.ndim - 1,) + tuple(range(p.ndim - 1))
+            a = tt.log(p.dimshuffle(pattern)[value_clip])
         else:
             a = tt.log(p[value_clip])
 

From beb9cef99ee04a3d3cd15e9b31d48d8740512a61 Mon Sep 17 00:00:00 2001
From: lucianopaz <luciano.paz.neuro@gmail.com>
Date: Wed, 27 Feb 2019 07:43:42 +0100
Subject: [PATCH 5/9] Fixed lint error

---
 pymc3/distributions/discrete.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index e3f8bd33cb..1a1dddf9fa 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -1,4 +1,3 @@
-import numbers
 import numpy as np
 import theano
 import theano.tensor as tt

From 8e03412b3524d31716086ff2ffc7bf8dbdca875c Mon Sep 17 00:00:00 2001
From: Luciano Paz <chulo1787@gmail.com>
Date: Wed, 27 Feb 2019 14:51:01 +0100
Subject: [PATCH 6/9] Force a normalized p in logp

---
 pymc3/distributions/discrete.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 1a1dddf9fa..7d940bf295 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -735,17 +735,7 @@ def logp(self, value):
         # Clip values before using them for indexing
         value_clip = tt.clip(value, 0, k - 1)
 
-        # We must only check that the values sum to 1 if p comes from a
-        # tensor variable, i.e. when p is a step_method proposal. In the other
-        # cases we normalize ourselves
-        if not isinstance(p_, (tt.TensorConstant,
-                               tt.sharedvar.SharedVariable)):
-            sumto1 = theano.gradient.zero_grad(
-                tt.le(abs(tt.sum(p_, axis=-1) - 1), 1e-5))
-            p = p_
-        else:
-            p = p_ / tt.sum(p_, axis=-1, keepdims=True)
-            sumto1 = True
+        p = p_ / tt.sum(p_, axis=-1, keepdims=True)
 
         if p.ndim > 1:
             pattern = (p.ndim - 1,) + tuple(range(p.ndim - 1))
@@ -753,7 +743,7 @@ def logp(self, value):
         else:
             a = tt.log(p[value_clip])
 
-        return bound(a, value >= 0, value <= (k - 1), sumto1,
+        return bound(a, value >= 0, value <= (k - 1),
                      tt.all(p_ > 0, axis=-1), tt.all(p <= 1, axis=-1))
 
     def _repr_latex_(self, name=None, dist=None):

From 2dfdd70626342d2c13282219639ef350712c3207 Mon Sep 17 00:00:00 2001
From: Luciano Paz <chulo1787@gmail.com>
Date: Thu, 28 Feb 2019 09:37:58 +0100
Subject: [PATCH 7/9] Fixed lint error

---
 pymc3/distributions/discrete.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 7d940bf295..1dbe399363 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -1,5 +1,4 @@
 import numpy as np
-import theano
 import theano.tensor as tt
 from scipy import stats
 import warnings
@@ -744,7 +743,7 @@ def logp(self, value):
             a = tt.log(p[value_clip])
 
         return bound(a, value >= 0, value <= (k - 1),
-                     tt.all(p_ > 0, axis=-1), tt.all(p <= 1, axis=-1))
+                     tt.all(p_ >= 0, axis=-1), tt.all(p <= 1, axis=-1))
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:

From 1953fdeebfd09b711d8a6843b410d630408e49b1 Mon Sep 17 00:00:00 2001
From: Luciano Paz <chulo1787@gmail.com>
Date: Thu, 28 Feb 2019 09:56:34 +0100
Subject: [PATCH 8/9] Edited release notes. Categorical.p can have zeros, and
 the normalization of p is always carried out

---
 RELEASE-NOTES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index d8bd2239ce..4806c7aa94 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -26,7 +26,7 @@
 - `dist_math.random_choice` now handles nd-arrays of category probabilities, and also handles sizes that are not `None`. Also removed unused `k` kwarg from `dist_math.random_choice`.
 - Changed `Categorical.mode` to preserve all the dimensions of `p` except the last one, which encodes each category's probability.
 - Changed initialization of `Categorical.p`. `p` is now normalized to sum to `1` inside `logp` and `random`, but not during initialization. This could hide negative values supplied to `p` as mentioned in #2082.
-- To be able to test for negative `p` values supplied to `Categorical`, `Categorical.logp` was changed to check for `sum(self.p, axis=-1) == 1` only if `self.p` is not a `Number`, `np.ndarray`, `TensorConstant` or `SharedVariable`. These cases are automatically normalized to sum to `1`. The other condition may originate from a `step_method` proposal, where `self.p` tensor's value may be set, but must sum to 1 nevertheless. This may break old code which intialized `p` with a theano expression and relied on the default normalization to get it to sum to 1. `Categorical.logp` now also checks that the used `p` has values lower than 1.
+- `Categorical` now accepts elements of `p` equal to `0`. `logp` will return `-inf` if there are `values` that index to the zero probability categories.
 
 ### Deprecations
 

From 88c9e725fb4bd578595a1e9bb2ba52744f8deec4 Mon Sep 17 00:00:00 2001
From: Luciano Paz <chulo1787@gmail.com>
Date: Thu, 28 Feb 2019 10:28:26 +0100
Subject: [PATCH 9/9] Fixed orderedlogistic test logpdf to check for ps>=0 and
 not just ps>0

---
 pymc3/tests/test_distributions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 05eb14ea82..dbb02d36d8 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -349,7 +349,7 @@ def orderedlogistic_logpdf(value, eta, cutpoints):
     ps = np.array([invlogit(eta - cc) - invlogit(eta - cc1)
                    for cc, cc1 in zip(c[:-1], c[1:])])
     p = ps[value]
-    return np.where(np.all(ps > 0), np.log(p), -np.inf)
+    return np.where(np.all(ps >= 0), np.log(p), -np.inf)
 
 class Simplex:
     def __init__(self, n):