Skip to content

boost minibatches #2171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 36 commits into from
Jun 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
1f2913b
draft
ferrine May 10, 2017
7c9963c
upgrade subsampling
ferrine May 11, 2017
4022841
add broadcastable kwarg
ferrine May 11, 2017
7edaba0
fix mutable in defaults
ferrine May 11, 2017
3d0ae88
fix mutable in defaults
ferrine May 11, 2017
9aa0b1d
draft
ferrine May 12, 2017
2be3609
refactor minibatch
ferrine May 19, 2017
d3b8ae8
add tests for minibatches
ferrine May 19, 2017
4e3b55d
fix broadcastable
ferrine May 22, 2017
d961497
fix type errors
ferrine May 27, 2017
a9cc5ad
fix old generator issues
ferrine May 27, 2017
60354a6
refactor tests related to minibatches
ferrine May 27, 2017
47382bc
use strict floatX
ferrine May 27, 2017
905f755
Much better api
ferrine May 30, 2017
805d723
delete DataSampler
ferrine May 30, 2017
f58af23
fix python2 tests
ferrine May 31, 2017
08a0e46
add docstring
ferrine May 31, 2017
9e31935
typos
ferrine May 31, 2017
fb77019
add total size hints
ferrine May 31, 2017
e366745
fix test
ferrine Jun 1, 2017
7b3c1b6
fix docstrings, thanks @fonnesbeck
ferrine Jun 1, 2017
b62f6ce
fix small typo
ferrine Jun 1, 2017
ca9b5c0
more decorations
ferrine Jun 1, 2017
4a6f40c
update lda-advi-aevb.ipynb
ferrine Apr 30, 2017
404ba6e
update GLM-hierarchical-advi-minibatch.ipynb
ferrine May 1, 2017
45e452d
update convolutional_vae_keras_advi.ipynb
ferrine May 1, 2017
94366d6
update gaussian-mixture-model-advi.ipynb
ferrine May 2, 2017
8c83c76
new minibatches in GLM-hierarchical-advi-minibatch.ipynb
Jun 1, 2017
075fd2b
add temperature
ferrine Jun 3, 2017
1d3e173
Unused Import
ferrine Jun 3, 2017
89a47d5
update GLM-hierarchical-advi-minibatch.ipynb
Jun 4, 2017
56c5263
update docs/source/notebooks/lda-advi-aevb.ipynb
Jun 4, 2017
50c67df
update convolutional_vae_keras_advi.ipynb
ferrine Jun 5, 2017
6e8977a
update gaussian-mixture-model-advi.ipynb
ferrine Jun 6, 2017
400fd92
some fixes in docstring
ferrine Jun 7, 2017
bdf5602
fix notebooks' comments following @taku-y recommendations
ferrine Jun 10, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 60 additions & 107 deletions docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb

Large diffs are not rendered by default.

250 changes: 104 additions & 146 deletions docs/source/notebooks/convolutional_vae_keras_advi.ipynb

Large diffs are not rendered by default.

254 changes: 107 additions & 147 deletions docs/source/notebooks/gaussian-mixture-model-advi.ipynb

Large diffs are not rendered by default.

449 changes: 131 additions & 318 deletions docs/source/notebooks/lda-advi-aevb.ipynb

Large diffs are not rendered by default.

311 changes: 262 additions & 49 deletions pymc3/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
__all__ = [
'get_data',
'GeneratorAdapter',
'DataSampler'
'Minibatch'
]


Expand Down Expand Up @@ -90,59 +90,272 @@ def __hash__(self):
return hash(id(self))


class DataSampler(object):
"""
Convenient picklable data sampler for minibatch inference.

This generator can be used for passing to pm.generator
creating picklable theano computational grapf
class Minibatch(tt.TensorVariable):
"""Multidimensional minibatch that is pure TensorVariable

Parameters
----------
data : array like
batchsize : sample size over zero axis
random_seed : int for numpy random generator
dtype : str representing dtype

Usage
-----
>>> import pickle
data : :class:`ndarray`
initial data
batch_size : `int` or `List[int|tuple(size, random_seed)]`
batch size for inference, random seed is needed
for child random generators
dtype : `str`
cast data to specific type
broadcastable : tuple[bool]
change broadcastable pattern that defaults to `(False, ) * ndim`
name : `str`
name for tensor, defaults to "Minibatch"
random_seed : `int`
random seed that is used by default
update_shared_f : `callable`
returns :class:`ndarray` that will be carefully
stored to underlying shared variable
you can use it to change source of
minibatches programmatically
in_memory_size : `int` or `List[int|slice|Ellipsis]`
data size for storing in theano.shared

Attributes
----------
shared : shared tensor
Used for storing data
minibatch : minibatch tensor
Used for training

Examples
--------
Consider we have data
>>> data = np.random.rand(100, 100)

if we want 1d slice of size 10 we do
>>> x = Minibatch(data, batch_size=10)

Note, that your data is cast to `floatX` if it is not integer type
But you still can add `dtype` kwarg for :class:`Minibatch`

in case we want 10 sampled rows and columns
`[(size, seed), (size, seed)]` it is
>>> x = Minibatch(data, batch_size=[(10, 42), (10, 42)], dtype='int32')
>>> assert str(x.dtype) == 'int32'

or simpler with default random seed = 42
`[size, size]`
>>> x = Minibatch(data, batch_size=[10, 10])

x is a regular :class:`TensorVariable` that supports any math
>>> assert x.eval().shape == (10, 10)

You can pass it to your desired model
>>> with pm.Model() as model:
... mu = pm.Flat('mu')
... sd = pm.HalfNormal('sd')
... lik = pm.Normal('lik', mu, sd, observed=x, total_size=(100, 100))

Then you can perform regular Variational Inference out of the box
>>> with model:
... approx = pm.fit()

Notable thing is that :class:`Minibatch` has `shared`, `minibatch`, attributes
you can call later
>>> x.set_value(np.random.laplace(size=(100, 100)))

and minibatches will be then from new storage
it directly affects `x.shared`.
the same thing would be but less convenient
>>> x.shared.set_value(pm.floatX(np.random.laplace(size=(100, 100))))

programmatic way to change storage is as follows
I import `partial` for simplicity
>>> from functools import partial
>>> np.random.seed(42) # reproducibility
>>> pm.set_tt_rng(42)
>>> data = np.random.normal(size=(1000,)) + 10
>>> minibatches = DataSampler(data, batchsize=50)
>>> with pm.Model():
... sd = pm.Uniform('sd', 0, 10)
... mu = pm.Normal('mu', sd=10)
... obs_norm = pm.Normal('obs_norm', mu=mu, sd=sd,
... observed=minibatches,
... total_size=data.shape[0])
... adam = partial(pm.adam, learning_rate=.8) # easy problem
... approx = pm.fit(10000, method='advi', obj_optimizer=adam)
>>> new = pickle.loads(pickle.dumps(approx))
>>> new #doctest: +ELLIPSIS
<pymc3.variational.approximations.MeanField object at 0x...>
>>> new.sample_vp(draws=1000)['mu'].mean()
10.08339999101371
>>> new.sample_vp(draws=1000)['sd'].mean()
1.2178044136104513
>>> datagen = partial(np.random.laplace, size=(100, 100))
>>> x = Minibatch(datagen(), batch_size=10, update_shared_f=datagen)
>>> x.update_shared()

To be more concrete about how we get minibatch, here is a demo
1) create shared variable
>>> shared = theano.shared(data)

2) create random slice of size 10
>>> ridx = pm.tt_rng().uniform(size=(10,), low=0, high=data.shape[0]-1e-10).astype('int64')

3) take that slice
>>> minibatch = shared[ridx]

That's done. Next you can use this minibatch somewhere else.
You can see that implementation does not require fixed shape
for shared variable. Feel free to use that if needed.

Suppose you need some replacements in the graph, e.g. change minibatch to testdata
>>> node = x ** 2 # arbitrary expressions on minibatch `x`
>>> testdata = pm.floatX(np.random.laplace(size=(1000, 10)))

Then you should create a dict with replacements
>>> replacements = {x: testdata}
>>> rnode = theano.clone(node, replacements)
>>> assert (testdata ** 2 == rnode.eval()).all()

To replace minibatch with it's shared variable you should do
the same things. Minibatch variable is accessible as an attribute
as well as shared, associated with minibatch
>>> replacements = {x.minibatch: x.shared}
>>> rnode = theano.clone(node, replacements)

For more complex slices some more code is needed that can seem not so clear
>>> moredata = np.random.rand(10, 20, 30, 40, 50)

default `total_size` that can be passed to `PyMC3` random node
is then `(10, 20, 30, 40, 50)` but can be less verbose in some cases

1) Advanced indexing, `total_size = (10, Ellipsis, 50)`
>>> x = Minibatch(moredata, [2, Ellipsis, 10])

We take slice only for the first and last dimension
>>> assert x.eval().shape == (2, 20, 30, 40, 10)

2) Skipping particular dimension, `total_size = (10, None, 30)`
>>> x = Minibatch(moredata, [2, None, 20])
>>> assert x.eval().shape == (2, 20, 20, 40, 50)

3) Mixing that all, `total_size = (10, None, 30, Ellipsis, 50)`
>>> x = Minibatch(moredata, [2, None, 20, Ellipsis, 10])
>>> assert x.eval().shape == (2, 20, 20, 40, 10)
"""
def __init__(self, data, batchsize=50, random_seed=42, dtype='floatX'):
self.dtype = theano.config.floatX if dtype == 'floatX' else dtype
self.rng = np.random.RandomState(random_seed)
self.data = data
self.n = batchsize
@theano.configparser.change_flags(compute_test_value='raise')
def __init__(self, data, batch_size=128, dtype=None, broadcastable=None, name='Minibatch',
random_seed=42, update_shared_f=None, in_memory_size=None):
if dtype is None:
data = pm.smartfloatX(np.asarray(data))
else:
data = np.asarray(data, dtype)
in_memory_slc = self.make_static_slices(in_memory_size)
self.shared = theano.shared(data[in_memory_slc])
self.update_shared_f = update_shared_f
self.random_slc = self.make_random_slices(self.shared.shape, batch_size, random_seed)
minibatch = self.shared[self.random_slc]
if broadcastable is None:
broadcastable = (False, ) * minibatch.ndim
minibatch = tt.patternbroadcast(minibatch, broadcastable)
self.minibatch = minibatch
super(Minibatch, self).__init__(
self.minibatch.type, None, None, name=name)
theano.Apply(
theano.compile.view_op,
inputs=[self.minibatch], outputs=[self])
self.tag.test_value = copy(self.minibatch.tag.test_value)

def __iter__(self):
return self
@staticmethod
def rslice(total, size, seed):
if size is None:
return slice(None)
elif isinstance(size, int):
return (pm.tt_rng(seed)
.uniform(size=(size, ), low=0.0, high=pm.floatX(total) - 1e-16)
.astype('int64'))
else:
raise TypeError('Unrecognized size type, %r' % size)

def __next__(self):
idx = (self.rng
.uniform(size=self.n,
low=0.0,
high=self.data.shape[0] - 1e-16)
.astype('int64'))
return np.asarray(self.data[idx], self.dtype)
@staticmethod
def make_static_slices(user_size):
if user_size is None:
return [Ellipsis]
elif isinstance(user_size, int):
return slice(None, user_size)
elif isinstance(user_size, (list, tuple)):
slc = list()
for i in user_size:
if isinstance(i, int):
slc.append(i)
elif i is None:
slc.append(slice(None))
elif i is Ellipsis:
slc.append(Ellipsis)
elif isinstance(i, slice):
slc.append(i)
else:
raise TypeError('Unrecognized size type, %r' % user_size)
return slc
else:
raise TypeError('Unrecognized size type, %r' % user_size)

next = __next__
@classmethod
def make_random_slices(cls, in_memory_shape, batch_size, default_random_seed):
if batch_size is None:
return [Ellipsis]
elif isinstance(batch_size, int):
slc = [cls.rslice(in_memory_shape[0], batch_size, default_random_seed)]
elif isinstance(batch_size, (list, tuple)):
def check(t):
if t is Ellipsis or t is None:
return True
else:
if isinstance(t, (tuple, list)):
if not len(t) == 2:
return False
else:
return isinstance(t[0], int) and isinstance(t[1], int)
elif isinstance(t, int):
return True
else:
return False
# end check definition
if not all(check(t) for t in batch_size):
raise TypeError('Unrecognized `batch_size` type, expected '
'int or List[int|tuple(size, random_seed)] where '
'size and random seed are both ints, got %r' %
batch_size)
batch_size = [
(i, default_random_seed) if isinstance(i, int) else i
for i in batch_size
]
shape = in_memory_shape
if Ellipsis in batch_size:
sep = batch_size.index(Ellipsis)
begin = batch_size[:sep]
end = batch_size[sep + 1:]
if Ellipsis in end:
raise ValueError('Double Ellipsis in `batch_size` is restricted, got %r' %
batch_size)
if len(end) > 0:
shp_mid = shape[sep:-len(end)]
mid = [tt.arange(s) for s in shp_mid]
else:
mid = []
else:
begin = batch_size
end = []
mid = []
if (len(begin) + len(end)) > len(in_memory_shape.eval()):
raise ValueError('Length of `batch_size` is too big, '
'number of ints is bigger that ndim, got %r'
% batch_size)
if len(end) > 0:
shp_end = shape[-len(end):]
else:
shp_end = np.asarray([])
shp_begin = shape[:len(begin)]
slc_begin = [cls.rslice(shp_begin[i], t[0], t[1])
if t is not None else tt.arange(shp_begin[i])
for i, t in enumerate(begin)]
slc_end = [cls.rslice(shp_end[i], t[0], t[1])
if t is not None else tt.arange(shp_end[i])
for i, t in enumerate(end)]
slc = slc_begin + mid + slc_end
slc = slc
else:
raise TypeError('Unrecognized size type, %r' % batch_size)
return pm.theanof.ix_(*slc)

def update_shared(self):
if self.update_shared_f is None:
raise NotImplementedError("No `update_shared_f` was provided to `__init__`")
self.set_value(np.asarray(self.update_shared_f(), self.dtype))

def set_value(self, value):
self.shared.set_value(np.asarray(value, self.dtype))

def clone(self):
ret = self.type()
ret.name = self.name
ret.tag = copy(self.tag)
return ret
12 changes: 7 additions & 5 deletions pymc3/distributions/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@ def dist(cls, *args, **kwargs):
dist.__init__(*args, **kwargs)
return dist

def __init__(self, shape, dtype, testval=None, defaults=(), transform=None):
def __init__(self, shape, dtype, testval=None, defaults=(), transform=None, broadcastable=None):
self.shape = np.atleast_1d(shape)
if False in (np.floor(self.shape) == self.shape):
raise TypeError("Expected int elements in shape")
self.dtype = dtype
self.type = TensorType(self.dtype, self.shape)
self.type = TensorType(self.dtype, self.shape, broadcastable)
self.testval = testval
self.defaults = defaults
self.transform = transform
Expand Down Expand Up @@ -89,8 +89,10 @@ def _repr_latex_(self, name=None, dist=None):
return None


def TensorType(dtype, shape):
return tt.TensorType(str(dtype), np.atleast_1d(shape) == 1)
def TensorType(dtype, shape, broadcastable=None):
if broadcastable is None:
broadcastable = np.atleast_1d(shape) == 1
return tt.TensorType(str(dtype), broadcastable)


class NoDistribution(Distribution):
Expand All @@ -114,7 +116,7 @@ def logp(self, x):
class Discrete(Distribution):
"""Base class for discrete distributions"""

def __init__(self, shape=(), dtype=None, defaults=('mode', ),
def __init__(self, shape=(), dtype=None, defaults=('mode',),
*args, **kwargs):
if dtype is None:
if theano.config.floatX == 'float32':
Expand Down
Loading