diff --git a/pymc3/model.py b/pymc3/model.py index 5f079b5686..1220c5529d 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1,3 +1,4 @@ +import six import numpy as np import theano import theano.tensor as tt @@ -168,17 +169,215 @@ def logpt(self): return tt.sum(self.logp_elemwiset) -class Model(Context, Factor): - """Encapsulates the variables and likelihood factors of a model.""" +class InitContextMeta(type): + """Metaclass that executes `__init__` of instance in it's context""" + def __call__(cls, *args, **kwargs): + instance = cls.__new__(cls, *args, **kwargs) + with instance: # appends context + instance.__init__(*args, **kwargs) + return instance + + +def withparent(meth): + """Helper wrapper that passes calls to parent's instance""" + def wrapped(self, *args, **kwargs): + res = meth(self, *args, **kwargs) + if getattr(self, 'parent', None) is not None: + getattr(self.parent, meth.__name__)(*args, **kwargs) + return res + # Unfortunately functools wrapper fails + # when decorating built-in methods so we + # need to fix that improper behaviour + wrapped.__name__ = meth.__name__ + return wrapped + + +class treelist(list): + """A list that passes mutable extending operations used in Model + to parent list instance. + Extending treelist you will also extend its parent + """ + def __init__(self, iterable=(), parent=None): + super(treelist, self).__init__(iterable) + assert isinstance(parent, list) or parent is None + self.parent = parent + if self.parent is not None: + self.parent.extend(self) + # typechecking here works bad + append = withparent(list.append) + __iadd__ = withparent(list.__iadd__) + extend = withparent(list.extend) + + def tree_contains(self, item): + if isinstance(self.parent, treedict): + return (list.__contains__(self, item) or + self.parent.tree_contains(item)) + elif isinstance(self.parent, list): + return (list.__contains__(self, item) or + self.parent.__contains__(item)) + else: + return list.__contains__(self, item) + + def __setitem__(self, key, value): + raise NotImplementedError('Method is removed as we are not' + ' able to determine ' + 'appropriate logic for it') + + def __imul__(self, other): + t0 = len(self) + list.__imul__(self, other) + if self.parent is not None: + self.parent.extend(self[t0:]) + + +class treedict(dict): + """A dict that passes mutable extending operations used in Model + to parent dict instance. + Extending treedict you will also extend its parent + """ + def __init__(self, iterable=(), parent=None, **kwargs): + super(treedict, self).__init__(iterable, **kwargs) + assert isinstance(parent, dict) or parent is None + self.parent = parent + if self.parent is not None: + self.parent.update(self) + # typechecking here works bad + __setitem__ = withparent(dict.__setitem__) + update = withparent(dict.update) + + def tree_contains(self, item): + # needed for `add_random_variable` method + if isinstance(self.parent, treedict): + return (dict.__contains__(self, item) or + self.parent.tree_contains(item)) + elif isinstance(self.parent, dict): + return (dict.__contains__(self, item) or + self.parent.__contains__(item)) + else: + return dict.__contains__(self, item) + + +class Model(six.with_metaclass(InitContextMeta, Context, Factor)): + """Encapsulates the variables and likelihood factors of a model. - def __init__(self): - self.named_vars = {} - self.free_RVs = [] - self.observed_RVs = [] - self.deterministics = [] - self.potentials = [] - self.missing_values = [] - self.model = self + Model class can be used for creating class based models. To create + a class based model you should inherit from `Model` and + override `__init__` with arbitrary definitions + (do not forget to call base class `__init__` first). + + Parameters + ---------- + name : str, default '' - name that will be used as prefix for + names of all random variables defined within model + model : Model, default None - instance of Model that is + supposed to be a parent for the new instance. If None, + context will be used. All variables defined within instance + will be passed to the parent instance. So that 'nested' model + contributes to the variables and likelihood factors of + parent model. + + Examples + -------- + # How to define a custom model + class CustomModel(Model): + # 1) override init + def __init__(self, mean=0, sd=1, name='', model=None): + # 2) call super's init first, passing model and name to it + # name will be prefix for all variables here + # if no name specified for model there will be no prefix + super(CustomModel, self).__init__(name, model) + # now you are in the context of instance, + # `modelcontext` will return self + # you can define variables in several ways + # note, that all variables will get model's name prefix + + # 3) you can create variables with Var method + self.Var('v1', Normal.dist(mu=mean, sd=sd)) + # this will create variable named like '{prefix_}v1' + # and assign attribute 'v1' to instance + # created variable can be accessed with self.v1 or self['v1'] + + # 4) this syntax will also work as we are in the context + # of instance itself, names are given as usual + Normal('v2', mu=mean, sd=sd) + + # something more complex is allowed too + Normal('v3', mu=mean, sd=HalfCauchy('sd', beta=10, testval=1.)) + + # Deterministic variables can be used in usual way + Deterministic('v3_sq', self.v3 ** 2) + # Potentials too + Potential('p1', tt.constant(1)) + + # After defining a class CustomModel you can use it in several ways + + # I: + # state the model within a context + with Model() as model: + CustomModel() + # arbitrary actions + + # II: + # use new class as entering point in context + with CustomModel() as model: + Normal('new_normal_var', mu=1, sd=0) + + # III: + # just get model instance with all that was defined in it + model = CustomModel() + + # IV: + # use many custom models within one context + with Model() as model: + CustomModel(mean=1, name='first') + CustomModel(mean=2, name='second') + """ + def __new__(cls, *args, **kwargs): + # resolves the parent instance + instance = object.__new__(cls) + if kwargs.get('model') is not None: + instance._parent = kwargs.get('model') + elif cls.get_contexts(): + instance._parent = cls.get_contexts()[-1] + else: + instance._parent = None + return instance + + def __init__(self, name='', model=None): + self.name = name + if self.parent is not None: + self.named_vars = treedict(parent=self.parent.named_vars) + self.free_RVs = treelist(parent=self.parent.free_RVs) + self.observed_RVs = treelist(parent=self.parent.observed_RVs) + self.deterministics = treelist(parent=self.parent.deterministics) + self.potentials = treelist(parent=self.parent.potentials) + self.missing_values = treelist(parent=self.parent.missing_values) + else: + self.named_vars = treedict() + self.free_RVs = treelist() + self.observed_RVs = treelist() + self.deterministics = treelist() + self.potentials = treelist() + self.missing_values = treelist() + + @property + def model(self): + return self + + @property + def parent(self): + return self._parent + + @property + def root(self): + model = self + while not model.isroot: + model = model.parent + return model + + @property + def isroot(self): + return self.parent is None @property @memoize @@ -271,6 +470,7 @@ def Var(self, name, dist, data=None): ------- FreeRV or ObservedRV """ + name = self.name_for(name) if data is None: if getattr(dist, "transform", None) is None: var = FreeRV(name=name, distribution=dist, model=self) @@ -308,15 +508,46 @@ def Var(self, name, dist, data=None): def add_random_variable(self, var): """Add a random variable to the named variables of the model.""" - if var.name in self.named_vars: + if self.named_vars.tree_contains(var.name): raise ValueError( "Variable name {} already exists.".format(var.name)) self.named_vars[var.name] = var - if not hasattr(self, var.name): - setattr(self, var.name, var) + if not hasattr(self, self.name_of(var.name)): + setattr(self, self.name_of(var.name), var) + + @property + def prefix(self): + return '%s_' % self.name if self.name else '' + + def name_for(self, name): + """Checks if name has prefix and adds if needed + """ + if self.prefix: + if not name.startswith(self.prefix): + return '{}{}'.format(self.prefix, name) + else: + return name + else: + return name + + def name_of(self, name): + """Checks if name has prefix and deletes if needed + """ + if not self.prefix or not name: + return name + elif name.startswith(self.prefix): + return name[len(self.prefix):] + else: + return name def __getitem__(self, key): - return self.named_vars[key] + try: + return self.named_vars[key] + except KeyError as e: + try: + return self.named_vars[self.name_for(key)] + except KeyError: + raise e @memoize def makefn(self, outs, mode=None, *args, **kwargs): @@ -633,9 +864,10 @@ def Deterministic(name, var, model=None): ------- n : var but with name name """ - var.name = name - modelcontext(model).deterministics.append(var) - modelcontext(model).add_random_variable(var) + model = modelcontext(model) + var.name = model.name_for(name) + model.deterministics.append(var) + model.add_random_variable(var) return var @@ -651,8 +883,9 @@ def Potential(name, var, model=None): ------- var : var, with name attribute """ - var.name = name - modelcontext(model).potentials.append(var) + model = modelcontext(model) + var.name = model.name_for(name) + model.potentials.append(var) return var diff --git a/pymc3/models/__init__.py b/pymc3/models/__init__.py new file mode 100644 index 0000000000..17e6772687 --- /dev/null +++ b/pymc3/models/__init__.py @@ -0,0 +1,6 @@ +from .linear import LinearComponent, Glm + +__all__ = [ + 'LinearComponent', + 'Glm' +] diff --git a/pymc3/models/linear.py b/pymc3/models/linear.py new file mode 100644 index 0000000000..b6a224f2b4 --- /dev/null +++ b/pymc3/models/linear.py @@ -0,0 +1,134 @@ +import theano.tensor as tt +import pandas as pd +import numpy as np +from ..distributions import Normal, Flat +from ..glm import families +from ..model import Model, Deterministic +from .utils import any_to_tensor_and_labels + +__all__ = [ + 'LinearComponent', + 'Glm' +] + + +class LinearComponent(Model): + """Creates linear component, y_est is accessible via attribute + Parameters + ---------- + name : str - name, associated with the linear component + x : pd.DataFrame or np.ndarray + y : pd.Series or np.array + intercept : bool - fit with intercept or not? + labels : list - replace variable names with these labels + priors : dict - priors for coefficients + use `Intercept` key for defining Intercept prior + defaults to Flat.dist() + use `Regressor` key for defining default prior for all regressors + defaults to Normal.dist(mu=0, tau=1.0E-6) + vars : dict - random variables instead of creating new ones + """ + default_regressor_prior = Normal.dist(mu=0, tau=1.0E-6) + default_intercept_prior = Flat.dist() + + def __init__(self, x, y, intercept=True, labels=None, + priors=None, vars=None, name='', model=None): + super(LinearComponent, self).__init__(name, model) + if priors is None: + priors = {} + if vars is None: + vars = {} + x, labels = any_to_tensor_and_labels(x, labels) + # now we have x, shape and labels + if intercept: + x = tt.concatenate( + [tt.ones((x.shape[0], 1), x.dtype), x], + axis=1 + ) + labels = ['Intercept'] + labels + coeffs = list() + for name in labels: + if name == 'Intercept': + if name in vars: + v = Deterministic(name, vars[name]) + else: + v = self.Var( + name=name, + dist=priors.get( + name, + self.default_intercept_prior + ) + ) + coeffs.append(v) + else: + if name in vars: + v = Deterministic(name, vars[name]) + else: + v = self.Var( + name=name, + dist=priors.get( + name, + priors.get( + 'Regressor', + self.default_regressor_prior + ) + ) + ) + coeffs.append(v) + self.coeffs = tt.stack(coeffs, axis=0) + self.y_est = x.dot(self.coeffs) + + @classmethod + def from_formula(cls, formula, data, priors=None, vars=None, name='', model=None): + import patsy + y, x = patsy.dmatrices(formula, data) + labels = x.design_info.column_names + return cls(np.asarray(x), np.asarray(y)[:, 0], intercept=False, labels=labels, + priors=priors, vars=vars, name=name, model=model) + + +class Glm(LinearComponent): + """Creates glm model, y_est is accessible via attribute + Parameters + ---------- + name : str - name, associated with the linear component + x : pd.DataFrame or np.ndarray + y : pd.Series or np.array + intercept : bool - fit with intercept or not? + labels : list - replace variable names with these labels + priors : dict - priors for coefficients + use `Intercept` key for defining Intercept prior + defaults to Flat.dist() + use `Regressor` key for defining default prior for all regressors + defaults to Normal.dist(mu=0, tau=1.0E-6) + init : dict - test_vals for coefficients + vars : dict - random variables instead of creating new ones + family : pymc3.glm.families object + """ + def __init__(self, x, y, intercept=True, labels=None, + priors=None, vars=None, family='normal', name='', model=None): + super(Glm, self).__init__( + x, y, intercept=intercept, labels=labels, + priors=priors, vars=vars, name=name, model=model + ) + + _families = dict( + normal=families.Normal, + student=families.StudentT, + binomial=families.Binomial, + poisson=families.Poisson + ) + if isinstance(family, str): + family = _families[family]() + self.y_est = family.create_likelihood( + name='', y_est=self.y_est, + y_data=y, model=self) + + @classmethod + def from_formula(cls, formula, data, priors=None, + vars=None, family='normal', name='', model=None): + import patsy + y, x = patsy.dmatrices(formula, data) + labels = x.design_info.column_names + return cls(np.asarray(x), np.asarray(y)[:, 0], intercept=False, labels=labels, + priors=priors, vars=vars, family=family, name=name, model=model) diff --git a/pymc3/models/utils.py b/pymc3/models/utils.py new file mode 100644 index 0000000000..ea073fcd1b --- /dev/null +++ b/pymc3/models/utils.py @@ -0,0 +1,126 @@ +import six +import pandas as pd +from pandas.core.common import PandasError +import numpy as np +import theano.tensor as tt + + +def any_to_tensor_and_labels(x, labels=None): + """Util for converting input x to tensor trying to + create labels for columns if they are not provided. + + Default names for columns are ['x0', 'x1', ...], for mappable + arrays (e.g. pd.DataFrame) their names are treated as labels. + You can override them with `labels` argument. + + If you have tensor input you should provide labels as we + cannot get their shape directly + + If you pass dict input we cannot rely on labels order thus dict + keys are treated as labels anyway + + Parameters + ---------- + x : np.ndarray | pd.DataFrame | tt.Variable | dict | list + labels : list - names for columns of output tensor + + Returns + ------- + (x, labels) - tensor and labels for its columns + """ + if isinstance(labels, six.string_types): + labels = [labels] + # pandas.DataFrame + # labels can come from here + # we can override them + if isinstance(x, pd.DataFrame): + if not labels: + labels = x.columns + x = x.as_matrix() + + # pandas.Series + # there can still be a label + # we can override labels + elif isinstance(x, pd.Series): + if not labels: + labels = [x.name] + x = x.as_matrix()[:, None] + + # dict + # labels are keys, + # cannot override them + elif isinstance(x, dict): + # try to do it via pandas + try: + x = pd.DataFrame.from_dict(x) + labels = x.columns + x = x.as_matrix() + # some types fail there + # another approach is to construct + # variable by hand + except (PandasError, TypeError): + res = [] + labels = [] + for k, v in x.items(): + res.append(v) + labels.append(k) + x = tt.stack(res, axis=1) + if x.ndim == 1: + x = x[:, None] + # case when it can appear to be some + # array like value like lists of lists + # numpy deals with it + elif not isinstance(x, tt.Variable): + x = np.asarray(x) + if x.ndim == 0: + raise ValueError('Cannot use scalars') + elif x.ndim == 1: + x = x[:, None] + # something really strange goes here, + # but user passes labels trusting seems + # to be a good option + elif labels is not None: + x = tt.as_tensor_variable(x) + if x.ndim == 0: + raise ValueError('Cannot use scalars') + elif x.ndim == 1: + x = x[:, None] + else: # trust input + pass + # we should check that we can extract labels + if labels is None and not isinstance(x, tt.Variable): + labels = ['x%d' % i for i in range(x.shape[1])] + # for theano variables we should have labels from user + elif labels is None: + raise ValueError('Please provide labels as ' + 'we cannot infer shape of input') + else: # trust labels, user knows what he is doing + pass + # it's time to check shapes if we can + if not isinstance(x, tt.Variable): + if not len(labels) == x.shape[1]: + raise ValueError( + 'Please provide full list ' + 'of labels for coefficients, ' + 'got len(labels)=%d instead of %d' + % (len(labels), x.shape[1]) + ) + else: + # trust labels, as we raised an + # error in bad case, we have labels + pass + # convert labels to list + if isinstance(labels, pd.RangeIndex): + labels = ['x%d' % i for i in labels] + # maybe it was a tuple ot whatever + elif not isinstance(labels, list): + labels = list(labels) + # as output we need tensor + if not isinstance(x, tt.Variable): + x = tt.as_tensor_variable(x) + # finally check dimensions + if x.ndim == 0: + raise ValueError('Cannot use scalars') + elif x.ndim == 1: + x = x[:, None] + return x, labels diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py new file mode 100644 index 0000000000..4eb96e6789 --- /dev/null +++ b/pymc3/tests/test_model.py @@ -0,0 +1,120 @@ +import unittest +import theano.tensor as tt +import pymc3 as pm +from pymc3.distributions import HalfCauchy, Normal +from pymc3 import Potential, Deterministic + + +class NewModel(pm.Model): + def __init__(self, name='', model=None): + super(NewModel, self).__init__(name, model) + assert pm.modelcontext(None) is self + # 1) init variables with Var method + self.Var('v1', pm.Normal.dist()) + self.v2 = pm.Normal('v2', mu=0, sd=1) + # 2) Potentials and Deterministic variables with method too + # be sure that names will not overlap with other same models + pm.Deterministic('d', tt.constant(1)) + pm.Potential('p', tt.constant(1)) + + +class DocstringModel(pm.Model): + def __init__(self, mean=0, sd=1, name='', model=None): + super(DocstringModel, self).__init__(name, model) + self.Var('v1', Normal.dist(mu=mean, sd=sd)) + Normal('v2', mu=mean, sd=sd) + Normal('v3', mu=mean, sd=HalfCauchy('sd', beta=10, testval=1.)) + Deterministic('v3_sq', self.v3 ** 2) + Potential('p1', tt.constant(1)) + + +class TestBaseModel(unittest.TestCase): + def test_setattr_properly_works(self): + with pm.Model() as model: + pm.Normal('v1') + self.assertEqual(len(model.vars), 1) + with pm.Model('sub') as submodel: + submodel.Var('v1', pm.Normal.dist()) + self.assertTrue(hasattr(submodel, 'v1')) + self.assertEqual(len(submodel.vars), 1) + self.assertEqual(len(model.vars), 2) + with submodel: + submodel.Var('v2', pm.Normal.dist()) + self.assertTrue(hasattr(submodel, 'v2')) + self.assertEqual(len(submodel.vars), 2) + self.assertEqual(len(model.vars), 3) + + def test_context_passes_vars_to_parent_model(self): + with pm.Model() as model: + # a set of variables is created + NewModel() + # another set of variables are created but with prefix 'another' + usermodel2 = NewModel(name='another') + # you can enter in a context with submodel + with usermodel2: + usermodel2.Var('v3', pm.Normal.dist()) + pm.Normal('v4') + # this variable is created in parent model too + self.assertIn('another_v2', model.named_vars) + self.assertIn('another_v3', model.named_vars) + self.assertIn('another_v3', usermodel2.named_vars) + self.assertIn('another_v4', model.named_vars) + self.assertIn('another_v4', usermodel2.named_vars) + self.assertTrue(hasattr(usermodel2, 'v3')) + self.assertTrue(hasattr(usermodel2, 'v2')) + self.assertTrue(hasattr(usermodel2, 'v4')) + # When you create a class based model you should follow some rules + with model: + m = NewModel('one_more') + self.assertTrue(m.d is model['one_more_d']) + self.assertTrue(m['d'] is model['one_more_d']) + self.assertTrue(m['one_more_d'] is model['one_more_d']) + + +class TestNested(unittest.TestCase): + def test_nest_context_works(self): + with pm.Model() as m: + new = NewModel() + with new: + self.assertTrue( + pm.modelcontext(None) is new + ) + self.assertTrue( + pm.modelcontext(None) is m + ) + self.assertIn('v1', m.named_vars) + self.assertIn('v2', m.named_vars) + + def test_named_context(self): + with pm.Model() as m: + NewModel(name='new') + self.assertIn('new_v1', m.named_vars) + self.assertIn('new_v2', m.named_vars) + + def test_docstring_example1(self): + usage1 = DocstringModel() + self.assertIn('v1', usage1.named_vars) + self.assertIn('v2', usage1.named_vars) + self.assertIn('v3', usage1.named_vars) + self.assertIn('v3_sq', usage1.named_vars) + self.assertTrue(len(usage1.potentials), 1) + + def test_docstring_example2(self): + with pm.Model() as model: + DocstringModel(name='prefix') + self.assertIn('prefix_v1', model.named_vars) + self.assertIn('prefix_v2', model.named_vars) + self.assertIn('prefix_v3', model.named_vars) + self.assertIn('prefix_v3_sq', model.named_vars) + self.assertTrue(len(model.potentials), 1) + + def test_duplicates_detection(self): + with pm.Model(): + DocstringModel(name='prefix') + self.assertRaises(ValueError, DocstringModel, name='prefix') + + def test_model_root(self): + with pm.Model() as model: + self.assertTrue(model is model.root) + with pm.Model() as sub: + self.assertTrue(model is sub.root) diff --git a/pymc3/tests/test_models_linear.py b/pymc3/tests/test_models_linear.py new file mode 100644 index 0000000000..fac66fc0de --- /dev/null +++ b/pymc3/tests/test_models_linear.py @@ -0,0 +1,108 @@ +import numpy as np +from .helpers import SeededTest +from pymc3 import Model, Uniform, Normal, find_MAP, Slice, sample +from pymc3.models.linear import LinearComponent, Glm + + +# Generate data +def generate_data(intercept, slope, size=700): + x = np.linspace(-1, 1, size) + y = intercept + x * slope + return x, y + + +class TestGLM(SeededTest): + @classmethod + def setUpClass(cls): + super(TestGLM, cls).setUpClass() + cls.intercept = 1 + cls.slope = 3 + cls.sd = .05 + x_linear, cls.y_linear = generate_data(cls.intercept, cls.slope, size=1000) + cls.y_linear += np.random.normal(size=1000, scale=cls.sd) + cls.data_linear = dict(x=x_linear, y=cls.y_linear) + + x_logistic, y_logistic = generate_data(cls.intercept, cls.slope, size=3000) + y_logistic = 1 / (1 + np.exp(-y_logistic)) + bern_trials = [np.random.binomial(1, i) for i in y_logistic] + cls.data_logistic = dict(x=x_logistic, y=bern_trials) + + def test_linear_component(self): + vars_to_create = { + 'sigma_interval_', + 'y_obs', + 'lm_x0', + 'lm_Intercept' + } + with Model() as model: + lm = LinearComponent( + self.data_linear['x'], + self.data_linear['y'], + name='lm' + ) # yields lm_x0, lm_Intercept + sigma = Uniform('sigma', 0, 20) # yields sigma_interval_ + Normal('y_obs', mu=lm.y_est, sd=sigma, observed=self.y_linear) # yields y_obs + start = find_MAP(vars=[sigma]) + step = Slice(model.vars) + trace = sample(500, step, start, progressbar=False, random_seed=self.random_seed) + + self.assertAlmostEqual(np.mean(trace['lm_Intercept']), self.intercept, 1) + self.assertAlmostEqual(np.mean(trace['lm_x0']), self.slope, 1) + self.assertAlmostEqual(np.mean(trace['sigma']), self.sd, 1) + self.assertSetEqual(vars_to_create, set(model.named_vars.keys())) + + def test_linear_component_from_formula(self): + with Model() as model: + lm = LinearComponent.from_formula('y ~ x', self.data_linear) + sigma = Uniform('sigma', 0, 20) + Normal('y_obs', mu=lm.y_est, sd=sigma, observed=self.y_linear) + start = find_MAP(vars=[sigma]) + step = Slice(model.vars) + trace = sample(500, step, start, progressbar=False, random_seed=self.random_seed) + + self.assertAlmostEqual(np.mean(trace['Intercept']), self.intercept, 1) + self.assertAlmostEqual(np.mean(trace['x']), self.slope, 1) + self.assertAlmostEqual(np.mean(trace['sigma']), self.sd, 1) + + def test_glm(self): + with Model() as model: + vars_to_create = { + 'glm_sd_log_', + 'glm_y', + 'glm_x0', + 'glm_Intercept' + } + Glm( + self.data_linear['x'], + self.data_linear['y'], + name='glm' + ) + start = find_MAP() + step = Slice(model.vars) + trace = sample(500, step, start, progressbar=False, random_seed=self.random_seed) + self.assertAlmostEqual(np.mean(trace['glm_Intercept']), self.intercept, 1) + self.assertAlmostEqual(np.mean(trace['glm_x0']), self.slope, 1) + self.assertAlmostEqual(np.mean(trace['glm_sd']), self.sd, 1) + self.assertSetEqual(vars_to_create, set(model.named_vars.keys())) + + def test_glm_from_formula(self): + with Model() as model: + NAME = 'glm' + Glm.from_formula('y ~ x', self.data_linear, name=NAME) + start = find_MAP() + step = Slice(model.vars) + trace = sample(500, step, start, progressbar=False, random_seed=self.random_seed) + + self.assertAlmostEqual(np.mean(trace['%s_Intercept' % NAME]), self.intercept, 1) + self.assertAlmostEqual(np.mean(trace['%s_x' % NAME]), self.slope, 1) + self.assertAlmostEqual(np.mean(trace['%s_sd' % NAME]), self.sd, 1) + + def test_strange_types(self): + with Model(): + self.assertRaises( + ValueError, + Glm, + 1, + self.data_linear['y'], + name='lm' + ) diff --git a/pymc3/tests/test_models_utils.py b/pymc3/tests/test_models_utils.py new file mode 100644 index 0000000000..160846bcbc --- /dev/null +++ b/pymc3/tests/test_models_utils.py @@ -0,0 +1,79 @@ +import unittest +import numpy as np +import pandas as pd +import theano.tensor as tt +import pymc3.models.utils as utils + + +class TestUtils(unittest.TestCase): + def setUp(self): + self.data = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6])) + + def assertMatrixLabels(self, m, l, mt=None, lt=None): + self.assertTrue( + np.all( + np.equal( + m.eval(), + mt if mt is not None else self.data.as_matrix() + ) + ) + ) + self.assertEqual(l, list(lt or self.data.columns)) + + def test_numpy_init(self): + m, l = utils.any_to_tensor_and_labels(self.data.as_matrix()) + self.assertMatrixLabels(m, l, lt=['x0', 'x1']) + m, l = utils.any_to_tensor_and_labels(self.data.as_matrix(), labels=['x2', 'x3']) + self.assertMatrixLabels(m, l, lt=['x2', 'x3']) + + def test_pandas_init(self): + m, l = utils.any_to_tensor_and_labels(self.data) + self.assertMatrixLabels(m, l) + m, l = utils.any_to_tensor_and_labels(self.data, labels=['x2', 'x3']) + self.assertMatrixLabels(m, l, lt=['x2', 'x3']) + + def test_dict_input(self): + m, l = utils.any_to_tensor_and_labels(self.data.to_dict('dict')) + self.assertMatrixLabels(m, l, mt=self.data.as_matrix(l), lt=l) + + m, l = utils.any_to_tensor_and_labels(self.data.to_dict('series')) + self.assertMatrixLabels(m, l, mt=self.data.as_matrix(l), lt=l) + + m, l = utils.any_to_tensor_and_labels(self.data.to_dict('list')) + self.assertMatrixLabels(m, l, mt=self.data.as_matrix(l), lt=l) + + inp = {k: tt.as_tensor_variable(v) for k, v in self.data.to_dict('series').items()} + m, l = utils.any_to_tensor_and_labels(inp) + self.assertMatrixLabels(m, l, mt=self.data.as_matrix(l), lt=l) + + def test_list_input(self): + m, l = utils.any_to_tensor_and_labels(self.data.as_matrix().tolist()) + self.assertMatrixLabels(m, l, lt=['x0', 'x1']) + m, l = utils.any_to_tensor_and_labels(self.data.as_matrix().tolist(), labels=['x2', 'x3']) + self.assertMatrixLabels(m, l, lt=['x2', 'x3']) + + def test_tensor_input(self): + m, l = utils.any_to_tensor_and_labels( + tt.as_tensor_variable(self.data.as_matrix().tolist()), + labels=['x0', 'x1'] + ) + self.assertMatrixLabels(m, l, lt=['x0', 'x1']) + m, l = utils.any_to_tensor_and_labels( + tt.as_tensor_variable(self.data.as_matrix().tolist()), + labels=['x2', 'x3']) + self.assertMatrixLabels(m, l, lt=['x2', 'x3']) + + def test_user_mistakes(self): + # no labels for tensor variable + self.assertRaises( + ValueError, + utils.any_to_tensor_and_labels, + tt.as_tensor_variable(self.data.as_matrix().tolist()) + ) + # len of labels is bad + self.assertRaises( + ValueError, + utils.any_to_tensor_and_labels, + self.data.as_matrix().tolist(), + labels=['x'] + ) diff --git a/requirements.txt b/requirements.txt index fc6625c5d9..2e53afc286 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ recommonmark sphinx nbsphinx numpydoc +six diff --git a/setup.py b/setup.py index 708cfb1006..017c28898a 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ DISTNAME = 'pymc3' DESCRIPTION = "PyMC3" -LONG_DESCRIPTION = """Bayesian estimation, particularly using Markov chain Monte Carlo (MCMC), is an increasingly relevant approach to statistical estimation. However, few statistical software packages implement MCMC samplers, and they are non-trivial to code by hand. ``pymc3`` is a python package that implements the Metropolis-Hastings algorithm as a python class, and is extremely flexible and applicable to a large suite of problems. ``pymc3`` includes methods for summarizing output, plotting, goodness-of-fit and convergence diagnostics.""" +LONG_DESCRIPTION = """Bayesian estimation, particularly using Markov chain Monte Carlo (MCMC), is an increasingly relevant approach to statistical estimation. However, few statistical software packages implement MCMC samplers, and they are non-trivial to code by hand. ``pymc3`` is a python package that implements the Metropolis-Hastings algorithm as a python class, and is extremely flexible and applicable to a large suite of problems. ``pymc3`` includes methods for summarizing output, plotting, goodness-of-fit and convergence diagnostics.""" MAINTAINER = 'Thomas Wiecki' MAINTAINER_EMAIL = 'thomas.wiecki@gmail.com' AUTHOR = 'John Salvatier and Christopher Fonnesbeck' @@ -50,7 +50,8 @@ packages=['pymc3', 'pymc3.distributions', 'pymc3.step_methods', 'pymc3.tuning', 'pymc3.tests', 'pymc3.glm', 'pymc3.examples', - 'pymc3.backends', 'pymc3.variational', 'docs', '.'], + 'pymc3.models', 'pymc3.backends', + 'pymc3.variational', 'docs', '.'], classifiers=classifiers, install_requires=install_reqs, tests_require=test_reqs,