Skip to content

Set data when building Linearmodel #249

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Nov 13, 2023
4 changes: 2 additions & 2 deletions pymc_experimental/linearmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
"""
cfg = self.model_config

# The model is built with placeholder data.
# Actual data will be set by _data_setter when fitting or evaluating the model.
# Data array size can change but number of dimensions must stay the same.
with pm.Model() as self.model:
x = pm.MutableData("x", np.zeros((1,)), dims="observation")
Expand All @@ -94,6 +92,8 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
dims="observation",
)

self._data_setter(X, y)

def _data_setter(self, X: pd.DataFrame, y: Optional[Union[pd.DataFrame, pd.Series]] = None):
with self.model:
pm.set_data({"x": X.squeeze()})
Expand Down
29 changes: 14 additions & 15 deletions pymc_experimental/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,8 @@ def sample_model(self, **kwargs):
with self.model:
sampler_args = {**self.sampler_config, **kwargs}
idata = pm.sample(**sampler_args)
idata.extend(pm.sample_prior_predictive())
idata.extend(pm.sample_posterior_predictive(idata))
idata.extend(pm.sample_prior_predictive(), join="right")
idata.extend(pm.sample_posterior_predictive(idata), join="right")

idata = self.set_idata_attrs(idata)
return idata
Expand Down Expand Up @@ -595,23 +595,22 @@ def sample_prior_predictive(
Prior predictive samples for each input X_pred
"""
if y_pred is None:
y_pred = np.zeros(len(X_pred))
y_pred = pd.Series(np.zeros(len(X_pred)), name=self.output_var)
if samples is None:
samples = self.sampler_config.get("draws", 500)

if self.model is None:
self.build_model(X_pred, y_pred)

self._data_setter(X_pred, y_pred)
if self.model is not None:
with self.model: # sample with new input data
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
self.set_idata_attrs(prior_pred)
if extend_idata:
if self.idata is not None:
self.idata.extend(prior_pred)
else:
self.idata = prior_pred
else:
self._data_setter(X_pred, y_pred)
with self.model: # sample with new input data
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
self.set_idata_attrs(prior_pred)
if extend_idata:
if self.idata is not None:
self.idata.extend(prior_pred, join="right")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we test that all these join="right" are doing the right thing (i.e., discarding the old value and replacing the new one), and that extend_idata=False is being respected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be possible. I'll work on it and update the PR accordingly.

else:
self.idata = prior_pred

prior_predictive_samples = az.extract(prior_pred, "prior_predictive", combined=combined)

Expand Down Expand Up @@ -641,7 +640,7 @@ def sample_posterior_predictive(self, X_pred, extend_idata, combined, **kwargs):
with self.model: # sample with new input data
post_pred = pm.sample_posterior_predictive(self.idata, **kwargs)
if extend_idata:
self.idata.extend(post_pred)
self.idata.extend(post_pred, join="right")

posterior_predictive_samples = az.extract(
post_pred, "posterior_predictive", combined=combined
Expand Down
39 changes: 33 additions & 6 deletions pymc_experimental/tests/test_linearmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,15 @@
sklearn_available = False


@pytest.fixture(scope="module")
def toy_actual_params():
return {
"intercept": 3,
"slope": 5,
"obs_error": 0.5,
}


@pytest.fixture(scope="module")
def toy_X():
x = np.linspace(start=0, stop=1, num=100)
Expand All @@ -44,23 +53,24 @@ def toy_X():


@pytest.fixture(scope="module")
def toy_y(toy_X):
y = 5 * toy_X["input"] + 3
y = y + np.random.normal(0, 1, size=len(toy_X))
def toy_y(toy_X, toy_actual_params):
y = toy_actual_params["slope"] * toy_X["input"] + toy_actual_params["intercept"]
rng = np.random.default_rng(427)
y = y + rng.normal(0, toy_actual_params["obs_error"], size=len(toy_X))
y = pd.Series(y, name="output")
return y


@pytest.fixture(scope="module")
def fitted_linear_model_instance(toy_X, toy_y):
sampler_config = {
"draws": 500,
"tune": 300,
"draws": 50,
"tune": 30,
"chains": 2,
"target_accept": 0.95,
}
model = LinearModel(sampler_config=sampler_config)
model.fit(toy_X, toy_y)
model.fit(toy_X, toy_y, random_seed=312)
return model


Expand Down Expand Up @@ -101,6 +111,23 @@ def test_fit(fitted_linear_model_instance):
assert isinstance(post_pred, xr.DataArray)


def test_parameter_fit(toy_X, toy_y, toy_actual_params):
"""Check that the fit model recovered the data-generating parameters."""
# Fit the model with a sufficient number of samples
sampler_config = {
"draws": 500,
"tune": 300,
"chains": 2,
"target_accept": 0.95,
}
model = LinearModel(sampler_config=sampler_config)
model.fit(toy_X, toy_y, random_seed=312)
fit_params = model.idata.posterior.mean()
np.testing.assert_allclose(fit_params["intercept"], toy_actual_params["intercept"], rtol=0.1)
np.testing.assert_allclose(fit_params["slope"], toy_actual_params["slope"], rtol=0.1)
np.testing.assert_allclose(fit_params["σ_model_fmc"], toy_actual_params["obs_error"], rtol=0.1)


def test_predict(fitted_linear_model_instance):
model = fitted_linear_model_instance
X_pred = pd.DataFrame({"input": np.random.uniform(low=0, high=1, size=100)})
Expand Down
80 changes: 70 additions & 10 deletions pymc_experimental/tests/test_model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@ def toy_y(toy_X):
return y


@pytest.fixture(scope="module")
def fitted_model_instance(toy_X, toy_y):
def get_unfitted_model_instance(X, y):
"""Creates an unfitted model instance to which idata can be copied in
and then used as a fitted model instance. That way a fitted model
can be used multiple times without having to run `fit` multiple times."""
sampler_config = {
"draws": 100,
"tune": 100,
"draws": 20,
"tune": 10,
"chains": 2,
"target_accept": 0.95,
}
Expand All @@ -57,7 +59,30 @@ def fitted_model_instance(toy_X, toy_y):
model = test_ModelBuilder(
model_config=model_config, sampler_config=sampler_config, test_parameter="test_paramter"
)
model.fit(toy_X)
# Do the things that `model.fit` does except sample to create idata.
model._generate_and_preprocess_model_data(X, y.values.flatten())
model.build_model(X, y)
return model


@pytest.fixture(scope="module")
def fitted_model_instance_base(toy_X, toy_y):
"""Because fitting takes a relatively long time, this is intended to
be used only once and then have new instances created and fit data patched in
for tests that use a fitted model instance. Tests should use
`fitted_model_instance` instead of this."""
model = get_unfitted_model_instance(toy_X, toy_y)
model.fit(toy_X, toy_y)
return model


@pytest.fixture
def fitted_model_instance(toy_X, toy_y, fitted_model_instance_base):
"""Get a fitted model instance. A new instance is created and fit data is
patched in, so tests using this fixture can modify the model object without
affecting other tests."""
model = get_unfitted_model_instance(toy_X, toy_y)
model.idata = fitted_model_instance_base.idata.copy()
return model


Expand Down Expand Up @@ -131,8 +156,8 @@ def _generate_and_preprocess_model_data(
@staticmethod
def get_default_sampler_config() -> Dict:
return {
"draws": 1_000,
"tune": 1_000,
"draws": 10,
"tune": 10,
"chains": 3,
"target_accept": 0.95,
}
Expand All @@ -142,6 +167,9 @@ def test_save_input_params(fitted_model_instance):
assert fitted_model_instance.idata.attrs["test_paramter"] == '"test_paramter"'


@pytest.mark.skipif(
sys.platform == "win32", reason="Permissions for temp files not granted on windows CI."
)
def test_save_load(fitted_model_instance):
temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False)
fitted_model_instance.save(temp.name)
Expand Down Expand Up @@ -193,9 +221,6 @@ def test_fit_no_y(toy_X):
assert "posterior" in model_builder.idata.groups()


@pytest.mark.skipif(
sys.platform == "win32", reason="Permissions for temp files not granted on windows CI."
)
def test_predict(fitted_model_instance):
x_pred = np.random.uniform(low=0, high=1, size=100)
prediction_data = pd.DataFrame({"input": x_pred})
Expand All @@ -220,6 +245,41 @@ def test_sample_posterior_predictive(fitted_model_instance, combined):
assert np.issubdtype(pred[fitted_model_instance.output_var].dtype, np.floating)


@pytest.mark.parametrize("group", ["prior_predictive", "posterior_predictive"])
@pytest.mark.parametrize("extend_idata", [True, False])
def test_sample_xxx_extend_idata_param(fitted_model_instance, group, extend_idata):
output_var = fitted_model_instance.output_var
idata_prev = fitted_model_instance.idata[group][output_var]

# Since coordinates are provided, the dimension must match
n_pred = 100 # Must match toy_x
x_pred = np.random.uniform(0, 1, n_pred)

prediction_data = pd.DataFrame({"input": x_pred})
if group == "prior_predictive":
prediction_method = fitted_model_instance.sample_prior_predictive
else: # group == "posterior_predictive":
prediction_method = fitted_model_instance.sample_posterior_predictive

pred = prediction_method(prediction_data["input"], combined=False, extend_idata=extend_idata)

pred_unstacked = pred[output_var].values
idata_now = fitted_model_instance.idata[group][output_var].values

if extend_idata:
# After sampling, data in the model should be the same as the predictions
np.testing.assert_array_equal(idata_now, pred_unstacked)
# Data in the model should NOT be the same as before
if idata_now.shape == idata_prev.values.shape:
assert np.sum(np.abs(idata_now - idata_prev.values) < 1e-5) <= 2
else:
# After sampling, data in the model should be the same as it was before
np.testing.assert_array_equal(idata_now, idata_prev.values)
# Data in the model should NOT be the same as the predictions
if idata_now.shape == pred_unstacked.shape:
assert np.sum(np.abs(idata_now - pred_unstacked) < 1e-5) <= 2


def test_model_config_formatting():
model_config = {
"a": {
Expand Down