From b31e89e235b14a7ca237b360ba9b0dd4cce3b04a Mon Sep 17 00:00:00 2001 From: devforfu Date: Tue, 14 Aug 2018 19:49:27 +0500 Subject: [PATCH 1/3] Get/set parameters for a single transformer --- sklearn_pandas/dataframe_mapper.py | 22 ++++++++++++ tests/test_dataframe_mapper.py | 58 ++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 57b27f4..0c1b512 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -1,5 +1,6 @@ import sys import contextlib +from collections import defaultdict import pandas as pd import numpy as np @@ -348,3 +349,24 @@ def transform(self, X): return df_out else: return stacked + + def get_params(self, deep=True): + out = super().get_params(deep=False) + if not deep: + return out + for name, transformer in out['features']: + if transformer is None: + continue + for key, value in transformer.get_params(deep=True).items(): + out['%s__%s' % (name, key)] = value + return out + + def set_params(self, **params): + transformers = dict(self.features) + assignment = defaultdict(dict) + for key, value in params.items(): + transformer, parameter = key.split('__') + assignment[transformer][parameter] = value + for name, parameters in assignment.items(): + if name in transformers: + transformers[name].set_params(**parameters) diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index 004e809..50692f1 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -96,6 +96,20 @@ def transform(self, X): return X - self.min +class NoOpTransformer(BaseEstimator, TransformerMixin): + + def __init__(self, string='', number=0, flag=False): + self.string = string + self.number = number + self.flag = flag + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X + + @pytest.fixture def simple_dataframe(): return pd.DataFrame({'a': [1, 2, 3]}) @@ -857,3 +871,47 @@ def test_heterogeneous_output_types_input_df(): dft = M.fit_transform(df) assert dft['feat1'].dtype == np.dtype('int64') assert dft['feat2'].dtype == np.dtype('float64') + + +def test_getting_single_transformer_parameters(): + """ + Tests that a data frame mapper with a single transformer exposes its + parameters via get_params() method. + """ + noop = NoOpTransformer() + nested_keys = list(noop.get_params().keys()) + step_name = 'data_frame_mapper' + transformer_name = 'nested_transformer' + expected_keys = [ + '{step_name}__{transformer_name}__{key}'.format( + step_name=step_name, + transformer_name=transformer_name, + key=nested_key) + for nested_key in nested_keys] + + mapper = DataFrameMapper([(transformer_name, noop)], df_out=False) + pipeline = Pipeline([(step_name, mapper)]) + params = pipeline.get_params() + + assert all([key in params for key in expected_keys]) + + +def test_setting_single_transformer_parameters(): + """ + Tests that a data frame mapper with a single transformer correctly assigns + parameters to the transformer when the set_params() method is called. + """ + noop = NoOpTransformer() + old_parameters = noop.get_params() + mapper = DataFrameMapper([('noop', noop)], df_out=False) + pipeline = Pipeline([('mapper', mapper)]) + + pipeline.set_params( + mapper__noop__string='string', + mapper__noop__number=1, + mapper__noop__flag=True) + + assert old_parameters != noop.get_params() + assert noop.string == 'string' + assert noop.number == 1 + assert noop.flag From 45992e1d64bdde02a28fe468da9a19a1be5bb923 Mon Sep 17 00:00:00 2001 From: devforfu Date: Wed, 15 Aug 2018 13:05:42 +0500 Subject: [PATCH 2/3] Parameters getters and setters --- sklearn_pandas/dataframe_mapper.py | 58 +++++++++++++++++++++++------ tests/test_dataframe_mapper.py | 60 ++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 12 deletions(-) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 0c1b512..49e4a89 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -1,5 +1,6 @@ import sys import contextlib +from itertools import chain from collections import defaultdict import pandas as pd @@ -49,6 +50,10 @@ def _get_feature_names(estimator): return None +def _get_lowercased_class_name(inst): + return type(inst).__name__.lower() + + @contextlib.contextmanager def add_column_names_to_exception(column_names): # Stolen from https://stackoverflow.com/a/17677938/356729 @@ -351,22 +356,51 @@ def transform(self, X): return stacked def get_params(self, deep=True): - out = super().get_params(deep=False) + out = super(DataFrameMapper, self).get_params(deep=False) if not deep: return out - for name, transformer in out['features']: - if transformer is None: - continue - for key, value in transformer.get_params(deep=True).items(): - out['%s__%s' % (name, key)] = value + for feature_name, transformers in out['features']: + if isinstance(transformers, list): + for transformer in transformers: + if transformer is None: + continue + transformer_name = _get_lowercased_class_name(transformer) + parameters = transformer.get_params(deep=True) + for key, value in parameters.items(): + param_path = '{column}__{transformer}__{key}'.format( + column=feature_name, + transformer=transformer_name, + key=key + ) + out[param_path] = value + else: + transformer = transformers + if transformer is None: + continue + for key, value in transformer.get_params(deep=True).items(): + out['%s__%s' % (feature_name, key)] = value return out def set_params(self, **params): - transformers = dict(self.features) + features = dict(self.features) assignment = defaultdict(dict) + for key, value in params.items(): - transformer, parameter = key.split('__') - assignment[transformer][parameter] = value - for name, parameters in assignment.items(): - if name in transformers: - transformers[name].set_params(**parameters) + feature_name, _, parameter = key.partition('__') + if '__' in parameter: + transformer_name, _, parameter = parameter.partition('__') + transformers = features[feature_name] + for transformer in transformers: + class_name = _get_lowercased_class_name(transformer) + if class_name == transformer_name: + assignment[id(transformer)][parameter] = value + else: + transformer = features[feature_name] + assignment[id(transformer)][parameter] = value + + transformers_instances = chain(*[ + x if isinstance(x, list) else [x] + for name, x in self.features]) + + for instance in transformers_instances: + instance.set_params(**assignment[id(instance)]) diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index 50692f1..0608afe 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -1,6 +1,7 @@ # -*- coding: utf8 -*- import pytest +from itertools import product from pkg_resources import parse_version # In py3, mock is included with the unittest standard library @@ -110,6 +111,30 @@ def transform(self, X): return X +class Adder(BaseEstimator, TransformerMixin): + + def __init__(self, num_to_add): + self.num_to_add = num_to_add + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X + self.num_to_add + + +class Divider(BaseEstimator, TransformerMixin): + + def __init__(self, denominator): + self.denominator = denominator + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X / self.denominator + + @pytest.fixture def simple_dataframe(): return pd.DataFrame({'a': [1, 2, 3]}) @@ -915,3 +940,38 @@ def test_setting_single_transformer_parameters(): assert noop.string == 'string' assert noop.number == 1 assert noop.flag + + +def test_getting_parameters_from_a_list_of_transformers(): + expected_keys = [ + 'mapper__{column}__{name}__{value}'.format( + column=column, name=name, value=value) + for column, (name, value) in product( + ('colA', 'colB'), + (('adder', 'num_to_add'), ('divider', 'denominator')) + ) + ] + mapper = Pipeline([ + ('mapper', DataFrameMapper([ + ('colA', [Adder(1), Divider(2)]), + ('colB', [Divider(1), Adder(2)]) + ])) + ]) + + params = mapper.get_params() + + assert all([key in params for key in expected_keys]) + + +def test_setting_parameters_to_a_list_of_transformers(): + transformers = adder, divider = Adder(1), Divider(2) + mapper = DataFrameMapper([('colA', list(transformers))], df_out=False) + pipeline = Pipeline([('mapper', mapper)]) + + pipeline.set_params( + mapper__colA__adder__num_to_add=0, + mapper__colA__divider__denominator=1 + ) + + assert adder.num_to_add == 0 + assert divider.denominator == 1 From 89e6d1163dd8ebd8eb52888ab6dcb63b4b53a0e9 Mon Sep 17 00:00:00 2001 From: devforfu Date: Wed, 5 Sep 2018 12:40:25 +0500 Subject: [PATCH 3/3] Testing pipeline with GridSearchCV --- sklearn_pandas/dataframe_mapper.py | 14 +++++++++++--- tests/test_dataframe_mapper.py | 30 ++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 2c1bf3b..5c3bd75 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -39,6 +39,12 @@ def _build_feature(columns, transformers, options={}): return (columns, _build_transformer(transformers), options) +def _build_feature_name(values): + if isinstance(values, list): + values = '-'.join([str(value) for value in values]) + return values + + def _get_feature_names(estimator): """ Attempt to extract feature names based on a given estimator @@ -420,7 +426,11 @@ def get_params(self, deep=True): return out def set_params(self, **params): - features = dict(self.features) + features = {} + for column_names, transformers in self.features: + key = _build_feature_name(column_names) + features[key] = transformers + assignment = defaultdict(dict) for key, value in params.items(): @@ -442,5 +452,3 @@ def set_params(self, **params): for instance in transformers_instances: instance.set_params(**assignment[id(instance)]) - - diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index 7d19de2..d170a72 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -16,6 +16,7 @@ from scipy import sparse from sklearn import __version__ as sklearn_version from sklearn.cross_validation import cross_val_score as sklearn_cv_score +from sklearn.grid_search import GridSearchCV as sklearn_grid_search from sklearn.datasets import load_iris from sklearn.pipeline import Pipeline from sklearn.svm import SVC @@ -113,7 +114,7 @@ def transform(self, X): class Adder(BaseEstimator, TransformerMixin): - def __init__(self, num_to_add): + def __init__(self, num_to_add=0): self.num_to_add = num_to_add def fit(self, X, y=None): @@ -125,7 +126,7 @@ def transform(self, X): class Divider(BaseEstimator, TransformerMixin): - def __init__(self, denominator): + def __init__(self, denominator=1): self.denominator = denominator def fit(self, X, y=None): @@ -1068,3 +1069,28 @@ def test_setting_parameters_to_a_list_of_transformers(): assert adder.num_to_add == 0 assert divider.denominator == 1 + + +def test_compliant_with_grid_search(iris_dataframe): + pipeline = Pipeline([ + ('mapper', DataFrameMapper([ + (['petal length (cm)'], StandardScaler()), + (['petal width (cm)'], StandardScaler()), + (['sepal length (cm)'], StandardScaler()), + (['sepal width (cm)'], StandardScaler()), + ])), + ('classifier', SVC(kernel='linear')) + ]) + param_grid = { + 'mapper__petal length (cm)__with_mean': [True, False], + 'mapper__petal width (cm)__with_mean': [True, False], + 'mapper__sepal length (cm)__with_mean': [True, False], + 'mapper__sepal width (cm)__with_mean': [True, False] + } + data = iris_dataframe.drop("species", axis=1) + labels = iris_dataframe["species"] + + grid_search = sklearn_grid_search(pipeline, param_grid=param_grid) + grid_search.fit(data, labels) + + assert len(grid_search.grid_scores_) == 2**len(param_grid)