diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 690499d..9106867 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -158,12 +158,24 @@ def fit(self, X, y=None): def get_names(self, c, t, x): - if type(c)==list: + """ + Return verbose names for the transformed columns. + + c name (or list of names) of the original column(s) + t transformer + x transformed columns (numpy.ndarray) + """ + if isinstance(c, list): c = '_'.join(c) - if hasattr(t, 'classes_') and (len(t.classes_)>2): - return [c + '_' + o for o in t.classes_] - elif len(x.shape)>1 and x.shape[1]>1: - return [c + '_' + str(o) for o in range(x.shape[1])] + num_cols = x.shape[1] if len(x.shape) > 1 else 1 + if num_cols > 1: + # If there are as many columns as classes, + # infer column names from classes names. + if hasattr(t, 'classes_') and (len(t.classes_) == num_cols): + return [c + '_' + str(o) for o in t.classes_] + # otherwise, return name concatenated with '_1', '_2', etc. + else: + return [c + '_' + str(o) for o in range(num_cols)] else: return [c] diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index 0f9dd75..c59dfe6 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -67,6 +67,23 @@ def transform(self, X): return sparse.csr_matrix(X) +class CustomTransformer(BaseEstimator, TransformerMixin): + """ + Example of transformer in which the number of classes + is not equals to the number of output columns. + """ + def fit(self, X, y=None): + self.min = X.min() + self.classes_ = np.unique(X) + return self + + def transform(self, X): + classes = np.unique(X) + if len(np.setdiff1d(classes, self.classes_)) > 0: + raise ValueError('Unknown values found.') + return X - self.min + + @pytest.fixture def simple_dataframe(): return pd.DataFrame({'a': [1, 2, 3]}) @@ -118,6 +135,20 @@ def test_binarizer_df(): assert cols[2] == 'target_c' +def test_binarizer_int_df(): + """ + Check level names from LabelBinarizer for a numeric array. + """ + df = pd.DataFrame({'target': [5, 5, 6, 6, 7, 5]}) + mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True) + transformed = mapper.fit_transform(df) + cols = transformed.columns + assert len(cols) == 3 + assert cols[0] == 'target_5' + assert cols[1] == 'target_6' + assert cols[2] == 'target_7' + + def test_binarizer2_df(): """ Check level names from LabelBinarizer with just one output column @@ -143,6 +174,20 @@ def test_onehot_df(): assert cols[3] == 'target_3' +def test_customtransform_df(): + """ + Check level ids from a transformer in which + the number of classes is not equals to the number of output columns. + """ + df = pd.DataFrame({'target': [6, 5, 7, 5, 4, 8, 8]}) + mapper = DataFrameMapper([(['target'], CustomTransformer())], df_out=True) + transformed = mapper.fit_transform(df) + cols = transformed.columns + assert len(mapper.features[0][1].classes_) == 5 + assert len(cols) == 1 + assert cols[0] == 'target' + + def test_pca(complex_dataframe): """ Check multi in and out with PCA