ENH Adds Column name consistency (#18010)

thomasjpfan · ogrisel · web-flow · commit 416898b7f412 · 2021-08-17T14:48:21.000+02:00
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -134,6 +134,12 @@ Changelog
 - |API| `np.matrix` usage is deprecated in 1.0 and will raise a `TypeError` in
   1.2. :pr:`20165` by `Thomas Fan`_.
 
+- |API| All estimators store `feature_names_in_` when fitted on pandas Dataframes.
+  These feature names are compared to names seen in `non-fit` methods,
+  `i.e.` `transform` and will raise a `FutureWarning` if they are not consistent.
+  These `FutureWarning`s will become `ValueError`s in 1.2.
+  :pr:`18010` by `Thomas Fan`_.
+
 :mod:`sklearn.base`
 ...................
 
diff --git a/sklearn/base.py b/sklearn/base.py
@@ -24,6 +24,7 @@
 from .utils.validation import _check_y
 from .utils.validation import _num_features
 from .utils._estimator_html_repr import estimator_html_repr
+from .utils.validation import _get_feature_names
 
 
 def clone(estimator, *, safe=True):
@@ -395,6 +396,92 @@ def _check_n_features(self, X, reset):
                 f"is expecting {self.n_features_in_} features as input."
             )
 
+    def _check_feature_names(self, X, *, reset):
+        """Set or check the `feature_names_in_` attribute.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        X : {ndarray, dataframe} of shape (n_samples, n_features)
+            The input samples.
+
+        reset : bool
+            Whether to reset the `feature_names_in_` attribute.
+            If False, the input will be checked for consistency with
+            feature names of data provided when reset was last True.
+            .. note::
+               It is recommended to call `reset=True` in `fit` and in the first
+               call to `partial_fit`. All other methods that validate `X`
+               should set `reset=False`.
+        """
+
+        if reset:
+            feature_names_in = _get_feature_names(X)
+            if feature_names_in is not None:
+                self.feature_names_in_ = feature_names_in
+            return
+
+        fitted_feature_names = getattr(self, "feature_names_in_", None)
+        X_feature_names = _get_feature_names(X)
+
+        if fitted_feature_names is None and X_feature_names is None:
+            # no feature names seen in fit and in X
+            return
+
+        if X_feature_names is not None and fitted_feature_names is None:
+            warnings.warn(
+                f"X has feature names, but {self.__class__.__name__} was fitted without"
+                " feature names"
+            )
+            return
+
+        if X_feature_names is None and fitted_feature_names is not None:
+            warnings.warn(
+                "X does not have valid feature names, but"
+                f" {self.__class__.__name__} was fitted with feature names"
+            )
+            return
+
+        # validate the feature names against the `feature_names_in_` attribute
+        if len(fitted_feature_names) != len(X_feature_names) or np.any(
+            fitted_feature_names != X_feature_names
+        ):
+            message = (
+                "The feature names should match those that were "
+                "passed during fit. Starting version 1.2, an error will be raised.\n"
+            )
+            fitted_feature_names_set = set(fitted_feature_names)
+            X_feature_names_set = set(X_feature_names)
+
+            unexpected_names = sorted(X_feature_names_set - fitted_feature_names_set)
+            missing_names = sorted(fitted_feature_names_set - X_feature_names_set)
+
+            def add_names(names):
+                output = ""
+                max_n_names = 5
+                for i, name in enumerate(names):
+                    if i >= max_n_names:
+                        output += "- ...\n"
+                        break
+                    output += f"- {name}\n"
+                return output
+
+            if unexpected_names:
+                message += "Feature names unseen at fit time:\n"
+                message += add_names(unexpected_names)
+
+            if missing_names:
+                message += "Feature names seen at fit time, yet now missing:\n"
+                message += add_names(missing_names)
+
+            if not missing_names and not missing_names:
+                message += (
+                    "Feature names must be in the same order as they were in fit.\n"
+                )
+
+            warnings.warn(message, FutureWarning)
+
     def _validate_data(
         self,
         X="no_validation",
@@ -452,6 +539,8 @@ def _validate_data(
             The validated input. A tuple is returned if both `X` and `y` are
             validated.
         """
+        self._check_feature_names(X, reset=reset)
+
         if y is None and self._get_tags()["requires_y"]:
             raise ValueError(
                 f"This {self.__class__.__name__} estimator "
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -368,6 +368,8 @@ def fit(self, X, y, sample_weight=None):
         first_clf = self.calibrated_classifiers_[0].base_estimator
         if hasattr(first_clf, "n_features_in_"):
             self.n_features_in_ = first_clf.n_features_in_
+        if hasattr(first_clf, "feature_names_in_"):
+            self.feature_names_in_ = first_clf.feature_names_in_
         return self
 
     def predict_proba(self, X):
diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
@@ -257,6 +257,8 @@ def fit(self, X, y=None, **fit_params):
             raise NotFittedError("Since 'prefit=True', call transform directly")
         self.estimator_ = clone(self.estimator)
         self.estimator_.fit(X, y, **fit_params)
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
         return self
 
     @property
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
@@ -21,7 +21,7 @@
 
 from .base import BaseEstimator
 from .base import TransformerMixin
-from .utils import check_random_state, as_float_array
+from .utils import check_random_state
 from .utils.extmath import safe_sparse_dot
 from .utils.validation import check_is_fitted
 from .metrics.pairwise import pairwise_kernels, KERNEL_PARAMS
@@ -469,9 +469,9 @@ def transform(self, X):
             Returns the instance itself.
         """
         check_is_fitted(self)
-
-        X = as_float_array(X, copy=True)
-        X = self._validate_data(X, copy=False, reset=False)
+        X = self._validate_data(
+            X, copy=True, dtype=[np.float64, np.float32], reset=False
+        )
         if (X <= -self.skewedness).any():
             raise ValueError("X may not contain entries smaller than -skewedness.")
 
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
@@ -556,6 +556,7 @@ def predict(self, X):
             Returns predicted values.
         """
         check_is_fitted(self)
+        self._check_feature_names(X, reset=False)
 
         return self.estimator_.predict(X)
 
@@ -578,6 +579,7 @@ def score(self, X, y):
             Score of the prediction.
         """
         check_is_fitted(self)
+        self._check_feature_names(X, reset=False)
 
         return self.estimator_.score(X, y)
 
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
@@ -1983,6 +1983,8 @@ def fit(self, X, y, sample_weight=None):
         self.coef_ = estimator.coef_
         self.intercept_ = estimator.intercept_
         self.n_features_in_ = estimator.n_features_in_
+        if hasattr(estimator, "feature_names_in_"):
+            self.feature_names_in_ = estimator.feature_names_in_
 
         return self
 
diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
@@ -172,6 +172,8 @@ def _fit_transform(self, X):
         )
         self.nbrs_.fit(X)
         self.n_features_in_ = self.nbrs_.n_features_in_
+        if hasattr(self.nbrs_, "feature_names_in_"):
+            self.feature_names_in_ = self.nbrs_.feature_names_in_
 
         self.kernel_pca_ = KernelPCA(
             n_components=self.n_components,
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
@@ -768,7 +768,7 @@ def transform(self, X):
         """
         check_is_fitted(self)
 
-        X = check_array(X)
+        X = self._validate_data(X, reset=False)
         ind = self.nbrs_.kneighbors(
             X, n_neighbors=self.n_neighbors, return_distance=False
         )
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
@@ -15,7 +15,6 @@
 
 from ..base import BaseEstimator
 from ..base import TransformerMixin
-from ..utils import check_array
 from ..utils import check_random_state
 from ..utils import gen_even_slices
 from ..utils.extmath import safe_sparse_dot
@@ -333,7 +332,7 @@ def score_samples(self, X):
         """
         check_is_fitted(self)
 
-        v = check_array(X, accept_sparse="csr")
+        v = self._validate_data(X, accept_sparse="csr", reset=False)
         rng = check_random_state(self.random_state)
 
         # Randomly corrupt one feature in each sample in v.
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
@@ -1,6 +1,7 @@
 # Author: Gael Varoquaux
 # License: BSD 3 clause
 
+import re
 import numpy as np
 import scipy.sparse as sp
 import pytest
@@ -615,3 +616,73 @@ def test_n_features_in_no_validation():
 
     # does not raise
     est._check_n_features("invalid X", reset=False)
+
+
+def test_feature_names_in():
+    """Check that feature_name_in are recorded by `_validate_data`"""
+    pd = pytest.importorskip("pandas")
+    iris = datasets.load_iris()
+    X_np = iris.data
+    df = pd.DataFrame(X_np, columns=iris.feature_names)
+
+    class NoOpTransformer(TransformerMixin, BaseEstimator):
+        def fit(self, X, y=None):
+            self._validate_data(X)
+            return self
+
+        def transform(self, X):
+            self._validate_data(X, reset=False)
+            return X
+
+    # fit on dataframe saves the feature names
+    trans = NoOpTransformer().fit(df)
+    assert_array_equal(trans.feature_names_in_, df.columns)
+
+    msg = "The feature names should match those that were passed"
+    df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1])
+    with pytest.warns(FutureWarning, match=msg):
+        trans.transform(df_bad)
+
+    # warns when fitted on dataframe and transforming a ndarray
+    msg = (
+        "X does not have valid feature names, but NoOpTransformer was "
+        "fitted with feature names"
+    )
+    with pytest.warns(UserWarning, match=msg):
+        trans.transform(X_np)
+
+    # warns when fitted on a ndarray and transforming dataframe
+    msg = "X has feature names, but NoOpTransformer was fitted without feature names"
+    trans = NoOpTransformer().fit(X_np)
+    with pytest.warns(UserWarning, match=msg):
+        trans.transform(df)
+
+    # fit on dataframe with all integer feature names works without warning
+    df_int_names = pd.DataFrame(X_np)
+    trans = NoOpTransformer()
+    with pytest.warns(None) as record:
+        trans.fit(df_int_names)
+    assert not record
+
+    # fit on dataframe with no feature names or all integer feature names
+    # -> do not warn on trainsform
+    Xs = [X_np, df_int_names]
+    for X in Xs:
+        with pytest.warns(None) as record:
+            trans.transform(X)
+        assert not record
+
+    # TODO: Convert to a error in 1.2
+    # fit on dataframe with feature names that are mixed warns:
+    df_mixed = pd.DataFrame(X_np, columns=["a", "b", 1, 2])
+    trans = NoOpTransformer()
+    msg = re.escape(
+        "Feature names only support names that are all strings. "
+        "Got feature names with dtypes: ['int', 'str']"
+    )
+    with pytest.warns(FutureWarning, match=msg) as record:
+        trans.fit(df_mixed)
+
+    # transform on feature names that are mixed also warns:
+    with pytest.warns(FutureWarning, match=msg) as record:
+        trans.transform(df_mixed)
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -47,6 +47,7 @@
     _get_check_estimator_ids,
     check_class_weight_balanced_linear_classifier,
     parametrize_with_checks,
+    check_dataframe_column_names_consistency,
     check_n_features_in_after_fitting,
 )
 
@@ -313,3 +314,41 @@ def test_search_cv(estimator, check, request):
 def test_check_n_features_in_after_fitting(estimator):
     _set_checking_parameters(estimator)
     check_n_features_in_after_fitting(estimator.__class__.__name__, estimator)
+
+
+# TODO: When more modules get added, we can remove it from this list to make
+# sure it gets tested. After we finish each module we can move the checks
+# into check_estimator.
+# NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator that
+# delegates validation to a base estimator, the check is testing that the base estimator
+# is checking for column name consistency.
+
+COLUMN_NAME_MODULES_TO_IGNORE = {
+    "compose",
+    "ensemble",
+    "feature_extraction",
+    "kernel_approximation",
+    "model_selection",
+    "multiclass",
+    "multioutput",
+    "pipeline",
+    "semi_supervised",
+}
+
+
+column_name_estimators = [
+    est
+    for est in _tested_estimators()
+    if est.__module__.split(".")[1] not in COLUMN_NAME_MODULES_TO_IGNORE
+]
+
+
+@pytest.mark.parametrize(
+    "estimator", column_name_estimators, ids=_get_check_estimator_ids
+)
+def test_pandas_column_name_consistency(estimator):
+    _set_checking_parameters(estimator)
+    with ignore_warnings(category=(FutureWarning)):
+        check_dataframe_column_names_consistency(
+            estimator.__class__.__name__, estimator
+        )
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py

Original file line number	Diff line number	Diff line change
`@@ -172,6 +172,8 @@ def _fit_transform(self, X):`
`172`	`172`	`)`
`173`	`173`	`self.nbrs_.fit(X)`
`174`	`174`	`self.n_features_in_ = self.nbrs_.n_features_in_`
	`175`	`+ if hasattr(self.nbrs_, "feature_names_in_"):`
	`176`	`+ self.feature_names_in_ = self.nbrs_.feature_names_in_`
`175`	`177`
`176`	`178`	`self.kernel_pca_ = KernelPCA(`
`177`	`179`	`n_components=self.n_components,`
Original file line number	Diff line number	Diff line change
`@@ -768,7 +768,7 @@ def transform(self, X):`
`768`	`768`	`"""`
`769`	`769`	`check_is_fitted(self)`
`770`	`770`
`771`		`- X = check_array(X)`
	`771`	`+ X = self._validate_data(X, reset=False)`
`772`	`772`	`ind = self.nbrs_.kneighbors(`
`773`	`773`	`X, n_neighbors=self.n_neighbors, return_distance=False`
`774`	`774`	`)`