automl
diff --git a/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorClassification.py‎
Lines changed: 31 additions & 18 deletions b/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorClassification.py‎
Lines changed: 31 additions & 18 deletions
diff --git a/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorRegression.py‎
Lines changed: 30 additions & 18 deletions b/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorRegression.py‎
Lines changed: 30 additions & 18 deletions
diff --git a/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/LibLinearSVCPreprocessor.py‎
Lines changed: 20 additions & 10 deletions b/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/LibLinearSVCPreprocessor.py‎
Lines changed: 20 additions & 10 deletions
diff --git a/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomTreesEmbedding.py‎
Lines changed: 17 additions & 22 deletions b/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomTreesEmbedding.py‎
Lines changed: 17 additions & 22 deletions
diff --git a/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileClassification.py‎
Lines changed: 12 additions & 28 deletions b/‎autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileClassification.py‎
Lines changed: 12 additions & 28 deletions
@@ -19,6 +19,9 @@
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, check_none
 
 
+CRITERION_CHOICES = ("gini", "entropy")
+
+
 class ExtraTreesPreprocessorClassification(autoPyTorchFeaturePreprocessingComponent):
     """
     Selects features based on importance weights calculated using extra trees
@@ -33,9 +36,9 @@ def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
                  random_state: Optional[np.random.RandomState] = None):
         self.bootstrap = bootstrap
         self.n_estimators = n_estimators
-        if criterion not in ("gini", "entropy"):
-            raise ValueError("'criterion' is not in ('gini', 'entropy'): "
-                             "%s" % criterion)
+        if criterion not in CRITERION_CHOICES:
+            raise ValueError(f"`criterion` of {self.__class__.__name__} "
+                             f"must be in {CRITERION_CHOICES}, but got: {criterion}")
         self.criterion = criterion
         self.max_features = max_features
         self.min_impurity_decrease = min_impurity_decrease
@@ -49,6 +52,29 @@ def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
 
         super().__init__(random_state=random_state)
 
+    def get_components_kwargs(self) -> Dict[str, Any]:
+        """
+        returns keyword arguments required by the feature preprocessor
+
+        Returns:
+            Dict[str, Any]: kwargs
+        """
+        return dict(
+            bootstrap=self.bootstrap,
+            n_estimators=self.n_estimators,
+            criterion=self.criterion,
+            max_features=self.max_features,
+            min_impurity_decrease=self.min_impurity_decrease,
+            max_depth=self.max_depth,
+            min_samples_split=self.min_samples_split,
+            min_samples_leaf=self.min_samples_leaf,
+            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+            max_leaf_nodes=self.max_leaf_nodes,
+            oob_score=self.oob_score,
+            verbose=self.verbose,
+            random_state=self.random_state,
+        )
+
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         if check_none(self.max_leaf_nodes):
@@ -68,20 +94,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                              f"in ('None', 'none', None) or an integer, got {self.max_depth}")
 
         # TODO: add class_weights
-        estimator = ExtraTreesClassifier(
-            n_estimators=self.n_estimators,
-            criterion=self.criterion,
-            max_depth=self.max_depth,
-            min_samples_split=self.min_samples_split,
-            min_samples_leaf=self.min_samples_leaf,
-            bootstrap=self.bootstrap,
-            max_features=self.max_features,
-            max_leaf_nodes=self.max_leaf_nodes,
-            min_impurity_decrease=self.min_impurity_decrease,
-            oob_score=self.oob_score,
-            verbose=self.verbose,
-            random_state=self.random_state,
-        )
+        estimator = ExtraTreesClassifier(**self.get_components_kwargs())
 
         self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
                                                          threshold='mean',
@@ -112,7 +125,7 @@ def get_hyperparameter_search_space(
             value_range=(0,),
             default_value=0),
         criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion',
-                                                                         value_range=("gini", "entropy"),
+                                                                         value_range=CRITERION_CHOICES,
                                                                          default_value="gini",
                                                                          ),
         min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',
 
@@ -19,6 +19,9 @@
 from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, check_none
 
 
+CRITERION_CHOICES = ('mse', 'friedman_mse', 'mae')
+
+
 class ExtraTreesPreprocessorRegression(autoPyTorchFeaturePreprocessingComponent):
     """
     Selects features based on importance weights using extra trees
@@ -32,9 +35,9 @@ def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
                  random_state: Optional[np.random.RandomState] = None):
         self.bootstrap = bootstrap
         self.n_estimators = n_estimators
-        if criterion not in ('mse', 'friedman_mse', 'mae'):
-            raise ValueError("'criterion' is not in ('mse', 'friedman_mse', 'mae'): "
-                             "%s" % criterion)
+        if criterion not in CRITERION_CHOICES:
+            raise ValueError(f"`criterion` of {self.__class__.__name__} "
+                             f"must be in {CRITERION_CHOICES}, but got: {criterion}")
         self.criterion = criterion
         self.max_features = max_features
         self.max_depth = max_depth
@@ -50,6 +53,28 @@ def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
         self.add_fit_requirements([
             FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
 
+    def get_components_kwargs(self) -> Dict[str, Any]:
+        """
+        returns keyword arguments required by the feature preprocessor
+
+        Returns:
+            Dict[str, Any]: kwargs
+        """
+        return dict(
+            bootstrap=self.bootstrap,
+            n_estimators=self.n_estimators,
+            criterion=self.criterion,
+            max_features=self.max_features,
+            max_depth=self.max_depth,
+            min_samples_split=self.min_samples_split,
+            min_samples_leaf=self.min_samples_leaf,
+            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+            max_leaf_nodes=self.max_leaf_nodes,
+            oob_score=self.oob_score,
+            verbose=self.verbose,
+            random_state=self.random_state,
+        )
+
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         self.check_requirements(X, y)
@@ -77,20 +102,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         max_features = max(1, min(int(num_features / 2), max_features))
 
         # TODO: add class_weights
-        estimator = ExtraTreesRegressor(
-            n_estimators=self.n_estimators,
-            criterion=self.criterion,
-            max_depth=self.max_depth,
-            min_samples_split=self.min_samples_split,
-            min_samples_leaf=self.min_samples_leaf,
-            bootstrap=self.bootstrap,
-            max_features=self.max_features,
-            max_leaf_nodes=self.max_leaf_nodes,
-            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
-            oob_score=self.oob_score,
-            verbose=self.verbose,
-            random_state=self.random_state,
-        )
+        estimator = ExtraTreesRegressor(**self.get_components_kwargs())
 
         self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
                                                          threshold='mean',
@@ -117,7 +129,7 @@ def get_hyperparameter_search_space(
                                                                             default_value=1,
                                                                             ),
         criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion',
-                                                                         value_range=('mse', 'friedman_mse', 'mae'),
+                                                                         value_range=CRITERION_CHOICES,
                                                                          default_value="mse",
                                                                          ),
         min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',
 
@@ -45,19 +45,29 @@ def __init__(self, dual: bool = False, penalty: str = "l1",
 
         super().__init__(random_state=random_state)
 
+    def get_components_kwargs(self) -> Dict[str, Any]:
+        """
+        returns keyword arguments required by the feature preprocessor
+
+        Returns:
+            Dict[str, Any]: kwargs
+        """
+        return dict(
+            dual=self.dual,
+            penalty=self.penalty,
+            loss=self.loss,
+            multi_class=self.multi_class,
+            intercept_scaling=self.intercept_scaling,
+            tol=self.tol,
+            fit_intercept=self.fit_intercept,
+            C=self.C,
+            random_state=self.random_state
+        )
+
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         # TODO: add class_weights
-        estimator = LinearSVC(penalty=self.penalty,
-                              loss=self.loss,
-                              dual=self.dual,
-                              tol=self.tol,
-                              C=self.C,
-                              fit_intercept=self.fit_intercept,
-                              intercept_scaling=self.intercept_scaling,
-                              multi_class=self.multi_class,
-                              random_state=self.random_state
-                              )
+        estimator = LinearSVC(**self.get_components_kwargs())
 
         self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
                                                          threshold='mean',
 
@@ -2,8 +2,6 @@
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter,
-    UniformFloatHyperparameter,
     UniformIntegerHyperparameter
 )
 
@@ -19,30 +17,29 @@
 
 
 class RandomTreesEmbedding(autoPyTorchFeaturePreprocessingComponent):
-    def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
+    def __init__(self, n_estimators: int = 10,
                  max_depth: Optional[Union[str, int]] = 5, min_samples_split: int = 2,
-                 min_samples_leaf: int = 1, min_weight_fraction_leaf: bool = True,
+                 min_samples_leaf: int = 1,
                  max_leaf_nodes: Optional[Union[str, int]] = "none",
                  sparse_output: bool = False,
                  random_state: Optional[np.random.RandomState] = None):
-        self.bootstrap = bootstrap
         self.n_estimators = n_estimators
         self.max_depth = max_depth
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
-        self.min_weight_fraction_leaf = min_weight_fraction_leaf
         self.max_leaf_nodes = max_leaf_nodes
         self.sparse_output = sparse_output
 
         super().__init__(random_state=random_state)
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        if check_none(self.max_leaf_nodes):
-            self.max_leaf_nodes = None
-        if check_none(self.max_depth):
-            self.max_depth = None
+    def get_components_kwargs(self) -> Dict[str, Any]:
+        """
+        returns keyword arguments required by the feature preprocessor
 
-        self.preprocessor['numerical'] = SklearnRandomTreesEmbedding(
+        Returns:
+            Dict[str, Any]: kwargs
+        """
+        return dict(
             n_estimators=self.n_estimators,
             max_depth=self.max_depth,
             min_samples_split=self.min_samples_split,
@@ -51,6 +48,14 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             sparse_output=self.sparse_output,
             random_state=self.random_state
         )
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        if check_none(self.max_leaf_nodes):
+            self.max_leaf_nodes = None
+        if check_none(self.max_depth):
+            self.max_depth = None
+
+        self.preprocessor['numerical'] = SklearnRandomTreesEmbedding(**self.get_components_kwargs())
         return self
 
     @staticmethod
@@ -65,10 +70,6 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap',
-                                                                         value_range=(True, False),
-                                                                         default_value=True,
-                                                                         ),
         n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators',
                                                                             value_range=(10, 100),
                                                                             default_value=10,
@@ -85,23 +86,17 @@ def get_hyperparameter_search_space(
                                                                                 value_range=(1, 20),
                                                                                 default_value=1,
                                                                                 ),
-        min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter='min_weight_fraction_leaf',
-            value_range=(1.0,),
-            default_value=1.0),
         max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes',
                                                                               value_range=("none",),
                                                                               default_value="none",
                                                                               ),
     ) -> ConfigurationSpace:
 
         cs = ConfigurationSpace()
-        add_hyperparameter(cs, bootstrap, CategoricalHyperparameter)
         add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter)
         add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter)
         add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter)
         add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter)
-        add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter)
         add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter)
 
         return cs
@@ -1,4 +1,3 @@
-import warnings
 from functools import partial
 from typing import Any, Dict, Optional
 
@@ -16,13 +15,18 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
     base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.utils \
+    import filter_score_func_choices
 from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
 
 
+SCORE_FUNC_CHOICES = ("chi2", "mutual_info_classif", "f_classif")
+
+
 class SelectPercentileClassification(autoPyTorchFeaturePreprocessingComponent):
     """
     Select features according to a percentile of the highest scores.
-    Scores are calculated using one of 'chi2, 'f_classif', 'mutual_info_classif'
+    Scores are calculated using one of SCORE_FUNC_CHOICES
     """
     def __init__(self, score_func: str = "chi2",
                  percentile: int = 50,
@@ -36,8 +40,8 @@ def __init__(self, score_func: str = "chi2",
         elif score_func == "mutual_info_classif":
             self.score_func = partial(mutual_info_classif, random_state=random_state)
         else:
-            raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info_classif'), "
-                             "but is: %s" % score_func)
+            raise ValueError(f"score_func of {self.__class__.__name__} must be in {SCORE_FUNC_CHOICES}, "
+                             "but is: {score_func}")
 
         super().__init__(random_state=random_state)
         self.add_fit_requirements([
@@ -60,33 +64,13 @@ def get_hyperparameter_search_space(
                                                                           default_value=50,
                                                                           ),
         score_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="score_func",
-                                                                          value_range=("chi2",
-                                                                                       "f_classif",
-                                                                                       "mutual_info_classif"),
+                                                                          value_range=SCORE_FUNC_CHOICES,
                                                                           default_value="chi2",
                                                                           ),
     ) -> ConfigurationSpace:
-        value_range = list(score_func.value_range)
-        if dataset_properties is not None:
-            if (
-                dataset_properties.get("issigned") is True
-            ):
-                value_range = [value for value in value_range if value not in ("chi2", "mutual_info_classif")]
-            if dataset_properties.get("issparse") is True:
-                value_range = [value for value in value_range if value != "f_classif"]
-
-        if value_range != list(score_func.value_range):
-            warnings.warn(f"Given choices for `score_func` are not compatible with the dataset. "
-                          f"Updating choices to {value_range}")
-
-        if len(value_range) == 0:
-            raise TypeError("`SelectPercentileClassification` is not compatible with the"
-                            " current dataset as it is both `signed` and `sparse`")
-        default_value = score_func.default_value if score_func.default_value in value_range else value_range[-1]
-        score_func = HyperparameterSearchSpace(hyperparameter="score_func",
-                                               value_range=value_range,
-                                               default_value=default_value,
-                                               )
+        score_func = filter_score_func_choices(class_name="SelectPercentileClassification",
+                                               dataset_properties=dataset_properties,
+                                               score_func=score_func)
         cs = ConfigurationSpace()
 
         add_hyperparameter(cs, score_func, CategoricalHyperparameter)