Skip to content

Commit c1d43c6

Browse files
committed
address comments from shuhei
1 parent 9c846bf commit c1d43c6

File tree

10 files changed

+190
-133
lines changed

10 files changed

+190
-133
lines changed

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorClassification.py

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, check_none
2020

2121

22+
CRITERION_CHOICES = ("gini", "entropy")
23+
24+
2225
class ExtraTreesPreprocessorClassification(autoPyTorchFeaturePreprocessingComponent):
2326
"""
2427
Selects features based on importance weights calculated using extra trees
@@ -33,9 +36,9 @@ def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
3336
random_state: Optional[np.random.RandomState] = None):
3437
self.bootstrap = bootstrap
3538
self.n_estimators = n_estimators
36-
if criterion not in ("gini", "entropy"):
37-
raise ValueError("'criterion' is not in ('gini', 'entropy'): "
38-
"%s" % criterion)
39+
if criterion not in CRITERION_CHOICES:
40+
raise ValueError(f"`criterion` of {self.__class__.__name__} "
41+
f"must be in {CRITERION_CHOICES}, but got: {criterion}")
3942
self.criterion = criterion
4043
self.max_features = max_features
4144
self.min_impurity_decrease = min_impurity_decrease
@@ -49,6 +52,29 @@ def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
4952

5053
super().__init__(random_state=random_state)
5154

55+
def get_components_kwargs(self) -> Dict[str, Any]:
56+
"""
57+
returns keyword arguments required by the feature preprocessor
58+
59+
Returns:
60+
Dict[str, Any]: kwargs
61+
"""
62+
return dict(
63+
bootstrap=self.bootstrap,
64+
n_estimators=self.n_estimators,
65+
criterion=self.criterion,
66+
max_features=self.max_features,
67+
min_impurity_decrease=self.min_impurity_decrease,
68+
max_depth=self.max_depth,
69+
min_samples_split=self.min_samples_split,
70+
min_samples_leaf=self.min_samples_leaf,
71+
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
72+
max_leaf_nodes=self.max_leaf_nodes,
73+
oob_score=self.oob_score,
74+
verbose=self.verbose,
75+
random_state=self.random_state,
76+
)
77+
5278
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
5379

5480
if check_none(self.max_leaf_nodes):
@@ -68,20 +94,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
6894
f"in ('None', 'none', None) or an integer, got {self.max_depth}")
6995

7096
# TODO: add class_weights
71-
estimator = ExtraTreesClassifier(
72-
n_estimators=self.n_estimators,
73-
criterion=self.criterion,
74-
max_depth=self.max_depth,
75-
min_samples_split=self.min_samples_split,
76-
min_samples_leaf=self.min_samples_leaf,
77-
bootstrap=self.bootstrap,
78-
max_features=self.max_features,
79-
max_leaf_nodes=self.max_leaf_nodes,
80-
min_impurity_decrease=self.min_impurity_decrease,
81-
oob_score=self.oob_score,
82-
verbose=self.verbose,
83-
random_state=self.random_state,
84-
)
97+
estimator = ExtraTreesClassifier(**self.get_components_kwargs())
8598

8699
self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
87100
threshold='mean',
@@ -112,7 +125,7 @@ def get_hyperparameter_search_space(
112125
value_range=(0,),
113126
default_value=0),
114127
criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion',
115-
value_range=("gini", "entropy"),
128+
value_range=CRITERION_CHOICES,
116129
default_value="gini",
117130
),
118131
min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorRegression.py

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, check_none
2020

2121

22+
CRITERION_CHOICES = ('mse', 'friedman_mse', 'mae')
23+
24+
2225
class ExtraTreesPreprocessorRegression(autoPyTorchFeaturePreprocessingComponent):
2326
"""
2427
Selects features based on importance weights using extra trees
@@ -32,9 +35,9 @@ def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
3235
random_state: Optional[np.random.RandomState] = None):
3336
self.bootstrap = bootstrap
3437
self.n_estimators = n_estimators
35-
if criterion not in ('mse', 'friedman_mse', 'mae'):
36-
raise ValueError("'criterion' is not in ('mse', 'friedman_mse', 'mae'): "
37-
"%s" % criterion)
38+
if criterion not in CRITERION_CHOICES:
39+
raise ValueError(f"`criterion` of {self.__class__.__name__} "
40+
f"must be in {CRITERION_CHOICES}, but got: {criterion}")
3841
self.criterion = criterion
3942
self.max_features = max_features
4043
self.max_depth = max_depth
@@ -50,6 +53,28 @@ def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
5053
self.add_fit_requirements([
5154
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
5255

56+
def get_components_kwargs(self) -> Dict[str, Any]:
57+
"""
58+
returns keyword arguments required by the feature preprocessor
59+
60+
Returns:
61+
Dict[str, Any]: kwargs
62+
"""
63+
return dict(
64+
bootstrap=self.bootstrap,
65+
n_estimators=self.n_estimators,
66+
criterion=self.criterion,
67+
max_features=self.max_features,
68+
max_depth=self.max_depth,
69+
min_samples_split=self.min_samples_split,
70+
min_samples_leaf=self.min_samples_leaf,
71+
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
72+
max_leaf_nodes=self.max_leaf_nodes,
73+
oob_score=self.oob_score,
74+
verbose=self.verbose,
75+
random_state=self.random_state,
76+
)
77+
5378
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
5479

5580
self.check_requirements(X, y)
@@ -77,20 +102,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
77102
max_features = max(1, min(int(num_features / 2), max_features))
78103

79104
# TODO: add class_weights
80-
estimator = ExtraTreesRegressor(
81-
n_estimators=self.n_estimators,
82-
criterion=self.criterion,
83-
max_depth=self.max_depth,
84-
min_samples_split=self.min_samples_split,
85-
min_samples_leaf=self.min_samples_leaf,
86-
bootstrap=self.bootstrap,
87-
max_features=self.max_features,
88-
max_leaf_nodes=self.max_leaf_nodes,
89-
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
90-
oob_score=self.oob_score,
91-
verbose=self.verbose,
92-
random_state=self.random_state,
93-
)
105+
estimator = ExtraTreesRegressor(**self.get_components_kwargs())
94106

95107
self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
96108
threshold='mean',
@@ -117,7 +129,7 @@ def get_hyperparameter_search_space(
117129
default_value=1,
118130
),
119131
criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion',
120-
value_range=('mse', 'friedman_mse', 'mae'),
132+
value_range=CRITERION_CHOICES,
121133
default_value="mse",
122134
),
123135
min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/LibLinearSVCPreprocessor.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,29 @@ def __init__(self, dual: bool = False, penalty: str = "l1",
4545

4646
super().__init__(random_state=random_state)
4747

48+
def get_components_kwargs(self) -> Dict[str, Any]:
49+
"""
50+
returns keyword arguments required by the feature preprocessor
51+
52+
Returns:
53+
Dict[str, Any]: kwargs
54+
"""
55+
return dict(
56+
dual=self.dual,
57+
penalty=self.penalty,
58+
loss=self.loss,
59+
multi_class=self.multi_class,
60+
intercept_scaling=self.intercept_scaling,
61+
tol=self.tol,
62+
fit_intercept=self.fit_intercept,
63+
C=self.C,
64+
random_state=self.random_state
65+
)
66+
4867
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
4968

5069
# TODO: add class_weights
51-
estimator = LinearSVC(penalty=self.penalty,
52-
loss=self.loss,
53-
dual=self.dual,
54-
tol=self.tol,
55-
C=self.C,
56-
fit_intercept=self.fit_intercept,
57-
intercept_scaling=self.intercept_scaling,
58-
multi_class=self.multi_class,
59-
random_state=self.random_state
60-
)
70+
estimator = LinearSVC(**self.get_components_kwargs())
6171

6272
self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
6373
threshold='mean',

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomTreesEmbedding.py

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22

33
from ConfigSpace.configuration_space import ConfigurationSpace
44
from ConfigSpace.hyperparameters import (
5-
CategoricalHyperparameter,
6-
UniformFloatHyperparameter,
75
UniformIntegerHyperparameter
86
)
97

@@ -19,30 +17,29 @@
1917

2018

2119
class RandomTreesEmbedding(autoPyTorchFeaturePreprocessingComponent):
22-
def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
20+
def __init__(self, n_estimators: int = 10,
2321
max_depth: Optional[Union[str, int]] = 5, min_samples_split: int = 2,
24-
min_samples_leaf: int = 1, min_weight_fraction_leaf: bool = True,
22+
min_samples_leaf: int = 1,
2523
max_leaf_nodes: Optional[Union[str, int]] = "none",
2624
sparse_output: bool = False,
2725
random_state: Optional[np.random.RandomState] = None):
28-
self.bootstrap = bootstrap
2926
self.n_estimators = n_estimators
3027
self.max_depth = max_depth
3128
self.min_samples_split = min_samples_split
3229
self.min_samples_leaf = min_samples_leaf
33-
self.min_weight_fraction_leaf = min_weight_fraction_leaf
3430
self.max_leaf_nodes = max_leaf_nodes
3531
self.sparse_output = sparse_output
3632

3733
super().__init__(random_state=random_state)
3834

39-
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
40-
if check_none(self.max_leaf_nodes):
41-
self.max_leaf_nodes = None
42-
if check_none(self.max_depth):
43-
self.max_depth = None
35+
def get_components_kwargs(self) -> Dict[str, Any]:
36+
"""
37+
returns keyword arguments required by the feature preprocessor
4438
45-
self.preprocessor['numerical'] = SklearnRandomTreesEmbedding(
39+
Returns:
40+
Dict[str, Any]: kwargs
41+
"""
42+
return dict(
4643
n_estimators=self.n_estimators,
4744
max_depth=self.max_depth,
4845
min_samples_split=self.min_samples_split,
@@ -51,6 +48,14 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
5148
sparse_output=self.sparse_output,
5249
random_state=self.random_state
5350
)
51+
52+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
53+
if check_none(self.max_leaf_nodes):
54+
self.max_leaf_nodes = None
55+
if check_none(self.max_depth):
56+
self.max_depth = None
57+
58+
self.preprocessor['numerical'] = SklearnRandomTreesEmbedding(**self.get_components_kwargs())
5459
return self
5560

5661
@staticmethod
@@ -65,10 +70,6 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
6570
@staticmethod
6671
def get_hyperparameter_search_space(
6772
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
68-
bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap',
69-
value_range=(True, False),
70-
default_value=True,
71-
),
7273
n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators',
7374
value_range=(10, 100),
7475
default_value=10,
@@ -85,23 +86,17 @@ def get_hyperparameter_search_space(
8586
value_range=(1, 20),
8687
default_value=1,
8788
),
88-
min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(
89-
hyperparameter='min_weight_fraction_leaf',
90-
value_range=(1.0,),
91-
default_value=1.0),
9289
max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes',
9390
value_range=("none",),
9491
default_value="none",
9592
),
9693
) -> ConfigurationSpace:
9794

9895
cs = ConfigurationSpace()
99-
add_hyperparameter(cs, bootstrap, CategoricalHyperparameter)
10096
add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter)
10197
add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter)
10298
add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter)
10399
add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter)
104-
add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter)
105100
add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter)
106101

107102
return cs

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileClassification.py

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import warnings
21
from functools import partial
32
from typing import Any, Dict, Optional
43

@@ -16,13 +15,18 @@
1615
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1716
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
1817
base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
18+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.utils \
19+
import filter_score_func_choices
1920
from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
2021

2122

23+
SCORE_FUNC_CHOICES = ("chi2", "mutual_info_classif", "f_classif")
24+
25+
2226
class SelectPercentileClassification(autoPyTorchFeaturePreprocessingComponent):
2327
"""
2428
Select features according to a percentile of the highest scores.
25-
Scores are calculated using one of 'chi2, 'f_classif', 'mutual_info_classif'
29+
Scores are calculated using one of SCORE_FUNC_CHOICES
2630
"""
2731
def __init__(self, score_func: str = "chi2",
2832
percentile: int = 50,
@@ -36,8 +40,8 @@ def __init__(self, score_func: str = "chi2",
3640
elif score_func == "mutual_info_classif":
3741
self.score_func = partial(mutual_info_classif, random_state=random_state)
3842
else:
39-
raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info_classif'), "
40-
"but is: %s" % score_func)
43+
raise ValueError(f"score_func of {self.__class__.__name__} must be in {SCORE_FUNC_CHOICES}, "
44+
"but is: {score_func}")
4145

4246
super().__init__(random_state=random_state)
4347
self.add_fit_requirements([
@@ -60,33 +64,13 @@ def get_hyperparameter_search_space(
6064
default_value=50,
6165
),
6266
score_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="score_func",
63-
value_range=("chi2",
64-
"f_classif",
65-
"mutual_info_classif"),
67+
value_range=SCORE_FUNC_CHOICES,
6668
default_value="chi2",
6769
),
6870
) -> ConfigurationSpace:
69-
value_range = list(score_func.value_range)
70-
if dataset_properties is not None:
71-
if (
72-
dataset_properties.get("issigned") is True
73-
):
74-
value_range = [value for value in value_range if value not in ("chi2", "mutual_info_classif")]
75-
if dataset_properties.get("issparse") is True:
76-
value_range = [value for value in value_range if value != "f_classif"]
77-
78-
if value_range != list(score_func.value_range):
79-
warnings.warn(f"Given choices for `score_func` are not compatible with the dataset. "
80-
f"Updating choices to {value_range}")
81-
82-
if len(value_range) == 0:
83-
raise TypeError("`SelectPercentileClassification` is not compatible with the"
84-
" current dataset as it is both `signed` and `sparse`")
85-
default_value = score_func.default_value if score_func.default_value in value_range else value_range[-1]
86-
score_func = HyperparameterSearchSpace(hyperparameter="score_func",
87-
value_range=value_range,
88-
default_value=default_value,
89-
)
71+
score_func = filter_score_func_choices(class_name="SelectPercentileClassification",
72+
dataset_properties=dataset_properties,
73+
score_func=score_func)
9074
cs = ConfigurationSpace()
9175

9276
add_hyperparameter(cs, score_func, CategoricalHyperparameter)

0 commit comments

Comments
 (0)