diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 4c4773700e..525004269e 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -26,9 +26,8 @@ jobs: run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" id: extract_branch - # Updating this to @v2 requires a github auth token - name: Push to GitHub Packages - uses: docker/build-push-action@v1 + uses: docker/build-push-action@v3 with: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} @@ -37,12 +36,11 @@ jobs: tag_with_ref: true tags: ${{ steps.extract_branch.outputs.branch }} - # Updating this to @v2 requires a github auth token - name: Push to Docker Hub - uses: docker/build-push-action@v1 + uses: docker/build-push-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} + password: ${{ secrets.DOCKER_TOKEN }} repository: mfeurer/auto-sklearn tags: ${{ steps.extract_branch.outputs.branch }} diff --git a/autosklearn/askl_typing.py b/autosklearn/askl_typing.py new file mode 100644 index 0000000000..61d01bef30 --- /dev/null +++ b/autosklearn/askl_typing.py @@ -0,0 +1,3 @@ +from typing import Dict, Union + +FEAT_TYPE_TYPE = Dict[Union[str, int], str] diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 278cd5c146..2a62e8a97c 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -21,7 +21,6 @@ import os import platform import sys -import tempfile import time import types import uuid @@ -37,7 +36,7 @@ import sklearn.utils from ConfigSpace.configuration_space import Configuration, ConfigurationSpace from ConfigSpace.read_and_write import json as cs_json -from dask.distributed import Client, LocalCluster +from dask.distributed import Client from scipy.sparse import spmatrix from sklearn.base import BaseEstimator from sklearn.dummy import DummyClassifier, DummyRegressor @@ -105,6 +104,7 @@ from autosklearn.pipeline.components.regression import RegressorChoice from autosklearn.smbo import AutoMLSMBO from autosklearn.util import RE_PATTERN, pipeline +from autosklearn.util.dask import Dask, LocalDask, UserDask from autosklearn.util.data import ( DatasetCompressionSpec, default_dataset_compression_arg, @@ -120,7 +120,6 @@ warnings_to, ) from autosklearn.util.parallel import preload_modules -from autosklearn.util.single_thread_client import SingleThreadedClient from autosklearn.util.smac_wrap import SMACCallback, SmacRunCallback from autosklearn.util.stopwatch import StopWatch @@ -299,21 +298,22 @@ def __init__( self._initial_configurations_via_metalearning = ( initial_configurations_via_metalearning ) + self._n_jobs = n_jobs self._scoring_functions = scoring_functions or [] self._resampling_strategy_arguments = resampling_strategy_arguments or {} + self._multiprocessing_context = "forkserver" # Single core, local runs should use fork to prevent the __main__ requirements # in examples. Nevertheless, multi-process runs have spawn as requirement to # reduce the possibility of a deadlock - if n_jobs == 1 and dask_client is None: - self._multiprocessing_context = "fork" - self._dask_client = SingleThreadedClient() - self._n_jobs = 1 + self._dask: Dask + if dask_client is not None: + self._dask = UserDask(client=dask_client) else: - self._multiprocessing_context = "forkserver" - self._dask_client = dask_client - self._n_jobs = n_jobs + self._dask = LocalDask(n_jobs=n_jobs) + if n_jobs == 1: + self._multiprocessing_context = "fork" # Create the backend self._backend: Backend = create( @@ -350,38 +350,6 @@ def __init__( self.num_run = 0 self.fitted = False - def _create_dask_client(self) -> None: - self._is_dask_client_internally_created = True - self._dask_client = Client( - LocalCluster( - n_workers=self._n_jobs, - processes=False, - threads_per_worker=1, - # We use the temporal directory to save the - # dask workers, because deleting workers takes - # more time than deleting backend directories - # This prevent an error saying that the worker - # file was deleted, so the client could not close - # the worker properly - local_directory=tempfile.gettempdir(), - # Memory is handled by the pynisher, not by the dask worker/nanny - memory_limit=0, - ), - # Heartbeat every 10s - heartbeat_interval=10000, - ) - - def _close_dask_client(self, force: bool = False) -> None: - if getattr(self, "_dask_client", None) is not None and ( - force or getattr(self, "_is_dask_client_internally_created", False) - ): - self._dask_client.shutdown() - self._dask_client.close() - del self._dask_client - self._dask_client = None - self._is_dask_client_internally_created = False - del self._is_dask_client_internally_created - def _get_logger(self, name: str) -> PicklableClientLogger: logger_name = "AutoML(%d):%s" % (self._seed, name) @@ -747,17 +715,6 @@ def fit( "autosklearn.metrics.Scorer." ) - # If no dask client was provided, we create one, so that we can - # start a ensemble process in parallel to smbo optimize - if self._dask_client is None and ( - self._ensemble_class is not None - or self._n_jobs is not None - and self._n_jobs > 1 - ): - self._create_dask_client() - else: - self._is_dask_client_internally_created = False - self._dataset_name = dataset_name self._stopwatch.start(self._dataset_name) @@ -902,70 +859,85 @@ def fit( ) n_meta_configs = self._initial_configurations_via_metalearning - _proc_smac = AutoMLSMBO( - config_space=self.configuration_space, - dataset_name=self._dataset_name, - backend=self._backend, - total_walltime_limit=time_left, - func_eval_time_limit=per_run_time_limit, - memory_limit=self._memory_limit, - data_memory_limit=self._data_memory_limit, - stopwatch=self._stopwatch, - n_jobs=self._n_jobs, - dask_client=self._dask_client, - start_num_run=self.num_run, - num_metalearning_cfgs=n_meta_configs, - config_file=configspace_path, - seed=self._seed, - metadata_directory=self._metadata_directory, - metrics=self._metrics, - resampling_strategy=self._resampling_strategy, - resampling_strategy_args=self._resampling_strategy_arguments, - include=self._include, - exclude=self._exclude, - disable_file_output=self._disable_evaluator_output, - get_smac_object_callback=self._get_smac_object_callback, - smac_scenario_args=self._smac_scenario_args, - scoring_functions=self._scoring_functions, - port=self._logger_port, - pynisher_context=self._multiprocessing_context, - ensemble_callback=proc_ensemble, - trials_callback=self._get_trials_callback, - ) + with self._dask as dask_client: + resamp_args = self._resampling_strategy_arguments + _proc_smac = AutoMLSMBO( + config_space=self.configuration_space, + dataset_name=self._dataset_name, + backend=self._backend, + total_walltime_limit=time_left, + func_eval_time_limit=per_run_time_limit, + memory_limit=self._memory_limit, + data_memory_limit=self._data_memory_limit, + stopwatch=self._stopwatch, + n_jobs=self._n_jobs, + dask_client=dask_client, + start_num_run=self.num_run, + num_metalearning_cfgs=n_meta_configs, + config_file=configspace_path, + seed=self._seed, + metadata_directory=self._metadata_directory, + metrics=self._metrics, + resampling_strategy=self._resampling_strategy, + resampling_strategy_args=resamp_args, + include=self._include, + exclude=self._exclude, + disable_file_output=self._disable_evaluator_output, + get_smac_object_callback=self._get_smac_object_callback, + smac_scenario_args=self._smac_scenario_args, + scoring_functions=self._scoring_functions, + port=self._logger_port, + pynisher_context=self._multiprocessing_context, + ensemble_callback=proc_ensemble, + trials_callback=self._get_trials_callback, + ) - ( - self.runhistory_, - self.trajectory_, - self._budget_type, - ) = _proc_smac.run_smbo() - trajectory_filename = os.path.join( - self._backend.get_smac_output_directory_for_run(self._seed), - "trajectory.json", - ) - saveable_trajectory = [ - list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) - for entry in self.trajectory_ - ] - with open(trajectory_filename, "w") as fh: - json.dump(saveable_trajectory, fh) - - self._logger.info("Starting shutdown...") - # Wait until the ensemble process is finished to avoid shutting down - # while the ensemble builder tries to access the data - if proc_ensemble is not None: - self.ensemble_performance_history = list(proc_ensemble.history) - - if len(proc_ensemble.futures) > 0: - # Now we wait for the future to return as it cannot be cancelled - # while it is running: https://stackoverflow.com/a/49203129 - self._logger.info( - "Ensemble script still running, waiting for it to finish." - ) - result = proc_ensemble.futures.pop().result() - if result: - ensemble_history, _ = result - self.ensemble_performance_history.extend(ensemble_history) - self._logger.info("Ensemble script finished, continue shutdown.") + ( + self.runhistory_, + self.trajectory_, + self._budget_type, + ) = _proc_smac.run_smbo() + + trajectory_filename = os.path.join( + self._backend.get_smac_output_directory_for_run(self._seed), + "trajectory.json", + ) + saveable_trajectory = [ + list(entry[:2]) + + [entry[2].get_dictionary()] + + list(entry[3:]) + for entry in self.trajectory_ + ] + with open(trajectory_filename, "w") as fh: + json.dump(saveable_trajectory, fh) + + self._logger.info("Starting shutdown...") + # Wait until the ensemble process is finished to avoid shutting + # down while the ensemble builder tries to access the data + if proc_ensemble is not None: + self.ensemble_performance_history = list( + proc_ensemble.history + ) + + if len(proc_ensemble.futures) > 0: + # Now we wait for the future to return as it cannot be + # cancelled while it is running + # * https://stackoverflow.com/a/49203129 + self._logger.info( + "Ensemble script still running," + " waiting for it to finish." + ) + result = proc_ensemble.futures.pop().result() + + if result: + ensemble_history, _ = result + self.ensemble_performance_history.extend( + ensemble_history + ) + + self._logger.info( + "Ensemble script finished, continue shutdown." + ) # save the ensemble performance history file if len(self.ensemble_performance_history) > 0: @@ -1054,7 +1026,7 @@ def _log_fit_setup(self) -> None: self._logger.debug( " multiprocessing_context: %s", str(self._multiprocessing_context) ) - self._logger.debug(" dask_client: %s", str(self._dask_client)) + self._logger.debug(" dask_client: %s", str(self._dask)) self._logger.debug(" precision: %s", str(self.precision)) self._logger.debug( " disable_evaluator_output: %s", str(self._disable_evaluator_output) @@ -1090,7 +1062,6 @@ def __sklearn_is_fitted__(self) -> bool: def _fit_cleanup(self) -> None: self._logger.info("Closing the dask infrastructure") - self._close_dask_client() self._logger.info("Finished closing the dask infrastructure") # Clean up the logger @@ -1555,12 +1526,6 @@ def fit_ensemble( # Make sure that input is valid y = self.InputValidator.target_validator.transform(y) - # Create a client if needed - if self._dask_client is None: - self._create_dask_client() - else: - self._is_dask_client_internally_created = False - metrics = metrics if metrics is not None else self._metrics if not isinstance(metrics, Sequence): metrics = [metrics] @@ -1568,35 +1533,41 @@ def fit_ensemble( # Use the current thread to start the ensemble builder process # The function ensemble_builder_process will internally create a ensemble # builder in the provide dask client - manager = EnsembleBuilderManager( - start_time=time.time(), - time_left_for_ensembles=self._time_for_task, - backend=copy.deepcopy(self._backend), - dataset_name=dataset_name if dataset_name else self._dataset_name, - task=task if task else self._task, - metrics=metrics if metrics is not None else self._metrics, - ensemble_class=( - ensemble_class if ensemble_class is not None else self._ensemble_class - ), - ensemble_kwargs=( - ensemble_kwargs - if ensemble_kwargs is not None - else self._ensemble_kwargs - ), - ensemble_nbest=ensemble_nbest if ensemble_nbest else self._ensemble_nbest, - max_models_on_disc=self._max_models_on_disc, - seed=self._seed, - precision=precision if precision else self.precision, - max_iterations=1, - read_at_most=None, - memory_limit=self._memory_limit, - random_state=self._seed, - logger_port=self._logger_port, - pynisher_context=self._multiprocessing_context, - ) - manager.build_ensemble(self._dask_client) - future = manager.futures.pop() - result = future.result() + with self._dask as dask_client: + manager = EnsembleBuilderManager( + start_time=time.time(), + time_left_for_ensembles=self._time_for_task, + backend=copy.deepcopy(self._backend), + dataset_name=dataset_name if dataset_name else self._dataset_name, + task=task if task else self._task, + metrics=metrics if metrics is not None else self._metrics, + ensemble_class=( + ensemble_class + if ensemble_class is not None + else self._ensemble_class + ), + ensemble_kwargs=( + ensemble_kwargs + if ensemble_kwargs is not None + else self._ensemble_kwargs + ), + ensemble_nbest=ensemble_nbest + if ensemble_nbest + else self._ensemble_nbest, + max_models_on_disc=self._max_models_on_disc, + seed=self._seed, + precision=precision if precision else self.precision, + max_iterations=1, + read_at_most=None, + memory_limit=self._memory_limit, + random_state=self._seed, + logger_port=self._logger_port, + pynisher_context=self._multiprocessing_context, + ) + manager.build_ensemble(dask_client) + future = manager.futures.pop() + result = future.result() + if result is None: raise ValueError( "Error building the ensemble - please check the log file and command " @@ -1606,7 +1577,6 @@ def fit_ensemble( self._ensemble_class = ensemble_class self._load_models() - self._close_dask_client() return self def _load_models(self): @@ -2281,7 +2251,7 @@ def _create_search_space( ) -> Tuple[ConfigurationSpace, str]: configspace_path = os.path.join(tmp_dir, "space.json") configuration_space = pipeline.get_configuration_space( - datamanager.info, + datamanager, include=include, exclude=exclude, ) @@ -2295,7 +2265,7 @@ def _create_search_space( def __getstate__(self) -> dict[str, Any]: # Cannot serialize a client! - self._dask_client = None + self._dask = None self.logging_server = None self.stop_logging_server = None return self.__dict__ @@ -2304,8 +2274,6 @@ def __del__(self) -> None: # Clean up the logger self._clean_logger() - self._close_dask_client() - class AutoMLClassifier(AutoML): diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index b97f588a45..1dd77479bc 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -17,6 +17,7 @@ import autosklearn.pipeline.classification import autosklearn.pipeline.regression +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.automl_common.common.utils.backend import Backend from autosklearn.constants import ( CLASSIFICATION_TASKS, @@ -45,6 +46,7 @@ def __init__( self, config: Configuration, random_state: Optional[Union[int, np.random.RandomState]], + feat_type: Optional[FEAT_TYPE_TYPE] = None, init_params: Optional[Dict[str, Any]] = None, dataset_properties: Dict[str, Any] = {}, include: Optional[List[str]] = None, @@ -61,6 +63,7 @@ def __init__( self.dataset_properties = dataset_properties self.include = include self.exclude = exclude + self.feat_type = feat_type def pre_transform( self, @@ -108,6 +111,7 @@ def __init__( self, config: Configuration, random_state: Optional[Union[int, np.random.RandomState]], + feat_type: Optional[FEAT_TYPE_TYPE] = None, init_params: Optional[Dict[str, Any]] = None, dataset_properties: Dict[str, Any] = {}, include: Optional[List[str]] = None, @@ -123,6 +127,7 @@ def __init__( self.dataset_properties = dataset_properties self.include = include self.exclude = exclude + self.feat_type = feat_type def pre_transform( self, @@ -217,6 +222,7 @@ def __init__( self.queue = queue self.datamanager = self.backend.load_datamanager() + self.feat_type = self.datamanager.feat_type self.include = include self.exclude = exclude @@ -294,11 +300,12 @@ def __init__( _addons[key].add_component(component) # Please mypy to prevent not defined attr - self.model = self._get_model() + self.model = self._get_model(feat_type=self.feat_type) - def _get_model(self) -> BaseEstimator: + def _get_model(self, feat_type: Optional[FEAT_TYPE_TYPE]) -> BaseEstimator: if not isinstance(self.configuration, Configuration): model = self.model_class( + feat_type=feat_type, config=self.configuration, random_state=self.seed, init_params=self._init_params, @@ -318,6 +325,7 @@ def _get_model(self) -> BaseEstimator: "multiclass": self.task_type == MULTICLASS_CLASSIFICATION, } model = self.model_class( + feat_type=feat_type, config=self.configuration, dataset_properties=dataset_properties, random_state=self.seed, diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py index d624c1a44d..fc1134819d 100644 --- a/autosklearn/evaluation/test_evaluator.py +++ b/autosklearn/evaluation/test_evaluator.py @@ -58,7 +58,7 @@ def __init__( self.X_test = self.datamanager.data.get("X_test") self.Y_test = self.datamanager.data.get("Y_test") - self.model = self._get_model() + self.model = self._get_model(self.feat_type) def fit_predict_and_loss(self) -> None: _fit_and_suppress_warnings(self.logger, self.model, self.X_train, self.Y_train) diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index f19db473bf..edd3bfb24a 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -247,6 +247,7 @@ def __init__( budget_type=budget_type, ) + self.feat_type = self.backend.load_datamanager().feat_type self.resampling_strategy = resampling_strategy if resampling_strategy_args is None: self.resampling_strategy_args = {} @@ -305,7 +306,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: # Test if the model allows for an iterative fit, if not, # call this method again without the iterative argument - model = self._get_model() + model = self._get_model(self.feat_type) if not model.estimator_supports_iterative_fit(): self.fit_predict_and_loss(iterative=False) return @@ -319,7 +320,9 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: Y_test_pred = [None] * self.num_cv_folds train_splits = [None] * self.num_cv_folds - self.models = [self._get_model() for i in range(self.num_cv_folds)] + self.models = [ + self._get_model(self.feat_type) for i in range(self.num_cv_folds) + ] iterations = [1] * self.num_cv_folds total_n_iterations = [0] * self.num_cv_folds # model.estimator_supports_iterative_fit -> true @@ -515,7 +518,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: self.Y_optimization = Y_targets self.Y_actual_train = Y_train_targets - self.model = self._get_model() + self.model = self._get_model(self.feat_type) status = StatusType.DONOTADVANCE if any( [ @@ -679,7 +682,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: self.Y_actual_train = Y_train_targets if self.num_cv_folds > 1: - self.model = self._get_model() + self.model = self._get_model(self.feat_type) # Bad style, but necessary for unit testing that self.model is # actually a new model self._added_empty_model = True @@ -798,7 +801,7 @@ def _partial_fit_and_predict_iterative( test_indices: List[int], add_model_to_self: bool, ) -> None: - model = self._get_model() + model = self._get_model(self.feat_type) self.indices[fold] = (train_indices, test_indices) @@ -939,7 +942,7 @@ def _partial_fit_and_predict_standard( PIPELINE_DATA_DTYPE, # test_pred TYPE_ADDITIONAL_INFO, ]: - model = self._get_model() + model = self._get_model(self.feat_type) self.indices[fold] = (train_indices, test_indices) @@ -1005,7 +1008,7 @@ def _partial_fit_and_predict_budget( # Add this statement for mypy assert self.budget is not None - model = self._get_model() + model = self._get_model(self.feat_type) self.indices[fold] = (train_indices, test_indices) self.X_targets[fold] = self.X_train[test_indices] self.Y_targets[fold] = self.Y_train[test_indices] diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py index 4303c953dc..078355dfbb 100644 --- a/autosklearn/experimental/askl2.py +++ b/autosklearn/experimental/askl2.py @@ -51,8 +51,10 @@ def __call__( initial_configurations = [] for member in self.portfolio.values(): try: + hp_names = scenario.cs.get_hyperparameter_names() + _member = {key: member[key] for key in member if key in hp_names} initial_configurations.append( - Configuration(configuration_space=scenario.cs, values=member) + Configuration(configuration_space=scenario.cs, values=_member) ) except ValueError: pass @@ -103,8 +105,10 @@ def __call__( initial_configurations = [] for member in self.portfolio.values(): try: + hp_names = scenario.cs.get_hyperparameter_names() + _member = {key: member[key] for key in member if key in hp_names} initial_configurations.append( - Configuration(configuration_space=scenario.cs, values=member) + Configuration(configuration_space=scenario.cs, values=_member) ) except ValueError: pass diff --git a/autosklearn/metalearning/input/aslib_simple.py b/autosklearn/metalearning/input/aslib_simple.py index 833242729d..871cccd2c1 100644 --- a/autosklearn/metalearning/input/aslib_simple.py +++ b/autosklearn/metalearning/input/aslib_simple.py @@ -5,13 +5,15 @@ import arff import pandas as pd +from ConfigSpace.configuration_space import ConfigurationSpace class AlgorithmSelectionProblem(object): - def __init__(self, directory): + def __init__(self, directory: str, cs: ConfigurationSpace): self.logger = logging.getLogger(__name__) # Create data structures + self.cs = cs self.dir_ = directory self.algorithm_runs = None self.configurations = None @@ -143,13 +145,17 @@ def _read_configurations(self, filename): csv_reader = csv.DictReader(fh) configurations = dict() + hp_names = self.cs.get_hyperparameter_names() for line in csv_reader: configuration = dict() algorithm_id = line["idx"] for hp_name, value in line.items(): if not value or hp_name == "idx": continue - + if hp_name not in hp_names: + # skip hyperparameter + # if it is not existing in the current search space + continue try: value = int(value) except Exception: diff --git a/autosklearn/metalearning/metalearning/meta_base.py b/autosklearn/metalearning/metalearning/meta_base.py index 826bfaeda3..4cb116ef21 100644 --- a/autosklearn/metalearning/metalearning/meta_base.py +++ b/autosklearn/metalearning/metalearning/meta_base.py @@ -42,7 +42,9 @@ def __init__(self, configuration_space, aslib_directory, logger): self.configuration_space = configuration_space self.aslib_directory = aslib_directory - aslib_reader = aslib_simple.AlgorithmSelectionProblem(self.aslib_directory) + aslib_reader = aslib_simple.AlgorithmSelectionProblem( + self.aslib_directory, self.configuration_space + ) self.metafeatures = aslib_reader.metafeatures self.algorithm_runs: OrderedDict[ str, pd.DataFrame diff --git a/autosklearn/pipeline/base.py b/autosklearn/pipeline/base.py index 3a13364ea6..b4647215c6 100644 --- a/autosklearn/pipeline/base.py +++ b/autosklearn/pipeline/base.py @@ -1,5 +1,5 @@ from abc import ABCMeta -from typing import Dict, Union +from typing import Dict, Optional, Union import numpy as np import scipy.sparse @@ -7,6 +7,7 @@ from sklearn.pipeline import Pipeline import autosklearn.pipeline.create_searchspace_util +from autosklearn.askl_typing import FEAT_TYPE_TYPE from .components.base import AutoSklearnChoice, AutoSklearnComponent @@ -35,6 +36,7 @@ class BasePipeline(Pipeline): def __init__( self, config=None, + feat_type: Optional[FEAT_TYPE_TYPE] = None, steps=None, dataset_properties=None, include=None, @@ -50,15 +52,18 @@ def __init__( dataset_properties if dataset_properties is not None else {} ) self.random_state = random_state + self.feat_type = feat_type if steps is None: - self.steps = self._get_pipeline_steps(dataset_properties=dataset_properties) + self.steps = self._get_pipeline_steps( + feat_type=feat_type, dataset_properties=dataset_properties + ) else: self.steps = steps self._validate_include_exclude_params() - self.config_space = self.get_hyperparameter_search_space() + self.config_space = self.get_hyperparameter_search_space(feat_type=feat_type) if config is None: self.config = self.config_space.get_default_configuration() @@ -82,7 +87,9 @@ def __init__( ) self.config = config - self.set_hyperparameters(self.config, init_params=init_params) + self.set_hyperparameters( + self.config, feat_type=feat_type, init_params=init_params + ) super().__init__(steps=self.steps) @@ -202,14 +209,19 @@ def predict(self, X, batch_size=None): return y - def set_hyperparameters(self, configuration, init_params=None): + def set_hyperparameters( + self, + configuration, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + init_params=None, + ): self.config = configuration for node_idx, n_ in enumerate(self.steps): node_name, node = n_ sub_configuration_space = node.get_hyperparameter_search_space( - dataset_properties=self.dataset_properties + feat_type=feat_type, dataset_properties=self.dataset_properties ) sub_config_dict = {} for param in configuration: @@ -236,7 +248,9 @@ def set_hyperparameters(self, configuration, init_params=None): node, (AutoSklearnChoice, AutoSklearnComponent, BasePipeline) ): node.set_hyperparameters( - configuration=sub_configuration, init_params=sub_init_params_dict + feat_type=feat_type, + configuration=sub_configuration, + init_params=sub_init_params_dict, ) else: raise NotImplementedError("Not supported yet!") @@ -247,7 +261,9 @@ def set_hyperparameters(self, configuration, init_params=None): return self - def get_hyperparameter_search_space(self, dataset_properties=None): + def get_hyperparameter_search_space( + self, feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): """Return the configuration space for the CASH problem. Returns @@ -258,6 +274,7 @@ def get_hyperparameter_search_space(self, dataset_properties=None): """ if not hasattr(self, "config_space") or self.config_space is None: self.config_space = self._get_hyperparameter_search_space( + feat_type=feat_type, include=self.include, exclude=self.exclude, dataset_properties=self.dataset_properties, @@ -265,7 +282,11 @@ def get_hyperparameter_search_space(self, dataset_properties=None): return self.config_space def _get_hyperparameter_search_space( - self, include=None, exclude=None, dataset_properties=None + self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + include=None, + exclude=None, + dataset_properties=None, ): """Return the configuration space for the CASH problem. @@ -278,6 +299,9 @@ def _get_hyperparameter_search_space( Parameters ---------- + feat_type: dict + python dictionary which maps the columns of the dataset to the data types + estimator_name : str Name of the estimator hyperparameter which will be used in the configuration space. For a classification task, this would be @@ -307,7 +331,13 @@ def _get_hyperparameter_search_space( raise NotImplementedError() def _get_base_search_space( - self, cs, dataset_properties, exclude, include, pipeline + self, + cs, + dataset_properties, + include, + exclude, + pipeline, + feat_type: Optional[FEAT_TYPE_TYPE] = None, ): if include is None: if self.include is None: @@ -343,7 +373,10 @@ def _get_base_search_space( dataset_properties["signed"] = False matches = autosklearn.pipeline.create_searchspace_util.get_match_array( - pipeline, dataset_properties, include=include, exclude=exclude + pipeline=pipeline, + dataset_properties=dataset_properties, + include=include, + exclude=exclude, ) # Now we have only legal combinations at this step of the pipeline @@ -370,7 +403,9 @@ def _get_base_search_space( if not is_choice: cs.add_configuration_space( node_name, - node.get_hyperparameter_search_space(dataset_properties), + node.get_hyperparameter_search_space( + dataset_properties=dataset_properties, feat_type=feat_type + ), ) # If the node is a choice, we have to figure out which of its # choices are actually legal choices @@ -386,7 +421,9 @@ def _get_base_search_space( ) ) sub_config_space = node.get_hyperparameter_search_space( - dataset_properties, include=choices_list + feat_type=feat_type, + dataset_properties=dataset_properties, + include=choices_list, ) cs.add_configuration_space(node_name, sub_config_space) @@ -505,7 +542,9 @@ def __repr__(self): return return_value - def _get_pipeline_steps(self, dataset_properties): + def _get_pipeline_steps( + self, dataset_properties, feat_type: Optional[FEAT_TYPE_TYPE] = None + ): raise NotImplementedError() def _get_estimator_hyperparameter_name(self): diff --git a/autosklearn/pipeline/classification.py b/autosklearn/pipeline/classification.py index 1686e02809..332c076b9b 100644 --- a/autosklearn/pipeline/classification.py +++ b/autosklearn/pipeline/classification.py @@ -8,6 +8,7 @@ from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause from sklearn.base import ClassifierMixin +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import BasePipeline from autosklearn.pipeline.components.classification import ClassifierChoice from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice @@ -71,6 +72,7 @@ class SimpleClassificationPipeline(BasePipeline, ClassifierMixin): def __init__( self, config: Optional[Configuration] = None, + feat_type: Optional[FEAT_TYPE_TYPE] = None, steps=None, dataset_properties=None, include=None, @@ -84,6 +86,7 @@ def __init__( if "target_type" not in dataset_properties: dataset_properties["target_type"] = "classification" super().__init__( + feat_type=feat_type, config=config, steps=steps, dataset_properties=dataset_properties, @@ -109,7 +112,9 @@ def fit_transformer(self, X, y, fit_params=None): ) _init_params.update(self.init_params) self.set_hyperparameters( - configuration=self.config, init_params=_init_params + feat_type=self.feat_type, + configuration=self.config, + init_params=_init_params, ) if _fit_params is not None: @@ -166,12 +171,18 @@ def predict_proba(self, X, batch_size=None): return y def _get_hyperparameter_search_space( - self, include=None, exclude=None, dataset_properties=None + self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + include=None, + exclude=None, + dataset_properties=None, ): """Create the hyperparameter configuration space. Parameters ---------- + feat_type : dict, maps columns to there datatypes + include : dict (optional, default=None) Returns @@ -194,6 +205,7 @@ def _get_hyperparameter_search_space( cs = self._get_base_search_space( cs=cs, + feat_type=feat_type, dataset_properties=dataset_properties, exclude=exclude, include=include, @@ -344,7 +356,9 @@ def _get_hyperparameter_search_space( self.dataset_properties = dataset_properties return cs - def _get_pipeline_steps(self, dataset_properties): + def _get_pipeline_steps( + self, dataset_properties, feat_type: Optional[FEAT_TYPE_TYPE] = None + ): steps = [] default_dataset_properties = {"target_type": "classification"} @@ -356,6 +370,7 @@ def _get_pipeline_steps(self, dataset_properties): [ "data_preprocessor", DataPreprocessorChoice( + feat_type=feat_type, dataset_properties=default_dataset_properties, random_state=self.random_state, ), @@ -364,6 +379,7 @@ def _get_pipeline_steps(self, dataset_properties): [ "feature_preprocessor", FeaturePreprocessorChoice( + feat_type=feat_type, dataset_properties=default_dataset_properties, random_state=self.random_state, ), @@ -371,6 +387,7 @@ def _get_pipeline_steps(self, dataset_properties): [ "classifier", ClassifierChoice( + feat_type=feat_type, dataset_properties=default_dataset_properties, random_state=self.random_state, ), diff --git a/autosklearn/pipeline/components/base.py b/autosklearn/pipeline/components/base.py index c4a95df08c..7b496842b2 100644 --- a/autosklearn/pipeline/components/base.py +++ b/autosklearn/pipeline/components/base.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, Optional import importlib import inspect @@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator, TransformerMixin +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.constants import SPARSE _addons = dict() # type: Dict[str, 'ThirdPartyComponents'] @@ -98,12 +99,14 @@ def get_properties(dataset_properties=None): raise NotImplementedError() @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): """Return the configuration space of this classification algorithm. Parameters ---------- - + feat_type : FEAT_TYPE_TYPE (default=None) dataset_properties : dict, optional (default=None) Returns @@ -136,7 +139,12 @@ def fit(self, X, y): for further information.""" raise NotImplementedError() - def set_hyperparameters(self, configuration, init_params=None): + def set_hyperparameters( + self, + configuration, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + init_params=None, + ): params = configuration.get_dictionary() for param, value in params.items(): @@ -339,7 +347,12 @@ def get_estimator(self): class AutoSklearnChoice(object): - def __init__(self, dataset_properties, random_state=None): + def __init__( + self, + dataset_properties, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + random_state=None, + ): """ Parameters ---------- @@ -414,7 +427,12 @@ def get_available_components( return components_dict - def set_hyperparameters(self, configuration, init_params=None): + def set_hyperparameters( + self, + configuration, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + init_params=None, + ): new_params = {} params = configuration.get_dictionary() @@ -438,7 +456,12 @@ def set_hyperparameters(self, configuration, init_params=None): return self def get_hyperparameter_search_space( - self, dataset_properties=None, default=None, include=None, exclude=None + self, + feat_type: FEAT_TYPE_TYPE, + dataset_properties=None, + default=None, + include=None, + exclude=None, ): raise NotImplementedError() diff --git a/autosklearn/pipeline/components/classification/__init__.py b/autosklearn/pipeline/components/classification/__init__.py index c95334273a..31fa2ea9ca 100644 --- a/autosklearn/pipeline/components/classification/__init__.py +++ b/autosklearn/pipeline/components/classification/__init__.py @@ -8,6 +8,8 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from autosklearn.askl_typing import FEAT_TYPE_TYPE + from ..base import ( AutoSklearnChoice, AutoSklearnClassificationAlgorithm, @@ -86,7 +88,12 @@ def get_available_components( return components_dict def get_hyperparameter_search_space( - self, dataset_properties=None, default=None, include=None, exclude=None + self, + feat_type: FEAT_TYPE_TYPE, + dataset_properties=None, + default=None, + include=None, + exclude=None, ): if dataset_properties is None: dataset_properties = {} @@ -126,7 +133,9 @@ def get_hyperparameter_search_space( for estimator_name in available_estimators.keys(): estimator_configuration_space = available_estimators[ estimator_name - ].get_hyperparameter_search_space(dataset_properties) + ].get_hyperparameter_search_space( + feat_type=feat_type, dataset_properties=dataset_properties + ) parent_hyperparameter = {"parent": estimator, "value": estimator_name} cs.add_configuration_space( estimator_name, diff --git a/autosklearn/pipeline/components/classification/adaboost.py b/autosklearn/pipeline/components/classification/adaboost.py index 3634f53956..08a9bc06bd 100644 --- a/autosklearn/pipeline/components/classification/adaboost.py +++ b/autosklearn/pipeline/components/classification/adaboost.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, @@ -5,6 +7,7 @@ UniformIntegerHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA @@ -68,7 +71,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() n_estimators = UniformIntegerHyperparameter( diff --git a/autosklearn/pipeline/components/classification/bernoulli_nb.py b/autosklearn/pipeline/components/classification/bernoulli_nb.py index 8271c5f602..de52bc939c 100644 --- a/autosklearn/pipeline/components/classification/bernoulli_nb.py +++ b/autosklearn/pipeline/components/classification/bernoulli_nb.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -5,6 +7,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool @@ -64,7 +67,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() # the smoothing parameter is a non-negative float diff --git a/autosklearn/pipeline/components/classification/decision_tree.py b/autosklearn/pipeline/components/classification/decision_tree.py index fbfc6b7c6a..1369ecf906 100644 --- a/autosklearn/pipeline/components/classification/decision_tree.py +++ b/autosklearn/pipeline/components/classification/decision_tree.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -8,6 +10,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import ( @@ -106,7 +109,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() criterion = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/components/classification/extra_trees.py b/autosklearn/pipeline/components/classification/extra_trees.py index 5c7ce1879a..36edd82584 100644 --- a/autosklearn/pipeline/components/classification/extra_trees.py +++ b/autosklearn/pipeline/components/classification/extra_trees.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, @@ -6,6 +8,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponentWithSampleWeight, @@ -156,7 +159,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() criterion = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/components/classification/gaussian_nb.py b/autosklearn/pipeline/components/classification/gaussian_nb.py index 8e978e9631..bf43f4e4a5 100644 --- a/autosklearn/pipeline/components/classification/gaussian_nb.py +++ b/autosklearn/pipeline/components/classification/gaussian_nb.py @@ -1,6 +1,9 @@ +from typing import Optional + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA @@ -55,6 +58,8 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() return cs diff --git a/autosklearn/pipeline/components/classification/gradient_boosting.py b/autosklearn/pipeline/components/classification/gradient_boosting.py index 50b0b284bd..618028dff7 100644 --- a/autosklearn/pipeline/components/classification/gradient_boosting.py +++ b/autosklearn/pipeline/components/classification/gradient_boosting.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.conditions import EqualsCondition, InCondition from ConfigSpace.configuration_space import ConfigurationSpace @@ -9,6 +11,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponentWithSampleWeight, @@ -182,7 +185,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() loss = Constant("loss", "auto") learning_rate = UniformFloatHyperparameter( diff --git a/autosklearn/pipeline/components/classification/k_nearest_neighbors.py b/autosklearn/pipeline/components/classification/k_nearest_neighbors.py index fe55e0783d..d524bd42d9 100644 --- a/autosklearn/pipeline/components/classification/k_nearest_neighbors.py +++ b/autosklearn/pipeline/components/classification/k_nearest_neighbors.py @@ -1,9 +1,12 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, UniformIntegerHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA @@ -57,7 +60,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() n_neighbors = UniformIntegerHyperparameter( diff --git a/autosklearn/pipeline/components/classification/lda.py b/autosklearn/pipeline/components/classification/lda.py index 29a08f80b5..e7ebec290b 100644 --- a/autosklearn/pipeline/components/classification/lda.py +++ b/autosklearn/pipeline/components/classification/lda.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.conditions import EqualsCondition from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -5,6 +7,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import softmax @@ -76,7 +79,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() shrinkage = CategoricalHyperparameter( "shrinkage", ["None", "auto", "manual"], default_value="None" diff --git a/autosklearn/pipeline/components/classification/liblinear_svc.py b/autosklearn/pipeline/components/classification/liblinear_svc.py index 3f57ef8f94..d1beb08837 100644 --- a/autosklearn/pipeline/components/classification/liblinear_svc.py +++ b/autosklearn/pipeline/components/classification/liblinear_svc.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause from ConfigSpace.hyperparameters import ( @@ -6,6 +8,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import softmax @@ -104,7 +107,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() penalty = CategoricalHyperparameter("penalty", ["l1", "l2"], default_value="l2") diff --git a/autosklearn/pipeline/components/classification/libsvm_svc.py b/autosklearn/pipeline/components/classification/libsvm_svc.py index ba423161c1..43bd017c5c 100644 --- a/autosklearn/pipeline/components/classification/libsvm_svc.py +++ b/autosklearn/pipeline/components/classification/libsvm_svc.py @@ -1,3 +1,5 @@ +from typing import Optional + import resource import sys @@ -10,6 +12,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import softmax @@ -138,7 +141,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): C = UniformFloatHyperparameter("C", 0.03125, 32768, log=True, default_value=1.0) # No linear kernel here, because we have liblinear kernel = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/components/classification/mlp.py b/autosklearn/pipeline/components/classification/mlp.py index f7001d7bc1..d8c95fa9d0 100644 --- a/autosklearn/pipeline/components/classification/mlp.py +++ b/autosklearn/pipeline/components/classification/mlp.py @@ -1,3 +1,5 @@ +from typing import Optional + import copy import numpy as np @@ -11,6 +13,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponent, @@ -203,7 +206,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() hidden_layer_depth = UniformIntegerHyperparameter( name="hidden_layer_depth", lower=1, upper=3, default_value=1 diff --git a/autosklearn/pipeline/components/classification/multinomial_nb.py b/autosklearn/pipeline/components/classification/multinomial_nb.py index 7b65be8a5c..dee1507f01 100644 --- a/autosklearn/pipeline/components/classification/multinomial_nb.py +++ b/autosklearn/pipeline/components/classification/multinomial_nb.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -5,6 +7,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SIGNED_DATA, SPARSE from autosklearn.util.common import check_for_bool @@ -76,7 +79,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() # the smoothing parameter is a non-negative float diff --git a/autosklearn/pipeline/components/classification/passive_aggressive.py b/autosklearn/pipeline/components/classification/passive_aggressive.py index 494ea7db06..97a11a0283 100644 --- a/autosklearn/pipeline/components/classification/passive_aggressive.py +++ b/autosklearn/pipeline/components/classification/passive_aggressive.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -6,6 +8,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponentWithSampleWeight, @@ -152,7 +155,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") loss = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/components/classification/qda.py b/autosklearn/pipeline/components/classification/qda.py index 7b25858392..0b6f6f7653 100644 --- a/autosklearn/pipeline/components/classification/qda.py +++ b/autosklearn/pipeline/components/classification/qda.py @@ -1,7 +1,10 @@ +from typing import Optional + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformFloatHyperparameter +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA from autosklearn.pipeline.implementations.util import softmax @@ -72,7 +75,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): reg_param = UniformFloatHyperparameter("reg_param", 0.0, 1.0, default_value=0.0) cs = ConfigurationSpace() cs.add_hyperparameter(reg_param) diff --git a/autosklearn/pipeline/components/classification/random_forest.py b/autosklearn/pipeline/components/classification/random_forest.py index 6ccd720b3a..892d8611d5 100644 --- a/autosklearn/pipeline/components/classification/random_forest.py +++ b/autosklearn/pipeline/components/classification/random_forest.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, @@ -6,6 +8,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponentWithSampleWeight, @@ -149,7 +152,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() criterion = CategoricalHyperparameter( "criterion", ["gini", "entropy"], default_value="gini" diff --git a/autosklearn/pipeline/components/classification/sgd.py b/autosklearn/pipeline/components/classification/sgd.py index 469c2605dd..5073f8ec20 100644 --- a/autosklearn/pipeline/components/classification/sgd.py +++ b/autosklearn/pipeline/components/classification/sgd.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.conditions import EqualsCondition, InCondition from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -6,6 +8,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnClassificationAlgorithm, IterativeComponentWithSampleWeight, @@ -169,7 +172,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() loss = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/components/data_preprocessing/__init__.py b/autosklearn/pipeline/components/data_preprocessing/__init__.py index 5693efd441..3cc968f7d1 100644 --- a/autosklearn/pipeline/components/data_preprocessing/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/__init__.py @@ -6,6 +6,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import PIPELINE_DATA_DTYPE from ..base import ( @@ -105,6 +106,7 @@ def get_available_components( def get_hyperparameter_search_space( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[Dict] = None, default: str = None, include: Optional[Dict] = None, @@ -136,8 +138,8 @@ def get_hyperparameter_search_space( cs.add_hyperparameter(preprocessor) for name in available_preprocessors: preprocessor_configuration_space = available_preprocessors[name]( - dataset_properties=dataset_properties - ).get_hyperparameter_search_space(dataset_properties) + feat_type=feat_type, dataset_properties=dataset_properties + ).get_hyperparameter_search_space(dataset_properties=dataset_properties) parent_hyperparameter = {"parent": preprocessor, "value": name} cs.add_configuration_space( name, @@ -150,7 +152,10 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.choice.transform(X) def set_hyperparameters( - self, configuration: ConfigurationSpace, init_params: Optional[Dict] = None + self, + configuration: ConfigurationSpace, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + init_params: Optional[Dict] = None, ) -> "DataPreprocessorChoice": config = {} params = configuration.get_dictionary() @@ -162,7 +167,6 @@ def set_hyperparameters( config[param] = value new_params = {} - feat_type = None if init_params is not None: for param, value in init_params.items(): param = param.replace(choice, "").split(":", 1)[-1] diff --git a/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py b/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py index 721fe63fc5..106eb377f7 100644 --- a/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py +++ b/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py @@ -5,6 +5,7 @@ from ConfigSpace.hyperparameters import CategoricalHyperparameter from sklearn.base import BaseEstimator +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import ( @@ -139,6 +140,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: # TODO add replace by zero! diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py index 5d1647b24a..5b1cf075b3 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py @@ -8,6 +8,7 @@ from ConfigSpace.hyperparameters import CategoricalHyperparameter from sklearn.base import BaseEstimator +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from ...base import ( @@ -38,6 +39,7 @@ def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: def get_hyperparameter_search_space( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, default: Optional[str] = None, include: Optional[Dict[str, str]] = None, @@ -73,7 +75,7 @@ def get_hyperparameter_search_space( for name in available_preprocessors: preprocessor_configuration_space = available_preprocessors[ name - ].get_hyperparameter_search_space(dataset_properties) + ].get_hyperparameter_search_space(dataset_properties=dataset_properties) parent_hyperparameter = {"parent": preprocessor, "value": name} cs.add_configuration_space( name, @@ -86,7 +88,10 @@ def get_hyperparameter_search_space( return cs def set_hyperparameters( - self, configuration: Configuration, init_params: Optional[Dict[str, Any]] = None + self, + feat_type: FEAT_TYPE_TYPE, + configuration: Configuration, + init_params: Optional[Dict[str, Any]] = None, ) -> "OHEChoice": new_params = {} diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py index 43d578219f..7c904635f8 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py @@ -5,6 +5,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from sklearn.preprocessing import OrdinalEncoder +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -69,6 +70,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py index 028a4fb9c1..cead9331d4 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py @@ -3,6 +3,7 @@ import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -44,6 +45,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py index 9b9ee87c81..989cf86680 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py @@ -5,6 +5,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from sklearn.preprocessing import OneHotEncoder as DenseOneHotEncoder +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -55,6 +56,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py b/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py index f2dc2bf304..65ec36f7e7 100644 --- a/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py +++ b/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py @@ -4,6 +4,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace import autosklearn.pipeline.implementations.CategoryShift +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -63,6 +64,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type.py b/autosklearn/pipeline/components/data_preprocessing/feature_type.py index bd42d8a67a..11085a1f74 100644 --- a/autosklearn/pipeline/components/data_preprocessing/feature_type.py +++ b/autosklearn/pipeline/components/data_preprocessing/feature_type.py @@ -7,6 +7,7 @@ from scipy import sparse from sklearn.base import BaseEstimator +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.data.validation import SUPPORTED_FEAT_TYPES, SUPPORTED_TARGET_TYPES from autosklearn.pipeline.base import ( DATASET_PROPERTIES_TYPE, @@ -46,7 +47,7 @@ def __init__( exclude: Optional[Dict[str, str]] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, init_params: Optional[Dict[str, Any]] = None, - feat_type: Optional[Dict[Union[str, int], str]] = None, + feat_type: Optional[FEAT_TYPE_TYPE] = None, force_sparse_output: bool = False, column_transformer: Optional[sklearn.compose.ColumnTransformer] = None, ): @@ -72,6 +73,7 @@ def __init__( # TODO: Extract the child configuration space from the FeatTypeSplit to the # pipeline if needed self.categ_ppl = CategoricalPreprocessingPipeline( + feat_type=self.feat_type, config=None, steps=pipeline, dataset_properties=dataset_properties, @@ -88,6 +90,7 @@ def __init__( # TODO: Extract the child configuration space from the FeatTypeSplit to the # pipeline if needed self.numer_ppl = NumericalPreprocessingPipeline( + feat_type=self.feat_type, config=None, steps=pipeline, dataset_properties=dataset_properties, @@ -105,6 +108,7 @@ def __init__( # TODO: Extract the child configuration space from the FeatTypeSplit to the # pipeline if needed self.txt_ppl = TextPreprocessingPipeline( + feat_type=self.feat_type, config=None, steps=pipeline, dataset_properties=dataset_properties, @@ -114,13 +118,28 @@ def __init__( init_params=init_params, ) - self._transformers: List[Tuple[str, AutoSklearnComponent]] = [ - ("categorical_transformer", self.categ_ppl), - ("numerical_transformer", self.numer_ppl), - ("text_transformer", self.txt_ppl), - ] + if self.feat_type is None: + self._transformers: List[Tuple[str, AutoSklearnComponent]] = [ + ("categorical_transformer", self.categ_ppl), + ("numerical_transformer", self.numer_ppl), + ("text_transformer", self.txt_ppl), + ] + else: + self._transformers: List[Tuple[str, AutoSklearnComponent]] = [] + if "categorical" in self.feat_type.values(): + self._transformers.append(("categorical_transformer", self.categ_ppl)) + if "numerical" in self.feat_type.values(): + self._transformers.append(("numerical_transformer", self.numer_ppl)) + if "string" in self.feat_type.values(): + self._transformers.append(("text_transformer", self.txt_ppl)) + if self.config: - self.set_hyperparameters(self.config, init_params=init_params) + self.set_hyperparameters( + feat_type=self.feat_type, + configuration=self.config, + init_params=init_params, + ) + self.column_transformer = column_transformer def fit( @@ -128,9 +147,6 @@ def fit( ) -> "FeatTypeSplit": n_feats = X.shape[1] - categorical_features = [] - numerical_features = [] - text_features = [] if self.feat_type is not None: # Make sure that we are not missing any column! expected = set(self.feat_type.keys()) @@ -143,31 +159,37 @@ def fit( f"Train data has columns={expected} yet the" f" feat_types are feat={columns}" ) + sklearn_transf_spec = [] + categorical_features = [ key for key, value in self.feat_type.items() if value.lower() == "categorical" ] + if len(categorical_features) > 0: + sklearn_transf_spec.append( + ("categorical_transformer", self.categ_ppl, categorical_features) + ) + numerical_features = [ key for key, value in self.feat_type.items() if value.lower() == "numerical" ] + if len(numerical_features) > 0: + sklearn_transf_spec.append( + ("numerical_transformer", self.numer_ppl, numerical_features) + ) + text_features = [ key for key, value in self.feat_type.items() if value.lower() == "string" ] - - sklearn_transf_spec = [ - (name, transformer, feature_columns) - for name, transformer, feature_columns in [ - ("categorical_transformer", self.categ_ppl, categorical_features), - ("numerical_transformer", self.numer_ppl, numerical_features), - ("text_transformer", self.txt_ppl, text_features), - ] - if len(feature_columns) > 0 - ] + if len(text_features) > 0: + sklearn_transf_spec.append( + ("text_transformer", self.txt_ppl, text_features) + ) else: # self.feature_type == None assumes numerical case sklearn_transf_spec = [ @@ -223,7 +245,10 @@ def get_properties( } def set_hyperparameters( - self, configuration: Configuration, init_params: Optional[Dict[str, Any]] = None + self, + feat_type: FEAT_TYPE_TYPE, + configuration: Configuration, + init_params: Optional[Dict[str, Any]] = None, ) -> "FeatTypeSplit": if init_params is not None and "feat_type" in init_params.keys(): self.feat_type = init_params["feat_type"] @@ -232,7 +257,7 @@ def set_hyperparameters( for transf_name, transf_op in self._transformers: sub_configuration_space = transf_op.get_hyperparameter_search_space( - dataset_properties=self.dataset_properties + dataset_properties=self.dataset_properties, feat_type=feat_type ) sub_config_dict = {} for param in configuration: @@ -258,7 +283,9 @@ def set_hyperparameters( transf_op, (AutoSklearnChoice, AutoSklearnComponent, BasePipeline) ): transf_op.set_hyperparameters( - configuration=sub_configuration, init_params=sub_init_params_dict + feat_type=feat_type, + configuration=sub_configuration, + init_params=sub_init_params_dict, ) else: raise NotImplementedError("Not supported yet!") @@ -267,12 +294,16 @@ def set_hyperparameters( def get_hyperparameter_search_space( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: self.dataset_properties = dataset_properties cs = ConfigurationSpace() cs = FeatTypeSplit._get_hyperparameter_search_space_recursevely( - dataset_properties, cs, self._transformers + feat_type=feat_type, + dataset_properties=dataset_properties, + cs=cs, + transformer=self._transformers, ) return cs @@ -281,12 +312,15 @@ def _get_hyperparameter_search_space_recursevely( dataset_properties: DATASET_PROPERTIES_TYPE, cs: ConfigurationSpace, transformer: BaseEstimator, + feat_type: Optional[FEAT_TYPE_TYPE] = None, ) -> ConfigurationSpace: for st_name, st_operation in transformer: if hasattr(st_operation, "get_hyperparameter_search_space"): cs.add_configuration_space( st_name, - st_operation.get_hyperparameter_search_space(dataset_properties), + st_operation.get_hyperparameter_search_space( + dataset_properties=dataset_properties + ), ) else: return FeatTypeSplit._get_hyperparameter_search_space_recursevely( diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type_categorical.py b/autosklearn/pipeline/components/data_preprocessing/feature_type_categorical.py index dfdaf7af62..07cfeb7fa5 100644 --- a/autosklearn/pipeline/components/data_preprocessing/feature_type_categorical.py +++ b/autosklearn/pipeline/components/data_preprocessing/feature_type_categorical.py @@ -4,6 +4,7 @@ from ConfigSpace.configuration_space import Configuration, ConfigurationSpace from sklearn.base import BaseEstimator +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, BasePipeline from autosklearn.pipeline.components.data_preprocessing.categorical_encoding import ( # noqa: E501 OHEChoice, @@ -46,6 +47,7 @@ class CategoricalPreprocessingPipeline(BasePipeline): def __init__( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, config: Optional[Configuration] = None, steps: Optional[List[Tuple[str, BaseEstimator]]] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, @@ -56,13 +58,14 @@ def __init__( ) -> None: self._output_dtype = np.int32 super().__init__( - config, - steps, - dataset_properties, - include, - exclude, - random_state, - init_params, + config=config, + steps=steps, + dataset_properties=dataset_properties, + include=include, + exclude=exclude, + random_state=random_state, + init_params=init_params, + feat_type=feat_type, ) @staticmethod @@ -92,6 +95,7 @@ def get_properties( def _get_hyperparameter_search_space( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, include: Optional[Dict[str, str]] = None, exclude: Optional[Dict[str, str]] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, @@ -110,6 +114,7 @@ def _get_hyperparameter_search_space( cs = self._get_base_search_space( cs=cs, + feat_type=feat_type, dataset_properties=dataset_properties, exclude=exclude, include=include, @@ -120,6 +125,7 @@ def _get_hyperparameter_search_space( def _get_pipeline_steps( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[Dict[str, str]] = None, ) -> List[Tuple[str, BaseEstimator]]: steps = [] @@ -135,6 +141,7 @@ def _get_pipeline_steps( ( "category_coalescence", CoalescenseChoice( + feat_type=feat_type, dataset_properties=default_dataset_properties, random_state=self.random_state, ), @@ -142,6 +149,7 @@ def _get_pipeline_steps( ( "categorical_encoding", OHEChoice( + feat_type=feat_type, dataset_properties=default_dataset_properties, random_state=self.random_state, ), diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type_numerical.py b/autosklearn/pipeline/components/data_preprocessing/feature_type_numerical.py index b50bf0d357..5cc3f19561 100644 --- a/autosklearn/pipeline/components/data_preprocessing/feature_type_numerical.py +++ b/autosklearn/pipeline/components/data_preprocessing/feature_type_numerical.py @@ -4,6 +4,7 @@ from ConfigSpace.configuration_space import Configuration, ConfigurationSpace from sklearn.base import BaseEstimator +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, BasePipeline from autosklearn.pipeline.components.data_preprocessing import ( rescaling as rescaling_components, @@ -39,6 +40,7 @@ class NumericalPreprocessingPipeline(BasePipeline): def __init__( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, config: Optional[Configuration] = None, steps: Optional[List[Tuple[str, BaseEstimator]]] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, @@ -49,13 +51,14 @@ def __init__( ) -> None: self._output_dtype = np.int32 super().__init__( - config, - steps, - dataset_properties, - include, - exclude, - random_state, - init_params, + config=config, + steps=steps, + dataset_properties=dataset_properties, + include=include, + exclude=exclude, + random_state=random_state, + init_params=init_params, + feat_type=feat_type, ) @staticmethod @@ -85,6 +88,7 @@ def get_properties( def _get_hyperparameter_search_space( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, include: Optional[Dict[str, str]] = None, exclude: Optional[Dict[str, str]] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, @@ -110,12 +114,14 @@ def _get_hyperparameter_search_space( exclude=exclude, include=include, pipeline=self.steps, + feat_type=feat_type, ) return cs def _get_pipeline_steps( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[Dict[str, str]] = None, ) -> List[Tuple[str, BaseEstimator]]: steps = [] @@ -134,6 +140,7 @@ def _get_pipeline_steps( ( "rescaling", rescaling_components.RescalingChoice( + feat_type=feat_type, dataset_properties=default_dataset_properties, random_state=self.random_state, ), diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py b/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py index 8924d568a6..e92ef09c03 100644 --- a/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py +++ b/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py @@ -4,6 +4,7 @@ from ConfigSpace.configuration_space import Configuration, ConfigurationSpace from sklearn.base import BaseEstimator +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, BasePipeline from autosklearn.pipeline.components.data_preprocessing.text_encoding import ( BagOfWordChoice, @@ -34,6 +35,7 @@ class TextPreprocessingPipeline(BasePipeline): def __init__( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, config: Optional[Configuration] = None, steps: Optional[List[Tuple[str, BaseEstimator]]] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, @@ -44,13 +46,14 @@ def __init__( ) -> None: self._output_dtype = np.int32 super().__init__( - config, - steps, - dataset_properties, - include, - exclude, - random_state, - init_params, + config=config, + steps=steps, + dataset_properties=dataset_properties, + include=include, + exclude=exclude, + random_state=random_state, + init_params=init_params, + feat_type=feat_type, ) @staticmethod @@ -79,6 +82,7 @@ def get_properties( def _get_hyperparameter_search_space( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, include: Optional[Dict[str, str]] = None, exclude: Optional[Dict[str, str]] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, @@ -104,12 +108,14 @@ def _get_hyperparameter_search_space( exclude=exclude, include=include, pipeline=self.steps, + feat_type=feat_type, ) return cs def _get_pipeline_steps( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[Dict[str, str]] = None, ) -> List[Tuple[str, BaseEstimator]]: steps = [] @@ -123,7 +129,9 @@ def _get_pipeline_steps( ( "text_encoding", BagOfWordChoice( - default_dataset_properties, random_state=self.random_state + feat_type=feat_type, + dataset_properties=default_dataset_properties, + random_state=self.random_state, ), ), ( diff --git a/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py b/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py index 00b627daed..31b762eb60 100644 --- a/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py +++ b/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py @@ -4,6 +4,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from scipy.sparse import spmatrix +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -91,6 +92,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py b/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py index d7d6a645ab..0d09b7bf11 100644 --- a/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py +++ b/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py @@ -4,6 +4,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -62,6 +63,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: # TODO add replace by zero! diff --git a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py index fbf999761c..85002ec349 100644 --- a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py @@ -8,6 +8,7 @@ from ConfigSpace.hyperparameters import CategoricalHyperparameter from sklearn.base import BaseEstimator +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from ...base import ( @@ -38,6 +39,7 @@ def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: def get_hyperparameter_search_space( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, default: Optional[str] = None, include: Optional[Dict[str, str]] = None, @@ -73,7 +75,7 @@ def get_hyperparameter_search_space( for name in available_preprocessors: preprocessor_configuration_space = available_preprocessors[ name - ].get_hyperparameter_search_space(dataset_properties) + ].get_hyperparameter_search_space(dataset_properties=dataset_properties) parent_hyperparameter = {"parent": preprocessor, "value": name} cs.add_configuration_space( name, @@ -86,7 +88,10 @@ def get_hyperparameter_search_space( return cs def set_hyperparameters( - self, configuration: Configuration, init_params: Optional[Dict[str, Any]] = None + self, + configuration: Configuration, + init_params: Optional[Dict[str, Any]] = None, + feat_type: Optional[FEAT_TYPE_TYPE] = None, ) -> "CoalescenseChoice": new_params = {} @@ -111,6 +116,7 @@ def set_hyperparameters( new_params["random_state"] = self.random_state self.new_params = new_params + new_params["feat_type"] = feat_type self.choice = self.get_components()[choice](**new_params) return self diff --git a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py index 278cf0bfb9..2533e92e8d 100644 --- a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py +++ b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py @@ -5,6 +5,7 @@ from ConfigSpace.hyperparameters import UniformFloatHyperparameter import autosklearn.pipeline.implementations.MinorityCoalescer +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -15,6 +16,7 @@ class MinorityCoalescer(AutoSklearnPreprocessingAlgorithm): def __init__( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, minimum_fraction: float = 0.01, random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: @@ -59,6 +61,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py index d05c146d98..2732795649 100644 --- a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py +++ b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py @@ -3,6 +3,7 @@ import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -10,7 +11,9 @@ class NoCoalescence(AutoSklearnPreprocessingAlgorithm): def __init__( - self, random_state: Optional[Union[int, np.random.RandomState]] = None + self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> None: pass @@ -43,6 +46,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py index 2a9fbdb842..9f83881472 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py @@ -7,6 +7,7 @@ from ConfigSpace.hyperparameters import CategoricalHyperparameter from sklearn.base import BaseEstimator +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 Rescaling, @@ -42,6 +43,7 @@ def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: def get_hyperparameter_search_space( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, default: Optional[str] = None, include: Optional[Dict[str, str]] = None, @@ -74,7 +76,9 @@ def get_hyperparameter_search_space( for name in available_preprocessors: preprocessor_configuration_space = available_preprocessors[ name - ].get_hyperparameter_search_space(dataset_properties) + ].get_hyperparameter_search_space( + feat_type=feat_type, dataset_properties=dataset_properties + ) parent_hyperparameter = {"parent": preprocessor, "value": name} cs.add_configuration_space( name, diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py index 05e1a4e898..0b3244cc62 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py @@ -5,6 +5,7 @@ from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm @@ -38,6 +39,7 @@ def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py index 2611c0650d..51beabcc7a 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py @@ -7,6 +7,7 @@ UniformIntegerHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 @@ -62,6 +63,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py index af3b4c0558..8762c1be96 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py @@ -6,6 +6,7 @@ from scipy import sparse from sklearn.exceptions import NotFittedError +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( # noqa: E501 @@ -59,6 +60,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/__init__.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/__init__.py index 990ad579ca..75c173e181 100644 --- a/autosklearn/pipeline/components/data_preprocessing/text_encoding/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/text_encoding/__init__.py @@ -8,6 +8,7 @@ from ConfigSpace.hyperparameters import CategoricalHyperparameter from sklearn.base import BaseEstimator +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from ...base import ( @@ -39,6 +40,7 @@ def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: def get_hyperparameter_search_space( self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, default: Optional[str] = None, include: Optional[Dict[str, str]] = None, @@ -75,7 +77,9 @@ def get_hyperparameter_search_space( for name in available_preprocessors: preprocessor_configuration_space = available_preprocessors[ name - ].get_hyperparameter_search_space(dataset_properties) + ].get_hyperparameter_search_space( + feat_type=feat_type, dataset_properties=dataset_properties + ) parent_hyperparameter = {"parent": preprocessor, "value": name} cs.add_configuration_space( name, @@ -88,7 +92,10 @@ def get_hyperparameter_search_space( return cs def set_hyperparameters( - self, configuration: Configuration, init_params: Optional[Dict[str, Any]] = None + self, + configuration: Configuration, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + init_params: Optional[Dict[str, Any]] = None, ) -> "BagOfWordChoice": new_params = {} diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py index b8a62ccd89..a90b1c1fa4 100644 --- a/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py @@ -9,6 +9,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from sklearn.feature_extraction.text import CountVectorizer +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -95,6 +96,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py index 90a43b0f48..de852b5d6b 100644 --- a/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py +++ b/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py @@ -8,6 +8,7 @@ from scipy.sparse import hstack from sklearn.feature_extraction.text import CountVectorizer +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -101,6 +102,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py index f20d24f769..3956ec9eff 100644 --- a/autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py @@ -9,6 +9,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from sklearn.feature_extraction.text import TfidfVectorizer +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -100,6 +101,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py b/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py index beecefb028..d6380e03dd 100644 --- a/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py +++ b/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py @@ -5,6 +5,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from sklearn.decomposition import TruncatedSVD +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -74,6 +75,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py b/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py index 365ae405a0..eb917d6915 100644 --- a/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py +++ b/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py @@ -4,6 +4,7 @@ import sklearn.feature_selection from ConfigSpace.configuration_space import ConfigurationSpace +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -49,6 +50,7 @@ def get_properties( @staticmethod def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autosklearn/pipeline/components/feature_preprocessing/__init__.py b/autosklearn/pipeline/components/feature_preprocessing/__init__.py index cd52d6ad34..9a0bf69a30 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/__init__.py +++ b/autosklearn/pipeline/components/feature_preprocessing/__init__.py @@ -1,4 +1,4 @@ -from typing import Type +from typing import Optional, Type import os from collections import OrderedDict @@ -6,6 +6,8 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from autosklearn.askl_typing import FEAT_TYPE_TYPE + from ..base import ( AutoSklearnChoice, AutoSklearnPreprocessingAlgorithm, @@ -101,7 +103,12 @@ def get_available_components( return components_dict def get_hyperparameter_search_space( - self, dataset_properties=None, default=None, include=None, exclude=None + self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + dataset_properties=None, + default=None, + include=None, + exclude=None, ): cs = ConfigurationSpace() @@ -130,7 +137,7 @@ def get_hyperparameter_search_space( for name in available_preprocessors: preprocessor_configuration_space = available_preprocessors[ name - ].get_hyperparameter_search_space(dataset_properties) + ].get_hyperparameter_search_space(dataset_properties=dataset_properties) parent_hyperparameter = {"parent": preprocessor, "value": name} cs.add_configuration_space( name, diff --git a/autosklearn/pipeline/components/feature_preprocessing/densifier.py b/autosklearn/pipeline/components/feature_preprocessing/densifier.py index f5c88ecadf..f571d6abee 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/densifier.py +++ b/autosklearn/pipeline/components/feature_preprocessing/densifier.py @@ -1,5 +1,8 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -36,6 +39,8 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py index dad45795b8..904004b201 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py +++ b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, @@ -7,6 +9,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none @@ -123,7 +126,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() n_estimators = Constant("n_estimators", 100) diff --git a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py index 3287b837c5..10e741a44e 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py +++ b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -8,6 +10,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none @@ -125,7 +128,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() n_estimators = Constant("n_estimators", 100) diff --git a/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py b/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py index 695ff3c2cc..fe23177fc9 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py +++ b/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py @@ -1,3 +1,5 @@ +from typing import Optional + import warnings from ConfigSpace.conditions import EqualsCondition @@ -7,6 +9,7 @@ UniformIntegerHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none @@ -74,7 +77,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() n_components = UniformIntegerHyperparameter( diff --git a/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py b/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py index d51242de21..2a8db4eaad 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py +++ b/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.forbidden import ( @@ -10,6 +12,7 @@ UniformIntegerHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, UNSIGNED_DATA @@ -63,7 +66,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() n_clusters = UniformIntegerHyperparameter("n_clusters", 2, 400, 25) affinity = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py b/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py index 4e96bfb1c2..08c72efb6f 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py +++ b/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py @@ -1,3 +1,5 @@ +from typing import Optional + import warnings import numpy as np @@ -9,6 +11,7 @@ UniformIntegerHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA @@ -82,7 +85,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): n_components = UniformIntegerHyperparameter( "n_components", 10, 2000, default_value=100 ) diff --git a/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py b/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py index a81e9ddd78..4e6a348f17 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py +++ b/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py @@ -7,6 +7,7 @@ ) from numpy.random import RandomState +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -69,7 +70,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): gamma = UniformFloatHyperparameter( "gamma", 3.0517578125e-05, 8, default_value=1.0, log=True ) diff --git a/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py b/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py index 546c8742ad..7031089e91 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py +++ b/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause from ConfigSpace.hyperparameters import ( @@ -6,6 +8,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none @@ -91,7 +94,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() penalty = Constant("penalty", "l1") diff --git a/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py b/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py index 550872d551..38c11bdb58 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py +++ b/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py @@ -1,5 +1,8 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -34,6 +37,8 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py b/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py index 097f59e0f1..a7dc227056 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py +++ b/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py @@ -1,4 +1,5 @@ -import numpy as np +from typing import Optional + from ConfigSpace.conditions import EqualsCondition, InCondition from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -7,6 +8,7 @@ UniformIntegerHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import ( DENSE, @@ -54,7 +56,7 @@ def fit(self, X, Y=None): else: X[X < 0] = 0.0 - self.preprocessor.fit(X.astype(np.float64)) + self.preprocessor.fit(X) return self def transform(self, X): @@ -94,7 +96,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): if dataset_properties is not None and ( dataset_properties.get("sparse") is True or dataset_properties.get("signed") is False diff --git a/autosklearn/pipeline/components/feature_preprocessing/pca.py b/autosklearn/pipeline/components/feature_preprocessing/pca.py index a1ad9f3981..7c69f8eb80 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/pca.py +++ b/autosklearn/pipeline/components/feature_preprocessing/pca.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -5,6 +7,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool @@ -55,7 +58,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): keep_variance = UniformFloatHyperparameter( "keep_variance", 0.5, 0.9999, default_value=0.9999 ) diff --git a/autosklearn/pipeline/components/feature_preprocessing/polynomial.py b/autosklearn/pipeline/components/feature_preprocessing/polynomial.py index bd5312bba0..78e3ff2676 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/polynomial.py +++ b/autosklearn/pipeline/components/feature_preprocessing/polynomial.py @@ -1,9 +1,12 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, UniformIntegerHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool @@ -54,7 +57,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): # More than degree 3 is too expensive! degree = UniformIntegerHyperparameter("degree", 2, 3, 2) interaction_only = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py b/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py index 9daed1ae97..2b5aa340a9 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py +++ b/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, @@ -6,6 +8,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none @@ -94,7 +97,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): n_estimators = UniformIntegerHyperparameter( name="n_estimators", lower=10, upper=100, default_value=10 ) diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py index 3caa50b46d..98495eaedb 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py @@ -1,3 +1,5 @@ +from typing import Optional + from functools import partial from ConfigSpace.configuration_space import ConfigurationSpace @@ -7,6 +9,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.components.feature_preprocessing.select_percentile import ( SelectPercentileBase, @@ -110,7 +113,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): percentile = UniformFloatHyperparameter( name="percentile", lower=1, upper=99, default_value=50 ) diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py index e9343fead4..a653dc4a7e 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py @@ -1,3 +1,5 @@ +from typing import Optional + from functools import partial from ConfigSpace.configuration_space import ConfigurationSpace @@ -6,6 +8,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.components.feature_preprocessing.select_percentile import ( SelectPercentileBase, @@ -53,7 +56,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): percentile = UniformFloatHyperparameter( "percentile", lower=1, upper=99, default_value=50 ) diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py index 0c4768d000..3a728d753e 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py @@ -1,3 +1,5 @@ +from typing import Optional + from functools import partial from ConfigSpace import NotEqualsCondition @@ -7,6 +9,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import ( DENSE, @@ -116,7 +119,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): alpha = UniformFloatHyperparameter( name="alpha", lower=0.01, upper=0.5, default_value=0.1 ) diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py index ffec19e6ec..89c84905b2 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py @@ -1,3 +1,5 @@ +from typing import Optional + from functools import partial from ConfigSpace import NotEqualsCondition @@ -7,6 +9,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -84,7 +87,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): alpha = UniformFloatHyperparameter( name="alpha", lower=0.01, upper=0.5, default_value=0.1 ) diff --git a/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py b/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py index 4d6f6b7ca9..0c61e72c1c 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py +++ b/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py @@ -1,6 +1,9 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformIntegerHyperparameter +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, INPUT, SPARSE, UNSIGNED_DATA @@ -48,7 +51,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): target_dim = UniformIntegerHyperparameter( "target_dim", 10, 256, default_value=128 ) diff --git a/autosklearn/pipeline/components/regression/__init__.py b/autosklearn/pipeline/components/regression/__init__.py index 73033467a7..9d1ef58650 100644 --- a/autosklearn/pipeline/components/regression/__init__.py +++ b/autosklearn/pipeline/components/regression/__init__.py @@ -6,6 +6,8 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from autosklearn.askl_typing import FEAT_TYPE_TYPE + from ..base import ( AutoSklearnChoice, AutoSklearnRegressionAlgorithm, @@ -79,7 +81,12 @@ def get_available_components( return components_dict def get_hyperparameter_search_space( - self, dataset_properties=None, default=None, include=None, exclude=None + self, + feat_type: FEAT_TYPE_TYPE, + dataset_properties=None, + default=None, + include=None, + exclude=None, ): if include is not None and exclude is not None: raise ValueError( @@ -116,7 +123,9 @@ def get_hyperparameter_search_space( for estimator_name in available_estimators.keys(): estimator_configuration_space = available_estimators[ estimator_name - ].get_hyperparameter_search_space(dataset_properties) + ].get_hyperparameter_search_space( + feat_type=feat_type, dataset_properties=dataset_properties + ) parent_hyperparameter = {"parent": estimator, "value": estimator_name} cs.add_configuration_space( estimator_name, diff --git a/autosklearn/pipeline/components/regression/adaboost.py b/autosklearn/pipeline/components/regression/adaboost.py index e78a57e6a2..8faae821c3 100644 --- a/autosklearn/pipeline/components/regression/adaboost.py +++ b/autosklearn/pipeline/components/regression/adaboost.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, @@ -5,6 +7,7 @@ UniformIntegerHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA @@ -62,7 +65,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() # base_estimator = Constant(name="base_estimator", value="None") diff --git a/autosklearn/pipeline/components/regression/ard_regression.py b/autosklearn/pipeline/components/regression/ard_regression.py index 219cb775af..758c4b04d7 100644 --- a/autosklearn/pipeline/components/regression/ard_regression.py +++ b/autosklearn/pipeline/components/regression/ard_regression.py @@ -1,9 +1,12 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( UniformFloatHyperparameter, UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA from autosklearn.util.common import check_for_bool @@ -89,7 +92,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() n_iter = UnParametrizedHyperparameter("n_iter", value=300) tol = UniformFloatHyperparameter( diff --git a/autosklearn/pipeline/components/regression/decision_tree.py b/autosklearn/pipeline/components/regression/decision_tree.py index db59767587..80890889f9 100644 --- a/autosklearn/pipeline/components/regression/decision_tree.py +++ b/autosklearn/pipeline/components/regression/decision_tree.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -8,6 +10,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_none @@ -96,7 +99,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() criterion = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/components/regression/extra_trees.py b/autosklearn/pipeline/components/regression/extra_trees.py index c4646a2709..b1d8eeb00a 100644 --- a/autosklearn/pipeline/components/regression/extra_trees.py +++ b/autosklearn/pipeline/components/regression/extra_trees.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, @@ -6,6 +8,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnRegressionAlgorithm, IterativeComponent, @@ -148,7 +151,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() criterion = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/components/regression/gaussian_process.py b/autosklearn/pipeline/components/regression/gaussian_process.py index 1acf238cd1..d08a3b0239 100644 --- a/autosklearn/pipeline/components/regression/gaussian_process.py +++ b/autosklearn/pipeline/components/regression/gaussian_process.py @@ -1,6 +1,9 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformFloatHyperparameter +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, UNSIGNED_DATA @@ -65,7 +68,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): alpha = UniformFloatHyperparameter( name="alpha", lower=1e-14, upper=1.0, default_value=1e-8, log=True ) diff --git a/autosklearn/pipeline/components/regression/gradient_boosting.py b/autosklearn/pipeline/components/regression/gradient_boosting.py index b7503f5fd0..16b7df965d 100644 --- a/autosklearn/pipeline/components/regression/gradient_boosting.py +++ b/autosklearn/pipeline/components/regression/gradient_boosting.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.conditions import EqualsCondition, InCondition from ConfigSpace.configuration_space import ConfigurationSpace @@ -9,6 +11,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnRegressionAlgorithm, IterativeComponent, @@ -166,7 +169,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() loss = CategoricalHyperparameter( "loss", ["least_squares"], default_value="least_squares" diff --git a/autosklearn/pipeline/components/regression/k_nearest_neighbors.py b/autosklearn/pipeline/components/regression/k_nearest_neighbors.py index 83c13cd191..c16e8a6404 100644 --- a/autosklearn/pipeline/components/regression/k_nearest_neighbors.py +++ b/autosklearn/pipeline/components/regression/k_nearest_neighbors.py @@ -1,9 +1,12 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, UniformIntegerHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA @@ -52,7 +55,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() n_neighbors = UniformIntegerHyperparameter( diff --git a/autosklearn/pipeline/components/regression/liblinear_svr.py b/autosklearn/pipeline/components/regression/liblinear_svr.py index e129331298..62e38c1551 100644 --- a/autosklearn/pipeline/components/regression/liblinear_svr.py +++ b/autosklearn/pipeline/components/regression/liblinear_svr.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause from ConfigSpace.hyperparameters import ( @@ -6,6 +8,7 @@ UniformFloatHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool @@ -83,7 +86,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() C = UniformFloatHyperparameter("C", 0.03125, 32768, log=True, default_value=1.0) loss = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/components/regression/libsvm_svr.py b/autosklearn/pipeline/components/regression/libsvm_svr.py index d4173d7f01..c3ac98b1f9 100644 --- a/autosklearn/pipeline/components/regression/libsvm_svr.py +++ b/autosklearn/pipeline/components/regression/libsvm_svr.py @@ -1,3 +1,5 @@ +from typing import Optional + import resource import sys @@ -10,6 +12,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm from autosklearn.pipeline.constants import DENSE, PREDICTIONS, SPARSE, UNSIGNED_DATA from autosklearn.util.common import check_for_bool, check_none @@ -149,7 +152,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): C = UniformFloatHyperparameter( name="C", lower=0.03125, upper=32768, log=True, default_value=1.0 ) diff --git a/autosklearn/pipeline/components/regression/mlp.py b/autosklearn/pipeline/components/regression/mlp.py index 645c29403a..42ceff4556 100644 --- a/autosklearn/pipeline/components/regression/mlp.py +++ b/autosklearn/pipeline/components/regression/mlp.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from ConfigSpace.conditions import InCondition from ConfigSpace.configuration_space import ConfigurationSpace @@ -9,6 +11,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnRegressionAlgorithm, IterativeComponent, @@ -225,7 +228,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() hidden_layer_depth = UniformIntegerHyperparameter( name="hidden_layer_depth", lower=1, upper=3, default_value=1 diff --git a/autosklearn/pipeline/components/regression/random_forest.py b/autosklearn/pipeline/components/regression/random_forest.py index 128113fc43..043d62e16b 100644 --- a/autosklearn/pipeline/components/regression/random_forest.py +++ b/autosklearn/pipeline/components/regression/random_forest.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, @@ -6,6 +8,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnRegressionAlgorithm, IterativeComponent, @@ -135,7 +138,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() criterion = CategoricalHyperparameter( "criterion", ["mse", "friedman_mse", "mae"] diff --git a/autosklearn/pipeline/components/regression/sgd.py b/autosklearn/pipeline/components/regression/sgd.py index 3b3f939fa8..915e45169f 100644 --- a/autosklearn/pipeline/components/regression/sgd.py +++ b/autosklearn/pipeline/components/regression/sgd.py @@ -1,3 +1,5 @@ +from typing import Optional + from ConfigSpace.conditions import EqualsCondition, InCondition from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -6,6 +8,7 @@ UnParametrizedHyperparameter, ) +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import ( AutoSklearnRegressionAlgorithm, IterativeComponent, @@ -185,7 +188,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() loss = CategoricalHyperparameter( diff --git a/autosklearn/pipeline/regression.py b/autosklearn/pipeline/regression.py index 638f8ae3cb..dcc2fa3fcf 100644 --- a/autosklearn/pipeline/regression.py +++ b/autosklearn/pipeline/regression.py @@ -8,6 +8,7 @@ from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause from sklearn.base import RegressorMixin +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.base import BasePipeline from autosklearn.pipeline.components import ( feature_preprocessing as feature_preprocessing_components, @@ -68,6 +69,7 @@ class SimpleRegressionPipeline(RegressorMixin, BasePipeline): def __init__( self, config: Optional[Configuration] = None, + feat_type: Optional[FEAT_TYPE_TYPE] = None, steps=None, dataset_properties=None, include=None, @@ -81,6 +83,7 @@ def __init__( if "target_type" not in dataset_properties: dataset_properties["target_type"] = "regression" super().__init__( + feat_type=feat_type, config=config, steps=steps, dataset_properties=dataset_properties, @@ -112,7 +115,11 @@ def predict(self, X, batch_size=None): return y def _get_hyperparameter_search_space( - self, include=None, exclude=None, dataset_properties=None + self, + feat_type: Optional[FEAT_TYPE_TYPE] = None, + include=None, + exclude=None, + dataset_properties=None, ): """Return the configuration space for the CASH problem. @@ -149,6 +156,7 @@ def _get_hyperparameter_search_space( cs = self._get_base_search_space( cs=cs, + feat_type=feat_type, dataset_properties=dataset_properties, exclude=exclude, include=include, @@ -259,7 +267,9 @@ def _get_hyperparameter_search_space( def _get_estimator_components(self): return regression_components._regressors - def _get_pipeline_steps(self, dataset_properties, init_params=None): + def _get_pipeline_steps( + self, dataset_properties, feat_type: Optional[FEAT_TYPE_TYPE] = None + ): steps = [] default_dataset_properties = {"target_type": "regression"} @@ -271,6 +281,7 @@ def _get_pipeline_steps(self, dataset_properties, init_params=None): [ "data_preprocessor", DataPreprocessorChoice( + feat_type=feat_type, dataset_properties=default_dataset_properties, random_state=self.random_state, ), @@ -278,6 +289,7 @@ def _get_pipeline_steps(self, dataset_properties, init_params=None): [ "feature_preprocessor", feature_preprocessing_components.FeaturePreprocessorChoice( + feat_type=feat_type, dataset_properties=default_dataset_properties, random_state=self.random_state, ), @@ -285,7 +297,9 @@ def _get_pipeline_steps(self, dataset_properties, init_params=None): [ "regressor", regression_components.RegressorChoice( - default_dataset_properties, random_state=self.random_state + feat_type=feat_type, + dataset_properties=default_dataset_properties, + random_state=self.random_state, ), ], ] diff --git a/autosklearn/util/dask.py b/autosklearn/util/dask.py new file mode 100644 index 0000000000..624fecfae9 --- /dev/null +++ b/autosklearn/util/dask.py @@ -0,0 +1,142 @@ +""" Provides simplified 2 use cases of dask that we consider + +1. A UserDask is when a user supplies a dask client, in which case +we don't close this down and leave it up to the user to control its lifetime. +2. A LocalDask is one we use when no user dask is supplied. In this case +we make sure to spin up and close down clients as needed. + +Both of these can be uniformly accessed as a context manager. + +.. code:: python + + # Locally controlled dask client + local_dask = LocalDask(n_jobs=2) + with local_dask as client: + # Do stuff with client + ... + + # `client` is shutdown properly + + # ---------------- + + # User controlled dask client + user_dask = UserDask(user_client) + + with user_dask as client: + # Do stuff with (client == user_client) + ... + + # `user_client` is still open and up to the user to close +""" +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any + +import tempfile + +from dask.distributed import Client, LocalCluster + +from autosklearn.util.single_thread_client import SingleThreadedClient + + +class Dask(ABC): + @abstractmethod + def client(self) -> Client: + """Should return a dask client""" + ... + + @abstractmethod + def close(self) -> None: + """Should close up any resources needed for the dask client""" + ... + + def __enter__(self) -> Client: + return self.client() + + def __exit__(self, *args: Any, **kwargs: Any) -> None: + self.close() + + @abstractmethod + def __repr__(self) -> str: + ... + + +class UserDask(Dask): + """A dask instance created by a user""" + + def __init__(self, client: Client): + """ + Parameters + ---------- + client : Client + The client they passed in + """ + self._client = client + + def client(self) -> Client: + """The dask client""" + return self._client + + def close(self) -> None: + """Close the dask client""" + # We do nothing, it's user provided + pass + + def __repr__(self) -> str: + return "UserDask(...)" + + +class LocalDask(Dask): + def __init__(self, n_jobs: int | None = None) -> None: + self.n_jobs = n_jobs + self._client: Client | None = None + self._cluster: LocalCluster | None = None + + def client(self) -> Client: + """Creates a usable dask client or returns an existing one + + If there is not current client, because it has been closed, create + a new one. + * If ``n_jobs == 1``, create a ``SingleThreadedClient`` + * Else create a ``Client`` with a ``LocalCluster`` + """ + if self._client is not None: + return self._client + + if self.n_jobs == 1: + cluster = None + client = SingleThreadedClient() + else: + cluster = LocalCluster( + n_workers=self.n_jobs, + processes=False, + threads_per_worker=1, + # We use tmpdir to save the workers as deleting workers takes + # more time than deleting backend directories. + # This prevent an error saying that the worker file was deleted, + # so the client could not close the worker properly + local_directory=tempfile.gettempdir(), + # Memory is handled by the pynisher, not by the dask worker/nanny + memory_limit=0, + ) + client = Client(cluster, heartbeat_interval=10000) # 10s + + self._client = client + self._cluster = cluster + return self._client + + def close(self) -> None: + """Closes any open dask client""" + if self._client is None: + return + + self._client.close() + if self._cluster is not None: + self._cluster.close() + + self._client = None + self._cluster = None + + def __repr__(self) -> str: + return f"LocalDask(n_jobs = {self.n_jobs})" diff --git a/autosklearn/util/pipeline.py b/autosklearn/util/pipeline.py index d3291069f5..f0a66a2a86 100755 --- a/autosklearn/util/pipeline.py +++ b/autosklearn/util/pipeline.py @@ -1,5 +1,5 @@ # -*- encoding: utf-8 -*- -from typing import Any, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace @@ -11,6 +11,7 @@ MULTIOUTPUT_REGRESSION, REGRESSION_TASKS, ) +from autosklearn.data.abstract_data_manager import AbstractDataManager from autosklearn.pipeline.classification import SimpleClassificationPipeline from autosklearn.pipeline.regression import SimpleRegressionPipeline @@ -18,7 +19,7 @@ def get_configuration_space( - info: Dict[str, Any], + datamanager: AbstractDataManager, include: Optional[Dict[str, List[str]]] = None, exclude: Optional[Dict[str, List[str]]] = None, random_state: Optional[Union[int, np.random.RandomState]] = None, @@ -27,8 +28,8 @@ def get_configuration_space( Parameters ---------- - info: Dict[str, Any] - Information about the dataset + datamanager: AbstractDataManager + AbstractDataManager object storing all important information about the dataset include: Optional[Dict[str, List[str]]] = None A dictionary of what components to include for each pipeline step @@ -44,16 +45,18 @@ def get_configuration_space( ConfigurationSpace The configuration space for the pipeline """ - if info["task"] in REGRESSION_TASKS: - return _get_regression_configuration_space(info, include, exclude, random_state) + if datamanager.info["task"] in REGRESSION_TASKS: + return _get_regression_configuration_space( + datamanager, include, exclude, random_state + ) else: return _get_classification_configuration_space( - info, include, exclude, random_state + datamanager, include, exclude, random_state ) def _get_regression_configuration_space( - info: Dict[str, Any], + datamanager: AbstractDataManager, include: Optional[Dict[str, List[str]]], exclude: Optional[Dict[str, List[str]]], random_state: Optional[Union[int, np.random.RandomState]] = None, @@ -62,8 +65,8 @@ def _get_regression_configuration_space( Parameters ---------- - info: Dict[str, Any] - Information about the dataset + datamanager: AbstractDataManager + AbstractDataManager object storing all important information about the dataset include: Optional[Dict[str, List[str]]] = None A dictionary of what components to include for each pipeline step @@ -79,28 +82,29 @@ def _get_regression_configuration_space( ConfigurationSpace The configuration space for the regression pipeline """ - task_type = info["task"] + task_type = datamanager.info["task"] sparse = False multioutput = False if task_type == MULTIOUTPUT_REGRESSION: multioutput = True - if info["is_sparse"] == 1: + if datamanager.info["is_sparse"] == 1: sparse = True dataset_properties = {"multioutput": multioutput, "sparse": sparse} configuration_space = SimpleRegressionPipeline( + feat_type=datamanager.feat_type, dataset_properties=dataset_properties, include=include, exclude=exclude, random_state=random_state, - ).get_hyperparameter_search_space() + ).get_hyperparameter_search_space(feat_type=datamanager.feat_type) return configuration_space def _get_classification_configuration_space( - info: Dict[str, Any], + datamanager: AbstractDataManager, include: Optional[Dict[str, List[str]]], exclude: Optional[Dict[str, List[str]]], random_state: Optional[Union[int, np.random.RandomState]] = None, @@ -109,8 +113,8 @@ def _get_classification_configuration_space( Parameters ---------- - info: Dict[str, Any] - Information about the dataset + datamanager: AbstractDataManager + AbstractDataManager object storing all important information about the dataset include: Optional[Dict[str, List[str]]] = None A dictionary of what components to include for each pipeline step @@ -126,7 +130,7 @@ def _get_classification_configuration_space( ConfigurationSpace The configuration space for the classification pipeline """ - task_type = info["task"] + task_type = datamanager.info["task"] multilabel = False multiclass = False @@ -139,7 +143,7 @@ def _get_classification_configuration_space( if task_type == BINARY_CLASSIFICATION: pass - if info["is_sparse"] == 1: + if datamanager.info["is_sparse"] == 1: sparse = True dataset_properties = { @@ -149,8 +153,9 @@ def _get_classification_configuration_space( } return SimpleClassificationPipeline( + feat_type=datamanager.feat_type, dataset_properties=dataset_properties, include=include, exclude=exclude, random_state=random_state, - ).get_hyperparameter_search_space() + ).get_hyperparameter_search_space(feat_type=datamanager.feat_type) diff --git a/examples/40_advanced/example_text_preprocessing.py b/examples/40_advanced/example_text_preprocessing.py index 7c65825b7b..ba7deffe03 100644 --- a/examples/40_advanced/example_text_preprocessing.py +++ b/examples/40_advanced/example_text_preprocessing.py @@ -59,7 +59,6 @@ automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=60, per_run_time_limit=30, - tmp_folder="/tmp/autosklearn_text_example_tmp", ) automl.fit(X_train, y_train, dataset_name="20_Newsgroups") # fit the automl model diff --git a/examples/80_extending/example_extending_classification.py b/examples/80_extending/example_extending_classification.py index b5112c022b..9e46b9e8cd 100644 --- a/examples/80_extending/example_extending_classification.py +++ b/examples/80_extending/example_extending_classification.py @@ -6,6 +6,7 @@ The following example demonstrates how to create a new classification component for using in auto-sklearn. """ +from typing import Optional from pprint import pprint from ConfigSpace.configuration_space import ConfigurationSpace @@ -16,6 +17,8 @@ ) import sklearn.metrics + +from autosklearn.askl_typing import FEAT_TYPE_TYPE import autosklearn.classification import autosklearn.pipeline.components.classification from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm @@ -100,7 +103,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() hidden_layer_depth = UniformIntegerHyperparameter( name="hidden_layer_depth", lower=1, upper=3, default_value=1 diff --git a/examples/80_extending/example_extending_data_preprocessor.py b/examples/80_extending/example_extending_data_preprocessor.py index aa5c443255..eb0325d9df 100644 --- a/examples/80_extending/example_extending_data_preprocessor.py +++ b/examples/80_extending/example_extending_data_preprocessor.py @@ -5,12 +5,15 @@ The following example demonstrates how to turn off data preprocessing step in auto-skearn. """ +from typing import Optional from pprint import pprint import autosklearn.classification import autosklearn.pipeline.components.data_preprocessing import sklearn.metrics from ConfigSpace.configuration_space import ConfigurationSpace + +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT from sklearn.datasets import load_breast_cancer @@ -49,7 +52,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): return ConfigurationSpace() # Return an empty configuration as there is None diff --git a/examples/80_extending/example_extending_preprocessor.py b/examples/80_extending/example_extending_preprocessor.py index 1eb3fc1daf..8516931780 100644 --- a/examples/80_extending/example_extending_preprocessor.py +++ b/examples/80_extending/example_extending_preprocessor.py @@ -7,6 +7,7 @@ discriminant analysis (LDA) algorithm from sklearn and use it as a preprocessor in auto-sklearn. """ +from typing import Optional from pprint import pprint from ConfigSpace.configuration_space import ConfigurationSpace @@ -17,6 +18,8 @@ from ConfigSpace.conditions import InCondition import sklearn.metrics + +from autosklearn.askl_typing import FEAT_TYPE_TYPE import autosklearn.classification import autosklearn.pipeline.components.feature_preprocessing from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm @@ -76,7 +79,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() solver = CategoricalHyperparameter( name="solver", choices=["svd", "lsqr", "eigen"], default_value="svd" diff --git a/examples/80_extending/example_extending_regression.py b/examples/80_extending/example_extending_regression.py index 4d6987a9db..ad2fb8850f 100644 --- a/examples/80_extending/example_extending_regression.py +++ b/examples/80_extending/example_extending_regression.py @@ -6,6 +6,7 @@ The following example demonstrates how to create a new regression component for using in auto-sklearn. """ +from typing import Optional from pprint import pprint from ConfigSpace.configuration_space import ConfigurationSpace @@ -17,6 +18,8 @@ from ConfigSpace.conditions import EqualsCondition import sklearn.metrics + +from autosklearn.askl_typing import FEAT_TYPE_TYPE import autosklearn.regression import autosklearn.pipeline.components.regression from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm @@ -86,7 +89,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() alpha = UniformFloatHyperparameter( name="alpha", lower=10**-5, upper=1, log=True, default_value=1.0 diff --git a/examples/80_extending/example_restrict_number_of_hyperparameters.py b/examples/80_extending/example_restrict_number_of_hyperparameters.py index d8bd2f4a98..a17aa128aa 100644 --- a/examples/80_extending/example_restrict_number_of_hyperparameters.py +++ b/examples/80_extending/example_restrict_number_of_hyperparameters.py @@ -7,6 +7,7 @@ component with a new component, implementing the same classifier, but with different hyperparameters . """ +from typing import Optional from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -17,6 +18,7 @@ from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split +from autosklearn.askl_typing import FEAT_TYPE_TYPE import autosklearn.classification import autosklearn.pipeline.components.classification from autosklearn.pipeline.components.classification import ( @@ -84,7 +86,9 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space( + feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None + ): cs = ConfigurationSpace() # The maximum number of features used in the forest is calculated as m^max_features, where diff --git a/scripts/02_retrieve_metadata.py b/scripts/02_retrieve_metadata.py index f87f65ecc4..56a0395b9e 100644 --- a/scripts/02_retrieve_metadata.py +++ b/scripts/02_retrieve_metadata.py @@ -1,5 +1,4 @@ from argparse import ArgumentParser -from collections import defaultdict import csv import glob import itertools @@ -10,6 +9,7 @@ import numpy as np from ConfigSpace.configuration_space import Configuration +from ConfigSpace.util import deactivate_inactive_hyperparameters from autosklearn.constants import * from autosklearn.metrics import CLASSIFICATION_METRICS, REGRESSION_METRICS @@ -66,8 +66,18 @@ def retrieve_matadata( n_better += 1 try: + for hp in configuration_space.get_hyperparameters(): + if hp.name not in config: + config[hp.name] = hp.default_value + best_configuration = Configuration( - configuration_space=configuration_space, values=config + configuration_space=configuration_space, + values=config, + allow_inactive_with_values=True, + ) + best_configuration = deactivate_inactive_hyperparameters( + configuration=best_configuration, + configuration_space=configuration_space, ) best_value = score best_configuration_dir = validation_trajectory_file @@ -178,6 +188,11 @@ def write_output(outputs, configurations, output_dir, configuration_space, metri fh.write("%s: %s\n" % (key, description[key])) +class DummyDatamanager(): + def __init__(self, info, feat_type=None): + self.info = info + self.feat_type = feat_type + def main(): parser = ArgumentParser() @@ -220,7 +235,10 @@ def main(): ) configuration_space = pipeline.get_configuration_space( - {"is_sparse": sparse, "task": task} + DummyDatamanager( + info={"is_sparse": sparse, "task": task}, + feat_type={"A": "numerical", "B": "categorical"} + ) ) outputs, configurations = retrieve_matadata( diff --git a/test/fixtures/ensemble_building.py b/test/fixtures/ensemble_building.py index 548d1c5d72..20e771fd1a 100644 --- a/test/fixtures/ensemble_building.py +++ b/test/fixtures/ensemble_building.py @@ -231,7 +231,10 @@ def _make( # Hence, we take the y_train of the datamanager and use that as the # the targets if "Y_train" in datamanager.data: - backend.save_targets_ensemble(datamanager.data["Y_train"]) + backend.save_additional_data( + datamanager.data["Y_train"], + what="targets_ensemble", + ) return EnsembleBuilderManager( backend=backend, diff --git a/test/fixtures/ensembles.py b/test/fixtures/ensembles.py index 467c53822f..250841511a 100644 --- a/test/fixtures/ensembles.py +++ b/test/fixtures/ensembles.py @@ -5,7 +5,11 @@ import numpy as np from sklearn.ensemble import VotingClassifier, VotingRegressor -from autosklearn.data.validation import SUPPORTED_FEAT_TYPES, SUPPORTED_TARGET_TYPES +from autosklearn.data.validation import ( + SUPPORTED_FEAT_TYPES, + SUPPORTED_TARGET_TYPES, + InputValidator, +) from autosklearn.evaluation.abstract_evaluator import ( MyDummyClassifier, MyDummyRegressor, @@ -42,9 +46,16 @@ def _make( seed: Union[int, None, np.random.RandomState] = DEFAULT_SEED, ) -> VotingClassifier: assert not (X is None) ^ (y is None) - if not models: - models = [MyDummyClassifier(config=1, random_state=seed) for _ in range(5)] + validator = InputValidator(is_classification=True).fit(X, y) + models = [ + MyDummyClassifier( + feat_type=validator.feature_validator.feat_type, + config=1, + random_state=seed, + ) + for _ in range(5) + ] if X is not None: for model in models: @@ -81,7 +92,15 @@ def _make( assert not (X is None) ^ (y is None) if not models: - models = [MyDummyRegressor(config=1, random_state=seed) for _ in range(5)] + validator = InputValidator(is_classification=False).fit(X, y) + models = [ + MyDummyRegressor( + feat_type=validator.feature_validator.feat_type, + config=1, + random_state=seed, + ) + for _ in range(5) + ] if X is not None: for model in models: diff --git a/test/test_automl/test_construction.py b/test/test_automl/test_construction.py index 5b68d35118..be6fe0e39b 100644 --- a/test/test_automl/test_construction.py +++ b/test/test_automl/test_construction.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Union from autosklearn.automl import AutoML +from autosklearn.util.dask import LocalDask from autosklearn.util.data import default_dataset_compression_arg from autosklearn.util.single_thread_client import SingleThreadedClient @@ -87,4 +88,7 @@ def test_single_job_and_no_dask_client_sets_correct_multiprocessing_context() -> assert automl._multiprocessing_context == "fork" assert automl._n_jobs == 1 - assert isinstance(automl._dask_client, SingleThreadedClient) + assert isinstance(automl._dask, LocalDask) + + with automl._dask as client: + assert isinstance(client, SingleThreadedClient) diff --git a/test/test_estimators/test_estimators.py b/test/test_estimators/test_estimators.py index d0d3f28bdb..e1e33d684a 100644 --- a/test/test_estimators/test_estimators.py +++ b/test/test_estimators/test_estimators.py @@ -140,8 +140,6 @@ def __call__(self, *args, **kwargs): assert count_succeses(automl.cv_results_) > 0 assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible(automl.performance_over_time_) is True - # For travis-ci it is important that the client no longer exists - assert automl.automl_._dask_client is None def test_feat_type_wrong_arguments(): diff --git a/test/test_evaluation/test_dummy_pipelines.py b/test/test_evaluation/test_dummy_pipelines.py index 8d1005e178..c69578420f 100644 --- a/test/test_evaluation/test_dummy_pipelines.py +++ b/test/test_evaluation/test_dummy_pipelines.py @@ -23,8 +23,10 @@ def test_dummy_pipeline(task_type: str) -> None: pytest.fail(task_type) return - estimator = estimator_class(config=1, random_state=0) X, y = data_maker(random_state=0) + estimator = estimator_class( + feat_type={i: "numerical" for i in range(X.shape[1])}, config=1, random_state=0 + ) estimator.fit(X, y) check_is_fitted(estimator) diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py index 02eedcca91..52eccf489b 100644 --- a/test/test_evaluation/test_test_evaluator.py +++ b/test/test_evaluation/test_test_evaluator.py @@ -86,11 +86,22 @@ def test_datasets(self): self.assertTrue(np.isfinite(return_value[0]["loss"])) +class DummyDatamanager: + def __init__(self): + self.info = {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False} + self.feat_type = { + 0: "numerical", + 1: "Numerical", + 2: "numerical", + 3: "numerical", + } + + class FunctionsTest(unittest.TestCase): def setUp(self): self.queue = multiprocessing.Queue() self.configuration = get_configuration_space( - {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False} + DummyDatamanager() ).get_default_configuration() self.data = get_multiclass_classification_datamanager() self.tmp_dir = os.path.join(os.path.dirname(__file__), ".test_cv_functions") diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index c8fe1c5f87..e23c706847 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -2833,11 +2833,22 @@ def test_holdout_split_size(self, te_mock): self.assertEqual(len(test_samples), 3) +class DummyDatamanager: + def __init__(self): + self.info = {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False} + self.feat_type = { + 0: "numerical", + 1: "Numerical", + 2: "numerical", + 3: "numerical", + } + + class FunctionsTest(unittest.TestCase): def setUp(self): self.queue = multiprocessing.Queue() self.configuration = get_configuration_space( - {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False} + DummyDatamanager() ).get_default_configuration() self.data = get_multiclass_classification_datamanager() self.tmp_dir = os.path.join( @@ -3096,7 +3107,7 @@ def test_eval_holdout_budget_iterations_multi_objective(self): def test_eval_holdout_budget_iterations_converged_multi_objective(self): configuration = get_configuration_space( exclude={"classifier": ["random_forest", "liblinear_svc"]}, - info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}, + datamanager=DummyDatamanager(), ).get_default_configuration() eval_holdout( queue=self.queue, @@ -3131,7 +3142,7 @@ def test_eval_holdout_budget_iterations_converged(self): } configuration = get_configuration_space( exclude={"classifier": ["random_forest", "liblinear_svc"]}, - info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}, + datamanager=DummyDatamanager(), ).get_default_configuration() eval_holdout( queue=self.queue, @@ -3248,7 +3259,7 @@ def test_eval_holdout_budget_mixed_iterations(self): def test_eval_holdout_budget_mixed_subsample(self): configuration = get_configuration_space( exclude={"classifier": ["random_forest"]}, - info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}, + datamanager=DummyDatamanager(), ).get_default_configuration() self.assertEqual(configuration["classifier:__choice__"], "liblinear_svc") eval_holdout( diff --git a/test/test_metalearning/pyMetaLearn/test_metalearner.py b/test/test_metalearning/pyMetaLearn/test_metalearner.py index 42d27d49da..33ce3f9f88 100644 --- a/test/test_metalearning/pyMetaLearn/test_metalearner.py +++ b/test/test_metalearning/pyMetaLearn/test_metalearner.py @@ -25,6 +25,7 @@ def setUp(self): pipeline = autosklearn.pipeline.classification.SimpleClassificationPipeline() self.cs = pipeline.get_hyperparameter_search_space() + # print(self.cs.get_default_configuration()) self.logger = logging.getLogger() meta_base = MetaBase(self.cs, data_dir, logger=self.logger) diff --git a/test/test_metalearning/pyMetaLearn/test_metalearning_configuration.py b/test/test_metalearning/pyMetaLearn/test_metalearning_configuration.py new file mode 100644 index 0000000000..1e08805d87 --- /dev/null +++ b/test/test_metalearning/pyMetaLearn/test_metalearning_configuration.py @@ -0,0 +1,40 @@ +import logging +import os + +import autosklearn.metalearning.optimizers.metalearn_optimizer.metalearner as metalearner # noqa: E501 +import autosklearn.pipeline.classification +from autosklearn.metalearning.metalearning.meta_base import MetaBase + +import unittest + +logging.basicConfig() + + +class MetalearningConfiguration(unittest.TestCase): + def test_metalearning_cs_size(self): + self.cwd = os.getcwd() + data_dir = os.path.dirname(__file__) + data_dir = os.path.join(data_dir, "test_meta_base_data") + os.chdir(data_dir) + + for feat_type, cs_size in [ + ({"A": "numerical"}, 165), + ({"A": "categorical"}, 162), + ({"A": "string"}, 174), + ({"A": "numerical", "B": "categorical"}, 168), + ({"A": "numerical", "B": "string"}, 180), + ({"A": "categorical", "B": "string"}, 177), + ({"A": "categorical", "B": "string", "C": "numerical"}, 183), + ]: + pipeline = autosklearn.pipeline.classification.SimpleClassificationPipeline( + feat_type=feat_type + ) + self.cs = pipeline.get_hyperparameter_search_space(feat_type=feat_type) + # print(self.cs.get_default_configuration()) + + self.logger = logging.getLogger() + meta_base = MetaBase(self.cs, data_dir, logger=self.logger) + self.meta_optimizer = metalearner.MetaLearningOptimizer( + "233", self.cs, meta_base, logger=self.logger + ) + self.assertEqual(len(self.meta_optimizer.configuration_space), cs_size) diff --git a/test/test_pipeline/components/data_preprocessing/test_balancing.py b/test/test_pipeline/components/data_preprocessing/test_balancing.py index 6a76ce419c..a128559833 100644 --- a/test/test_pipeline/components/data_preprocessing/test_balancing.py +++ b/test/test_pipeline/components/data_preprocessing/test_balancing.py @@ -215,7 +215,7 @@ def test_weighting_effect(self): default._values["balancing:strategy"] = strategy classifier = SimpleClassificationPipeline( - default, random_state=1, include=include + config=default, random_state=1, include=include ) Xt, fit_params = classifier.fit_transformer(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_feat_type.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_feat_type.py new file mode 100644 index 0000000000..0cf5ee6bd2 --- /dev/null +++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_feat_type.py @@ -0,0 +1,135 @@ +from autosklearn.pipeline.components.data_preprocessing.feature_type import ( + FeatTypeSplit, +) + +import unittest + + +class PreprocessingPipelineFeatTypeTest(unittest.TestCase): + def test_single_type(self): + DPP = FeatTypeSplit(feat_type={"A": "numerical"}) + cs = DPP.get_hyperparameter_search_space( + feat_type={"A": "numerical"}, + dataset_properties={ + "task": 1, + "sparse": False, + "multilabel": False, + "multiclass": False, + "target_type": "classification", + "signed": False, + }, + ) + for key in cs.get_hyperparameters_dict().keys(): + self.assertNotIn("text", key.split(":")[0]) + self.assertNotIn("categorical", key.split(":")[0]) + self.assertEqual(len(cs), 6) + + DPP = FeatTypeSplit(feat_type={"A": "categorical"}) + cs = DPP.get_hyperparameter_search_space( + feat_type={"A": "categorical"}, + dataset_properties={ + "task": 1, + "sparse": False, + "multilabel": False, + "multiclass": False, + "target_type": "classification", + "signed": False, + }, + ) + for key in cs.get_hyperparameters_dict().keys(): + self.assertNotIn("text", key.split(":")[0]) + self.assertNotIn("numerical", key.split(":")[0]) + self.assertEqual(len(cs), 3) + + DPP = FeatTypeSplit(feat_type={"A": "string"}) + cs = DPP.get_hyperparameter_search_space( + feat_type={"A": "string"}, + dataset_properties={ + "task": 1, + "sparse": False, + "multilabel": False, + "multiclass": False, + "target_type": "classification", + "signed": False, + }, + ) + for key in cs.get_hyperparameters_dict().keys(): + self.assertNotIn("numerical", key.split(":")[0]) + self.assertNotIn("categorical", key.split(":")[0]) + self.assertEqual(len(cs), 15) + + def test_dual_type(self): + DPP = FeatTypeSplit(feat_type={"A": "numerical", "B": "categorical"}) + cs = DPP.get_hyperparameter_search_space( + feat_type={"A": "numerical", "B": "categorical"}, + dataset_properties={ + "task": 1, + "sparse": False, + "multilabel": False, + "multiclass": False, + "target_type": "classification", + "signed": False, + }, + ) + for key in cs.get_hyperparameters_dict().keys(): + self.assertNotIn("text", key.split(":")[0]) + self.assertEqual(len(cs), 9) + + DPP = FeatTypeSplit(feat_type={"A": "categorical", "B": "string"}) + cs = DPP.get_hyperparameter_search_space( + feat_type={"A": "categorical", "B": "string"}, + dataset_properties={ + "task": 1, + "sparse": False, + "multilabel": False, + "multiclass": False, + "target_type": "classification", + "signed": False, + }, + ) + for key in cs.get_hyperparameters_dict().keys(): + self.assertNotIn("numerical", key.split(":")[0]) + self.assertEqual(len(cs), 18) + + DPP = FeatTypeSplit(feat_type={"A": "string", "B": "numerical"}) + cs = DPP.get_hyperparameter_search_space( + feat_type={"A": "string", "B": "numerical"}, + dataset_properties={ + "task": 1, + "sparse": False, + "multilabel": False, + "multiclass": False, + "target_type": "classification", + "signed": False, + }, + ) + for key in cs.get_hyperparameters_dict().keys(): + self.assertNotIn("categorical", key.split(":")[0]) + self.assertEqual(len(cs), 21) + + def test_triple_type(self): + DPP = FeatTypeSplit( + feat_type={"A": "numerical", "B": "categorical", "C": "string"} + ) + cs = DPP.get_hyperparameter_search_space( + feat_type={"A": "numerical", "B": "categorical", "C": "string"}, + dataset_properties={ + "task": 1, + "sparse": False, + "multilabel": False, + "multiclass": False, + "target_type": "classification", + "signed": False, + }, + ) + truth_table = [False] * 3 + for key in cs.get_hyperparameters_dict().keys(): + if "text" in key.split(":")[0]: + truth_table[0] = True + elif "categorical" in key.split(":")[0]: + truth_table[1] = True + elif "numerical" in key.split(":")[0]: + truth_table[2] = True + + self.assertEqual(sum(truth_table), 3) + self.assertEqual(len(cs), 24) diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py index d25cef2a2b..6110793c8c 100644 --- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py +++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_numerical.py @@ -11,13 +11,17 @@ class NumericalPreprocessingPipelineTest(unittest.TestCase): def test_data_type_consistency(self): X = np.random.rand(3, 4) - Y = NumericalPreprocessingPipeline().fit_transform(X) + Y = NumericalPreprocessingPipeline( + feat_type={0: "numerical", 1: "numerical", 2: "numerical"} + ).fit_transform(X) self.assertFalse(sparse.issparse(Y)) X = sparse.csc_matrix( ([3.0, 6.0, 4.0, 5.0], ([0, 1, 2, 1], [3, 2, 1, 0])), shape=(3, 4) ) - Y = NumericalPreprocessingPipeline().fit_transform(X) + Y = NumericalPreprocessingPipeline( + feat_type={0: "numerical", 1: "numerical", 2: "numerical"} + ).fit_transform(X) self.assertTrue(sparse.issparse(Y)) def test_fit_transform(self): @@ -37,12 +41,16 @@ def test_fit_transform(self): ] ) # noqa : matrix legibility # dense input - Yt = NumericalPreprocessingPipeline().fit_transform(X) + Yt = NumericalPreprocessingPipeline( + feat_type={0: "numerical", 1: "numerical", 2: "numerical"} + ).fit_transform(X) np.testing.assert_array_almost_equal(Yt, Y1) # sparse input (uses with_mean=False) Y2 = np.array([[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]]) / sdev X_sparse = sparse.csc_matrix(X) - Yt = NumericalPreprocessingPipeline().fit_transform(X_sparse) + Yt = NumericalPreprocessingPipeline( + feat_type={0: "numerical", 1: "numerical", 2: "numerical"} + ).fit_transform(X_sparse) np.testing.assert_array_almost_equal(Yt.todense(), Y2) def test_transform(self): @@ -51,7 +59,9 @@ def test_transform(self): ) # noqa : matrix legibility sdev = np.sqrt(2 / 3) # fit - NPP = NumericalPreprocessingPipeline() + NPP = NumericalPreprocessingPipeline( + feat_type={0: "numerical", 1: "numerical", 2: "numerical"} + ) NPP.fit_transform(X1) # transform X2 = np.array([[1.0, 5.0, 8.0], [2.0, 6.0, 9.0], [3.0, 7.0, np.nan]]) diff --git a/test/test_pipeline/components/data_preprocessing/test_scaling.py b/test/test_pipeline/components/data_preprocessing/test_scaling.py index b87223d14d..faa5b3f1e1 100644 --- a/test/test_pipeline/components/data_preprocessing/test_scaling.py +++ b/test/test_pipeline/components/data_preprocessing/test_scaling.py @@ -19,7 +19,7 @@ def _test_helper(self, Preprocessor, dataset=None, make_sparse=False): original_X_train = X_train.copy() configuration_space = Preprocessor( dataset_properties - ).get_hyperparameter_search_space(dataset_properties) + ).get_hyperparameter_search_space(dataset_properties=dataset_properties) default = configuration_space.get_default_configuration() preprocessor = Preprocessor(dataset_properties, random_state=1) diff --git a/test/test_pipeline/test_base.py b/test/test_pipeline/test_base.py index f8cfe26912..1b604caf58 100644 --- a/test/test_pipeline/test_base.py +++ b/test/test_pipeline/test_base.py @@ -1,17 +1,20 @@ +from typing import Optional + import ConfigSpace.configuration_space import autosklearn.pipeline.base import autosklearn.pipeline.components.base import autosklearn.pipeline.components.classification as classification import autosklearn.pipeline.components.feature_preprocessing as feature_preprocessing +from autosklearn.askl_typing import FEAT_TYPE_TYPE import unittest import unittest.mock class BasePipelineMock(autosklearn.pipeline.base.BasePipeline): - def __init__(self): - pass + def __init__(self, feat_type: Optional[FEAT_TYPE_TYPE] = None): + self.feat_type = feat_type class BaseTest(unittest.TestCase): @@ -37,7 +40,11 @@ def test_get_hyperparameter_configuration_space_3choices(self): base = BasePipelineMock() cs = base._get_base_search_space( - cs, dataset_properties, exclude, include, pipeline + cs=cs, + dataset_properties=dataset_properties, + exclude=exclude, + include=include, + pipeline=pipeline, ) self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 13) @@ -51,7 +58,11 @@ def test_get_hyperparameter_configuration_space_3choices(self): dataset_properties = {"target_type": "classification", "signed": True} include = {"c": ["multinomial_nb"]} cs = base._get_base_search_space( - cs, dataset_properties, exclude, include, pipeline + cs=cs, + dataset_properties=dataset_properties, + exclude=exclude, + include=include, + pipeline=pipeline, ) self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 13) self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 10) @@ -66,7 +77,11 @@ def test_get_hyperparameter_configuration_space_3choices(self): dataset_properties = {"target_type": "classification", "signed": True} include = {} cs = base._get_base_search_space( - cs, dataset_properties, exclude, include, pipeline + cs=cs, + dataset_properties=dataset_properties, + exclude=exclude, + include=include, + pipeline=pipeline, ) self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 13) self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 15) @@ -78,7 +93,11 @@ def test_get_hyperparameter_configuration_space_3choices(self): cs = ConfigSpace.configuration_space.ConfigurationSpace() dataset_properties = {"target_type": "classification", "sparse": True} cs = base._get_base_search_space( - cs, dataset_properties, exclude, include, pipeline + cs=cs, + dataset_properties=dataset_properties, + exclude=exclude, + include=include, + pipeline=pipeline, ) self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 12) self.assertEqual(len(cs.get_hyperparameter("p1:__choice__").choices), 15) @@ -93,7 +112,11 @@ def test_get_hyperparameter_configuration_space_3choices(self): "signed": True, } cs = base._get_base_search_space( - cs, dataset_properties, exclude, include, pipeline + cs=cs, + dataset_properties=dataset_properties, + exclude=exclude, + include=include, + pipeline=pipeline, ) self.assertEqual(len(cs.get_hyperparameter("p0:__choice__").choices), 12) diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py index 7be8038119..94fd8c7a65 100644 --- a/test/test_pipeline/test_classification.py +++ b/test/test_pipeline/test_classification.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Union +from typing import Any, Dict, Optional, Union import copy import itertools @@ -20,6 +20,7 @@ import autosklearn.pipeline.components.classification as classification_components import autosklearn.pipeline.components.feature_preprocessing as preprocessing_components +from autosklearn.askl_typing import FEAT_TYPE_TYPE from autosklearn.pipeline.classification import SimpleClassificationPipeline from autosklearn.pipeline.components.base import ( AutoSklearnChoice, @@ -61,7 +62,7 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(feat_type=None, dataset_properties=None): cs = ConfigurationSpace() return cs @@ -352,16 +353,6 @@ def test_configurations_categorical_data(self): ------- * All configurations should fit, predict and predict_proba successfully """ - pipeline = SimpleClassificationPipeline( - dataset_properties={"sparse": False}, - include={ - "feature_preprocessor": ["no_preprocessing"], - "classifier": ["sgd", "adaboost"], - }, - ) - - cs = pipeline.get_hyperparameter_search_space() - categorical_columns = [ True, True, @@ -407,6 +398,17 @@ def test_configurations_categorical_data(self): for i, is_categorical in enumerate(categorical_columns) } + pipeline = SimpleClassificationPipeline( + feat_type=categorical, + dataset_properties={"sparse": False}, + include={ + "feature_preprocessor": ["no_preprocessing"], + "classifier": ["sgd", "adaboost"], + }, + ) + + cs = pipeline.get_hyperparameter_search_space(feat_type=categorical) + here = os.path.dirname(__file__) dataset_path = os.path.join( here, "components", "data_preprocessing", "dataset.pkl" @@ -429,7 +431,10 @@ def test_configurations_categorical_data(self): init_params = {"data_preprocessor:feat_type": categorical} self._test_configurations( - configurations_space=cs, dataset=data, init_params=init_params + configurations_space=cs, + dataset=data, + init_params=init_params, + feat_type=categorical, ) @unittest.mock.patch( @@ -459,7 +464,8 @@ def test_categorical_passed_to_one_hot_encoder(self, ohe_mock): feat_types = {0: "categorical", 1: "numerical"} cls = SimpleClassificationPipeline( - init_params={"data_preprocessor:feat_type": feat_types} + feat_type=feat_types, + init_params={"data_preprocessor:feat_type": feat_types}, ) init_args = ohe_mock.call_args[1]["init_params"] @@ -468,8 +474,11 @@ def test_categorical_passed_to_one_hot_encoder(self, ohe_mock): # Check through `set_hyperparameters` feat_types = {0: "categorical", 1: "categorical", 2: "numerical"} - default = cls.get_hyperparameter_search_space().get_default_configuration() + default = cls.get_hyperparameter_search_space( + feat_type=feat_types + ).get_default_configuration() cls.set_hyperparameters( + feat_type=feat_types, configuration=default, init_params={"data_preprocessor:feat_type": feat_types}, ) @@ -485,6 +494,7 @@ def _test_configurations( init_params: Dict[str, Any] = None, dataset_properties: Dict[str, Any] = None, n_samples: int = 10, + feat_type: Optional[FEAT_TYPE_TYPE] = None, ): """Tests a configuration space by taking multiple samples and fiting each before calling predict and predict_proba. @@ -560,9 +570,13 @@ def _test_configurations( init_params_ = copy.deepcopy(init_params) cls = SimpleClassificationPipeline( - dataset_properties=dataset_properties, init_params=init_params_ + feat_type=feat_type, + dataset_properties=dataset_properties, + init_params=init_params_, + ) + cls.set_hyperparameters( + config, init_params=init_params_, feat_type=feat_type ) - cls.set_hyperparameters(config, init_params=init_params_) # First make sure that for this configuration, setting the parameters # does not mistakenly set the estimator as fitted @@ -659,7 +673,9 @@ def test_get_hyperparameter_search_space(self): * (n_hyperparameters - 4) different conditionals for the pipeline * 53 forbidden combinations """ - pipeline = SimpleClassificationPipeline() + pipeline = SimpleClassificationPipeline( + feat_type={"A": "numerical", "B": "categorical", "C": "string"} + ) cs = pipeline.get_hyperparameter_search_space() self.assertIsInstance(cs, ConfigurationSpace) @@ -897,7 +913,10 @@ def test_predict_proba_batched(self): perform near identically """ # Multiclass - cls = SimpleClassificationPipeline(include={"classifier": ["sgd"]}) + cls = SimpleClassificationPipeline( + feat_type={i: "numerical" for i in range(0, 64)}, + include={"classifier": ["sgd"]}, + ) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") with ignore_warnings(classifier_warnings): @@ -951,6 +970,7 @@ def test_predict_proba_batched_sparse(self): perform near identically """ cls = SimpleClassificationPipeline( + feat_type={i: "numerical" for i in range(0, 64)}, dataset_properties={"sparse": True, "multiclass": True}, include={"classifier": ["sgd"]}, ) @@ -1313,3 +1333,43 @@ def test_fit_instantiates_component(self): del preprocessing_components.additional_components.components[ "CrashPreprocessor" ] + + def test_get_hyperparameter_search_space_feat_type(self): + cs_mc = SimpleClassificationPipeline( + feat_type={"1": "numerical"} + ).get_hyperparameter_search_space(dataset_properties={"multiclass": True}) + self.assertNotIn("data_preprocessor:feature_type:categorical", str(cs_mc)) + self.assertNotIn("data_preprocessor:feature_type:text", str(cs_mc)) + + cs_mc = SimpleClassificationPipeline( + feat_type={"1": "categorical"} + ).get_hyperparameter_search_space(dataset_properties={"multilabel": True}) + self.assertNotIn("data_preprocessor:feature_type:numerical", str(cs_mc)) + self.assertNotIn("data_preprocessor:feature_type:text", str(cs_mc)) + + cs_mc = SimpleClassificationPipeline( + feat_type={"1": "string"} + ).get_hyperparameter_search_space(dataset_properties={"sparse": True}) + self.assertNotIn("data_preprocessor:feature_type:numerical", str(cs_mc)) + self.assertNotIn("data_preprocessor:feature_type:categorical", str(cs_mc)) + + cs_mc = SimpleClassificationPipeline( + feat_type={"1": "numerical", "2": "categorical"} + ).get_hyperparameter_search_space( + dataset_properties={"multilabel": True, "multiclass": True} + ) + self.assertNotIn("data_preprocessor:feature_type:text", str(cs_mc)) + + cs_mc = SimpleClassificationPipeline( + feat_type={"1": "numerical", "2": "string"} + ).get_hyperparameter_search_space( + dataset_properties={"multilabel": True, "multiclass": True} + ) + self.assertNotIn("data_preprocessor:feature_type:categorical", str(cs_mc)) + + cs_mc = SimpleClassificationPipeline( + feat_type={"1": "categorical", "2": "string"} + ).get_hyperparameter_search_space( + dataset_properties={"multilabel": True, "multiclass": True} + ) + self.assertNotIn("data_preprocessor:feature_type:numerical", str(cs_mc)) diff --git a/test/test_util/test_dask.py b/test/test_util/test_dask.py new file mode 100644 index 0000000000..1dbc290500 --- /dev/null +++ b/test/test_util/test_dask.py @@ -0,0 +1,75 @@ +from pathlib import Path + +from dask.distributed import Client, LocalCluster + +from autosklearn.util.dask import LocalDask, UserDask + +import pytest + + +@pytest.mark.parametrize("n_jobs", [1, 2]) +def test_user_dask(tmp_path: Path, n_jobs: int) -> None: + """ + Expects + ------- + * A UserDask should not close the client after exiting context + """ + cluster = LocalCluster( + n_workers=n_jobs, + processes=False, + threads_per_worker=1, + local_directory=tmp_path, + ) + client = Client(cluster, heartbeat_interval=10000) + + # Active at creation + dask = UserDask(client) + + client_1 = None + with dask as user_client: + client_1 = user_client + assert user_client.status == "running" + + client_2 = None + with dask as user_client: + assert user_client.status == "running" + client_2 = user_client + + # Make sure they are the same client + assert id(client_1) == id(client_2) + + # Remains running after context + assert client_1.status == "running" + + cluster.close() + client.close() + + assert client.status == "closed" + + +def test_local_dask_creates_new_clients(tmp_path: Path) -> None: + """ + Expects + ------- + * A LocalDask should create new dask clusters at each context usage + """ + # We need 2 to use an actual dask client and not a SingleThreadedClient + local_dask = LocalDask(n_jobs=2) + + client_1 = None + with local_dask as client: + client_1 = client + assert client_1.status == "running" + + assert client_1.status == "closed" + + client_2 = None + with local_dask as client: + client_2 = client + assert client_2.status == "running" + + # Make sure they were different clients + assert id(client_1) != id(client_2) + + assert client_2.status == "closed" + assert client_1.status == "closed"