diff --git a/.github/workflows/regressions.yml b/.github/workflows/regressions.yml
index 46414dadc9..8bb0addcf4 100644
--- a/.github/workflows/regressions.yml
+++ b/.github/workflows/regressions.yml
@@ -327,7 +327,7 @@ jobs:
             && github.event.action == 'labeled'
             && github.event.label.name == 'regression-tests'
           )
-        uses: peter-evans/find-comment@v1
+        uses: peter-evans/find-comment@v2
         id: comment_finder
         with:
           issue-number: ${{ github.event.pull_request.number }}
diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
index b4bb87fafd..5d24ae0627 100644
--- a/.github/workflows/stale.yaml
+++ b/.github/workflows/stale.yaml
@@ -9,7 +9,7 @@ jobs:
   stale:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/stale@v4
+      - uses: actions/stale@v5
         with:
           days-before-stale: 60
           days-before-close: 7
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
index 3902aded6e..8726ab8c6d 100644
--- a/autosklearn/automl.py
+++ b/autosklearn/automl.py
@@ -244,7 +244,7 @@ def __init__(
 
         if isinstance(disable_evaluator_output, Iterable):
             disable_evaluator_output = list(disable_evaluator_output)  # Incase iterator
-            allowed = set(["model", "cv_model", "y_optimization", "y_test", "y_valid"])
+            allowed = set(["model", "cv_model", "y_optimization", "y_test"])
             unknown = allowed - set(disable_evaluator_output)
             if any(unknown):
                 raise ValueError(
@@ -656,273 +656,282 @@ def fit(
 
         # By default try to use the TCP logging port or get a new port
         self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
-        self._logger = self._get_logger(dataset_name)
 
-        # The first thing we have to do is create the logger to update the backend
-        self._backend.setup_logger(self._logger_port)
+        # Once we start the logging server, it starts in a new process
+        # If an error occurs then we want to make sure that we exit cleanly
+        # and shut it down, else it might hang
+        # https://github.com/automl/auto-sklearn/issues/1480
+        try:
+            self._logger = self._get_logger(dataset_name)
 
-        if not only_return_configuration_space:
-            # If only querying the configuration space, we do not save the start time
-            # The start time internally checks for the fit() method to execute only once
-            # But this does not apply when only querying the configuration space
-            self._backend.save_start_time(self._seed)
+            # The first thing we have to do is create the logger to update the backend
+            self._backend.setup_logger(self._logger_port)
 
-        self._stopwatch = StopWatch()
+            if not only_return_configuration_space:
+                # If only querying the configuration space, we do not save the start
+                # time The start time internally checks for the fit() method to execute
+                # only once but this does not apply when only querying the configuration
+                # space
+                self._backend.save_start_time(self._seed)
 
-        # Make sure that input is valid
-        # Performs Ordinal one hot encoding to the target
-        # both for train and test data
-        self.InputValidator = InputValidator(
-            is_classification=is_classification,
-            feat_type=feat_type,
-            logger_port=self._logger_port,
-            allow_string_features=self.allow_string_features,
-        )
-        self.InputValidator.fit(X_train=X, y_train=y, X_test=X_test, y_test=y_test)
-        X, y = self.InputValidator.transform(X, y)
+            self._stopwatch = StopWatch()
 
-        if X_test is not None and y_test is not None:
-            X_test, y_test = self.InputValidator.transform(X_test, y_test)
+            # Make sure that input is valid
+            # Performs Ordinal one hot encoding to the target
+            # both for train and test data
+            self.InputValidator = InputValidator(
+                is_classification=is_classification,
+                feat_type=feat_type,
+                logger_port=self._logger_port,
+                allow_string_features=self.allow_string_features,
+            )
+            self.InputValidator.fit(X_train=X, y_train=y, X_test=X_test, y_test=y_test)
+            X, y = self.InputValidator.transform(X, y)
 
-        # We don't support size reduction on pandas type object yet
-        if (
-            self._dataset_compression is not None
-            and not isinstance(X, pd.DataFrame)
-            and not (isinstance(y, pd.Series) or isinstance(y, pd.DataFrame))
-        ):
-            methods = self._dataset_compression["methods"]
-            memory_allocation = self._dataset_compression["memory_allocation"]
-
-            # Remove precision reduction if we can't perform it
-            if "precision" in methods and X.dtype not in supported_precision_reductions:
-                methods = [method for method in methods if method != "precision"]
-
-            with warnings_to(self._logger):
-                X, y = reduce_dataset_size_if_too_large(
-                    X=X,
-                    y=y,
-                    memory_limit=self._memory_limit,
-                    is_classification=is_classification,
-                    random_state=self._seed,
-                    operations=methods,
-                    memory_allocation=memory_allocation,
-                )
+            if X_test is not None and y_test is not None:
+                X_test, y_test = self.InputValidator.transform(X_test, y_test)
 
-        # Check the re-sampling strategy
-        try:
+            # We don't support size reduction on pandas type object yet
+            if (
+                self._dataset_compression is not None
+                and not isinstance(X, pd.DataFrame)
+                and not (isinstance(y, pd.Series) or isinstance(y, pd.DataFrame))
+            ):
+                methods = self._dataset_compression["methods"]
+                memory_allocation = self._dataset_compression["memory_allocation"]
+
+                # Remove precision reduction if we can't perform it
+                if (
+                    "precision" in methods
+                    and X.dtype not in supported_precision_reductions
+                ):
+                    methods = [method for method in methods if method != "precision"]
+
+                with warnings_to(self._logger):
+                    X, y = reduce_dataset_size_if_too_large(
+                        X=X,
+                        y=y,
+                        memory_limit=self._memory_limit,
+                        is_classification=is_classification,
+                        random_state=self._seed,
+                        operations=methods,
+                        memory_allocation=memory_allocation,
+                    )
+
+            # Check the re-sampling strategy
             self._check_resampling_strategy(
                 X=X,
                 y=y,
                 task=self._task,
             )
-        except Exception as e:
-            self._fit_cleanup()
-            raise e
-
-        # Reset learnt stuff
-        self.models_ = None
-        self.cv_models_ = None
-        self.ensemble_ = None
 
-        # The metric must exist as of this point
-        # It can be provided in the constructor, or automatically
-        # defined in the estimator fit call
-        if isinstance(self._metrics, Sequence):
-            for entry in self._metrics:
-                if not isinstance(entry, Scorer):
-                    raise ValueError(
-                        "Metric {entry} must be instance of autosklearn.metrics.Scorer."
-                    )
-        else:
-            raise ValueError(
-                "Metric must be a sequence of instances of "
-                "autosklearn.metrics.Scorer."
-            )
+            # Reset learnt stuff
+            self.models_ = None
+            self.cv_models_ = None
+            self.ensemble_ = None
 
-        # If no dask client was provided, we create one, so that we can
-        # start a ensemble process in parallel to smbo optimize
-        if self._dask_client is None and (
-            self._ensemble_class is not None
-            or self._n_jobs is not None
-            and self._n_jobs > 1
-        ):
-            self._create_dask_client()
-        else:
-            self._is_dask_client_internally_created = False
+            # The metric must exist as of this point
+            # It can be provided in the constructor, or automatically
+            # defined in the estimator fit call
+            if isinstance(self._metrics, Sequence):
+                for entry in self._metrics:
+                    if not isinstance(entry, Scorer):
+                        raise ValueError(
+                            f"Metric {entry} must be instance of"
+                            " autosklearn.metrics.Scorer."
+                        )
+            else:
+                raise ValueError(
+                    "Metric must be a sequence of instances of "
+                    "autosklearn.metrics.Scorer."
+                )
 
-        self._dataset_name = dataset_name
-        self._stopwatch.start(self._dataset_name)
+            # If no dask client was provided, we create one, so that we can
+            # start a ensemble process in parallel to smbo optimize
+            if self._dask_client is None and (
+                self._ensemble_class is not None
+                or self._n_jobs is not None
+                and self._n_jobs > 1
+            ):
+                self._create_dask_client()
+            else:
+                self._is_dask_client_internally_created = False
 
-        # Take the feature types from the validator
-        self._feat_type = self.InputValidator.feature_validator.feat_type
+            self._dataset_name = dataset_name
+            self._stopwatch.start(self._dataset_name)
 
-        self._log_fit_setup()
+            # Take the feature types from the validator
+            self._feat_type = self.InputValidator.feature_validator.feat_type
 
-        # == Pickle the data manager to speed up loading
-        with self._stopwatch.time("Save Datamanager"):
-            datamanager = XYDataManager(
-                X,
-                y,
-                X_test=X_test,
-                y_test=y_test,
-                task=self._task,
-                feat_type=self._feat_type,
-                dataset_name=dataset_name,
-            )
+            self._log_fit_setup()
 
-            self._backend._make_internals_directory()
-            self._label_num = datamanager.info["label_num"]
-
-            self._backend.save_datamanager(datamanager)
-
-        # = Create a searchspace
-        # Do this before One Hot Encoding to make sure that it creates a
-        # search space for a dense classifier even if one hot encoding would
-        # make it sparse (tradeoff; if one hot encoding would make it sparse,
-        #  densifier and truncatedSVD would probably lead to a MemoryError,
-        # like this we can't use some of the preprocessing methods in case
-        # the data became sparse)
-        with self._stopwatch.time("Create Search space"):
-            self.configuration_space, configspace_path = self._create_search_space(
-                self._backend.temporary_directory,
-                self._backend,
-                datamanager,
-                include=self._include,
-                exclude=self._exclude,
-            )
-
-        if only_return_configuration_space:
-            self._fit_cleanup()
-            return self.configuration_space
-
-        # == Perform dummy predictions
-        with self._stopwatch.time("Dummy predictions"):
-            self.num_run += 1
-            self._do_dummy_prediction()
-
-        # == RUN ensemble builder
-        # Do this before calculating the meta-features to make sure that the
-        # dummy predictions are actually included in the ensemble even if
-        # calculating the meta-features takes very long
-        with self._stopwatch.time("Run Ensemble Builder"):
-
-            elapsed_time = self._stopwatch.time_since(self._dataset_name, "start")
-
-            time_left_for_ensembles = max(0, self._time_for_task - elapsed_time)
-            proc_ensemble = None
-            if time_left_for_ensembles <= 0:
-                # Fit only raises error when an ensemble class is given but
-                # time_left_for_ensembles is zero.
-                if self._ensemble_class is not None:
-                    raise ValueError(
-                        "Not starting ensemble builder because there "
-                        "is no time left. Try increasing the value "
-                        "of time_left_for_this_task."
-                    )
-            elif self._ensemble_class is None:
-                self._logger.info(
-                    "Not starting ensemble builder because no ensemble class is given."
-                )
-            else:
-                self._logger.info(
-                    "Start Ensemble with %5.2fsec time left" % time_left_for_ensembles
-                )
-
-                proc_ensemble = EnsembleBuilderManager(
-                    start_time=time.time(),
-                    time_left_for_ensembles=time_left_for_ensembles,
-                    backend=copy.deepcopy(self._backend),
-                    dataset_name=dataset_name,
+            # == Pickle the data manager to speed up loading
+            with self._stopwatch.time("Save Datamanager"):
+                datamanager = XYDataManager(
+                    X,
+                    y,
+                    X_test=X_test,
+                    y_test=y_test,
                     task=self._task,
-                    metrics=self._metrics,
-                    ensemble_class=self._ensemble_class,
-                    ensemble_kwargs=self._ensemble_kwargs,
-                    ensemble_nbest=self._ensemble_nbest,
-                    max_models_on_disc=self._max_models_on_disc,
-                    seed=self._seed,
-                    precision=self.precision,
-                    max_iterations=self._max_ensemble_build_iterations,
-                    read_at_most=self._read_at_most,
-                    memory_limit=self._memory_limit,
-                    random_state=self._seed,
-                    logger_port=self._logger_port,
-                    pynisher_context=self._multiprocessing_context,
+                    feat_type=self._feat_type,
+                    dataset_name=dataset_name,
                 )
 
-        # kill the datamanager as it will be re-loaded anyways from sub processes
-        try:
-            del self._datamanager
-        except Exception:
-            pass
+                self._backend._make_internals_directory()
+                self._label_num = datamanager.info["label_num"]
+
+                self._backend.save_datamanager(datamanager)
+
+            # = Create a searchspace
+            # Do this before One Hot Encoding to make sure that it creates a
+            # search space for a dense classifier even if one hot encoding would
+            # make it sparse (tradeoff; if one hot encoding would make it sparse,
+            #  densifier and truncatedSVD would probably lead to a MemoryError,
+            # like this we can't use some of the preprocessing methods in case
+            # the data became sparse)
+            with self._stopwatch.time("Create Search space"):
+                self.configuration_space, configspace_path = self._create_search_space(
+                    self._backend.temporary_directory,
+                    self._backend,
+                    datamanager,
+                    include=self._include,
+                    exclude=self._exclude,
+                )
 
-        # => RUN SMAC
-        with self._stopwatch.time("Run SMAC"):
-            elapsed_time = self._stopwatch.time_since(self._dataset_name, "start")
-            time_left = self._time_for_task - elapsed_time
-
-            if self._logger:
-                self._logger.info("Start SMAC with %5.2fsec time left" % time_left)
-            if time_left <= 0:
-                self._logger.warning("Not starting SMAC because there is no time left.")
-                _proc_smac = None
-                self._budget_type = None
-            else:
-                if (
-                    self._per_run_time_limit is None
-                    or self._per_run_time_limit > time_left
-                ):
-                    self._logger.warning(
-                        "Time limit for a single run is higher than total time "
-                        "limit. Capping the limit for a single run to the total "
-                        "time given to SMAC (%f)" % time_left
+            if only_return_configuration_space:
+                return self.configuration_space
+
+            # == Perform dummy predictions
+            with self._stopwatch.time("Dummy predictions"):
+                self.num_run += 1
+                self._do_dummy_prediction()
+
+            # == RUN ensemble builder
+            # Do this before calculating the meta-features to make sure that the
+            # dummy predictions are actually included in the ensemble even if
+            # calculating the meta-features takes very long
+            with self._stopwatch.time("Run Ensemble Builder"):
+
+                elapsed_time = self._stopwatch.time_since(self._dataset_name, "start")
+
+                time_left_for_ensembles = max(0, self._time_for_task - elapsed_time)
+                proc_ensemble = None
+                if time_left_for_ensembles <= 0:
+                    # Fit only raises error when an ensemble class is given but
+                    # time_left_for_ensembles is zero.
+                    if self._ensemble_class is not None:
+                        raise ValueError(
+                            "Not starting ensemble builder because there "
+                            "is no time left. Try increasing the value "
+                            "of time_left_for_this_task."
+                        )
+                elif self._ensemble_class is None:
+                    self._logger.info(
+                        "No ensemble buildin because no ensemble class was given."
                     )
-                    per_run_time_limit = time_left
                 else:
-                    per_run_time_limit = self._per_run_time_limit
+                    self._logger.info(
+                        "Start Ensemble with %5.2fsec time left"
+                        % time_left_for_ensembles
+                    )
+
+                    proc_ensemble = EnsembleBuilderManager(
+                        start_time=time.time(),
+                        time_left_for_ensembles=time_left_for_ensembles,
+                        backend=copy.deepcopy(self._backend),
+                        dataset_name=dataset_name,
+                        task=self._task,
+                        metrics=self._metrics,
+                        ensemble_class=self._ensemble_class,
+                        ensemble_kwargs=self._ensemble_kwargs,
+                        ensemble_nbest=self._ensemble_nbest,
+                        max_models_on_disc=self._max_models_on_disc,
+                        seed=self._seed,
+                        precision=self.precision,
+                        max_iterations=self._max_ensemble_build_iterations,
+                        read_at_most=self._read_at_most,
+                        memory_limit=self._memory_limit,
+                        random_state=self._seed,
+                        logger_port=self._logger_port,
+                        pynisher_context=self._multiprocessing_context,
+                    )
 
-                # Make sure that at least 2 models are created for the ensemble process
-                num_models = time_left // per_run_time_limit
-                if num_models < 2:
-                    per_run_time_limit = time_left // 2
+            # kill the datamanager as it will be re-loaded anyways from sub processes
+            try:
+                del self._datamanager
+            except Exception:
+                pass
+
+            # => RUN SMAC
+            with self._stopwatch.time("Run SMAC"):
+                elapsed_time = self._stopwatch.time_since(self._dataset_name, "start")
+                time_left = self._time_for_task - elapsed_time
+
+                if self._logger:
+                    self._logger.info("Start SMAC with %5.2fsec time left" % time_left)
+                if time_left <= 0:
                     self._logger.warning(
-                        "Capping the per_run_time_limit to {} to have "
-                        "time for a least 2 models in each process.".format(
-                            per_run_time_limit
-                        )
+                        "Not starting SMAC because there is no time left."
                     )
+                    _proc_smac = None
+                    self._budget_type = None
+                else:
+                    if (
+                        self._per_run_time_limit is None
+                        or self._per_run_time_limit > time_left
+                    ):
+                        self._logger.warning(
+                            "Time limit for a single run is higher than total time "
+                            "limit. Capping the limit for a single run to the total "
+                            "time given to SMAC (%f)" % time_left
+                        )
+                        per_run_time_limit = time_left
+                    else:
+                        per_run_time_limit = self._per_run_time_limit
+
+                    # At least 2 models are created for the ensemble process
+                    num_models = time_left // per_run_time_limit
+                    if num_models < 2:
+                        per_run_time_limit = time_left // 2
+                        self._logger.warning(
+                            "Capping the per_run_time_limit to {} to have "
+                            "time for a least 2 models in each process.".format(
+                                per_run_time_limit
+                            )
+                        )
 
-                _proc_smac = AutoMLSMBO(
-                    config_space=self.configuration_space,
-                    dataset_name=self._dataset_name,
-                    backend=self._backend,
-                    total_walltime_limit=time_left,
-                    func_eval_time_limit=per_run_time_limit,
-                    memory_limit=self._memory_limit,
-                    data_memory_limit=self._data_memory_limit,
-                    stopwatch=self._stopwatch,
-                    n_jobs=self._n_jobs,
-                    dask_client=self._dask_client,
-                    start_num_run=self.num_run,
-                    num_metalearning_cfgs=self._initial_configurations_via_metalearning,
-                    config_file=configspace_path,
-                    seed=self._seed,
-                    metadata_directory=self._metadata_directory,
-                    metrics=self._metrics,
-                    resampling_strategy=self._resampling_strategy,
-                    resampling_strategy_args=self._resampling_strategy_arguments,
-                    include=self._include,
-                    exclude=self._exclude,
-                    disable_file_output=self._disable_evaluator_output,
-                    get_smac_object_callback=self._get_smac_object_callback,
-                    smac_scenario_args=self._smac_scenario_args,
-                    scoring_functions=self._scoring_functions,
-                    port=self._logger_port,
-                    pynisher_context=self._multiprocessing_context,
-                    ensemble_callback=proc_ensemble,
-                    trials_callback=self._get_trials_callback,
-                )
+                    n_meta_configs = self._initial_configurations_via_metalearning
+                    _proc_smac = AutoMLSMBO(
+                        config_space=self.configuration_space,
+                        dataset_name=self._dataset_name,
+                        backend=self._backend,
+                        total_walltime_limit=time_left,
+                        func_eval_time_limit=per_run_time_limit,
+                        memory_limit=self._memory_limit,
+                        data_memory_limit=self._data_memory_limit,
+                        stopwatch=self._stopwatch,
+                        n_jobs=self._n_jobs,
+                        dask_client=self._dask_client,
+                        start_num_run=self.num_run,
+                        num_metalearning_cfgs=n_meta_configs,
+                        config_file=configspace_path,
+                        seed=self._seed,
+                        metadata_directory=self._metadata_directory,
+                        metrics=self._metrics,
+                        resampling_strategy=self._resampling_strategy,
+                        resampling_strategy_args=self._resampling_strategy_arguments,
+                        include=self._include,
+                        exclude=self._exclude,
+                        disable_file_output=self._disable_evaluator_output,
+                        get_smac_object_callback=self._get_smac_object_callback,
+                        smac_scenario_args=self._smac_scenario_args,
+                        scoring_functions=self._scoring_functions,
+                        port=self._logger_port,
+                        pynisher_context=self._multiprocessing_context,
+                        ensemble_callback=proc_ensemble,
+                        trials_callback=self._get_trials_callback,
+                    )
 
-                try:
                     (
                         self.runhistory_,
                         self.trajectory_,
@@ -938,42 +947,49 @@ def fit(
                     ]
                     with open(trajectory_filename, "w") as fh:
                         json.dump(saveable_trajectory, fh)
-                except Exception as e:
-                    self._logger.exception(e)
-                    raise
-
-        self._logger.info("Starting shutdown...")
-        # Wait until the ensemble process is finished to avoid shutting down
-        # while the ensemble builder tries to access the data
-        if proc_ensemble is not None:
-            self.ensemble_performance_history = list(proc_ensemble.history)
-
-            if len(proc_ensemble.futures) > 0:
-                # Now we need to wait for the future to return as it cannot be cancelled
-                # while it is running: https://stackoverflow.com/a/49203129
-                self._logger.info(
-                    "Ensemble script still running, waiting for it to finish."
-                )
-                result = proc_ensemble.futures.pop().result()
-                if result:
-                    ensemble_history, _ = result
-                    self.ensemble_performance_history.extend(ensemble_history)
-                self._logger.info("Ensemble script finished, continue shutdown.")
-
-            # save the ensemble performance history file
-            if len(self.ensemble_performance_history) > 0:
-                pd.DataFrame(self.ensemble_performance_history).to_json(
-                    os.path.join(
-                        self._backend.internals_directory, "ensemble_history.json"
+
+            self._logger.info("Starting shutdown...")
+            # Wait until the ensemble process is finished to avoid shutting down
+            # while the ensemble builder tries to access the data
+            if proc_ensemble is not None:
+                self.ensemble_performance_history = list(proc_ensemble.history)
+
+                if len(proc_ensemble.futures) > 0:
+                    # Now we wait for the future to return as it cannot be cancelled
+                    # while it is running: https://stackoverflow.com/a/49203129
+                    self._logger.info(
+                        "Ensemble script still running, waiting for it to finish."
+                    )
+                    result = proc_ensemble.futures.pop().result()
+                    if result:
+                        ensemble_history, _ = result
+                        self.ensemble_performance_history.extend(ensemble_history)
+                    self._logger.info("Ensemble script finished, continue shutdown.")
+
+                # save the ensemble performance history file
+                if len(self.ensemble_performance_history) > 0:
+                    pd.DataFrame(self.ensemble_performance_history).to_json(
+                        os.path.join(
+                            self._backend.internals_directory, "ensemble_history.json"
+                        )
                     )
-                )
 
-        if load_models:
-            self._logger.info("Loading models...")
-            self._load_models()
-            self._logger.info("Finished loading models...")
+            if load_models:
+                self._logger.info("Loading models...")
+                self._load_models()
+                self._logger.info("Finished loading models...")
+
+        # The whole logic above from where we begin the logging server is capture
+        # in a try: finally: so that if something goes wrong, we at least close
+        # down the logging server, preventing it from hanging and not closing
+        # until ctrl+c is pressed
+        except Exception as e:
+            # This will be called before the _fit_cleanup
+            self._logger.exception(e)
+            raise e
+        finally:
+            self._fit_cleanup()
 
-        self._fit_cleanup()
         self.fitted = True
 
         return self
diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
index 96e7f07d26..5afd8c597c 100644
--- a/autosklearn/estimators.py
+++ b/autosklearn/estimators.py
@@ -262,8 +262,8 @@ def __init__(
             list are:
 
             * ``'y_optimization'`` : do not save the predictions for the
-              optimization/validation set, which would later on be used to build
-              an ensemble.
+              optimization set, which would later on be used to build an ensemble.
+
             * ``model`` : do not save any model files
 
         smac_scenario_args : dict, optional (None)
diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py
index 9563f4ef8b..ba17513ae0 100644
--- a/autosklearn/evaluation/__init__.py
+++ b/autosklearn/evaluation/__init__.py
@@ -230,14 +230,7 @@ def __init__(
         self.memory_limit = memory_limit
 
         dm = self.backend.load_datamanager()
-        if "X_valid" in dm.data and "Y_valid" in dm.data:
-            self._get_validation_loss = True
-        else:
-            self._get_validation_loss = False
-        if "X_test" in dm.data and "Y_test" in dm.data:
-            self._get_test_loss = True
-        else:
-            self._get_test_loss = False
+        self._get_test_loss = "X_test" in dm.data and "Y_test" in dm.data
 
         self.port = port
         self.pynisher_context = pynisher_context
@@ -533,21 +526,6 @@ def run(
                 additional_run_info["train_learning_curve"] = train_learning_curve
                 additional_run_info["learning_curve_runtime"] = learning_curve_runtime
 
-            if self._get_validation_loss:
-                validation_learning_curve = (
-                    autosklearn.evaluation.util.extract_learning_curve(
-                        info,
-                        "validation_loss",
-                    )
-                )
-                if len(validation_learning_curve) > 1:
-                    additional_run_info[
-                        "validation_learning_curve"
-                    ] = validation_learning_curve
-                    additional_run_info[
-                        "learning_curve_runtime"
-                    ] = learning_curve_runtime
-
             if self._get_test_loss:
                 test_learning_curve = (
                     autosklearn.evaluation.util.extract_learning_curve(
diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
index 6a189a86a0..b97f588a45 100644
--- a/autosklearn/evaluation/abstract_evaluator.py
+++ b/autosklearn/evaluation/abstract_evaluator.py
@@ -220,8 +220,6 @@ def __init__(
         self.include = include
         self.exclude = exclude
 
-        self.X_valid = self.datamanager.data.get("X_valid")
-        self.y_valid = self.datamanager.data.get("Y_valid")
         self.X_test = self.datamanager.data.get("X_test")
         self.y_test = self.datamanager.data.get("Y_test")
 
@@ -359,7 +357,6 @@ def finish_up(
         loss: Union[Dict[str, float], float],
         train_loss: Optional[Dict[str, float]],
         opt_pred: np.ndarray,
-        valid_pred: np.ndarray,
         test_pred: np.ndarray,
         additional_run_info: Optional[TYPE_ADDITIONAL_INFO],
         file_output: bool,
@@ -382,19 +379,12 @@ def finish_up(
         self.duration = time.time() - self.starttime
 
         if file_output:
-            file_out_loss, additional_run_info_ = self.file_output(
-                opt_pred,
-                valid_pred,
-                test_pred,
-            )
+            file_out_loss, additional_run_info_ = self.file_output(opt_pred, test_pred)
         else:
             file_out_loss = None
             additional_run_info_ = {}
 
-        validation_loss, test_loss = self.calculate_auxiliary_losses(
-            valid_pred,
-            test_pred,
-        )
+        test_loss = self.calculate_auxiliary_losses(test_pred)
 
         if file_out_loss is not None:
             return self.duration, file_out_loss, self.seed, additional_run_info_
@@ -424,8 +414,6 @@ def finish_up(
                 additional_run_info["train_loss"] = [
                     train_loss[metric.name] for metric in self.metrics
                 ]
-        if validation_loss is not None:
-            additional_run_info["validation_loss"] = validation_loss
         if test_loss is not None:
             additional_run_info["test_loss"] = test_loss
 
@@ -442,41 +430,22 @@ def finish_up(
 
     def calculate_auxiliary_losses(
         self,
-        Y_valid_pred: np.ndarray,
-        Y_test_pred: np.ndarray,
-    ) -> Tuple[Optional[float | Sequence[float]], Optional[float | Sequence[float]]]:
-        if Y_valid_pred is not None:
-            if self.y_valid is not None:
-                validation_loss: Optional[Union[float, Dict[str, float]]] = self._loss(
-                    self.y_valid, Y_valid_pred
-                )
-                if len(self.metrics) == 1:
-                    validation_loss = validation_loss[self.metrics[0].name]
-            else:
-                validation_loss = None
-        else:
-            validation_loss = None
+        Y_test_pred: np.ndarray | None,
+    ) -> float | dict[str, float] | None:
+        if Y_test_pred is None or self.y_test is None:
+            return None
 
-        if Y_test_pred is not None:
-            if self.y_test is not None:
-                test_loss: Optional[Union[float, Dict[str, float]]] = self._loss(
-                    self.y_test, Y_test_pred
-                )
-                if len(self.metrics) == 1:
-                    test_loss = test_loss[self.metrics[0].name]
-            else:
-                test_loss = None
-        else:
-            test_loss = None
+        test_loss = self._loss(self.y_test, Y_test_pred)
+        if len(self.metrics) == 1:
+            test_loss = test_loss[self.metrics[0].name]
 
-        return validation_loss, test_loss
+        return test_loss
 
     def file_output(
         self,
         Y_optimization_pred: np.ndarray,
-        Y_valid_pred: np.ndarray,
         Y_test_pred: np.ndarray,
-    ) -> Tuple[Optional[float], Dict[str, Union[str, int, float, List, Dict, Tuple]]]:
+    ) -> tuple[float | None, dict[str, Any]]:
         # Abort if self.Y_optimization is None
         # self.Y_optimization can be None if we use partial-cv, then,
         # obviously no output should be saved.
@@ -496,12 +465,7 @@ def file_output(
             )
 
         # Abort if predictions contain NaNs
-        for y, s in [
-            # Y_train_pred deleted here. Fix unittest accordingly.
-            [Y_optimization_pred, "optimization"],
-            [Y_valid_pred, "validation"],
-            [Y_test_pred, "test"],
-        ]:
+        for y, s in [(Y_optimization_pred, "optimization"), (Y_test_pred, "test")]:
             if y is not None and not np.all(np.isfinite(y)):
                 return (
                     1.0,
@@ -553,14 +517,13 @@ def file_output(
             budget=self.budget,
             model=self.model if "model" not in self.disable_file_output else None,
             cv_model=models if "cv_model" not in self.disable_file_output else None,
+            # TODO: below line needs to be deleted once backend is updated
+            valid_predictions=None,
             ensemble_predictions=(
                 Y_optimization_pred
                 if "y_optimization" not in self.disable_file_output
                 else None
             ),
-            valid_predictions=(
-                Y_valid_pred if "y_valid" not in self.disable_file_output else None
-            ),
             test_predictions=(
                 Y_test_pred if "y_test" not in self.disable_file_output else None
             ),
diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py
index e76186aa06..d624c1a44d 100644
--- a/autosklearn/evaluation/test_evaluator.py
+++ b/autosklearn/evaluation/test_evaluator.py
@@ -67,7 +67,6 @@ def fit_predict_and_loss(self) -> None:
             loss=loss,
             train_loss=None,
             opt_pred=Y_pred,
-            valid_pred=None,
             test_pred=None,
             file_output=False,
             final_call=True,
@@ -78,7 +77,6 @@ def fit_predict_and_loss(self) -> None:
     def predict_and_loss(
         self, train: bool = False
     ) -> Tuple[Union[Dict[str, float], float], np.array, Any, Any]:
-
         if train:
             Y_pred = self.predict_function(
                 self.X_train, self.model, self.task_type, self.Y_train
diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
index a8433c2136..f19db473bf 100644
--- a/autosklearn/evaluation/train_evaluator.py
+++ b/autosklearn/evaluation/train_evaluator.py
@@ -316,7 +316,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                 Y_train_pred = [None] * self.num_cv_folds
                 Y_optimization_pred = [None] * self.num_cv_folds
-                Y_valid_pred = [None] * self.num_cv_folds
                 Y_test_pred = [None] * self.num_cv_folds
                 train_splits = [None] * self.num_cv_folds
 
@@ -417,7 +416,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                             **fit_params_array[i],
                         )
 
-                        (train_pred, opt_pred, valid_pred, test_pred) = self._predict(
+                        (train_pred, opt_pred, test_pred) = self._predict(
                             model,
                             train_indices=train_indices,
                             test_indices=test_indices,
@@ -425,7 +424,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                         Y_train_pred[i] = train_pred
                         Y_optimization_pred[i] = opt_pred
-                        Y_valid_pred[i] = valid_pred
                         Y_test_pred[i] = test_pred
                         train_splits[i] = train_indices
 
@@ -499,20 +497,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     X_targets = concat_data(X_targets, num_cv_folds=self.num_cv_folds)
                     Y_targets = concat_data(Y_targets, num_cv_folds=self.num_cv_folds)
 
-                    if self.X_valid is not None:
-                        Y_valid_preds = np.array(
-                            [
-                                Y_valid_pred[i]
-                                for i in range(self.num_cv_folds)
-                                if Y_valid_pred[i] is not None
-                            ]
-                        )
-                        # Average the predictions of several models
-                        if len(Y_valid_preds.shape) == 3:
-                            Y_valid_preds = np.nanmean(Y_valid_preds, axis=0)
-                    else:
-                        Y_valid_preds = None
-
                     if self.X_test is not None:
                         Y_test_preds = np.array(
                             [
@@ -544,7 +528,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                         loss=opt_loss,
                         train_loss=train_loss,
                         opt_pred=Y_optimization_pred_concat,
-                        valid_pred=Y_valid_preds,
                         test_pred=Y_test_preds,
                         additional_run_info=additional_run_info,
                         file_output=True,
@@ -558,7 +541,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
             Y_train_pred = [None] * self.num_cv_folds
             Y_optimization_pred = [None] * self.num_cv_folds
-            Y_valid_pred = [None] * self.num_cv_folds
             Y_test_pred = [None] * self.num_cv_folds
             train_splits = [None] * self.num_cv_folds
 
@@ -586,7 +568,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     (
                         train_pred,
                         opt_pred,
-                        valid_pred,
                         test_pred,
                         additional_run_info,
                     ) = self._partial_fit_and_predict_standard(
@@ -599,7 +580,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     (
                         train_pred,
                         opt_pred,
-                        valid_pred,
                         test_pred,
                         additional_run_info,
                     ) = self._partial_fit_and_predict_budget(
@@ -622,7 +602,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                 Y_train_pred[i] = train_pred
                 Y_optimization_pred[i] = opt_pred
-                Y_valid_pred[i] = valid_pred
                 Y_test_pred[i] = test_pred
                 train_splits[i] = train_split
 
@@ -683,18 +662,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
             X_targets = concat_data(X_targets, num_cv_folds=self.num_cv_folds)
             Y_targets = concat_data(Y_targets, num_cv_folds=self.num_cv_folds)
 
-            if self.X_valid is not None:
-                Y_valid_pred = np.array(
-                    [
-                        Y_valid_pred[i]
-                        for i in range(self.num_cv_folds)
-                        if Y_valid_pred[i] is not None
-                    ]
-                )
-                # Average the predictions of several models
-                if len(np.shape(Y_valid_pred)) == 3:
-                    Y_valid_pred = np.nanmean(Y_valid_pred, axis=0)
-
             if self.X_test is not None:
                 Y_test_pred = np.array(
                     [
@@ -746,7 +713,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                 loss=opt_loss,
                 train_loss=train_loss,
                 opt_pred=Y_optimization_pred,
-                valid_pred=Y_valid_pred if self.X_valid is not None else None,
                 test_pred=Y_test_pred if self.X_test is not None else None,
                 additional_run_info=additional_run_info,
                 file_output=True,
@@ -793,7 +759,6 @@ def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> No
             (
                 train_pred,
                 opt_pred,
-                valid_pred,
                 test_pred,
                 additional_run_info,
             ) = self._partial_fit_and_predict_standard(
@@ -819,7 +784,6 @@ def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> No
                 loss=loss,
                 train_loss=train_loss,
                 opt_pred=opt_pred,
-                valid_pred=valid_pred,
                 test_pred=test_pred,
                 file_output=False,
                 final_call=True,
@@ -883,12 +847,7 @@ def _partial_fit_and_predict_iterative(
                     n_iter=n_iter,
                     **fit_params,
                 )
-                (
-                    Y_train_pred,
-                    Y_optimization_pred,
-                    Y_valid_pred,
-                    Y_test_pred,
-                ) = self._predict(
+                (Y_train_pred, Y_optimization_pred, Y_test_pred,) = self._predict(
                     model,
                     train_indices=train_indices,
                     test_indices=test_indices,
@@ -921,7 +880,6 @@ def _partial_fit_and_predict_iterative(
                     loss=loss,
                     train_loss=train_loss,
                     opt_pred=Y_optimization_pred,
-                    valid_pred=Y_valid_pred,
                     test_pred=Y_test_pred,
                     additional_run_info=additional_run_info,
                     file_output=file_output,
@@ -936,7 +894,6 @@ def _partial_fit_and_predict_iterative(
             (
                 Y_train_pred,
                 Y_optimization_pred,
-                Y_valid_pred,
                 Y_test_pred,
                 additional_run_info,
             ) = self._partial_fit_and_predict_standard(
@@ -962,7 +919,6 @@ def _partial_fit_and_predict_iterative(
                 loss=loss,
                 train_loss=train_loss,
                 opt_pred=Y_optimization_pred,
-                valid_pred=Y_valid_pred,
                 test_pred=Y_test_pred,
                 additional_run_info=additional_run_info,
                 file_output=file_output,
@@ -980,7 +936,6 @@ def _partial_fit_and_predict_standard(
     ) -> Tuple[
         PIPELINE_DATA_DTYPE,  # train_pred
         PIPELINE_DATA_DTYPE,  # opt_pred
-        PIPELINE_DATA_DTYPE,  # valid_pred
         PIPELINE_DATA_DTYPE,  # test_pred
         TYPE_ADDITIONAL_INFO,
     ]:
@@ -1020,7 +975,7 @@ def _partial_fit_and_predict_standard(
             else self.Y_train[train_indices]
         )
 
-        train_pred, opt_pred, valid_pred, test_pred = self._predict(
+        train_pred, opt_pred, test_pred = self._predict(
             model=model,
             train_indices=train_indices,
             test_indices=test_indices,
@@ -1029,7 +984,6 @@ def _partial_fit_and_predict_standard(
         return (
             train_pred,
             opt_pred,
-            valid_pred,
             test_pred,
             additional_run_info,
         )
@@ -1043,7 +997,6 @@ def _partial_fit_and_predict_budget(
     ) -> Tuple[
         PIPELINE_DATA_DTYPE,  # train_pred
         PIPELINE_DATA_DTYPE,  # opt_pred
-        PIPELINE_DATA_DTYPE,  # valid_pred
         PIPELINE_DATA_DTYPE,  # test_pred
         TYPE_ADDITIONAL_INFO,
     ]:
@@ -1073,7 +1026,7 @@ def _partial_fit_and_predict_budget(
             task_type=self.task_type,
         )
 
-        train_pred, opt_pred, valid_pred, test_pred = self._predict(
+        train_pred, opt_pred, test_pred = self._predict(
             model,
             train_indices=train_indices,
             test_indices=test_indices,
@@ -1088,19 +1041,13 @@ def _partial_fit_and_predict_budget(
         return (
             train_pred,
             opt_pred,
-            valid_pred,
             test_pred,
             additional_run_info,
         )
 
     def _predict(
         self, model: BaseEstimator, test_indices: List[int], train_indices: List[int]
-    ) -> Tuple[
-        PIPELINE_DATA_DTYPE,
-        PIPELINE_DATA_DTYPE,
-        PIPELINE_DATA_DTYPE,
-        PIPELINE_DATA_DTYPE,
-    ]:
+    ) -> Tuple[PIPELINE_DATA_DTYPE, PIPELINE_DATA_DTYPE, PIPELINE_DATA_DTYPE]:
         train_pred = self.predict_function(
             self.X_train.iloc[train_indices]
             if hasattr(self.X_train, "iloc")
@@ -1123,14 +1070,6 @@ def _predict(
             else self.Y_train[train_indices],
         )
 
-        if self.X_valid is not None:
-            X_valid = self.X_valid.copy()
-            valid_pred = self.predict_function(
-                X_valid, model, self.task_type, self.Y_train[train_indices]
-            )
-        else:
-            valid_pred = None
-
         if self.X_test is not None:
             X_test = self.X_test.copy()
             test_pred = self.predict_function(
@@ -1144,7 +1083,7 @@ def _predict(
         else:
             test_pred = None
 
-        return train_pred, opt_pred, valid_pred, test_pred
+        return train_pred, opt_pred, test_pred
 
     def get_splitter(
         self, D: AbstractDataManager
diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py
index 62623a50ba..38040f2e4e 100644
--- a/test/test_evaluation/evaluation_util.py
+++ b/test/test_evaluation/evaluation_util.py
@@ -133,27 +133,14 @@ def get_multiclass_classification_datamanager():
     np.random.shuffle(indices)
     X_train = X_train[indices]
     Y_train = Y_train[indices]
-
-    X_valid = X_test[
-        :25,
-    ]
-    Y_valid = Y_test[
-        :25,
-    ]
-    X_test = X_test[
-        25:,
-    ]
-    Y_test = Y_test[
-        25:,
-    ]
+    X_test = X_test[25:]
+    Y_test = Y_test[25:]
 
     D = Dummy()
     D.info = {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 3}
     D.data = {
         "X_train": X_train,
         "Y_train": Y_train,
-        "X_valid": X_valid,
-        "Y_valid": Y_valid,
         "X_test": X_test,
         "Y_test": Y_test,
     }
@@ -196,34 +183,16 @@ def get_multilabel_classification_datamanager():
     Y_train = Y_train[indices]
 
     Y_train = np.array(convert_to_bin(Y_train, 3))
-    # for i in range(Y_train_.shape[0]):
-    #    Y_train_[:, Y_train[i]] = 1
-    # Y_train = Y_train_
     Y_test = np.array(convert_to_bin(Y_test, 3))
-    # for i in range(Y_test_.shape[0]):
-    #    Y_test_[:, Y_test[i]] = 1
-    # Y_test = Y_test_
 
-    X_valid = X_test[
-        :25,
-    ]
-    Y_valid = Y_test[
-        :25,
-    ]
-    X_test = X_test[
-        25:,
-    ]
-    Y_test = Y_test[
-        25:,
-    ]
+    X_test = X_test[25:]
+    Y_test = Y_test[25:]
 
     D = Dummy()
     D.info = {"task": MULTILABEL_CLASSIFICATION, "is_sparse": False, "label_num": 3}
     D.data = {
         "X_train": X_train,
         "Y_train": Y_train,
-        "X_valid": X_valid,
-        "Y_valid": Y_valid,
         "X_test": X_test,
         "Y_test": Y_test,
     }
@@ -247,26 +216,14 @@ def get_binary_classification_datamanager():
     X_test = X_test[eliminate_class_two]
     Y_test = Y_test[eliminate_class_two]
 
-    X_valid = X_test[
-        :25,
-    ]
-    Y_valid = Y_test[
-        :25,
-    ]
-    X_test = X_test[
-        25:,
-    ]
-    Y_test = Y_test[
-        25:,
-    ]
+    X_test = X_test[25:]
+    Y_test = Y_test[25:]
 
     D = Dummy()
     D.info = {"task": BINARY_CLASSIFICATION, "is_sparse": False, "label_num": 2}
     D.data = {
         "X_train": X_train,
         "Y_train": Y_train.reshape((-1, 1)),
-        "X_valid": X_valid,
-        "Y_valid": Y_valid.reshape((-1, 1)),
         "X_test": X_test,
         "Y_test": Y_test.reshape((-1, 1)),
     }
@@ -282,26 +239,14 @@ def get_regression_datamanager():
     X_train = X_train[indices]
     Y_train = Y_train[indices]
 
-    X_valid = X_test[
-        :200,
-    ]
-    Y_valid = Y_test[
-        :200,
-    ]
-    X_test = X_test[
-        200:,
-    ]
-    Y_test = Y_test[
-        200:,
-    ]
+    X_test = X_test[200:]
+    Y_test = Y_test[200:]
 
     D = Dummy()
     D.info = {"task": REGRESSION, "is_sparse": False, "label_num": 1}
     D.data = {
         "X_train": X_train,
         "Y_train": Y_train.reshape((-1, 1)),
-        "X_valid": X_valid,
-        "Y_valid": Y_valid.reshape((-1, 1)),
         "X_test": X_test,
         "Y_test": Y_test.reshape((-1, 1)),
     }
@@ -334,8 +279,6 @@ def get_500_classes_datamanager():
     D.data = {
         "X_train": X[:700],
         "Y_train": Y[:700],
-        "X_valid": X[700:710],
-        "Y_valid": Y[700:710],
         "X_test": X[710:],
         "Y_test": Y[710:],
     }
diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py
index 7bd52c0f76..e2473d738b 100644
--- a/test/test_evaluation/test_abstract_evaluator.py
+++ b/test/test_evaluation/test_abstract_evaluator.py
@@ -71,7 +71,6 @@ def test_finish_up_model_predicts_NaN(self):
         ae.Y_optimization = rs.rand(33, 3)
         predictions_ensemble = rs.rand(33, 3)
         predictions_test = rs.rand(25, 3)
-        predictions_valid = rs.rand(25, 3)
 
         # NaNs in prediction ensemble
         predictions_ensemble[5, 2] = np.NaN
@@ -79,7 +78,6 @@ def test_finish_up_model_predicts_NaN(self):
             loss=0.1,
             train_loss=0.1,
             opt_pred=predictions_ensemble,
-            valid_pred=predictions_valid,
             test_pred=predictions_test,
             additional_run_info=None,
             final_call=True,
@@ -89,37 +87,15 @@ def test_finish_up_model_predicts_NaN(self):
         self.assertEqual(loss, 1.0)
         self.assertEqual(
             additional_run_info,
-            {"error": "Model predictions for optimization set " "contains NaNs."},
+            {"error": "Model predictions for optimization set contains NaNs."},
         )
 
-        # NaNs in prediction validation
-        predictions_ensemble[5, 2] = 0.5
-        predictions_valid[5, 2] = np.NaN
-        _, loss, _, additional_run_info = ae.finish_up(
-            loss=0.1,
-            train_loss=0.1,
-            opt_pred=predictions_ensemble,
-            valid_pred=predictions_valid,
-            test_pred=predictions_test,
-            additional_run_info=None,
-            final_call=True,
-            file_output=True,
-            status=StatusType.SUCCESS,
-        )
-        self.assertEqual(loss, 1.0)
-        self.assertEqual(
-            additional_run_info,
-            {"error": "Model predictions for validation set " "contains NaNs."},
-        )
-
-        # NaNs in prediction test
-        predictions_valid[5, 2] = 0.5
+        predictions_ensemble = rs.rand(33, 3)
         predictions_test[5, 2] = np.NaN
         _, loss, _, additional_run_info = ae.finish_up(
             loss=0.1,
             train_loss=0.1,
             opt_pred=predictions_ensemble,
-            valid_pred=predictions_valid,
             test_pred=predictions_test,
             additional_run_info=None,
             final_call=True,
@@ -129,9 +105,8 @@ def test_finish_up_model_predicts_NaN(self):
         self.assertEqual(loss, 1.0)
         self.assertEqual(
             additional_run_info,
-            {"error": "Model predictions for test set contains " "NaNs."},
+            {"error": "Model predictions for test set contains NaNs."},
         )
-
         self.assertEqual(self.backend_mock.save_predictions_as_npy.call_count, 0)
 
     def test_disable_file_output(self):
@@ -150,11 +125,9 @@ def test_disable_file_output(self):
 
         predictions_ensemble = rs.rand(33, 3)
         predictions_test = rs.rand(25, 3)
-        predictions_valid = rs.rand(25, 3)
 
         loss_, additional_run_info_ = ae.file_output(
             predictions_ensemble,
-            predictions_valid,
             predictions_test,
         )
 
@@ -179,7 +152,6 @@ def test_disable_file_output(self):
 
             loss_, additional_run_info_ = ae.file_output(
                 predictions_ensemble,
-                predictions_valid,
                 predictions_test,
             )
 
@@ -211,11 +183,6 @@ def test_disable_file_output(self):
                     "ensemble_predictions"
                 ]
             )
-            self.assertIsNotNone(
-                self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
-                    "valid_predictions"
-                ]
-            )
             self.assertIsNotNone(
                 self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
                     "test_predictions"
@@ -237,7 +204,6 @@ def test_disable_file_output(self):
 
         loss_, additional_run_info_ = ae.file_output(
             predictions_ensemble,
-            predictions_valid,
             predictions_test,
         )
 
@@ -249,11 +215,6 @@ def test_disable_file_output(self):
                 "ensemble_predictions"
             ]
         )
-        self.assertIsNotNone(
-            self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
-                "valid_predictions"
-            ]
-        )
         self.assertIsNotNone(
             self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
                 "test_predictions"
@@ -296,11 +257,9 @@ def test_file_output(self):
             ae.Y_optimization = rs.rand(33, 3)
             predictions_ensemble = rs.rand(33, 3)
             predictions_test = rs.rand(25, 3)
-            predictions_valid = rs.rand(25, 3)
 
             ae.file_output(
                 Y_optimization_pred=predictions_ensemble,
-                Y_valid_pred=predictions_valid,
                 Y_test_pred=predictions_test,
             )
 
diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py
index 14c36f2afc..c8fe1c5f87 100644
--- a/test/test_evaluation/test_train_evaluator.py
+++ b/test/test_evaluation/test_train_evaluator.py
@@ -7,6 +7,7 @@
 import shutil
 import sys
 import tempfile
+from itertools import chain
 
 import numpy as np
 import sklearn.model_selection
@@ -68,6 +69,24 @@
 )
 
 
+class LossSideEffect(object):
+    """Some kind of re-used fixture for losses calculated"""
+
+    def __init__(self):
+        # The 3 below is related to train, test, opt sets
+        self.losses = [
+            {"accuracy": value}
+            for value in chain.from_iterable(
+                [i] * 3 for i in [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]
+            )
+        ]
+        self.iteration = 0
+
+    def side_effect(self, *args, **kwargs):
+        self.iteration += 1
+        return self.losses[self.iteration - 1]
+
+
 class Dummy(object):
     def __init__(self):
         self.name = "dummy"
@@ -159,15 +178,14 @@ def test_holdout(self, pipeline_mock):
         self.assertEqual(evaluator.file_output.call_count, 1)
         self.assertEqual(result, 0.45833333333333337)
         self.assertEqual(pipeline_mock.fit.call_count, 1)
-        # four calls because of train, holdout, validation and test set
-        self.assertEqual(pipeline_mock.predict_proba.call_count, 4)
+        # four calls because of train, holdout and test set
+        self.assertEqual(pipeline_mock.predict_proba.call_count, 3)
         self.assertEqual(evaluator.file_output.call_count, 1)
+
         self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 24)
+
         self.assertEqual(
-            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
-        )
-        self.assertEqual(
-            evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_test"].shape[0]
         )
         self.assertEqual(evaluator.model.fit.call_count, 1)
 
@@ -240,46 +258,12 @@ def configuration_fully_fitted(self):
 
         class LossSideEffect(object):
             def __init__(self):
+                # The 3 below is related to train, test, opt sets
                 self.losses = [
                     {"accuracy": value}
-                    for value in [
-                        1.0,
-                        1.0,
-                        1.0,
-                        1.0,
-                        0.9,
-                        0.9,
-                        0.9,
-                        0.9,
-                        0.8,
-                        0.8,
-                        0.8,
-                        0.8,
-                        0.7,
-                        0.7,
-                        0.7,
-                        0.7,
-                        0.6,
-                        0.6,
-                        0.6,
-                        0.6,
-                        0.5,
-                        0.5,
-                        0.5,
-                        0.5,
-                        0.4,
-                        0.4,
-                        0.4,
-                        0.4,
-                        0.3,
-                        0.3,
-                        0.3,
-                        0.3,
-                        0.2,
-                        0.2,
-                        0.2,
-                        0.2,
-                    ]
+                    for value in chain.from_iterable(
+                        [i] * 3 for i in [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]
+                    )
                 ]
                 self.iteration = 0
 
@@ -310,16 +294,12 @@ def side_effect(self, *args, **kwargs):
             [cal[1]["n_iter"] for cal in pipeline_mock.iterative_fit.call_args_list],
             [2, 2, 4, 8, 16, 32, 64, 128, 256],
         )
-        # 20 calls because of train, holdout, validation and test set
-        # and a total of five calls because of five iterations of fitting
-        self.assertEqual(evaluator.model.predict_proba.call_count, 36)
-        # 1/3 of 69
+
+        # 9 per split type
+        self.assertEqual(evaluator.model.predict_proba.call_count, 27)
         self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23)
         self.assertEqual(
-            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
-        )
-        self.assertEqual(
-            evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_test"].shape[0]
         )
         self.assertEqual(evaluator.file_output.call_count, 9)
         self.assertEqual(evaluator.model.fit.call_count, 0)
@@ -443,15 +423,14 @@ def side_effect(self, *args, **kwargs):
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(pipeline_mock.iterative_fit.call_count, 2)
-        # eight calls because of train, holdout, the validation and the test set
+
+        # 6 calls because of train, holdout and test set
         # and a total of two calls each because of two iterations of fitting
-        self.assertEqual(evaluator.model.predict_proba.call_count, 8)
+        self.assertEqual(evaluator.model.predict_proba.call_count, 6)
+
         self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23)
         self.assertEqual(
-            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
-        )
-        self.assertEqual(
-            evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_test"].shape[0]
         )
         self.assertEqual(evaluator.file_output.call_count, 2)
         self.assertEqual(evaluator.model.fit.call_count, 0)
@@ -504,14 +483,13 @@ def test_iterative_holdout_not_iterative(self, pipeline_mock):
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(pipeline_mock.iterative_fit.call_count, 0)
-        # four calls for train, opt, valid and test
-        self.assertEqual(evaluator.model.predict_proba.call_count, 4)
+
+        # 3 calls for train, opt and test
+        self.assertEqual(evaluator.model.predict_proba.call_count, 3)
+
         self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23)
         self.assertEqual(
-            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
-        )
-        self.assertEqual(
-            evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_test"].shape[0]
         )
         self.assertEqual(evaluator.file_output.call_count, 1)
         self.assertEqual(evaluator.model.fit.call_count, 1)
@@ -563,17 +541,14 @@ def test_cv(self, pipeline_mock):
         self.assertEqual(evaluator.file_output.call_count, 1)
         self.assertEqual(result, 0.463768115942029)
         self.assertEqual(pipeline_mock.fit.call_count, 5)
-        # Fifteen calls because of the training, holdout, validation and
-        # test set (4 sets x 5 folds = 20)
-        self.assertEqual(pipeline_mock.predict_proba.call_count, 20)
+
+        # 15 calls because of the training (5), holdout (5) and test set (5)
+        self.assertEqual(pipeline_mock.predict_proba.call_count, 15)
         self.assertEqual(
             evaluator.file_output.call_args[0][0].shape[0], D.data["Y_train"].shape[0]
         )
         self.assertEqual(
-            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
-        )
-        self.assertEqual(
-            evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_test"].shape[0]
         )
         # The model prior to fitting is saved, this cannot be directly tested
         # because of the way the mock module is used. Instead, we test whether
@@ -629,7 +604,7 @@ def test_partial_cv(self, pipeline_mock):
         self.assertEqual(evaluator.file_output.call_count, 0)
         self.assertEqual(return_value["loss"], 0.5)
         self.assertEqual(pipeline_mock.fit.call_count, 1)
-        self.assertEqual(pipeline_mock.predict_proba.call_count, 4)
+        self.assertEqual(pipeline_mock.predict_proba.call_count, 3)
         # The model prior to fitting is saved, this cannot be directly tested
         # because of the way the mock module is used. Instead, we test whether
         # the if block in which model assignment is done is accessed
@@ -703,55 +678,6 @@ def configuration_fully_fitted(self):
         evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
         evaluator.file_output.return_value = (None, {})
 
-        class LossSideEffect(object):
-            def __init__(self):
-                self.losses = [
-                    {"accuracy": value}
-                    for value in [
-                        1.0,
-                        1.0,
-                        1.0,
-                        1.0,
-                        0.9,
-                        0.9,
-                        0.9,
-                        0.9,
-                        0.8,
-                        0.8,
-                        0.8,
-                        0.8,
-                        0.7,
-                        0.7,
-                        0.7,
-                        0.7,
-                        0.6,
-                        0.6,
-                        0.6,
-                        0.6,
-                        0.5,
-                        0.5,
-                        0.5,
-                        0.5,
-                        0.4,
-                        0.4,
-                        0.4,
-                        0.4,
-                        0.3,
-                        0.3,
-                        0.3,
-                        0.3,
-                        0.2,
-                        0.2,
-                        0.2,
-                        0.2,
-                    ]
-                ]
-                self.iteration = 0
-
-            def side_effect(self, *args, **kwargs):
-                self.iteration += 1
-                return self.losses[self.iteration - 1]
-
         evaluator._loss = unittest.mock.Mock()
         evaluator._loss.side_effect = LossSideEffect().side_effect
 
@@ -773,13 +699,9 @@ def side_effect(self, *args, **kwargs):
             [cal[1]["n_iter"] for cal in pipeline_mock.iterative_fit.call_args_list],
             [2, 2, 4, 8, 16, 32, 64, 128, 256],
         )
-        # fifteen calls because of the holdout, the validation and the test set
-        # and a total of five calls because of five iterations of fitting
         self.assertTrue(hasattr(evaluator, "model"))
         self.assertEqual(pipeline_mock.iterative_fit.call_count, 9)
-        # 20 calls because of train, holdout, the validation and the test set
-        # and a total of five calls because of five iterations of fitting
-        self.assertEqual(pipeline_mock.predict_proba.call_count, 36)
+        self.assertEqual(pipeline_mock.predict_proba.call_count, 27)
 
     @unittest.mock.patch.object(TrainEvaluator, "_loss")
     @unittest.mock.patch.object(TrainEvaluator, "_get_model")
@@ -809,11 +731,7 @@ def test_file_output(self, loss_mock, model_mock):
         self.backend_mock.get_model_dir.return_value = True
         evaluator.model = "model"
         evaluator.Y_optimization = D.data["Y_train"]
-        return_value = evaluator.file_output(
-            D.data["Y_train"],
-            D.data["Y_valid"],
-            D.data["Y_test"],
-        )
+        return_value = evaluator.file_output(D.data["Y_train"], D.data["Y_test"])
 
         self.assertEqual(return_value, (None, {}))
         self.assertEqual(self.backend_mock.save_additional_data.call_count, 2)
@@ -826,8 +744,8 @@ def test_file_output(self, loss_mock, model_mock):
                 "budget",
                 "model",
                 "cv_model",
+                "valid_predictions",  # TODO remove once backend updated
                 "ensemble_predictions",
-                "valid_predictions",
                 "test_predictions",
             },
         )
@@ -839,11 +757,7 @@ def test_file_output(self, loss_mock, model_mock):
         )
 
         evaluator.models = ["model2", "model2"]
-        return_value = evaluator.file_output(
-            D.data["Y_train"],
-            D.data["Y_valid"],
-            D.data["Y_test"],
-        )
+        return_value = evaluator.file_output(D.data["Y_train"], D.data["Y_test"])
         self.assertEqual(return_value, (None, {}))
         self.assertEqual(self.backend_mock.save_additional_data.call_count, 4)
         self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 2)
@@ -855,8 +769,8 @@ def test_file_output(self, loss_mock, model_mock):
                 "budget",
                 "model",
                 "cv_model",
+                "valid_predictions",  # TODO remove once backend updated
                 "ensemble_predictions",
-                "valid_predictions",
                 "test_predictions",
             },
         )
@@ -867,27 +781,8 @@ def test_file_output(self, loss_mock, model_mock):
             self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["cv_model"]
         )
 
-        # Check for not containing NaNs - that the models don't predict nonsense
-        # for unseen data
-        D.data["Y_valid"][0] = np.NaN
-        return_value = evaluator.file_output(
-            D.data["Y_train"],
-            D.data["Y_valid"],
-            D.data["Y_test"],
-        )
-        self.assertEqual(
-            return_value,
-            (
-                1.0,
-                {"error": "Model predictions for validation set contains NaNs."},
-            ),
-        )
         D.data["Y_train"][0] = np.NaN
-        return_value = evaluator.file_output(
-            D.data["Y_train"],
-            D.data["Y_valid"],
-            D.data["Y_test"],
-        )
+        return_value = evaluator.file_output(D.data["Y_train"], D.data["Y_test"])
         self.assertEqual(
             return_value,
             (
@@ -1086,7 +981,6 @@ def test_fit_predict_and_loss_standard_additional_run_info(
         _partial_fit_and_predict_mock.return_value = (
             np.array([[0.1, 0.9]] * 46),
             np.array([[0.1, 0.9]] * 23),
-            np.array([[0.1, 0.9]] * 25),
             np.array([[0.1, 0.9]] * 6),
             {"a": 5},
         )
@@ -1129,7 +1023,6 @@ def __call__(self, *args, **kwargs):
                     return (
                         np.array([[0.1, 0.9]] * 34),
                         np.array([[0.1, 0.9]] * 35),
-                        np.array([[0.1, 0.9]] * 25),
                         np.array([[0.1, 0.9]] * 6),
                         {"a": 5},
                     )
@@ -1137,7 +1030,6 @@ def __call__(self, *args, **kwargs):
                     return (
                         np.array([[0.1, 0.9]] * 34),
                         np.array([[0.1, 0.9]] * 34),
-                        np.array([[0.1, 0.9]] * 25),
                         np.array([[0.1, 0.9]] * 6),
                         {"a": 5},
                     )
@@ -3070,7 +2962,6 @@ def test_eval_holdout_all_loss_functions(self):
             "recall_micro": 0.030303030303030276,
             "recall_weighted": 0.030303030303030276,
             "num_run": 1,
-            "validation_loss": 0.0,
             "test_loss": 0.04,
             "train_loss": 0.0,
         }
@@ -3447,7 +3338,6 @@ def test_eval_cv_all_loss_functions(self):
             "recall_micro": 0.04999999999999997,
             "recall_weighted": 0.04999999999999997,
             "num_run": 1,
-            "validation_loss": 0.04,
             "test_loss": 0.04,
             "train_loss": 0.0,
         }