diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index bacf905e7..2084d7138 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -29,7 +29,7 @@ jobs: - name: Run tests run: | if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autoPyTorch --cov-report=xml'; fi - python -m pytest --durations=20 --timeout=300 --timeout-method=thread -v $codecov test + python -m pytest --durations=20 --timeout=600 --timeout-method=signal -v $codecov test - name: Check for files left behind by test if: ${{ always() }} run: | diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py index ecf188911..434849ef1 100644 --- a/autoPyTorch/ensemble/ensemble_builder.py +++ b/autoPyTorch/ensemble/ensemble_builder.py @@ -33,7 +33,7 @@ from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric -from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score +from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score from autoPyTorch.utils.backend import Backend from autoPyTorch.utils.logging_ import get_named_client_logger @@ -512,13 +512,13 @@ def __init__( # objects as attributes. For this reason, we dump to disk the stage of the past # ensemble iterations to kick-start the ensembling process # {"file name": { - # "ens_score": float + # "ens_loss": float # "mtime_ens": str, # "mtime_test": str, # "seed": int, # "num_run": int, # }} - self.read_scores = {} + self.read_losses = {} # {"file_name": { # Y_ENSEMBLE: np.ndarray # Y_TEST: np.ndarray @@ -528,7 +528,7 @@ def __init__( # Depending on the dataset dimensions, # regenerating every iteration, the predictions - # scores for self.read_preds + # losses for self.read_preds # is too computationally expensive # As the ensemble builder is stateless # (every time the ensemble builder gets resources @@ -551,17 +551,17 @@ def __init__( traceback.format_exc(), ) ) - self.ensemble_score_file = os.path.join( + self.ensemble_loss_file = os.path.join( self.backend.internals_directory, - 'ensemble_read_scores.pkl' + 'ensemble_read_losses.pkl' ) - if os.path.exists(self.ensemble_score_file): + if os.path.exists(self.ensemble_loss_file): try: - with (open(self.ensemble_score_file, "rb")) as memory: - self.read_scores = pickle.load(memory) + with (open(self.ensemble_loss_file, "rb")) as memory: + self.read_losses = pickle.load(memory) except Exception as e: self.logger.warning( - "Could not load the previous iterations of ensemble_builder scores." + "Could not load the previous iterations of ensemble_builder losses." "This might impact the quality of the run. Exception={} {}".format( e, traceback.format_exc(), @@ -770,8 +770,8 @@ def main( time_left - used_time, ) - # populates self.read_preds and self.read_scores - if not self.score_ensemble_preds(): + # populates self.read_preds and self.read_losses + if not self.compute_loss_per_model(): if return_predictions: return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred else: @@ -817,9 +817,9 @@ def main( if self.max_resident_models is not None: self._delete_excess_models(selected_keys=candidate_models) - # Save the read scores status for the next iteration - with open(self.ensemble_score_file, "wb") as memory: - pickle.dump(self.read_scores, memory) + # Save the read losses status for the next iteration + with open(self.ensemble_loss_file, "wb") as memory: + pickle.dump(self.read_losses, memory) if ensemble is not None: train_pred = self.predict(set_="train", @@ -873,10 +873,10 @@ def get_disk_consumption(self, pred_path: str) -> float: # get the megabytes return round(this_model_cost / math.pow(1024, 2), 2) - def score_ensemble_preds(self) -> bool: + def compute_loss_per_model(self) -> bool: """ - score predictions on ensemble building data set; - populates self.read_preds and self.read_scores + Compute the loss of the predictions on ensemble building data set; + populates self.read_preds and self.read_losses """ self.logger.debug("Read ensemble data set predictions") @@ -933,9 +933,9 @@ def score_ensemble_preds(self) -> bool: self.logger.info('Error loading file (not .npy or .npy.gz): %s', y_ens_fn) continue - if not self.read_scores.get(y_ens_fn): - self.read_scores[y_ens_fn] = { - "ens_score": -np.inf, + if not self.read_losses.get(y_ens_fn): + self.read_losses[y_ens_fn] = { + "ens_loss": np.inf, "mtime_ens": 0, "mtime_test": 0, "seed": _seed, @@ -955,45 +955,38 @@ def score_ensemble_preds(self) -> bool: Y_TEST: None, } - if self.read_scores[y_ens_fn]["mtime_ens"] == os.path.getmtime(y_ens_fn): + if self.read_losses[y_ens_fn]["mtime_ens"] == os.path.getmtime(y_ens_fn): # same time stamp; nothing changed; continue - # actually read the predictions and score them - y_ensemble = self._read_np_fn(y_ens_fn) - scores = calculate_score( - metrics=self.metrics, - target=self.y_true_ensemble, - prediction=y_ensemble, - task_type=self.task_type, - ) + # actually read the predictions and compute their respective loss try: y_ensemble = self._read_np_fn(y_ens_fn) - scores = calculate_score( + losses = calculate_loss( metrics=self.metrics, target=self.y_true_ensemble, prediction=y_ensemble, task_type=self.task_type, ) - if np.isfinite(self.read_scores[y_ens_fn]["ens_score"]): + if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]): self.logger.debug( - 'Changing ensemble score for file %s from %f to %f ' + 'Changing ensemble loss for file %s from %f to %f ' 'because file modification time changed? %f - %f', y_ens_fn, - self.read_scores[y_ens_fn]["ens_score"], - scores[self.opt_metric], - self.read_scores[y_ens_fn]["mtime_ens"], + self.read_losses[y_ens_fn]["ens_loss"], + losses[self.opt_metric], + self.read_losses[y_ens_fn]["mtime_ens"], os.path.getmtime(y_ens_fn), ) - self.read_scores[y_ens_fn]["ens_score"] = scores[self.opt_metric] + self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric] # It is not needed to create the object here - # To save memory, we just score the object. - self.read_scores[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn) - self.read_scores[y_ens_fn]["loaded"] = 2 - self.read_scores[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption( + # To save memory, we just compute the loss. + self.read_losses[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn) + self.read_losses[y_ens_fn]["loaded"] = 2 + self.read_losses[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption( y_ens_fn ) @@ -1005,21 +998,22 @@ def score_ensemble_preds(self) -> bool: y_ens_fn, traceback.format_exc(), ) - self.read_scores[y_ens_fn]["ens_score"] = -np.inf + self.read_losses[y_ens_fn]["ens_loss"] = np.inf self.logger.debug( 'Done reading %d new prediction files. Loaded %d predictions in ' 'total.', n_read_files, - np.sum([pred["loaded"] > 0 for pred in self.read_scores.values()]) + np.sum([pred["loaded"] > 0 for pred in self.read_losses.values()]) ) return True def get_n_best_preds(self) -> List[str]: """ - get best n predictions (i.e., keys of self.read_scores) - according to score on "ensemble set" + get best n predictions (i.e., keys of self.read_losses) + according to the loss on the "ensemble set" n: self.ensemble_nbest + Side effects: ->Define the n-best models to use in ensemble ->Only the best models are loaded @@ -1033,20 +1027,19 @@ def get_n_best_preds(self) -> List[str]: num_keys = len(sorted_keys) # remove all that are at most as good as random # note: dummy model must have run_id=1 (there is no run_id=0) - dummy_scores = list(filter(lambda x: x[2] == 1, sorted_keys)) - + dummy_losses = list(filter(lambda x: x[2] == 1, sorted_keys)) # Leave this here for when we enable dummy classifier/scorer - if dummy_scores: + if len(dummy_losses) > 0: # number of dummy models - num_dummy = len(dummy_scores) - dummy_score = dummy_scores[0] - self.logger.debug("Use %f as dummy score" % dummy_score[1]) - sorted_keys = list(filter(lambda x: x[1] > dummy_score[1], sorted_keys)) + num_dummy = len(dummy_losses) + dummy_loss = dummy_losses[0] + self.logger.debug("Use %f as dummy loss" % dummy_loss[1]) + sorted_keys = list(filter(lambda x: x[1] < dummy_loss[1], sorted_keys)) # remove Dummy Classifier sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys)) - if not sorted_keys: - # no model left; try to use dummy score (num_run==0) + if len(sorted_keys) == 0: + # no model left; try to use dummy loss (num_run==0) # log warning when there are other models but not better than dummy model if num_keys > num_dummy: self.logger.warning("No models better than random - using Dummy Score!" @@ -1055,10 +1048,10 @@ def get_n_best_preds(self) -> List[str]: num_keys - 1, num_dummy) sorted_keys = [ - (k, v["ens_score"], v["num_run"]) for k, v in self.read_scores.items() + (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items() if v["seed"] == self.seed and v["num_run"] == 1 ] - # reload predictions if scores changed over time and a model is + # reload predictions if losses changed over time and a model is # considered to be in the top models again! if not isinstance(self.ensemble_nbest, numbers.Integral): # Transform to number of models to keep. Keep at least one @@ -1081,9 +1074,9 @@ def get_n_best_preds(self) -> List[str]: if not isinstance(self.max_models_on_disc, numbers.Integral): consumption = [ [ - v["ens_score"], + v["ens_loss"], v["disc_space_cost_mb"], - ] for v in self.read_scores.values() if v["disc_space_cost_mb"] is not None + ] for v in self.read_losses.values() if v["disc_space_cost_mb"] is not None ] max_consumption = max(c[1] for c in consumption) @@ -1092,10 +1085,10 @@ def get_n_best_preds(self) -> List[str]: # max_consumption megabytes if (sum(c[1] for c in consumption) + max_consumption) > self.max_models_on_disc: - # just leave the best -- higher is better! + # just leave the best -- smaller is better! # This list is in descending order, to preserve the best models sorted_cum_consumption = np.cumsum([ - c[1] for c in list(reversed(sorted(consumption))) + c[1] for c in list(sorted(consumption)) ]) + max_consumption max_models = np.argmax(sorted_cum_consumption > self.max_models_on_disc) @@ -1125,17 +1118,17 @@ def get_n_best_preds(self) -> List[str]: # consider performance_range_threshold if self.performance_range_threshold > 0: - best_score = sorted_keys[0][1] - min_score = dummy_score[1] - min_score += (best_score - min_score) * self.performance_range_threshold - if sorted_keys[keep_nbest - 1][1] < min_score: + best_loss = sorted_keys[0][1] + worst_loss = dummy_loss[1] + worst_loss -= (worst_loss - best_loss) * self.performance_range_threshold + if sorted_keys[keep_nbest - 1][1] > worst_loss: # We can further reduce number of models # since worst model is worse than thresh for i in range(0, keep_nbest): # Look at most at keep_nbest models, # but always keep at least one model - current_score = sorted_keys[i][1] - if current_score <= min_score: + current_loss = sorted_keys[i][1] + if current_loss >= worst_loss: self.logger.debug("Dynamic Performance range: " "Further reduce from %d to %d models", keep_nbest, max(1, i)) @@ -1151,15 +1144,15 @@ def get_n_best_preds(self) -> List[str]: if k in self.read_preds: self.read_preds[k][Y_ENSEMBLE] = None self.read_preds[k][Y_TEST] = None - if self.read_scores[k]['loaded'] == 1: + if self.read_losses[k]['loaded'] == 1: self.logger.debug( - 'Dropping model %s (%d,%d) with score %f.', + 'Dropping model %s (%d,%d) with loss %f.', k, - self.read_scores[k]['seed'], - self.read_scores[k]['num_run'], - self.read_scores[k]['ens_score'], + self.read_losses[k]['seed'], + self.read_losses[k]['num_run'], + self.read_losses[k]['ens_loss'], ) - self.read_scores[k]['loaded'] = 2 + self.read_losses[k]['loaded'] = 2 # Load the predictions for the winning for k in reduced_sorted_keys[:ensemble_n_best]: @@ -1167,14 +1160,14 @@ def get_n_best_preds(self) -> List[str]: ( k not in self.read_preds or self.read_preds[k][Y_ENSEMBLE] is None ) - and self.read_scores[k]['loaded'] != 3 + and self.read_losses[k]['loaded'] != 3 ): self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(k) # No need to load test here because they are loaded # only if the model ends up in the ensemble - self.read_scores[k]['loaded'] = 1 + self.read_losses[k]['loaded'] = 1 - # return best scored keys of self.read_scores + # return best scored keys of self.read_losses return reduced_sorted_keys[:ensemble_n_best] def get_test_preds(self, selected_keys: List[str]) -> List[str]: @@ -1198,14 +1191,14 @@ def get_test_preds(self, selected_keys: List[str]) -> List[str]: os.path.join( glob.escape(self.backend.get_runs_directory()), '%d_%d_%s' % ( - self.read_scores[k]["seed"], - self.read_scores[k]["num_run"], - self.read_scores[k]["budget"], + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"], ), 'predictions_test_%d_%d_%s.npy*' % ( - self.read_scores[k]["seed"], - self.read_scores[k]["num_run"], - self.read_scores[k]["budget"] + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"] ) ) ) @@ -1218,7 +1211,7 @@ def get_test_preds(self, selected_keys: List[str]) -> List[str]: pass else: if ( - self.read_scores[k]["mtime_test"] == os.path.getmtime(test_fn[0]) + self.read_losses[k]["mtime_test"] == os.path.getmtime(test_fn[0]) and k in self.read_preds and self.read_preds[k][Y_TEST] is not None ): @@ -1228,7 +1221,7 @@ def get_test_preds(self, selected_keys: List[str]) -> List[str]: y_test = self._read_np_fn(test_fn[0]) self.read_preds[k][Y_TEST] = y_test success_keys_test.append(k) - self.read_scores[k]["mtime_test"] = os.path.getmtime(test_fn[0]) + self.read_losses[k]["mtime_test"] = os.path.getmtime(test_fn[0]) except Exception: self.logger.warning('Error loading %s: %s', test_fn, traceback.format_exc()) @@ -1238,10 +1231,12 @@ def get_test_preds(self, selected_keys: List[str]) -> List[str]: def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]: """ fit ensemble + Parameters --------- selected_keys: list - list of selected keys of self.read_scores + list of selected keys of self.read_losses + Returns ------- ensemble: EnsembleSelection @@ -1254,9 +1249,9 @@ def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]: predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in selected_keys] include_num_runs = [ ( - self.read_scores[k]["seed"], - self.read_scores[k]["num_run"], - self.read_scores[k]["budget"], + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"], ) for k in selected_keys] @@ -1332,7 +1327,7 @@ def predict(self, set_: str, ensemble: EnsembleSelection trained Ensemble selected_keys: list - list of selected keys of self.read_scores + list of selected keys of self.read_losses n_preds: int number of prediction models used for ensemble building same number of predictions on valid and test are necessary @@ -1418,33 +1413,38 @@ def _add_ensemble_trajectory(self, train_pred: np.ndarray, test_pred: np.ndarray def _get_list_of_sorted_preds(self) -> List[Tuple[str, float, int]]: """ - Returns a list of sorted predictions in descending order - Scores are taken from self.read_scores. + Returns a list of sorted predictions in descending performance order. + (We are solving a minimization problem) + Losses are taken from self.read_losses. + Parameters ---------- None + Return ------ - sorted_keys: list + sorted_keys: + given a sequence of pairs of (loss[i], num_run[i]) = (l[i], n[i]), + we will sort s.t. l[0] <= l[1] <= ... <= l[N] and for any pairs of + i, j (i < j, l[i] = l[j]), the resulting sequence satisfies n[i] <= n[j] """ - # Sort by score - higher is better! - # First sort by num_run - sorted_keys = list(reversed(sorted( + # Sort by loss - smaller is better! + sorted_keys = list(sorted( [ - (k, v["ens_score"], v["num_run"]) - for k, v in self.read_scores.items() + (k, v["ens_loss"], v["num_run"]) + for k, v in self.read_losses.items() ], - key=lambda x: x[2], - ))) # type: List[Tuple[str, float, int]] - # Then by score - sorted_keys = list(reversed(sorted(sorted_keys, key=lambda x: x[1]))) + # Sort by loss as priority 1 and then by num_run on a ascending order + # We want small num_run first + key=lambda x: (x[1], x[2]), + )) return sorted_keys def _delete_excess_models(self, selected_keys: List[str]) -> None: """ Deletes models excess models on disc. self.max_models_on_disc defines the upper limit on how many models to keep. - Any additional model with a worst score than the top + Any additional model with a worse loss than the top self.max_models_on_disc is deleted. """ @@ -1492,9 +1492,9 @@ def _delete_excess_models(self, selected_keys: List[str]) -> None: os.rename(numrun_dir, numrun_dir + '.old') shutil.rmtree(numrun_dir + '.old') self.logger.info("Deleted files of non-candidate model %s", pred_path) - self.read_scores[pred_path]["disc_space_cost_mb"] = None - self.read_scores[pred_path]["loaded"] = 3 - self.read_scores[pred_path]["ens_score"] = -np.inf + self.read_losses[pred_path]["disc_space_cost_mb"] = None + self.read_losses[pred_path]["loaded"] = 3 + self.read_losses[pred_path]["ens_loss"] = np.inf except Exception as e: self.logger.error( "Failed to delete files of non-candidate model %s due" diff --git a/autoPyTorch/ensemble/ensemble_selection.py b/autoPyTorch/ensemble/ensemble_selection.py index 6f701787a..d61a3555e 100644 --- a/autoPyTorch/ensemble/ensemble_selection.py +++ b/autoPyTorch/ensemble/ensemble_selection.py @@ -6,7 +6,7 @@ from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric -from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score +from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss class EnsembleSelection(AbstractEnsemble): @@ -39,6 +39,24 @@ def fit( labels: np.ndarray, identifiers: List[Tuple[int, int, float]], ) -> AbstractEnsemble: + """ + Builds a ensemble given the individual models out of fold predictions. + Fundamentally, defines a set of weights on how to perform a soft-voting + aggregation of the models in the given identifiers. + + Args: + predictions (List[np.array]): + A list of individual model predictions of shape (n_datapoints, n_targets) + corresponding to the OutOfFold estimate of the ground truth + labels (np.ndarray): + The ground truth targets of shape (n_datapoints, n_targets) + identifiers: List[Tuple[int, int, float]] + A list of model identifiers, each with the form + (seed, number of run, budget) + + Returns: + A copy of self + """ self.ensemble_size = int(self.ensemble_size) if self.ensemble_size < 1: raise ValueError('Ensemble size cannot be less than one!') @@ -53,7 +71,20 @@ def _fit( predictions: List[np.ndarray], labels: np.ndarray, ) -> None: - """Fast version of Rich Caruana's ensemble selection method.""" + """ + Fast version of Rich Caruana's ensemble selection method. + + For more details, please check the paper + "Ensemble Selection from Library of Models" by R Caruana (2004) + + Args: + predictions (List[np.array]): + A list of individual model predictions of shape (n_datapoints, n_targets) + corresponding to the OutOfFold estimate of the ground truth + identifiers (List[Tuple[int, int, float]]): + A list of model identifiers, each with the form + (seed, number of run, budget) + """ self.num_input_models_ = len(predictions) ensemble = [] # type: List[np.ndarray] @@ -71,60 +102,47 @@ def _fit( dtype=np.float64, ) for i in range(ensemble_size): - scores = np.zeros( + losses = np.zeros( (len(predictions)), dtype=np.float64, ) s = len(ensemble) - if s == 0: - weighted_ensemble_prediction.fill(0.0) - else: - weighted_ensemble_prediction.fill(0.0) - for pred in ensemble: - np.add( - weighted_ensemble_prediction, - pred, - out=weighted_ensemble_prediction, - ) - np.multiply( - weighted_ensemble_prediction, - 1 / s, - out=weighted_ensemble_prediction, - ) - np.multiply( + if s > 0: + np.add( weighted_ensemble_prediction, - (s / float(s + 1)), + ensemble[-1], out=weighted_ensemble_prediction, ) + # Memory-efficient averaging! for j, pred in enumerate(predictions): - # Memory-efficient averaging! - fant_ensemble_prediction.fill(0.0) + # fant_ensemble_prediction is the prediction of the current ensemble + # and should be ([predictions[selected_prev_iterations] + predictions[j])/(s+1) + # We overwrite the contents of fant_ensemble_prediction + # directly with weighted_ensemble_prediction + new_prediction and then scale for avg np.add( - fant_ensemble_prediction, weighted_ensemble_prediction, + pred, out=fant_ensemble_prediction ) - np.add( + np.multiply( fant_ensemble_prediction, - (1. / float(s + 1)) * pred, + (1. / float(s + 1)), out=fant_ensemble_prediction ) - # Calculate score is versatile and can return a dict of score - # when all_scoring_functions=False, we know it will be a float - score = calculate_score( + # Calculate loss is versatile and can return a dict of slosses + losses[j] = calculate_loss( metrics=[self.metric], target=labels, prediction=fant_ensemble_prediction, task_type=self.task_type, - ) - scores[j] = self.metric._optimum - score[self.metric.name] + )[self.metric.name] - all_best = np.argwhere(scores == np.nanmin(scores)).flatten() + all_best = np.argwhere(losses == np.nanmin(losses)).flatten() best = self.random_state.choice(all_best) ensemble.append(predictions[best]) - trajectory.append(scores[best]) + trajectory.append(losses[best]) order.append(best) # Handle special case @@ -133,9 +151,15 @@ def _fit( self.indices_ = order self.trajectory_ = trajectory - self.train_score_ = trajectory[-1] + self.train_loss_ = trajectory[-1] def _calculate_weights(self) -> None: + """ + Calculates the contribution each of the individual models + should have, in the final ensemble soft voting. It does so by + a frequency counting scheme. In particular, how many times a model + was used during hill climbing optimization. + """ ensemble_members = Counter(self.indices_).most_common() weights = np.zeros( (self.num_input_models_,), @@ -151,6 +175,19 @@ def _calculate_weights(self) -> None: self.weights_ = weights def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: + """ + Given a list of predictions from the individual model, this method + aggregates the predictions using a soft voting scheme with the weights + found during training. + + Args: + predictions (List[np.ndarray]): + A list of predictions from the individual base models. + + Returns: + average (np.array): Soft voting predictions of ensemble models, using + the weights found during ensemble selection (self._weights) + """ average = np.zeros_like(predictions[0], dtype=np.float64) tmp_predictions = np.empty_like(predictions[0], dtype=np.float64) @@ -191,6 +228,19 @@ def get_models_with_weights( self, models: Dict[Any, BasePipeline] ) -> List[Tuple[float, BasePipeline]]: + """ + Handy function to tag the provided input models with a given weight. + + Args: + models (List[Tuple[float, BasePipeline]]): + A dictionary that maps a model's name to it's actual python object. + + Returns: + output (List[Tuple[float, BasePipeline]]): + each model with the related weight, sorted by ascending + performance. Notice that ensemble selection solves a minimization + problem. + """ output = [] for i, weight in enumerate(self.weights_): if weight > 0.0: @@ -203,6 +253,15 @@ def get_models_with_weights( return output def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: + """ + After training of ensemble selection, not all models will be used. + Some of them will have zero weight. This procedure filters this models + out. + + Returns: + output (List[Tuple[int, int, float]]): + The models actually used by ensemble selection + """ output = [] for i, weight in enumerate(self.weights_): @@ -213,4 +272,11 @@ def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: return output def get_validation_performance(self) -> float: + """ + Returns the best optimization performance seen during hill climbing + + Returns: + (float): + best ensemble training performance + """ return self.trajectory_[-1] diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index c1f7da60d..6f2d93feb 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -38,7 +38,7 @@ from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.pipeline.components.training.metrics.utils import ( - calculate_score, + calculate_loss, get_metrics, ) from autoPyTorch.utils.backend import Backend @@ -364,30 +364,21 @@ def _get_pipeline(self) -> BaseEstimator: def _loss(self, y_true: np.ndarray, y_hat: np.ndarray) -> Dict[str, float]: """SMAC follows a minimization goal, so the make_scorer sign is used as a guide to obtain the value to reduce. + The calculate_loss internally translate a score function to + a minimization problem - On this regard, to optimize a metric: - 1- score is calculared with calculate_score, with the caveat, that if - for the metric greater is not better, a negative score is returned. - 2- the err (the optimization goal) is then: - optimum - (metric.sign * actual_score) - For accuracy for example: optimum(1) - (+1 * actual score) - For logloss for example: optimum(0) - (-1 * actual score) """ if not isinstance(self.configuration, Configuration): - return {self.metric.name: 1.0} + return {self.metric.name: self.metric._worst_possible_result} if self.additional_metrics is not None: metrics = self.additional_metrics else: metrics = [self.metric] - score = calculate_score( - y_true, y_hat, self.task_type, metrics) - - err = {metric.name: metric._optimum - score[metric.name] for metric in metrics - if metric.name in score.keys()} - return err + return calculate_loss( + y_true, y_hat, self.task_type, metrics) def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], opt_pred: np.ndarray, valid_pred: Optional[np.ndarray], diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py index d386ce47e..012d04e49 100644 --- a/autoPyTorch/pipeline/components/training/metrics/utils.py +++ b/autoPyTorch/pipeline/components/training/metrics/utils.py @@ -104,17 +104,17 @@ def get_metrics(dataset_properties: Dict[str, Any], def calculate_score( - target: np.ndarray, - prediction: np.ndarray, - task_type: int, - metrics: Iterable[autoPyTorchMetric], + target: np.ndarray, + prediction: np.ndarray, + task_type: int, + metrics: Iterable[autoPyTorchMetric], ) -> Dict[str, float]: score_dict = dict() if task_type in REGRESSION_TASKS: cprediction = sanitize_array(prediction) for metric_ in metrics: try: - score_dict[metric_.name] = metric_(target, cprediction) + score_dict[metric_.name] = metric_._sign * metric_(target, cprediction) except ValueError as e: warnings.warn(f"{e} {e.args[0]}") if e.args[0] == "Mean Squared Logarithmic Error cannot be used when " \ @@ -126,7 +126,7 @@ def calculate_score( else: for metric_ in metrics: try: - score_dict[metric_.name] = metric_(target, prediction) + score_dict[metric_.name] = metric_._sign * metric_(target, prediction) except ValueError as e: if e.args[0] == 'multiclass format is not supported': continue @@ -143,3 +143,49 @@ def calculate_score( else: raise e return score_dict + + +def calculate_loss( + target: np.ndarray, + prediction: np.ndarray, + task_type: int, + metrics: Iterable[autoPyTorchMetric], +) -> Dict[str, float]: + """ + Returns a loss (a magnitude that allows casting the + optimization problem, as a minimization one) for the + given Auto-Sklearn Scorer object + Parameters + ---------- + solution: np.ndarray + The ground truth of the targets + prediction: np.ndarray + The best estimate from the model, of the given targets + task_type: int + To understand if the problem task is classification + or regression + metric: Scorer + Object that host a function to calculate how good the + prediction is according to the solution. + scoring_functions: List[Scorer] + A list of metrics to calculate multiple losses + Returns + ------- + float or Dict[str, float] + A loss function for each of the provided scorer objects + """ + score = calculate_score( + target=target, + prediction=prediction, + task_type=task_type, + metrics=metrics, + ) + + loss_dict = dict() + for metric_ in metrics: + # TODO: When metrics are annotated with type_of_target support + # we can remove this check + if metric_.name not in score: + continue + loss_dict[metric_.name] = metric_._optimum - metric_._sign * score[metric_.name] + return loss_dict diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index ea7cccd72..1aca1fdb6 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -83,7 +83,7 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): '.autoPyTorch/ensemble_read_preds.pkl', '.autoPyTorch/start_time_1', '.autoPyTorch/ensemble_history.json', - '.autoPyTorch/ensemble_read_scores.pkl', + '.autoPyTorch/ensemble_read_losses.pkl', '.autoPyTorch/true_targets_ensemble.npy', ] for expected_file in expected_files: @@ -244,7 +244,7 @@ def test_tabular_regression(openml_name, resampling_strategy, backend): '.autoPyTorch/ensemble_read_preds.pkl', '.autoPyTorch/start_time_1', '.autoPyTorch/ensemble_history.json', - '.autoPyTorch/ensemble_read_scores.pkl', + '.autoPyTorch/ensemble_read_losses.pkl', '.autoPyTorch/true_targets_ensemble.npy', ] for expected_file in expected_files: diff --git a/test/test_ensemble/test_ensemble.py b/test/test_ensemble/test_ensemble.py index fb2f6fcec..b797f8272 100644 --- a/test/test_ensemble/test_ensemble.py +++ b/test/test_ensemble/test_ensemble.py @@ -125,18 +125,18 @@ def testRead(ensemble_backend): seed=0, # important to find the test files ) - success = ensbuilder.score_ensemble_preds() + success = ensbuilder.compute_loss_per_model() assert success, str(ensbuilder.read_preds) assert len(ensbuilder.read_preds) == 3, ensbuilder.read_preds.keys() - assert len(ensbuilder.read_scores) == 3, ensbuilder.read_scores.keys() + assert len(ensbuilder.read_losses) == 3, ensbuilder.read_losses.keys() filename = os.path.join( ensemble_backend.temporary_directory, ".autoPyTorch/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy" ) np.testing.assert_almost_equal( - ensbuilder.read_scores[filename]["ens_score"], - np.array(0.8) + ensbuilder.read_losses[filename]["ens_loss"], + np.array(0.2) ) filename = os.path.join( @@ -144,8 +144,8 @@ def testRead(ensemble_backend): ".autoPyTorch/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy" ) np.testing.assert_almost_equal( - ensbuilder.read_scores[filename]["ens_score"], - np.array(1.0) + ensbuilder.read_losses[filename]["ens_loss"], + np.array(0.0) ) @@ -173,7 +173,7 @@ def testNBest(ensemble_backend, ensemble_nbest, max_models_on_disc, exp): max_models_on_disc=max_models_on_disc, ) - ensbuilder.score_ensemble_preds() + ensbuilder.compute_loss_per_model() sel_keys = ensbuilder.get_n_best_preds() assert len(sel_keys) == exp @@ -216,7 +216,7 @@ def testMaxModelsOnDisc(ensemble_backend, test_case, exp): with unittest.mock.patch('os.path.getsize') as mock: mock.return_value = 100 * 1024 * 1024 - ensbuilder.score_ensemble_preds() + ensbuilder.compute_loss_per_model() sel_keys = ensbuilder.get_n_best_preds() assert len(sel_keys) == exp, test_case @@ -237,8 +237,8 @@ def testMaxModelsOnDisc2(ensemble_backend): ) ensbuilder.read_preds = {} for i in range(50): - ensbuilder.read_scores['pred' + str(i)] = { - 'ens_score': i * 10, + ensbuilder.read_losses['pred' + str(i)] = { + 'ens_loss': -i * 10, 'num_run': i, 'loaded': 1, "seed": 1, @@ -270,16 +270,16 @@ def testPerformanceRangeThreshold(ensemble_backend, performance_range_threshold, ensemble_nbest=100, performance_range_threshold=performance_range_threshold ) - ensbuilder.read_scores = { - 'A': {'ens_score': 1, 'num_run': 1, 'loaded': -1, "seed": 1}, - 'B': {'ens_score': 2, 'num_run': 2, 'loaded': -1, "seed": 1}, - 'C': {'ens_score': 3, 'num_run': 3, 'loaded': -1, "seed": 1}, - 'D': {'ens_score': 4, 'num_run': 4, 'loaded': -1, "seed": 1}, - 'E': {'ens_score': 5, 'num_run': 5, 'loaded': -1, "seed": 1}, + ensbuilder.read_losses = { + 'A': {'ens_loss': -1, 'num_run': 1, 'loaded': -1, "seed": 1}, + 'B': {'ens_loss': -2, 'num_run': 2, 'loaded': -1, "seed": 1}, + 'C': {'ens_loss': -3, 'num_run': 3, 'loaded': -1, "seed": 1}, + 'D': {'ens_loss': -4, 'num_run': 4, 'loaded': -1, "seed": 1}, + 'E': {'ens_loss': -5, 'num_run': 5, 'loaded': -1, "seed": 1}, } ensbuilder.read_preds = { key: {key_2: True for key_2 in (Y_ENSEMBLE, Y_TEST)} - for key in ensbuilder.read_scores + for key in ensbuilder.read_losses } sel_keys = ensbuilder.get_n_best_preds() @@ -307,16 +307,16 @@ def testPerformanceRangeThresholdMaxBest(ensemble_backend, performance_range_thr performance_range_threshold=performance_range_threshold, max_models_on_disc=None, ) - ensbuilder.read_scores = { - 'A': {'ens_score': 1, 'num_run': 1, 'loaded': -1, "seed": 1}, - 'B': {'ens_score': 2, 'num_run': 2, 'loaded': -1, "seed": 1}, - 'C': {'ens_score': 3, 'num_run': 3, 'loaded': -1, "seed": 1}, - 'D': {'ens_score': 4, 'num_run': 4, 'loaded': -1, "seed": 1}, - 'E': {'ens_score': 5, 'num_run': 5, 'loaded': -1, "seed": 1}, + ensbuilder.read_losses = { + 'A': {'ens_loss': -1, 'num_run': 1, 'loaded': -1, "seed": 1}, + 'B': {'ens_loss': -2, 'num_run': 2, 'loaded': -1, "seed": 1}, + 'C': {'ens_loss': -3, 'num_run': 3, 'loaded': -1, "seed": 1}, + 'D': {'ens_loss': -4, 'num_run': 4, 'loaded': -1, "seed": 1}, + 'E': {'ens_loss': -5, 'num_run': 5, 'loaded': -1, "seed": 1}, } ensbuilder.read_preds = { key: {key_2: True for key_2 in (Y_ENSEMBLE, Y_TEST)} - for key in ensbuilder.read_scores + for key in ensbuilder.read_losses } sel_keys = ensbuilder.get_n_best_preds() @@ -335,25 +335,25 @@ def testFallBackNBest(ensemble_backend): ensemble_nbest=1 ) - ensbuilder.score_ensemble_preds() + ensbuilder.compute_loss_per_model() filename = os.path.join( ensemble_backend.temporary_directory, ".autoPyTorch/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy" ) - ensbuilder.read_scores[filename]["ens_score"] = -1 + ensbuilder.read_losses[filename]["ens_loss"] = -1 filename = os.path.join( ensemble_backend.temporary_directory, ".autoPyTorch/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy" ) - ensbuilder.read_scores[filename]["ens_score"] = -1 + ensbuilder.read_losses[filename]["ens_loss"] = -1 filename = os.path.join( ensemble_backend.temporary_directory, ".autoPyTorch/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy" ) - ensbuilder.read_scores[filename]["ens_score"] = -1 + ensbuilder.read_losses[filename]["ens_loss"] = -1 sel_keys = ensbuilder.get_n_best_preds() @@ -377,7 +377,7 @@ def testGetTestPreds(ensemble_backend): ensemble_nbest=1 ) - ensbuilder.score_ensemble_preds() + ensbuilder.compute_loss_per_model() d1 = os.path.join( ensemble_backend.temporary_directory, @@ -426,7 +426,7 @@ def testEntireEnsembleBuilder(ensemble_backend): ) ensbuilder.SAVE2DISC = False - ensbuilder.score_ensemble_preds() + ensbuilder.compute_loss_per_model() d2 = os.path.join( ensemble_backend.temporary_directory, @@ -502,7 +502,7 @@ def test_main(ensemble_backend): os.path.join(ensemble_backend.internals_directory, 'ensemble_read_preds.pkl') ), os.listdir(ensemble_backend.internals_directory) assert os.path.exists( - os.path.join(ensemble_backend.internals_directory, 'ensemble_read_scores.pkl') + os.path.join(ensemble_backend.internals_directory, 'ensemble_read_losses.pkl') ), os.listdir(ensemble_backend.internals_directory) @@ -543,9 +543,9 @@ def testLimit(ensemble_backend): ) ensbuilder.SAVE2DISC = False - read_scores_file = os.path.join( + read_losses_file = os.path.join( ensemble_backend.internals_directory, - 'ensemble_read_scores.pkl' + 'ensemble_read_losses.pkl' ) read_preds_file = os.path.join( ensemble_backend.internals_directory, @@ -559,15 +559,15 @@ def testLimit(ensemble_backend): get_logger_mock.return_value = logger_mock ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') - assert os.path.exists(read_scores_file) + assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 1 ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') - assert os.path.exists(read_scores_file) + assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 2 ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') - assert os.path.exists(read_scores_file) + assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 3 @@ -575,7 +575,7 @@ def testLimit(ensemble_backend): assert ensbuilder.ensemble_nbest == 1 ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') - assert os.path.exists(read_scores_file) + assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 4 @@ -585,7 +585,7 @@ def testLimit(ensemble_backend): # And then it still runs, but basically won't do anything any more except for raising error # messages via the logger ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') - assert os.path.exists(read_scores_file) + assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 4 @@ -627,15 +627,15 @@ def test_read_pickle_read_preds(ensemble_backend): ensemble_memory_file = os.path.join( ensemble_backend.internals_directory, - 'ensemble_read_scores.pkl' + 'ensemble_read_losses.pkl' ) assert os.path.exists(ensemble_memory_file) # Make sure we pickle the correct read scores with (open(ensemble_memory_file, "rb")) as memory: - read_scores = pickle.load(memory) + read_losses = pickle.load(memory) - compare_read_preds(read_scores, ensbuilder.read_scores) + compare_read_preds(read_losses, ensbuilder.read_losses) # Then create a new instance, which should automatically read this file ensbuilder2 = EnsembleBuilder( @@ -650,7 +650,7 @@ def test_read_pickle_read_preds(ensemble_backend): max_models_on_disc=None, ) compare_read_preds(ensbuilder2.read_preds, ensbuilder.read_preds) - compare_read_preds(ensbuilder2.read_scores, ensbuilder.read_scores) + compare_read_preds(ensbuilder2.read_losses, ensbuilder.read_losses) assert ensbuilder2.last_hash == ensbuilder.last_hash diff --git a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py index 225193217..c2769b2cd 100644 --- a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py +++ b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py @@ -66,7 +66,10 @@ def test_pipeline_fit_include(self, fit_dictionary_tabular, preprocessor): cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) - pipeline.fit(fit_dictionary_tabular) + try: + pipeline.fit(fit_dictionary_tabular) + except Exception as e: + pytest.fail(f"For config {config} failed with {e}") # To make sure we fitted the model, there should be a # run summary object with accuracy diff --git a/test/test_pipeline/test_metrics.py b/test/test_pipeline/test_metrics.py index 153995cff..1f9889807 100644 --- a/test/test_pipeline/test_metrics.py +++ b/test/test_pipeline/test_metrics.py @@ -14,14 +14,18 @@ TABULAR_REGRESSION, TASK_TYPES_TO_STRING ) -from autoPyTorch.metrics import accuracy +from autoPyTorch.metrics import accuracy, balanced_accuracy, mean_squared_error from autoPyTorch.pipeline.components.training.metrics.base import ( _PredictMetric, _ThresholdMetric, autoPyTorchMetric, make_metric, ) -from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score, get_metrics +from autoPyTorch.pipeline.components.training.metrics.utils import ( + calculate_loss, + calculate_score, + get_metrics, +) @pytest.mark.parametrize('output_type', ['multiclass', @@ -190,3 +194,64 @@ def test_classification_only_metric(): previous_score = scorer._optimum assert score['accuracy'] == pytest.approx(previous_score) + + +def test_calculate_loss(): + # In a 0-1 ranged scorer, make sure that the loss + # has a expected positive value + y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0]) + y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0]) + score = sklearn.metrics.accuracy_score(y_true, y_pred) + assert pytest.approx(score) == calculate_score( + target=y_true, + prediction=y_pred, + task_type=TABULAR_CLASSIFICATION, + metrics=[accuracy], + )['accuracy'] + loss = 1.0 - score + assert pytest.approx(loss) == calculate_loss( + target=y_true, + prediction=y_pred, + task_type=TABULAR_CLASSIFICATION, + metrics=[accuracy], + )['accuracy'] + + # Test the dictionary case + score_dict = calculate_score( + target=y_true, + prediction=y_pred, + task_type=TABULAR_CLASSIFICATION, + metrics=[accuracy, balanced_accuracy], + ) + expected_score_dict = { + 'accuracy': 0.9, + 'balanced_accuracy': 0.9285714285714286, + } + loss_dict = calculate_loss( + target=y_true, + prediction=y_pred, + task_type=TABULAR_CLASSIFICATION, + metrics=[accuracy, balanced_accuracy], + ) + for expected_metric, expected_score in expected_score_dict.items(): + assert pytest.approx(expected_score) == score_dict[expected_metric] + assert pytest.approx(1 - expected_score) == loss_dict[expected_metric] + + # Lastly make sure that metrics whose optimum is zero + # are also properly working + y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) + y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66]) + score = sklearn.metrics.mean_squared_error(y_true, y_pred) + assert pytest.approx(score) == calculate_score( + target=y_true, + prediction=y_pred, + task_type=TABULAR_REGRESSION, + metrics=[mean_squared_error], + )['mean_squared_error'] + loss = score + assert pytest.approx(loss) == calculate_loss( + target=y_true, + prediction=y_pred, + task_type=TABULAR_REGRESSION, + metrics=[mean_squared_error], + )['mean_squared_error'] diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index 260587adb..d00ae377e 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -7,6 +7,8 @@ UniformIntegerHyperparameter, ) +import flaky + import numpy as np import pytest @@ -45,6 +47,7 @@ def _assert_pipeline_search_space(self, pipeline, search_space_updates): elif isinstance(hyperparameter, CategoricalHyperparameter): assert update.value_range == hyperparameter.choices + @flaky.flaky(max_runs=2) def test_pipeline_fit(self, fit_dictionary_tabular): """This test makes sure that the pipeline is able to fit given random combinations of hyperparameters across the pipeline""" @@ -95,8 +98,9 @@ def test_pipeline_score(self, fit_dictionary_tabular_dummy, fit_dictionary_tabul # we should be able to get a decent score on this dummy data accuracy = metrics.accuracy(y, prediction.squeeze()) - assert accuracy >= 0.8 + assert accuracy >= 0.8, f"Pipeline:{pipeline} Config:{config} FitDict: {fit_dictionary_tabular_dummy}" + @flaky.flaky(max_runs=3) def test_pipeline_predict(self, fit_dictionary_tabular): """This test makes sure that the pipeline is able to predict given a random configuration""" diff --git a/test/test_pipeline/test_tabular_regression.py b/test/test_pipeline/test_tabular_regression.py index 15b8351f9..ea7a70d86 100644 --- a/test/test_pipeline/test_tabular_regression.py +++ b/test/test_pipeline/test_tabular_regression.py @@ -109,7 +109,7 @@ def test_pipeline_score(self, fit_dictionary_tabular_dummy, fit_dictionary_tabul # we should be able to get a decent score on this dummy data r2_score = metrics.r2(y, prediction) - assert r2_score >= 0.5 + assert r2_score >= 0.5, f"Pipeline:{pipeline} Config:{config} FitDict: {fit_dictionary_tabular_dummy}" def test_pipeline_predict(self, fit_dictionary_tabular): """This test makes sure that the pipeline is able to predict