diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index c4d2e2396..59f70facf 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -30,6 +30,8 @@ jobs: echo "::set-output name=BEFORE::$(git status --porcelain -b)" - name: Run tests run: | - python examples/example_tabular_classification.py - python examples/example_tabular_regression.py + python examples/tabular/20_basics/example_tabular_classification.py + python examples/tabular/20_basics/example_tabular_regression.py + python examples/tabular/40_advanced/example_custom_configuration_space.py + python examples/tabular/40_advanced/example_resampling_strategy.py python examples/example_image_classification.py \ No newline at end of file diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 3bd481995..04d30dd9e 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -27,27 +27,36 @@ class TabularClassificationTask(BaseTask): """ Tabular Classification API to the pipelines. Args: - seed (int): seed to be used for reproducibility. - n_jobs (int), (default=1): number of consecutive processes to spawn. - logging_config (Optional[Dict]): specifies configuration - for logging, if None, it is loaded from the logging.yaml - ensemble_size (int), (default=50): Number of models added to the ensemble built by + seed (int): + seed to be used for reproducibility. + n_jobs (int), (default=1): + number of consecutive processes to spawn. + logging_config (Optional[Dict]): + specifies configuration for logging, if None, it is loaded from the logging.yaml + ensemble_size (int), (default=50): + Number of models added to the ensemble built by Ensemble selection from libraries of models. Models are drawn with replacement. - ensemble_nbest (int), (default=50): only consider the ensemble_nbest + ensemble_nbest (int), (default=50): + only consider the ensemble_nbest models to build the ensemble - max_models_on_disc (int), (default=50): maximum number of models saved to disc. + max_models_on_disc (int), (default=50): + maximum number of models saved to disc. Also, controls the size of the ensemble as any additional models will be deleted. Must be greater than or equal to 1. - temporary_directory (str): folder to store configuration output and log file - output_directory (str): folder to store predictions for optional test set - delete_tmp_folder_after_terminate (bool): determines whether to delete the temporary directory, - when finished - include_components (Optional[Dict]): If None, all possible components are used. - Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): If None, all possible components are used. - Otherwise specifies set of components not to use. Incompatible with include - components + temporary_directory (str): + folder to store configuration output and log file + output_directory (str): + folder to store predictions for optional test set + delete_tmp_folder_after_terminate (bool): + determines whether to delete the temporary directory, when finished + include_components (Optional[Dict]): + If None, all possible components are used. Otherwise + specifies set of components to use. + exclude_components (Optional[Dict]): + If None, all possible components are used. Otherwise + specifies set of components not to use. Incompatible + with include components """ def __init__( self, diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index 1d0bc3077..b853fac0a 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -97,7 +97,7 @@ def holdout_validation(val_share: float, indices: np.ndarray, **kwargs: Any) -> def stratified_holdout_validation(val_share: float, indices: np.ndarray, **kwargs: Any) \ -> Tuple[np.ndarray, np.ndarray]: - train, val = train_test_split(indices, test_size=val_share, shuffle=False, stratify=kwargs["stratify"]) + train, val = train_test_split(indices, test_size=val_share, shuffle=True, stratify=kwargs["stratify"]) return train, val diff --git a/autoPyTorch/utils/hyperparameter_search_space_update.py b/autoPyTorch/utils/hyperparameter_search_space_update.py index 3f2937686..e2ef1c85f 100644 --- a/autoPyTorch/utils/hyperparameter_search_space_update.py +++ b/autoPyTorch/utils/hyperparameter_search_space_update.py @@ -6,7 +6,25 @@ from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent -class HyperparameterSearchSpaceUpdate(): +class HyperparameterSearchSpaceUpdate: + """ + Allows specifying update to the search space of a + particular hyperparameter. + + Args: + node_name (str): + The name of the node in the pipeline + hyperparameter (str): + The name of the hyperparameter + value_range (Union[List, Tuple]): + In case of categorical hyperparameter, defines the new categorical choices. + In case of numerical hyperparameter, defines the new range + in the form of (LOWER, UPPER) + default_value (Union[int, float, str]): + New default value for the hyperparameter + log (bool) (default=False): + In case of numerical hyperparameters, whether to sample on a log scale + """ def __init__(self, node_name: str, hyperparameter: str, value_range: Union[List, Tuple], default_value: Union[int, float, str], log: bool = False) -> None: self.node_name = node_name @@ -16,6 +34,15 @@ def __init__(self, node_name: str, hyperparameter: str, value_range: Union[List, self.default_value = default_value def apply(self, pipeline: List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]) -> None: + """ + Applies the update to the appropriate hyperparameter of the pipeline + Args: + pipeline (List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]): + The named steps of the current autopytorch pipeline + + Returns: + None + """ [node[1]._apply_search_space_update(name=self.hyperparameter, new_value_range=self.value_range, log=self.log, @@ -29,16 +56,47 @@ def __str__(self) -> str: (" log" if self.log else "")) -class HyperparameterSearchSpaceUpdates(): +class HyperparameterSearchSpaceUpdates: + """ Contains a collection of HyperparameterSearchSpaceUpdate """ def __init__(self, updates: Optional[List[HyperparameterSearchSpaceUpdate]] = None) -> None: self.updates = updates if updates is not None else [] def apply(self, pipeline: List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]) -> None: + """ + Iteratively applies updates to the pipeline + + Args: + pipeline: (List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]): + The named steps of the current autoPyTorch pipeline + + Returns: + None + """ for update in self.updates: update.apply(pipeline) def append(self, node_name: str, hyperparameter: str, value_range: Union[List, Tuple], default_value: Union[int, float, str], log: bool = False) -> None: + """ + Add a new update + + Args: + node_name (str): + The name of the node in the pipeline + hyperparameter (str): + The name of the hyperparameter + value_range (Union[List, Tuple]): + In case of categorical hyperparameter, defines the new categorical choices. + In case of numerical hyperparameter, defines the new range + in the form of (LOWER, UPPER) + default_value (Union[int, float, str]): + New default value for the hyperparameter + log (bool) (default=False): + In case of numerical hyperparameters, whether to sample on a log scale + + Returns: + None + """ self.updates.append(HyperparameterSearchSpaceUpdate(node_name=node_name, hyperparameter=hyperparameter, value_range=value_range, @@ -46,13 +104,21 @@ def append(self, node_name: str, hyperparameter: str, value_range: Union[List, T log=log)) def save_as_file(self, path: str) -> None: + """ + Save the updates as a file to reuse later + + Args: + path (str): path of the file + + Returns: + None + """ with open(path, "w") as f: - with open(path, "w") as f: - for update in self.updates: - print(update.node_name, update.hyperparameter, # noqa: T001 - str(update.value_range), "'{}'".format(update.default_value) - if isinstance(update.default_value, str) else update.default_value, - (" log" if update.log else ""), file=f) + for update in self.updates: + print(update.node_name, update.hyperparameter, # noqa: T001 + str(update.value_range), "'{}'".format(update.default_value) + if isinstance(update.default_value, str) else update.default_value, + (" log" if update.log else ""), file=f) def parse_hyperparameter_search_space_updates(updates_file: Optional[str] diff --git a/docs/conf.py b/docs/conf.py index 3bde1c842..1cbe6fe56 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -68,9 +68,9 @@ sphinx_gallery_conf = { # path to the examples - 'examples_dirs': '../examples', + 'examples_dirs': ['../examples/tabular/20_basics', '../examples/tabular/40_advanced'], # path where to save gallery generated examples - 'gallery_dirs': 'examples', + 'gallery_dirs': ['basics_tabular', 'advanced_tabular'], #TODO: fix back/forward references for the examples. #'doc_module': ('autoPyTorch'), #'reference_url': { diff --git a/examples/tabular/20_basics/README.txt b/examples/tabular/20_basics/README.txt new file mode 100644 index 000000000..a7a739910 --- /dev/null +++ b/examples/tabular/20_basics/README.txt @@ -0,0 +1,8 @@ +.. _examples_tabular_basics: + + +============================== +Basic Tabular Dataset Examples +============================== + +Basic examples for using *Auto-PyTorch* on tabular datasets diff --git a/examples/example_tabular_classification.py b/examples/tabular/20_basics/example_tabular_classification.py similarity index 62% rename from examples/example_tabular_classification.py rename to examples/tabular/20_basics/example_tabular_classification.py index 6263233a0..047f01842 100644 --- a/examples/example_tabular_classification.py +++ b/examples/tabular/20_basics/example_tabular_classification.py @@ -22,32 +22,10 @@ import sklearn.model_selection from autoPyTorch.api.tabular_classification import TabularClassificationTask -from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates - - -def get_search_space_updates(): - """ - Search space updates to the task can be added using HyperparameterSearchSpaceUpdates - Returns: - HyperparameterSearchSpaceUpdates - """ - updates = HyperparameterSearchSpaceUpdates() - updates.append(node_name="data_loader", - hyperparameter="batch_size", - value_range=[16, 512], - default_value=32) - updates.append(node_name="lr_scheduler", - hyperparameter="CosineAnnealingLR:T_max", - value_range=[50, 60], - default_value=55) - updates.append(node_name='network_backbone', - hyperparameter='ResNetBackbone:dropout', - value_range=[0, 0.5], - default_value=0.2) - return updates if __name__ == '__main__': + ############################################################################ # Data Loading # ============ @@ -62,16 +40,23 @@ def get_search_space_updates(): # Build and fit a classifier # ========================== api = TabularClassificationTask( - delete_tmp_folder_after_terminate=False, - search_space_updates=get_search_space_updates() + temporary_directory='./tmp/autoPyTorch_example_tmp_01', + output_directory='./tmp/autoPyTorch_example_out_01', + # To maintain logs of the run, set the next two as False + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True ) + + ############################################################################ + # Search for an ensemble of machine learning algorithms + # ===================================================== api.search( X_train=X_train, y_train=y_train, X_test=X_test.copy(), y_test=y_test.copy(), optimize_metric='accuracy', - total_walltime_limit=500, + total_walltime_limit=300, func_eval_time_limit=50 ) @@ -82,4 +67,5 @@ def get_search_space_updates(): y_pred = api.predict(X_test) score = api.score(y_pred, y_test) print(score) + # Print the final ensemble built by AutoPyTorch print(api.show_models()) diff --git a/examples/example_tabular_regression.py b/examples/tabular/20_basics/example_tabular_regression.py similarity index 53% rename from examples/example_tabular_regression.py rename to examples/tabular/20_basics/example_tabular_regression.py index 43c901827..7bd48155f 100644 --- a/examples/example_tabular_regression.py +++ b/examples/tabular/20_basics/example_tabular_regression.py @@ -3,17 +3,15 @@ Tabular Regression ====================== -The following example shows how to fit a sample classification model +The following example shows how to fit a sample regression model with AutoPyTorch """ import os import tempfile as tmp -import typing import warnings -from sklearn.datasets import make_regression - -from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator +import sklearn.datasets +import sklearn.model_selection os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() os.environ['OMP_NUM_THREADS'] = '1' @@ -23,54 +21,16 @@ warnings.simplefilter(action='ignore', category=UserWarning) warnings.simplefilter(action='ignore', category=FutureWarning) -from sklearn import model_selection, preprocessing - from autoPyTorch.api.tabular_regression import TabularRegressionTask -from autoPyTorch.datasets.tabular_dataset import TabularDataset -from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates - - -def get_search_space_updates(): - """ - Search space updates to the task can be added using HyperparameterSearchSpaceUpdates - Returns: - HyperparameterSearchSpaceUpdates - """ - updates = HyperparameterSearchSpaceUpdates() - updates.append(node_name="data_loader", - hyperparameter="batch_size", - value_range=[16, 512], - default_value=32) - updates.append(node_name="lr_scheduler", - hyperparameter="CosineAnnealingLR:T_max", - value_range=[50, 60], - default_value=55) - updates.append(node_name='network_backbone', - hyperparameter='ResNetBackbone:dropout', - value_range=[0, 0.5], - default_value=0.2) - return updates if __name__ == '__main__': + ############################################################################ # Data Loading # ============ - - # Get the training data for tabular regression - # X, y = datasets.fetch_openml(name="cholesterol", return_X_y=True) - - # Use dummy data for now since there are problems with categorical columns - X, y = make_regression( - n_samples=5000, - n_features=4, - n_informative=3, - n_targets=1, - shuffle=True, - random_state=0 - ) - - X_train, X_test, y_train, y_test = model_selection.train_test_split( + X, y = sklearn.datasets.fetch_openml(name='boston', return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, random_state=1, @@ -89,16 +49,23 @@ def get_search_space_updates(): # Build and fit a regressor # ========================== api = TabularRegressionTask( - delete_tmp_folder_after_terminate=False, - search_space_updates=get_search_space_updates() + temporary_directory='./tmp/autoPyTorch_example_tmp_02', + output_directory='./tmp/autoPyTorch_example_out_02', + # To maintain logs of the run, set the next two as False + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True ) + + ############################################################################ + # Search for an ensemble of machine learning algorithms + # ===================================================== api.search( X_train=X_train, y_train=y_train_scaled, X_test=X_test.copy(), y_test=y_test_scaled.copy(), optimize_metric='r2', - total_walltime_limit=500, + total_walltime_limit=300, func_eval_time_limit=50, traditional_per_total_budget=0 ) @@ -114,3 +81,5 @@ def get_search_space_updates(): score = api.score(y_pred, y_test) print(score) + # Print the final ensemble built by AutoPyTorch + print(api.show_models()) diff --git a/examples/tabular/40_advanced/README.txt b/examples/tabular/40_advanced/README.txt new file mode 100644 index 000000000..f3293bf16 --- /dev/null +++ b/examples/tabular/40_advanced/README.txt @@ -0,0 +1,11 @@ +.. _examples_tabular_basics: + + +============================== +Advanced Tabular Dataset Examples +============================== + +Advanced examples for using *Auto-PyTorch* on tabular datasets. +We explain + 1. How to customise the search space + 2. How to split the data according to different resampling strategies diff --git a/examples/tabular/40_advanced/example_custom_configuration_space.py b/examples/tabular/40_advanced/example_custom_configuration_space.py new file mode 100644 index 000000000..772c268b9 --- /dev/null +++ b/examples/tabular/40_advanced/example_custom_configuration_space.py @@ -0,0 +1,132 @@ +""" +====================== +Tabular Classification with Custom Configuration Space +====================== + +The following example shows how adjust the configuration space of +the search. Currently, there are two changes that can be made to the space:- +1. Adjust individual hyperparameters in the pipeline +2. Include or exclude components: + a) include: Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + b) exclude: Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. +""" +import os +import tempfile as tmp +import warnings + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import sklearn.datasets +import sklearn.model_selection + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates + + +def get_search_space_updates(): + """ + Search space updates to the task can be added using HyperparameterSearchSpaceUpdates + Returns: + HyperparameterSearchSpaceUpdates + """ + updates = HyperparameterSearchSpaceUpdates() + updates.append(node_name="data_loader", + hyperparameter="batch_size", + value_range=[16, 512], + default_value=32) + updates.append(node_name="lr_scheduler", + hyperparameter="CosineAnnealingLR:T_max", + value_range=[50, 60], + default_value=55) + updates.append(node_name='network_backbone', + hyperparameter='ResNetBackbone:dropout', + value_range=[0, 0.5], + default_value=0.2) + return updates + + +if __name__ == '__main__': + + ############################################################################ + # Data Loading + # ============ + X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=1, + ) + + ############################################################################ + # Build and fit a classifier with include components + # ================================================== + api = TabularClassificationTask( + search_space_updates=get_search_space_updates(), + include_components={'network_backbone': ['MLPBackbone', 'ResNetBackbone'], + 'encoder': ['OneHotEncoder']} + ) + + ############################################################################ + # Search for an ensemble of machine learning algorithms + # ===================================================== + api.search( + X_train=X_train.copy(), + y_train=y_train.copy(), + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=300, + func_eval_time_limit=50 + ) + + ############################################################################ + # Print the final ensemble performance + # ==================================== + print(api.run_history, api.trajectory) + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + print(api.show_models()) + + ############################################################################ + # Build and fit a classifier with exclude components + # ================================================== + api = TabularClassificationTask( + search_space_updates=get_search_space_updates(), + exclude_components={'network_backbone': ['MLPBackbone'], + 'encoder': ['OneHotEncoder']} + ) + + ############################################################################ + # Search for an ensemble of machine learning algorithms + # ===================================================== + api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=300, + func_eval_time_limit=50 + ) + + ############################################################################ + # Print the final ensemble performance + # ==================================== + print(api.run_history, api.trajectory) + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + print(api.show_models()) diff --git a/examples/tabular/40_advanced/example_resampling_strategy.py b/examples/tabular/40_advanced/example_resampling_strategy.py new file mode 100644 index 000000000..5217afab2 --- /dev/null +++ b/examples/tabular/40_advanced/example_resampling_strategy.py @@ -0,0 +1,159 @@ +""" +====================== +Tabular Classification with different resampling strategy +====================== + +The following example shows how to fit a sample classification model +with different resampling strategies in AutoPyTorch +By default, AutoPyTorch uses Holdout Validation with +a 67% train size split. +""" +import os +import tempfile as tmp +import warnings + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import sklearn.datasets +import sklearn.model_selection + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes + + +if __name__ == '__main__': + + ############################################################################ + # Data Loading + # ============ + X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=1, + ) + + ############################################################################ + # Build and fit a classifier with default resampling strategy + # =========================================================== + api = TabularClassificationTask( + temporary_directory='./tmp/autoPyTorch_example_tmp_03', + output_directory='./tmp/autoPyTorch_example_out_03', + # To maintain logs of the run, set the next two as False + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True, + # 'HoldoutValTypes.holdout_validation' with 'val_share': 0.33 + # is the default argument setting for TabularClassificationTask. + # It is explicitly specified in this example for demonstrational + # purpose. + resampling_strategy=HoldoutValTypes.holdout_validation, + resampling_strategy_args={'val_share': 0.33} + ) + + ############################################################################ + # Search for an ensemble of machine learning algorithms + # ===================================================== + api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit=30 + ) + + ############################################################################ + # Print the final ensemble performance + # ==================================== + print(api.run_history, api.trajectory) + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + # Print the final ensemble built by AutoPyTorch + print(api.show_models()) + + ############################################################################ + + ############################################################################ + # Build and fit a classifier with Cross validation resampling strategy + # ==================================================================== + api = TabularClassificationTask( + temporary_directory='./tmp/autoPyTorch_example_tmp_04', + output_directory='./tmp/autoPyTorch_example_out_04', + # To maintain logs of the run, set the next two as False + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True, + resampling_strategy=CrossValTypes.k_fold_cross_validation, + resampling_strategy_args={'num_splits': 3} + ) + + ############################################################################ + # Search for an ensemble of machine learning algorithms + # ===================================================== + api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit=30 + ) + + ############################################################################ + # Print the final ensemble performance + # ==================================== + print(api.run_history, api.trajectory) + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + # Print the final ensemble built by AutoPyTorch + print(api.show_models()) + + ############################################################################ + + ############################################################################ + # Build and fit a classifier with Stratified resampling strategy + # ============================================================== + api = TabularClassificationTask( + temporary_directory='./tmp/autoPyTorch_example_tmp_05', + output_directory='./tmp/autoPyTorch_example_out_05', + # To maintain logs of the run, set the next two as False + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True, + # For demonstration purposes, we use + # Stratified hold out validation. However, + # one can also use CrossValTypes.stratified_k_fold_cross_validation. + resampling_strategy=HoldoutValTypes.stratified_holdout_validation, + resampling_strategy_args={'val_share': 0.33} + ) + + ############################################################################ + # Search for an ensemble of machine learning algorithms + # ===================================================== + api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit=30 + ) + + ############################################################################ + # Print the final ensemble performance + # ==================================== + print(api.run_history, api.trajectory) + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + # Print the final ensemble built by AutoPyTorch + print(api.show_models())