diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index dd26ebf99..6e52f0d6a 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -54,7 +54,9 @@ setup_logger, start_log_server, ) +from autoPyTorch.utils.parallel import preload_modules from autoPyTorch.utils.pipeline import get_configuration_space, get_dataset_requirements +from autoPyTorch.utils.single_thread_client import SingleThreadedClient from autoPyTorch.utils.stopwatch import StopWatch @@ -190,7 +192,16 @@ def __init__( self.stop_logging_server = None # type: Optional[multiprocessing.synchronize.Event] + # Single core, local runs should use fork + # to prevent the __main__ requirements in + # examples. Nevertheless, multi-process runs + # have spawn as requirement to reduce the + # possibility of a deadlock self._dask_client = None + self._multiprocessing_context = 'forkserver' + if self.n_jobs == 1: + self._multiprocessing_context = 'fork' + self._dask_client = SingleThreadedClient() self.search_space_updates = search_space_updates if search_space_updates is not None: @@ -300,7 +311,8 @@ def _get_logger(self, name: str) -> PicklableClientLogger: # under the above logging configuration setting # We need to specify the logger_name so that received records # are treated under the logger_name ROOT logger setting - context = multiprocessing.get_context('spawn') + context = multiprocessing.get_context(self._multiprocessing_context) + preload_modules(context) self.stop_logging_server = context.Event() port = context.Value('l') # be safe by using a long port.value = -1 @@ -505,6 +517,7 @@ def _do_dummy_prediction(self) -> None: stats = Stats(scenario_mock) stats.start_timing() ta = ExecuteTaFuncWithQueue( + pynisher_context=self._multiprocessing_context, backend=self._backend, seed=self.seed, metric=self._metric, @@ -599,6 +612,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: stats = Stats(scenario_mock) stats.start_timing() ta = ExecuteTaFuncWithQueue( + pynisher_context=self._multiprocessing_context, backend=self._backend, seed=self.seed, metric=self._metric, @@ -929,6 +943,7 @@ def _search( random_state=self.seed, precision=precision, logger_port=self._logger_port, + pynisher_context=self._multiprocessing_context, ) self._stopwatch.stop_task(ensemble_task_name) @@ -969,6 +984,7 @@ def _search( start_num_run=self._backend.get_next_num_run(peek=True), search_space_updates=self.search_space_updates, portfolio_selection=portfolio_selection, + pynisher_context=self._multiprocessing_context, ) try: run_history, self.trajectory, budget_type = \ @@ -1299,5 +1315,6 @@ def _print_debug_info_to_log(self) -> None: self._logger.debug(' System: %s', platform.system()) self._logger.debug(' Machine: %s', platform.machine()) self._logger.debug(' Platform: %s', platform.platform()) + self._logger.debug(' multiprocessing_context: %s', str(self._multiprocessing_context)) for key, value in vars(self).items(): self._logger.debug(f"\t{key}->{value}") diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py index 8be96a339..fc9f0d054 100644 --- a/autoPyTorch/ensemble/ensemble_builder.py +++ b/autoPyTorch/ensemble/ensemble_builder.py @@ -36,6 +36,7 @@ from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score from autoPyTorch.utils.logging_ import get_named_client_logger +from autoPyTorch.utils.parallel import preload_modules Y_ENSEMBLE = 0 Y_TEST = 1 @@ -64,6 +65,7 @@ def __init__( ensemble_memory_limit: Optional[int], random_state: int, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + pynisher_context: str = 'fork', ): """ SMAC callback to handle ensemble building Args: @@ -111,6 +113,8 @@ def __init__( read at most n new prediction files in each iteration logger_port: int port in where to publish a msg + pynisher_context: str + The multiprocessing context for pynisher. One of spawn/fork/forkserver. Returns: List[Tuple[int, float, float, float]]: @@ -135,6 +139,7 @@ def __init__( self.ensemble_memory_limit = ensemble_memory_limit self.random_state = random_state self.logger_port = logger_port + self.pynisher_context = pynisher_context # Store something similar to SMAC's runhistory self.history = [] # type: List[Dict[str, float]] @@ -160,7 +165,6 @@ def __call__( def build_ensemble( self, dask_client: dask.distributed.Client, - pynisher_context: str = 'spawn', unit_test: bool = False ) -> None: @@ -236,7 +240,7 @@ def build_ensemble( iteration=self.iteration, return_predictions=False, priority=100, - pynisher_context=pynisher_context, + pynisher_context=self.pynisher_context, logger_port=self.logger_port, unit_test=unit_test, )) @@ -585,11 +589,11 @@ def __init__( def run( self, iteration: int, + pynisher_context: str, time_left: Optional[float] = None, end_at: Optional[float] = None, time_buffer: int = 5, return_predictions: bool = False, - pynisher_context: str = 'spawn', # only change for unit testing! ) -> Tuple[ List[Dict[str, float]], int, @@ -655,6 +659,7 @@ def run( if wall_time_in_s < 1: break context = multiprocessing.get_context(pynisher_context) + preload_modules(context) safe_ensemble_script = pynisher.enforce_limits( wall_time_in_s=wall_time_in_s, diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py index 9c3ec2635..32e869ba7 100644 --- a/autoPyTorch/evaluation/tae.py +++ b/autoPyTorch/evaluation/tae.py @@ -29,6 +29,7 @@ from autoPyTorch.utils.common import replace_string_bool_to_bool from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger +from autoPyTorch.utils.parallel import preload_modules def fit_predict_try_except_decorator( @@ -92,29 +93,29 @@ class ExecuteTaFuncWithQueue(AbstractTAFunc): """ def __init__( - self, - backend: Backend, - seed: int, - metric: autoPyTorchMetric, - cost_for_crash: float, - abort_on_first_run_crash: bool, - pipeline_config: typing.Optional[typing.Dict[str, typing.Any]] = None, - initial_num_run: int = 1, - stats: typing.Optional[Stats] = None, - run_obj: str = 'quality', - par_factor: int = 1, - output_y_hat_optimization: bool = True, - include: typing.Optional[typing.Dict[str, typing.Any]] = None, - exclude: typing.Optional[typing.Dict[str, typing.Any]] = None, - memory_limit: typing.Optional[int] = None, - disable_file_output: bool = False, - init_params: typing.Dict[str, typing.Any] = None, - budget_type: str = None, - ta: typing.Optional[typing.Callable] = None, - logger_port: int = None, - all_supported_metrics: bool = True, - pynisher_context: str = 'spawn', - search_space_updates: typing.Optional[HyperparameterSearchSpaceUpdates] = None + self, + backend: Backend, + seed: int, + metric: autoPyTorchMetric, + cost_for_crash: float, + abort_on_first_run_crash: bool, + pynisher_context: str, + pipeline_config: typing.Optional[typing.Dict[str, typing.Any]] = None, + initial_num_run: int = 1, + stats: typing.Optional[Stats] = None, + run_obj: str = 'quality', + par_factor: int = 1, + output_y_hat_optimization: bool = True, + include: typing.Optional[typing.Dict[str, typing.Any]] = None, + exclude: typing.Optional[typing.Dict[str, typing.Any]] = None, + memory_limit: typing.Optional[int] = None, + disable_file_output: bool = False, + init_params: typing.Dict[str, typing.Any] = None, + budget_type: str = None, + ta: typing.Optional[typing.Callable] = None, + logger_port: int = None, + all_supported_metrics: bool = True, + search_space_updates: typing.Optional[HyperparameterSearchSpaceUpdates] = None ): eval_function = autoPyTorch.evaluation.train_evaluator.eval_function @@ -249,6 +250,7 @@ def run( ) -> typing.Tuple[StatusType, float, float, typing.Dict[str, typing.Any]]: context = multiprocessing.get_context(self.pynisher_context) + preload_modules(context) queue: multiprocessing.queues.Queue = context.Queue() if not (instance_specific is None or instance_specific == '0'): diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index 2ae894e8b..d1cd7d55d 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -109,7 +109,8 @@ def __init__(self, ensemble_callback: typing.Optional[EnsembleBuilderManager] = None, logger_port: typing.Optional[int] = None, search_space_updates: typing.Optional[HyperparameterSearchSpaceUpdates] = None, - portfolio_selection: typing.Optional[str] = None + portfolio_selection: typing.Optional[str] = None, + pynisher_context: str = 'spawn', ): """ Interface to SMAC. This method calls the SMAC optimize method, and allows @@ -156,6 +157,8 @@ def __init__(self, Additional arguments to the smac scenario get_smac_object_callback (typing.Optional[typing.Callable]): Allows to create a user specified SMAC object + pynisher_context (str): + A string indicating the multiprocessing context to use ensemble_callback (typing.Optional[EnsembleBuilderManager]): A callback used in this scenario to start ensemble building subtasks portfolio_selection (str), (default=None): @@ -204,6 +207,7 @@ def __init__(self, self.disable_file_output = disable_file_output self.smac_scenario_args = smac_scenario_args self.get_smac_object_callback = get_smac_object_callback + self.pynisher_context = pynisher_context self.ensemble_callback = ensemble_callback @@ -274,7 +278,8 @@ def run_smbo(self, func: typing.Optional[typing.Callable] = None logger_port=self.logger_port, all_supported_metrics=self.all_supported_metrics, pipeline_config=self.pipeline_config, - search_space_updates=self.search_space_updates + search_space_updates=self.search_space_updates, + pynisher_context=self.pynisher_context, ) ta = ExecuteTaFuncWithQueue self.logger.info("Created TA") diff --git a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/__init__.py b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/__init__.py index e69de29bb..33dd0cd32 100644 --- a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/__init__.py @@ -0,0 +1,106 @@ +import os +from collections import OrderedDict +from typing import Any, Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise.base_normalizer import BaseNormalizer + + +normalise_directory = os.path.split(__file__)[0] +_normalizers = find_components(__package__, + normalise_directory, + BaseNormalizer) + +_addons = ThirdPartyComponents(BaseNormalizer) + + +def add_normalizer(normalizer: BaseNormalizer) -> None: + _addons.add_component(normalizer) + + +class NormalizerChoice(autoPyTorchChoice): + """ + Allows for dynamically choosing normalizer component at runtime + """ + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available normalizer components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all BaseNormalizer components available + as choices for encoding the categorical columns + """ + components = OrderedDict() + components.update(_normalizers) + components.update(_addons.components) + return components + + def get_hyperparameter_search_space(self, + dataset_properties: Optional[Dict[str, Any]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None) -> ConfigurationSpace: + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = dict() + + dataset_properties = {**self.dataset_properties, **dataset_properties} + + available_preprocessors = self.get_available_components(dataset_properties=dataset_properties, + include=include, + exclude=exclude) + + if len(available_preprocessors) == 0: + raise ValueError("no image normalizers found, please add an image normalizer") + + if default is None: + defaults = ['ImageNormalizer', 'NoNormalizer'] + for default_ in defaults: + if default_ in available_preprocessors: + if include is not None and default_ not in include: + continue + if exclude is not None and default_ in exclude: + continue + default = default_ + break + + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_preprocessors): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_preprocessors, + choice_hyperparameter.value_range)) + preprocessor = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + preprocessor = CSH.CategoricalHyperparameter('__choice__', + list(available_preprocessors.keys()), + default_value=default) + cs.add_hyperparameter(preprocessor) + + # add only child hyperparameters of preprocessor choices + for name in preprocessor.choices: + preprocessor_configuration_space = available_preprocessors[name].\ + get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {'parent': preprocessor, 'value': name} + cs.add_configuration_space(name, preprocessor_configuration_space, + parent_hyperparameter=parent_hyperparameter) + + self.configuration_space = cs + self.dataset_properties = dataset_properties + return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/base_normalizer_choice.py b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/base_normalizer_choice.py deleted file mode 100644 index acc9f5b64..000000000 --- a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/base_normalizer_choice.py +++ /dev/null @@ -1,106 +0,0 @@ -import os -from collections import OrderedDict -from typing import Any, Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise.base_normalizer import BaseNormalizer - - -normalise_directory = os.path.split(__file__)[0] -_normalizers = find_components(__package__, - normalise_directory, - BaseNormalizer) - -_addons = ThirdPartyComponents(BaseNormalizer) - - -def add_normalizer(normalizer: BaseNormalizer) -> None: - _addons.add_component(normalizer) - - -class NormalizerChoice(autoPyTorchChoice): - """ - Allows for dynamically choosing encoding component at runtime - """ - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available normalizer components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all BaseNormalise components available - as choices for encoding the categorical columns - """ - components = OrderedDict() - components.update(_normalizers) - components.update(_addons.components) - return components - - def get_hyperparameter_search_space(self, - dataset_properties: Optional[Dict[str, Any]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None) -> ConfigurationSpace: - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = dict() - - dataset_properties = {**self.dataset_properties, **dataset_properties} - - available_preprocessors = self.get_available_components(dataset_properties=dataset_properties, - include=include, - exclude=exclude) - - if len(available_preprocessors) == 0: - raise ValueError("no image normalizers found, please add an image normalizer") - - if default is None: - defaults = ['ImageNormalizer', 'NoNormalizer'] - for default_ in defaults: - if default_ in available_preprocessors: - if include is not None and default_ not in include: - continue - if exclude is not None and default_ in exclude: - continue - default = default_ - break - - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(available_preprocessors): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_preprocessors, - choice_hyperparameter.value_range)) - preprocessor = CSH.CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - preprocessor = CSH.CategoricalHyperparameter('__choice__', - list(available_preprocessors.keys()), - default_value=default) - cs.add_hyperparameter(preprocessor) - - # add only child hyperparameters of early_preprocessor choices - for name in preprocessor.choices: - preprocessor_configuration_space = available_preprocessors[name].\ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': preprocessor, 'value': name} - cs.add_configuration_space(name, preprocessor_configuration_space, - parent_hyperparameter=parent_hyperparameter) - - self.configuration_space = cs - self.dataset_properties = dataset_properties - return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/__init__.py index e69de29bb..f7399005a 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/__init__.py @@ -0,0 +1,137 @@ +import os +from collections import OrderedDict +from typing import Any, Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import BaseEncoder + + +encoding_directory = os.path.split(__file__)[0] +_encoders = find_components(__package__, + encoding_directory, + BaseEncoder) +_addons = ThirdPartyComponents(BaseEncoder) + + +def add_encoder(encoder: BaseEncoder) -> None: + _addons.add_component(encoder) + + +class EncoderChoice(autoPyTorchChoice): + """ + Allows for dynamically choosing encoding component at runtime + """ + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available encoder components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all BaseEncoder components available + as choices for encoding the categorical columns + """ + components = OrderedDict() + components.update(_encoders) + components.update(_addons.components) + return components + + def get_hyperparameter_search_space(self, + dataset_properties: Optional[Dict[str, Any]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None) -> ConfigurationSpace: + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = dict() + + dataset_properties = {**self.dataset_properties, **dataset_properties} + + available_preprocessors = self.get_available_components(dataset_properties=dataset_properties, + include=include, + exclude=exclude) + + if len(available_preprocessors) == 0: + raise ValueError("no encoders found, please add a encoder") + + if default is None: + defaults = ['OneHotEncoder', 'NoEncoder'] + for default_ in defaults: + if default_ in available_preprocessors: + if include is not None and default_ not in include: + continue + if exclude is not None and default_ in exclude: + continue + default = default_ + break + + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_preprocessors): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_preprocessors, + choice_hyperparameter.value_range)) + if len(dataset_properties['categorical_columns']) == 0: + assert len(choice_hyperparameter.value_range) == 1 + assert 'NoEncoder' in choice_hyperparameter.value_range, \ + "Provided {} in choices, however, the dataset " \ + "is incompatible with it".format(choice_hyperparameter.value_range) + + preprocessor = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + # add only no encoder to choice hyperparameters in case the dataset is only numerical + if len(dataset_properties['categorical_columns']) == 0: + default = 'NoEncoder' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, the dataset " + "is incompatible with it".format(include)) + preprocessor = CSH.CategoricalHyperparameter('__choice__', + ['NoEncoder'], + default_value=default) + else: + preprocessor = CSH.CategoricalHyperparameter('__choice__', + list(available_preprocessors.keys()), + default_value=default) + + cs.add_hyperparameter(preprocessor) + + # add only child hyperparameters of preprocessor choices + for name in preprocessor.choices: + preprocessor_configuration_space = available_preprocessors[name].\ + get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {'parent': preprocessor, 'value': name} + cs.add_configuration_space(name, preprocessor_configuration_space, + parent_hyperparameter=parent_hyperparameter) + + self.configuration_space = cs + self.dataset_properties = dataset_properties + return cs + + def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None: + """ + A mechanism in code to ensure the correctness of the fit dictionary + It recursively makes sure that the children and parent level requirements + are honored before fit. + Args: + dataset_properties: + + """ + super()._check_dataset_properties(dataset_properties) + assert 'numerical_columns' in dataset_properties.keys(), \ + "Dataset properties must contain information about numerical columns" + assert 'categorical_columns' in dataset_properties.keys(), \ + "Dataset properties must contain information about categorical columns" diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py deleted file mode 100644 index 7ddbf8eaf..000000000 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py +++ /dev/null @@ -1,137 +0,0 @@ -import os -from collections import OrderedDict -from typing import Any, Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import BaseEncoder - - -encoding_directory = os.path.split(__file__)[0] -_encoders = find_components(__package__, - encoding_directory, - BaseEncoder) -_addons = ThirdPartyComponents(BaseEncoder) - - -def add_encoder(encoder: BaseEncoder) -> None: - _addons.add_component(encoder) - - -class EncoderChoice(autoPyTorchChoice): - """ - Allows for dynamically choosing encoding component at runtime - """ - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available encoder components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all BaseEncoder components available - as choices for encoding the categorical columns - """ - components = OrderedDict() - components.update(_encoders) - components.update(_addons.components) - return components - - def get_hyperparameter_search_space(self, - dataset_properties: Optional[Dict[str, Any]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None) -> ConfigurationSpace: - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = dict() - - dataset_properties = {**self.dataset_properties, **dataset_properties} - - available_preprocessors = self.get_available_components(dataset_properties=dataset_properties, - include=include, - exclude=exclude) - - if len(available_preprocessors) == 0: - raise ValueError("no encoders found, please add a encoder") - - if default is None: - defaults = ['OneHotEncoder', 'NoEncoder'] - for default_ in defaults: - if default_ in available_preprocessors: - if include is not None and default_ not in include: - continue - if exclude is not None and default_ in exclude: - continue - default = default_ - break - - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(available_preprocessors): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_preprocessors, - choice_hyperparameter.value_range)) - if len(dataset_properties['categorical_columns']) == 0: - assert len(choice_hyperparameter.value_range) == 1 - assert 'NoEncoder' in choice_hyperparameter.value_range, \ - "Provided {} in choices, however, the dataset " \ - "is incompatible with it".format(choice_hyperparameter.value_range) - - preprocessor = CSH.CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - # add only no encoder to choice hyperparameters in case the dataset is only numerical - if len(dataset_properties['categorical_columns']) == 0: - default = 'NoEncoder' - if include is not None and default not in include: - raise ValueError("Provided {} in include, however, the dataset " - "is incompatible with it".format(include)) - preprocessor = CSH.CategoricalHyperparameter('__choice__', - ['NoEncoder'], - default_value=default) - else: - preprocessor = CSH.CategoricalHyperparameter('__choice__', - list(available_preprocessors.keys()), - default_value=default) - - cs.add_hyperparameter(preprocessor) - - # add only child hyperparameters of early_preprocessor choices - for name in preprocessor.choices: - preprocessor_configuration_space = available_preprocessors[name].\ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': preprocessor, 'value': name} - cs.add_configuration_space(name, preprocessor_configuration_space, - parent_hyperparameter=parent_hyperparameter) - - self.configuration_space = cs - self.dataset_properties = dataset_properties - return cs - - def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None: - """ - A mechanism in code to ensure the correctness of the fit dictionary - It recursively makes sure that the children and parent level requirements - are honored before fit. - Args: - dataset_properties: - - """ - super()._check_dataset_properties(dataset_properties) - assert 'numerical_columns' in dataset_properties.keys(), \ - "Dataset properties must contain information about numerical columns" - assert 'categorical_columns' in dataset_properties.keys(), \ - "Dataset properties must contain information about categorical columns" diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py index e69de29bb..be678da94 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py @@ -0,0 +1,132 @@ +import os +from collections import OrderedDict +from typing import Any, Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent + +preprocessing_directory = os.path.split(__file__)[0] +_preprocessors = find_components(__package__, + preprocessing_directory, + autoPyTorchFeaturePreprocessingComponent) +_addons = ThirdPartyComponents(autoPyTorchFeaturePreprocessingComponent) + + +def add_feature_preprocessor(feature_preprocessor: autoPyTorchFeaturePreprocessingComponent) -> None: + _addons.add_component(feature_preprocessor) + + +class FeatureProprocessorChoice(autoPyTorchChoice): + """ + Allows for dynamically choosing feature_preprocessor component at runtime + """ + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available feature_preprocessor components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all feature preprocessor components available + as choices for encoding the categorical columns + """ + components: Dict = OrderedDict() + components.update(_preprocessors) + components.update(_addons.components) + return components + + def get_hyperparameter_search_space(self, + dataset_properties: Optional[Dict[str, Any]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None) -> ConfigurationSpace: + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = dict() + + dataset_properties = {**self.dataset_properties, **dataset_properties} + + available_ = self.get_available_components(dataset_properties=dataset_properties, + include=include, + exclude=exclude) + + if len(available_) == 0: + raise ValueError("no feature preprocessors found, please add a feature preprocessor") + + if default is None: + defaults = ['NoFeaturePreprocessor', + 'FastICA', + 'KernelPCA', + 'RandomKitchenSinks', + 'Nystroem', + 'PolynomialFeatures', + 'PowerTransformer', + 'TruncatedSVD', + ] + for default_ in defaults: + if default_ in available_: + if include is not None and default_ not in include: + continue + if exclude is not None and default_ in exclude: + continue + default = default_ + break + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_, + choice_hyperparameter.value_range)) + if len(dataset_properties['numerical_columns']) == 0: + assert len(choice_hyperparameter.value_range) == 1 + assert 'NoFeaturePreprocessor' in choice_hyperparameter.value_range, \ + "Provided {} in choices, however, the dataset " \ + "is incompatible with it".format(choice_hyperparameter.value_range) + preprocessor = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + # add only no feature preprocessor to choice hyperparameters in case the dataset is only categorical + if len(dataset_properties['numerical_columns']) == 0: + default = 'NoFeaturePreprocessor' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, " + "the dataset is incompatible with it".format(include)) + preprocessor = CSH.CategoricalHyperparameter('__choice__', + ['NoFeaturePreprocessor'], + default_value=default) + else: + # Truncated SVD requires n_features > n_components + if len(dataset_properties['numerical_columns']) == 1: + del available_['TruncatedSVD'] + preprocessor = CSH.CategoricalHyperparameter('__choice__', + list(available_.keys()), + default_value=default) + + cs.add_hyperparameter(preprocessor) + + # add only child hyperparameters of preprocessor choices + for name in preprocessor.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_[name].get_hyperparameter_search_space(dataset_properties, # type:ignore + **updates) + parent_hyperparameter = {'parent': preprocessor, 'value': name} + cs.add_configuration_space(name, config_space, + parent_hyperparameter=parent_hyperparameter) + + self.configuration_space = cs + self.dataset_properties = dataset_properties + return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor_choice.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor_choice.py deleted file mode 100644 index 43a1e1a66..000000000 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor_choice.py +++ /dev/null @@ -1,132 +0,0 @@ -import os -from collections import OrderedDict -from typing import Any, Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ - base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent - -preprocessing_directory = os.path.split(__file__)[0] -_preprocessors = find_components(__package__, - preprocessing_directory, - autoPyTorchFeaturePreprocessingComponent) -_addons = ThirdPartyComponents(autoPyTorchFeaturePreprocessingComponent) - - -def add_feature_preprocessor(feature_preprocessor: autoPyTorchFeaturePreprocessingComponent) -> None: - _addons.add_component(feature_preprocessor) - - -class FeatureProprocessorChoice(autoPyTorchChoice): - """ - Allows for dynamically choosing feature_preprocessor component at runtime - """ - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available feature_preprocessor components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all feature preprocessor components available - as choices for encoding the categorical columns - """ - components: Dict = OrderedDict() - components.update(_preprocessors) - components.update(_addons.components) - return components - - def get_hyperparameter_search_space(self, - dataset_properties: Optional[Dict[str, Any]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None) -> ConfigurationSpace: - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = dict() - - dataset_properties = {**self.dataset_properties, **dataset_properties} - - available_ = self.get_available_components(dataset_properties=dataset_properties, - include=include, - exclude=exclude) - - if len(available_) == 0: - raise ValueError("no feature preprocessors found, please add a feature preprocessor") - - if default is None: - defaults = ['NoFeaturePreprocessor', - 'FastICA', - 'KernelPCA', - 'RandomKitchenSinks', - 'Nystroem', - 'PolynomialFeatures', - 'PowerTransformer', - 'TruncatedSVD', - ] - for default_ in defaults: - if default_ in available_: - if include is not None and default_ not in include: - continue - if exclude is not None and default_ in exclude: - continue - default = default_ - break - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(available_): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_, - choice_hyperparameter.value_range)) - if len(dataset_properties['numerical_columns']) == 0: - assert len(choice_hyperparameter.value_range) == 1 - assert 'NoFeaturePreprocessor' in choice_hyperparameter.value_range, \ - "Provided {} in choices, however, the dataset " \ - "is incompatible with it".format(choice_hyperparameter.value_range) - preprocessor = CSH.CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - # add only no feature preprocessor to choice hyperparameters in case the dataset is only categorical - if len(dataset_properties['numerical_columns']) == 0: - default = 'NoFeaturePreprocessor' - if include is not None and default not in include: - raise ValueError("Provided {} in include, however, " - "the dataset is incompatible with it".format(include)) - preprocessor = CSH.CategoricalHyperparameter('__choice__', - ['NoFeaturePreprocessor'], - default_value=default) - else: - # Truncated SVD requires n_features > n_components - if len(dataset_properties['numerical_columns']) == 1: - del available_['TruncatedSVD'] - preprocessor = CSH.CategoricalHyperparameter('__choice__', - list(available_.keys()), - default_value=default) - - cs.add_hyperparameter(preprocessor) - - # add only child hyperparameters of early_preprocessor choices - for name in preprocessor.choices: - updates = self._get_search_space_updates(prefix=name) - config_space = available_[name].get_hyperparameter_search_space(dataset_properties, # type:ignore - **updates) - parent_hyperparameter = {'parent': preprocessor, 'value': name} - cs.add_configuration_space(name, config_space, - parent_hyperparameter=parent_hyperparameter) - - self.configuration_space = cs - self.dataset_properties = dataset_properties - return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py index e69de29bb..3cdd81676 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py @@ -0,0 +1,131 @@ +import os +from collections import OrderedDict +from typing import Any, Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler + +scaling_directory = os.path.split(__file__)[0] +_scalers = find_components(__package__, + scaling_directory, + BaseScaler) + +_addons = ThirdPartyComponents(BaseScaler) + + +def add_scaler(scaler: BaseScaler) -> None: + _addons.add_component(scaler) + + +class ScalerChoice(autoPyTorchChoice): + """ + Allows for dynamically choosing scaling component at runtime + """ + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available scaler components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all BaseScalers components available + as choices for scaling + """ + components = OrderedDict() + components.update(_scalers) + components.update(_addons.components) + return components + + def get_hyperparameter_search_space(self, + dataset_properties: Optional[Dict[str, Any]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None) -> ConfigurationSpace: + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = dict() + + dataset_properties = {**self.dataset_properties, **dataset_properties} + + available_scalers = self.get_available_components(dataset_properties=dataset_properties, + include=include, + exclude=exclude) + + if len(available_scalers) == 0: + raise ValueError("no scalers found, please add a scaler") + + if default is None: + defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler'] + for default_ in defaults: + if default_ in available_scalers: + default = default_ + break + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_scalers): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_scalers, + choice_hyperparameter.value_range)) + if len(dataset_properties['numerical_columns']) == 0: + assert len(choice_hyperparameter.value_range) == 1 + if 'NoScaler' not in choice_hyperparameter.value_range: + raise ValueError("Provided {} in choices, however, the dataset " + "is incompatible with it".format(choice_hyperparameter.value_range)) + + preprocessor = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + # add only no scaler to choice hyperparameters in case the dataset is only categorical + if len(dataset_properties['numerical_columns']) == 0: + default = 'NoScaler' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, " + "the dataset is incompatible with it".format(include)) + preprocessor = CSH.CategoricalHyperparameter('__choice__', + ['NoScaler'], + default_value=default) + else: + preprocessor = CSH.CategoricalHyperparameter('__choice__', + list(available_scalers.keys()), + default_value=default) + cs.add_hyperparameter(preprocessor) + + # add only child hyperparameters of preprocessor choices + for name in preprocessor.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_scalers[name].get_hyperparameter_search_space(dataset_properties, # type:ignore + **updates) + parent_hyperparameter = {'parent': preprocessor, 'value': name} + cs.add_configuration_space(name, config_space, + parent_hyperparameter=parent_hyperparameter) + + self.configuration_space = cs + self.dataset_properties = dataset_properties + return cs + + def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None: + """ + A mechanism in code to ensure the correctness of the fit dictionary + It recursively makes sure that the children and parent level requirements + are honored before fit. + Args: + dataset_properties: + + """ + super()._check_dataset_properties(dataset_properties) + assert 'numerical_columns' in dataset_properties.keys() and \ + 'categorical_columns' in dataset_properties.keys(), \ + "Dataset properties must contain information about the type of columns" diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler_choice.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler_choice.py deleted file mode 100644 index 7c5f22fd5..000000000 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler_choice.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -from collections import OrderedDict -from typing import Any, Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler - -scaling_directory = os.path.split(__file__)[0] -_scalers = find_components(__package__, - scaling_directory, - BaseScaler) - -_addons = ThirdPartyComponents(BaseScaler) - - -def add_scaler(scaler: BaseScaler) -> None: - _addons.add_component(scaler) - - -class ScalerChoice(autoPyTorchChoice): - """ - Allows for dynamically choosing scaling component at runtime - """ - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available scaler components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all BaseScalers components available - as choices for scaling - """ - components = OrderedDict() - components.update(_scalers) - components.update(_addons.components) - return components - - def get_hyperparameter_search_space(self, - dataset_properties: Optional[Dict[str, Any]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None) -> ConfigurationSpace: - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = dict() - - dataset_properties = {**self.dataset_properties, **dataset_properties} - - available_scalers = self.get_available_components(dataset_properties=dataset_properties, - include=include, - exclude=exclude) - - if len(available_scalers) == 0: - raise ValueError("no scalers found, please add a scaler") - - if default is None: - defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler'] - for default_ in defaults: - if default_ in available_scalers: - default = default_ - break - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(available_scalers): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_scalers, - choice_hyperparameter.value_range)) - if len(dataset_properties['numerical_columns']) == 0: - assert len(choice_hyperparameter.value_range) == 1 - if 'NoScaler' not in choice_hyperparameter.value_range: - raise ValueError("Provided {} in choices, however, the dataset " - "is incompatible with it".format(choice_hyperparameter.value_range)) - - preprocessor = CSH.CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - # add only no scaler to choice hyperparameters in case the dataset is only categorical - if len(dataset_properties['numerical_columns']) == 0: - default = 'NoScaler' - if include is not None and default not in include: - raise ValueError("Provided {} in include, however, " - "the dataset is incompatible with it".format(include)) - preprocessor = CSH.CategoricalHyperparameter('__choice__', - ['NoScaler'], - default_value=default) - else: - preprocessor = CSH.CategoricalHyperparameter('__choice__', - list(available_scalers.keys()), - default_value=default) - cs.add_hyperparameter(preprocessor) - - # add only child hyperparameters of early_preprocessor choices - for name in preprocessor.choices: - updates = self._get_search_space_updates(prefix=name) - config_space = available_scalers[name].get_hyperparameter_search_space(dataset_properties, # type:ignore - **updates) - parent_hyperparameter = {'parent': preprocessor, 'value': name} - cs.add_configuration_space(name, config_space, - parent_hyperparameter=parent_hyperparameter) - - self.configuration_space = cs - self.dataset_properties = dataset_properties - return cs - - def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None: - """ - A mechanism in code to ensure the correctness of the fit dictionary - It recursively makes sure that the children and parent level requirements - are honored before fit. - Args: - dataset_properties: - - """ - super()._check_dataset_properties(dataset_properties) - assert 'numerical_columns' in dataset_properties.keys() and \ - 'categorical_columns' in dataset_properties.keys(), \ - "Dataset properties must contain information about the type of columns" diff --git a/autoPyTorch/pipeline/components/setup/lr_scheduler/__init__.py b/autoPyTorch/pipeline/components/setup/lr_scheduler/__init__.py index e69de29bb..9349ab642 100644 --- a/autoPyTorch/pipeline/components/setup/lr_scheduler/__init__.py +++ b/autoPyTorch/pipeline/components/setup/lr_scheduler/__init__.py @@ -0,0 +1,187 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler import BaseLRComponent + +directory = os.path.split(__file__)[0] +_schedulers = find_components(__package__, + directory, + BaseLRComponent) +_addons = ThirdPartyComponents(BaseLRComponent) + + +def add_scheduler(scheduler: BaseLRComponent) -> None: + _addons.add_component(scheduler) + + +class SchedulerChoice(autoPyTorchChoice): + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available scheduler components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all baseScheduler components available + as choices for learning rate scheduling + """ + components = OrderedDict() + components.update(_schedulers) + components.update(_addons.components) + return components + + def get_available_components( + self, + dataset_properties: Optional[Dict[str, str]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics + of the dataset to guide the pipeline choices of components + + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of learning + rate schedulers + + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry == SchedulerChoice or hasattr(entry, 'get_components'): + continue + + # target_type = dataset_properties['target_type'] + # Apply some automatic filtering here for + # schedulers based on the dataset! + # TODO: Think if there is any case where a scheduler + # is not recommended for a certain dataset + + components_dict[name] = entry + + return components_dict + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default scheduler to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + # Compile a list of legal preprocessors for this problem + available_schedulers = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(available_schedulers) == 0: + raise ValueError("No scheduler found") + + if default is None: + defaults = [ + 'ReduceLROnPlateau', + 'CosineAnnealingLR', + 'no_LRScheduler', + 'LambdaLR', + 'StepLR', + 'ExponentialLR', + ] + for default_ in defaults: + if default_ in available_schedulers: + default = default_ + break + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_schedulers): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_schedulers, + choice_hyperparameter.value_range)) + scheduler = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + scheduler = CSH.CategoricalHyperparameter( + '__choice__', + list(available_schedulers.keys()), + default_value=default + ) + cs.add_hyperparameter(scheduler) + for name in scheduler.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_schedulers[name].get_hyperparameter_search_space(dataset_properties, # type:ignore + **updates) + parent_hyperparameter = {'parent': scheduler, 'value': name} + cs.add_configuration_space( + name, + config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call transform before the object is initialized" + return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/lr_scheduler/base_scheduler_choice.py b/autoPyTorch/pipeline/components/setup/lr_scheduler/base_scheduler_choice.py deleted file mode 100644 index 9349ab642..000000000 --- a/autoPyTorch/pipeline/components/setup/lr_scheduler/base_scheduler_choice.py +++ /dev/null @@ -1,187 +0,0 @@ -import os -from collections import OrderedDict -from typing import Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -import numpy as np - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler import BaseLRComponent - -directory = os.path.split(__file__)[0] -_schedulers = find_components(__package__, - directory, - BaseLRComponent) -_addons = ThirdPartyComponents(BaseLRComponent) - - -def add_scheduler(scheduler: BaseLRComponent) -> None: - _addons.add_component(scheduler) - - -class SchedulerChoice(autoPyTorchChoice): - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available scheduler components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all baseScheduler components available - as choices for learning rate scheduling - """ - components = OrderedDict() - components.update(_schedulers) - components.update(_addons.components) - return components - - def get_available_components( - self, - dataset_properties: Optional[Dict[str, str]] = None, - include: List[str] = None, - exclude: List[str] = None, - ) -> Dict[str, autoPyTorchComponent]: - """Filters out components based on user provided - include/exclude directives, as well as the dataset properties - - Args: - include (Optional[Dict[str, Any]]): what hyper-parameter configurations - to honor when creating the configuration space - exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations - to remove from the configuration space - dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics - of the dataset to guide the pipeline choices of components - - Returns: - Dict[str, autoPyTorchComponent]: A filtered dict of learning - rate schedulers - - """ - if dataset_properties is None: - dataset_properties = {} - - if include is not None and exclude is not None: - raise ValueError( - "The argument include and exclude cannot be used together.") - - available_comp = self.get_components() - - if include is not None: - for incl in include: - if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) - - components_dict = OrderedDict() - for name in available_comp: - if include is not None and name not in include: - continue - elif exclude is not None and name in exclude: - continue - - entry = available_comp[name] - - # Exclude itself to avoid infinite loop - if entry == SchedulerChoice or hasattr(entry, 'get_components'): - continue - - # target_type = dataset_properties['target_type'] - # Apply some automatic filtering here for - # schedulers based on the dataset! - # TODO: Think if there is any case where a scheduler - # is not recommended for a certain dataset - - components_dict[name] = entry - - return components_dict - - def get_hyperparameter_search_space( - self, - dataset_properties: Optional[Dict[str, str]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - ) -> ConfigurationSpace: - """Returns the configuration space of the current chosen components - - Args: - dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on - default (Optional[str]): Default scheduler to use - include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive - list, and will exclusively use this components. - exclude: Optional[Dict[str, Any]]: which components to skip - - Returns: - ConfigurationSpace: the configuration space of the hyper-parameters of the - chosen component - """ - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = {} - - # Compile a list of legal preprocessors for this problem - available_schedulers = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) - - if len(available_schedulers) == 0: - raise ValueError("No scheduler found") - - if default is None: - defaults = [ - 'ReduceLROnPlateau', - 'CosineAnnealingLR', - 'no_LRScheduler', - 'LambdaLR', - 'StepLR', - 'ExponentialLR', - ] - for default_ in defaults: - if default_ in available_schedulers: - default = default_ - break - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(available_schedulers): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_schedulers, - choice_hyperparameter.value_range)) - scheduler = CSH.CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - scheduler = CSH.CategoricalHyperparameter( - '__choice__', - list(available_schedulers.keys()), - default_value=default - ) - cs.add_hyperparameter(scheduler) - for name in scheduler.choices: - updates = self._get_search_space_updates(prefix=name) - config_space = available_schedulers[name].get_hyperparameter_search_space(dataset_properties, # type:ignore - **updates) - parent_hyperparameter = {'parent': scheduler, 'value': name} - cs.add_configuration_space( - name, - config_space, - parent_hyperparameter=parent_hyperparameter - ) - - self.configuration_space_ = cs - self.dataset_properties_ = dataset_properties - return cs - - def transform(self, X: np.ndarray) -> np.ndarray: - assert self.choice is not None, "Cannot call transform before the object is initialized" - return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py index 81fd8e5f4..ef02f3c22 100644 --- a/autoPyTorch/pipeline/components/setup/network/base_network.py +++ b/autoPyTorch/pipeline/components/setup/network/base_network.py @@ -31,6 +31,7 @@ def __init__( FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False), FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False), ]) + self.network = network self.final_activation = None def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent: diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py index e69de29bb..13793c393 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py @@ -0,0 +1,196 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import ( + NetworkBackboneComponent, +) + +directory = os.path.split(__file__)[0] +_backbones = find_components(__package__, + directory, + NetworkBackboneComponent) +_addons = ThirdPartyComponents(NetworkBackboneComponent) + + +def add_backbone(backbone: NetworkBackboneComponent) -> None: + _addons.add_component(backbone) + + +class NetworkBackboneChoice(autoPyTorchChoice): + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available backbone components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all basebackbone components available + as choices for learning rate scheduling + """ + components = OrderedDict() + components.update(_backbones) + components.update(_addons.components) + return components + + def get_available_components( + self, + dataset_properties: Optional[Dict[str, str]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics + of the dataset to guide the pipeline choices of components + + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of learning + rate backbones + + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry == NetworkBackboneChoice or hasattr(entry, 'get_components'): + continue + + task_type = dataset_properties['task_type'] + properties = entry.get_properties() + if 'tabular' in task_type and not properties['handles_tabular']: + continue + elif 'image' in task_type and not properties['handles_image']: + continue + elif 'time_series' in task_type and not properties['handles_time_series']: + continue + + # target_type = dataset_properties['target_type'] + # Apply some automatic filtering here for + # backbones based on the dataset! + # TODO: Think if there is any case where a backbone + # is not recommended for a certain dataset + + components_dict[name] = entry + + return components_dict + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default backbone to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + # Compile a list of legal preprocessors for this problem + available_backbones = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(available_backbones) == 0: + raise ValueError("No backbone found") + + if default is None: + defaults = [ + 'ShapedMLPBackbone', + 'MLPBackbone', + 'ConvNetImageBackbone', + 'InceptionTimeBackbone', + ] + for default_ in defaults: + if default_ in available_backbones: + default = default_ + break + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_backbones): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_backbones, + choice_hyperparameter.value_range)) + backbone = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + backbone = CSH.CategoricalHyperparameter( + '__choice__', + list(available_backbones.keys()), + default_value=default + ) + cs.add_hyperparameter(backbone) + for name in backbone.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_backbones[name].get_hyperparameter_search_space(dataset_properties, # type: ignore + **updates) + parent_hyperparameter = {'parent': backbone, 'value': name} + cs.add_configuration_space( + name, + config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call transform before the object is initialized" + return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone_choice.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone_choice.py deleted file mode 100644 index 13793c393..000000000 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone_choice.py +++ /dev/null @@ -1,196 +0,0 @@ -import os -from collections import OrderedDict -from typing import Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -import numpy as np - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import ( - NetworkBackboneComponent, -) - -directory = os.path.split(__file__)[0] -_backbones = find_components(__package__, - directory, - NetworkBackboneComponent) -_addons = ThirdPartyComponents(NetworkBackboneComponent) - - -def add_backbone(backbone: NetworkBackboneComponent) -> None: - _addons.add_component(backbone) - - -class NetworkBackboneChoice(autoPyTorchChoice): - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available backbone components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all basebackbone components available - as choices for learning rate scheduling - """ - components = OrderedDict() - components.update(_backbones) - components.update(_addons.components) - return components - - def get_available_components( - self, - dataset_properties: Optional[Dict[str, str]] = None, - include: List[str] = None, - exclude: List[str] = None, - ) -> Dict[str, autoPyTorchComponent]: - """Filters out components based on user provided - include/exclude directives, as well as the dataset properties - - Args: - include (Optional[Dict[str, Any]]): what hyper-parameter configurations - to honor when creating the configuration space - exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations - to remove from the configuration space - dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics - of the dataset to guide the pipeline choices of components - - Returns: - Dict[str, autoPyTorchComponent]: A filtered dict of learning - rate backbones - - """ - if dataset_properties is None: - dataset_properties = {} - - if include is not None and exclude is not None: - raise ValueError( - "The argument include and exclude cannot be used together.") - - available_comp = self.get_components() - - if include is not None: - for incl in include: - if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) - - components_dict = OrderedDict() - for name in available_comp: - if include is not None and name not in include: - continue - elif exclude is not None and name in exclude: - continue - - entry = available_comp[name] - - # Exclude itself to avoid infinite loop - if entry == NetworkBackboneChoice or hasattr(entry, 'get_components'): - continue - - task_type = dataset_properties['task_type'] - properties = entry.get_properties() - if 'tabular' in task_type and not properties['handles_tabular']: - continue - elif 'image' in task_type and not properties['handles_image']: - continue - elif 'time_series' in task_type and not properties['handles_time_series']: - continue - - # target_type = dataset_properties['target_type'] - # Apply some automatic filtering here for - # backbones based on the dataset! - # TODO: Think if there is any case where a backbone - # is not recommended for a certain dataset - - components_dict[name] = entry - - return components_dict - - def get_hyperparameter_search_space( - self, - dataset_properties: Optional[Dict[str, str]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - ) -> ConfigurationSpace: - """Returns the configuration space of the current chosen components - - Args: - dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on - default (Optional[str]): Default backbone to use - include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive - list, and will exclusively use this components. - exclude: Optional[Dict[str, Any]]: which components to skip - - Returns: - ConfigurationSpace: the configuration space of the hyper-parameters of the - chosen component - """ - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = {} - - # Compile a list of legal preprocessors for this problem - available_backbones = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) - - if len(available_backbones) == 0: - raise ValueError("No backbone found") - - if default is None: - defaults = [ - 'ShapedMLPBackbone', - 'MLPBackbone', - 'ConvNetImageBackbone', - 'InceptionTimeBackbone', - ] - for default_ in defaults: - if default_ in available_backbones: - default = default_ - break - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(available_backbones): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_backbones, - choice_hyperparameter.value_range)) - backbone = CSH.CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - backbone = CSH.CategoricalHyperparameter( - '__choice__', - list(available_backbones.keys()), - default_value=default - ) - cs.add_hyperparameter(backbone) - for name in backbone.choices: - updates = self._get_search_space_updates(prefix=name) - config_space = available_backbones[name].get_hyperparameter_search_space(dataset_properties, # type: ignore - **updates) - parent_hyperparameter = {'parent': backbone, 'value': name} - cs.add_configuration_space( - name, - config_space, - parent_hyperparameter=parent_hyperparameter - ) - - self.configuration_space_ = cs - self.dataset_properties_ = dataset_properties - return cs - - def transform(self, X: np.ndarray) -> np.ndarray: - assert self.choice is not None, "Cannot call transform before the object is initialized" - return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py index e69de29bb..2d634c0bb 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py @@ -0,0 +1,204 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import ( + NetworkEmbeddingComponent, +) + +directory = os.path.split(__file__)[0] +_embeddings = find_components(__package__, + directory, + NetworkEmbeddingComponent) +_addons = ThirdPartyComponents(NetworkEmbeddingComponent) + + +def add_embedding(embedding: NetworkEmbeddingComponent) -> None: + _addons.add_component(embedding) + + +class NetworkEmbeddingChoice(autoPyTorchChoice): + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available embedding components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all NetworkEmbeddingComponents available + as choices for learning rate scheduling + """ + components = OrderedDict() + components.update(_embeddings) + components.update(_addons.components) + return components + + def get_available_components( + self, + dataset_properties: Optional[Dict[str, str]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics + of the dataset to guide the pipeline choices of components + + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of learning + rate embeddings + + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry == NetworkEmbeddingChoice or hasattr(entry, 'get_components'): + continue + + task_type = dataset_properties['task_type'] + properties = entry.get_properties() + if 'tabular' in task_type and not properties['handles_tabular']: + continue + elif 'image' in task_type and not properties['handles_image']: + continue + elif 'time_series' in task_type and not properties['handles_time_series']: + continue + + components_dict[name] = entry + + return components_dict + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default embedding to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + # Compile a list of legal preprocessors for this problem + available_embedding = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(available_embedding) == 0 and 'tabular' in dataset_properties['task_type']: + raise ValueError("No embedding found") + + if available_embedding == 0: + return cs + + if default is None: + defaults = [ + 'NoEmbedding', + 'LearnedEntityEmbedding', + ] + for default_ in defaults: + if default_ in available_embedding: + default = default_ + break + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_embedding): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_embedding, + choice_hyperparameter.value_range)) + if len(dataset_properties['categorical_columns']) == 0: + assert len(choice_hyperparameter.value_range) == 1 + if 'NoEmbedding' not in choice_hyperparameter.value_range: + raise ValueError("Provided {} in choices, however, the dataset " + "is incompatible with it".format(choice_hyperparameter.value_range)) + embedding = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + if len(dataset_properties['categorical_columns']) == 0: + default = 'NoEmbedding' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, the dataset " + "is incompatible with it".format(include)) + embedding = CSH.CategoricalHyperparameter('__choice__', + ['NoEmbedding'], + default_value=default) + else: + embedding = CSH.CategoricalHyperparameter('__choice__', + list(available_embedding.keys()), + default_value=default) + + cs.add_hyperparameter(embedding) + for name in embedding.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_embedding[name].get_hyperparameter_search_space(dataset_properties, # type: ignore + **updates) + parent_hyperparameter = {'parent': embedding, 'value': name} + cs.add_configuration_space( + name, + config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call transform before the object is initialized" + return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py deleted file mode 100644 index 14a5c93d9..000000000 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py +++ /dev/null @@ -1,204 +0,0 @@ -import os -from collections import OrderedDict -from typing import Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -import numpy as np - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import ( - NetworkEmbeddingComponent, -) - -directory = os.path.split(__file__)[0] -_embeddings = find_components(__package__, - directory, - NetworkEmbeddingComponent) -_addons = ThirdPartyComponents(NetworkEmbeddingComponent) - - -def add_embedding(embedding: NetworkEmbeddingComponent) -> None: - _addons.add_component(embedding) - - -class NetworkEmbeddingChoice(autoPyTorchChoice): - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available embedding components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all baseembedding components available - as choices for learning rate scheduling - """ - components = OrderedDict() - components.update(_embeddings) - components.update(_addons.components) - return components - - def get_available_components( - self, - dataset_properties: Optional[Dict[str, str]] = None, - include: List[str] = None, - exclude: List[str] = None, - ) -> Dict[str, autoPyTorchComponent]: - """Filters out components based on user provided - include/exclude directives, as well as the dataset properties - - Args: - include (Optional[Dict[str, Any]]): what hyper-parameter configurations - to honor when creating the configuration space - exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations - to remove from the configuration space - dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics - of the dataset to guide the pipeline choices of components - - Returns: - Dict[str, autoPyTorchComponent]: A filtered dict of learning - rate embeddings - - """ - if dataset_properties is None: - dataset_properties = {} - - if include is not None and exclude is not None: - raise ValueError( - "The argument include and exclude cannot be used together.") - - available_comp = self.get_components() - - if include is not None: - for incl in include: - if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) - - components_dict = OrderedDict() - for name in available_comp: - if include is not None and name not in include: - continue - elif exclude is not None and name in exclude: - continue - - entry = available_comp[name] - - # Exclude itself to avoid infinite loop - if entry == NetworkEmbeddingChoice or hasattr(entry, 'get_components'): - continue - - task_type = dataset_properties['task_type'] - properties = entry.get_properties() - if 'tabular' in task_type and not properties['handles_tabular']: - continue - elif 'image' in task_type and not properties['handles_image']: - continue - elif 'time_series' in task_type and not properties['handles_time_series']: - continue - - components_dict[name] = entry - - return components_dict - - def get_hyperparameter_search_space( - self, - dataset_properties: Optional[Dict[str, str]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - ) -> ConfigurationSpace: - """Returns the configuration space of the current chosen components - - Args: - dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on - default (Optional[str]): Default embedding to use - include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive - list, and will exclusively use this components. - exclude: Optional[Dict[str, Any]]: which components to skip - - Returns: - ConfigurationSpace: the configuration space of the hyper-parameters of the - chosen component - """ - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = {} - - # Compile a list of legal preprocessors for this problem - available_embedding = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) - - if len(available_embedding) == 0 and 'tabular' in dataset_properties['task_type']: - raise ValueError("No embedding found") - - if available_embedding == 0: - return cs - - if default is None: - defaults = [ - 'NoEmbedding', - 'LearnedEntityEmbedding', - ] - for default_ in defaults: - if default_ in available_embedding: - default = default_ - break - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(available_embedding): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_embedding, - choice_hyperparameter.value_range)) - if len(dataset_properties['categorical_columns']) == 0: - assert len(choice_hyperparameter.value_range) == 1 - if 'NoEmbedding' not in choice_hyperparameter.value_range: - raise ValueError("Provided {} in choices, however, the dataset " - "is incompatible with it".format(choice_hyperparameter.value_range)) - embedding = CSH.CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - if len(dataset_properties['categorical_columns']) == 0: - default = 'NoEmbedding' - if include is not None and default not in include: - raise ValueError("Provided {} in include, however, the dataset " - "is incompatible with it".format(include)) - embedding = CSH.CategoricalHyperparameter('__choice__', - ['NoEmbedding'], - default_value=default) - else: - embedding = CSH.CategoricalHyperparameter('__choice__', - list(available_embedding.keys()), - default_value=default) - - cs.add_hyperparameter(embedding) - for name in embedding.choices: - updates = self._get_search_space_updates(prefix=name) - config_space = available_embedding[name].get_hyperparameter_search_space(dataset_properties, # type: ignore - **updates) - parent_hyperparameter = {'parent': embedding, 'value': name} - cs.add_configuration_space( - name, - config_space, - parent_hyperparameter=parent_hyperparameter - ) - - self.configuration_space_ = cs - self.dataset_properties_ = dataset_properties - return cs - - def transform(self, X: np.ndarray) -> np.ndarray: - assert self.choice is not None, "Cannot call transform before the object is initialized" - return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/__init__.py index e69de29bb..346f5fba3 100644 --- a/autoPyTorch/pipeline/components/setup/network_head/__init__.py +++ b/autoPyTorch/pipeline/components/setup/network_head/__init__.py @@ -0,0 +1,193 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.setup.network_head.base_network_head import ( + NetworkHeadComponent, +) + +directory = os.path.split(__file__)[0] +_heads = find_components(__package__, + directory, + NetworkHeadComponent) +_addons = ThirdPartyComponents(NetworkHeadComponent) + + +def add_head(head: NetworkHeadComponent) -> None: + _addons.add_component(head) + + +class NetworkHeadChoice(autoPyTorchChoice): + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available head components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all NetworkHeadComponents available + as choices for learning rate scheduling + """ + components = OrderedDict() + components.update(_heads) + components.update(_addons.components) + return components + + def get_available_components( + self, + dataset_properties: Optional[Dict[str, str]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics + of the dataset to guide the pipeline choices of components + + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of learning + rate heads + + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry == NetworkHeadChoice or hasattr(entry, 'get_components'): + continue + + task_type = dataset_properties['task_type'] + properties = entry.get_properties() + if 'tabular' in task_type and not properties['handles_tabular']: + continue + elif 'image' in task_type and not properties['handles_image']: + continue + elif 'time_series' in task_type and not properties['handles_time_series']: + continue + + # target_type = dataset_properties['target_type'] + # Apply some automatic filtering here for + # heads based on the dataset! + # TODO: Think if there is any case where a head + # is not recommended for a certain dataset + + components_dict[name] = entry + + return components_dict + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default head to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + # Compile a list of legal preprocessors for this problem + available_heads = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(available_heads) == 0: + raise ValueError("No head found") + + if default is None: + defaults = [ + 'FullyConnectedHead', + 'FullyConvolutional2DHead', + ] + for default_ in defaults: + if default_ in available_heads: + default = default_ + break + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_heads): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_heads, + choice_hyperparameter.value_range)) + head = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + head = CSH.CategoricalHyperparameter( + '__choice__', + list(available_heads.keys()), + default_value=default) + cs.add_hyperparameter(head) + for name in head.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_heads[name].get_hyperparameter_search_space(dataset_properties, # type: ignore + **updates) + parent_hyperparameter = {'parent': head, 'value': name} + cs.add_configuration_space( + name, + config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call transform before the object is initialized" + return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network_head/base_network_head_choice.py b/autoPyTorch/pipeline/components/setup/network_head/base_network_head_choice.py deleted file mode 100644 index c03e860fc..000000000 --- a/autoPyTorch/pipeline/components/setup/network_head/base_network_head_choice.py +++ /dev/null @@ -1,193 +0,0 @@ -import os -from collections import OrderedDict -from typing import Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -import numpy as np - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.setup.network_head.base_network_head import ( - NetworkHeadComponent, -) - -directory = os.path.split(__file__)[0] -_heads = find_components(__package__, - directory, - NetworkHeadComponent) -_addons = ThirdPartyComponents(NetworkHeadComponent) - - -def add_head(head: NetworkHeadComponent) -> None: - _addons.add_component(head) - - -class NetworkHeadChoice(autoPyTorchChoice): - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available head components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all basehead components available - as choices for learning rate scheduling - """ - components = OrderedDict() - components.update(_heads) - components.update(_addons.components) - return components - - def get_available_components( - self, - dataset_properties: Optional[Dict[str, str]] = None, - include: List[str] = None, - exclude: List[str] = None, - ) -> Dict[str, autoPyTorchComponent]: - """Filters out components based on user provided - include/exclude directives, as well as the dataset properties - - Args: - include (Optional[Dict[str, Any]]): what hyper-parameter configurations - to honor when creating the configuration space - exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations - to remove from the configuration space - dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics - of the dataset to guide the pipeline choices of components - - Returns: - Dict[str, autoPyTorchComponent]: A filtered dict of learning - rate heads - - """ - if dataset_properties is None: - dataset_properties = {} - - if include is not None and exclude is not None: - raise ValueError( - "The argument include and exclude cannot be used together.") - - available_comp = self.get_components() - - if include is not None: - for incl in include: - if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) - - components_dict = OrderedDict() - for name in available_comp: - if include is not None and name not in include: - continue - elif exclude is not None and name in exclude: - continue - - entry = available_comp[name] - - # Exclude itself to avoid infinite loop - if entry == NetworkHeadChoice or hasattr(entry, 'get_components'): - continue - - task_type = dataset_properties['task_type'] - properties = entry.get_properties() - if 'tabular' in task_type and not properties['handles_tabular']: - continue - elif 'image' in task_type and not properties['handles_image']: - continue - elif 'time_series' in task_type and not properties['handles_time_series']: - continue - - # target_type = dataset_properties['target_type'] - # Apply some automatic filtering here for - # heads based on the dataset! - # TODO: Think if there is any case where a head - # is not recommended for a certain dataset - - components_dict[name] = entry - - return components_dict - - def get_hyperparameter_search_space( - self, - dataset_properties: Optional[Dict[str, str]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - ) -> ConfigurationSpace: - """Returns the configuration space of the current chosen components - - Args: - dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on - default (Optional[str]): Default head to use - include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive - list, and will exclusively use this components. - exclude: Optional[Dict[str, Any]]: which components to skip - - Returns: - ConfigurationSpace: the configuration space of the hyper-parameters of the - chosen component - """ - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = {} - - # Compile a list of legal preprocessors for this problem - available_heads = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) - - if len(available_heads) == 0: - raise ValueError("No head found") - - if default is None: - defaults = [ - 'FullyConnectedHead', - 'FullyConvolutional2DHead', - ] - for default_ in defaults: - if default_ in available_heads: - default = default_ - break - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(available_heads): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_heads, - choice_hyperparameter.value_range)) - head = CSH.CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - head = CSH.CategoricalHyperparameter( - '__choice__', - list(available_heads.keys()), - default_value=default) - cs.add_hyperparameter(head) - for name in head.choices: - updates = self._get_search_space_updates(prefix=name) - config_space = available_heads[name].get_hyperparameter_search_space(dataset_properties, # type: ignore - **updates) - parent_hyperparameter = {'parent': head, 'value': name} - cs.add_configuration_space( - name, - config_space, - parent_hyperparameter=parent_hyperparameter - ) - - self.configuration_space_ = cs - self.dataset_properties_ = dataset_properties - return cs - - def transform(self, X: np.ndarray) -> np.ndarray: - assert self.choice is not None, "Cannot call transform before the object is initialized" - return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py b/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py index e69de29bb..f75f00c65 100644 --- a/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py +++ b/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py @@ -0,0 +1,179 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.setup.network_initializer.base_network_initializer import ( + BaseNetworkInitializerComponent +) + +directory = os.path.split(__file__)[0] +_initializers = find_components(__package__, + directory, + BaseNetworkInitializerComponent) +_addons = ThirdPartyComponents(BaseNetworkInitializerComponent) + + +def add_network_initializer(initializer: BaseNetworkInitializerComponent) -> None: + _addons.add_component(initializer) + + +class NetworkInitializerChoice(autoPyTorchChoice): + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available initializer components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all BaseNetworkInitializerComponents available + as choices + """ + components = OrderedDict() + components.update(_initializers) + components.update(_addons.components) + return components + + def get_available_components( + self, + dataset_properties: Optional[Dict[str, str]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics + of the dataset to guide the pipeline choices of components + + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of initializer + components + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry == NetworkInitializerChoice or hasattr(entry, 'get_components'): + continue + + # target_type = dataset_properties['target_type'] + # Apply some automatic filtering here based on dataset + + components_dict[name] = entry + + return components_dict + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default component to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + # Compile a list of legal preprocessors for this problem + initializers = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(initializers) == 0: + raise ValueError("No initializers found") + + if default is None: + defaults = ['XavierInit', + ] + for default_ in defaults: + if default_ in initializers: + default = default_ + break + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(initializers): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + initializers, + choice_hyperparameter.value_range)) + initializer = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + initializer = CSH.CategoricalHyperparameter( + '__choice__', + list(initializers.keys()), + default_value=default + ) + cs.add_hyperparameter(initializer) + for name in initializer.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = initializers[name].get_hyperparameter_search_space(dataset_properties, # type:ignore + **updates) + parent_hyperparameter = {'parent': initializer, 'value': name} + cs.add_configuration_space( + name, + config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call transform before the object is initialized" + return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/base_network_init_choice.py b/autoPyTorch/pipeline/components/setup/network_initializer/base_network_init_choice.py deleted file mode 100644 index cc7dfcfc6..000000000 --- a/autoPyTorch/pipeline/components/setup/network_initializer/base_network_init_choice.py +++ /dev/null @@ -1,179 +0,0 @@ -import os -from collections import OrderedDict -from typing import Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -import numpy as np - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.setup.network_initializer.base_network_initializer import ( - BaseNetworkInitializerComponent -) - -directory = os.path.split(__file__)[0] -_initializers = find_components(__package__, - directory, - BaseNetworkInitializerComponent) -_addons = ThirdPartyComponents(BaseNetworkInitializerComponent) - - -def add_network_initializer(initializer: BaseNetworkInitializerComponent) -> None: - _addons.add_component(initializer) - - -class NetworkInitializerChoice(autoPyTorchChoice): - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available initializer components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all BaseInitializerComponent components available - as choices - """ - components = OrderedDict() - components.update(_initializers) - components.update(_addons.components) - return components - - def get_available_components( - self, - dataset_properties: Optional[Dict[str, str]] = None, - include: List[str] = None, - exclude: List[str] = None, - ) -> Dict[str, autoPyTorchComponent]: - """Filters out components based on user provided - include/exclude directives, as well as the dataset properties - - Args: - include (Optional[Dict[str, Any]]): what hyper-parameter configurations - to honor when creating the configuration space - exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations - to remove from the configuration space - dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics - of the dataset to guide the pipeline choices of components - - Returns: - Dict[str, autoPyTorchComponent]: A filtered dict of initializer - components - """ - if dataset_properties is None: - dataset_properties = {} - - if include is not None and exclude is not None: - raise ValueError( - "The argument include and exclude cannot be used together.") - - available_comp = self.get_components() - - if include is not None: - for incl in include: - if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) - - components_dict = OrderedDict() - for name in available_comp: - if include is not None and name not in include: - continue - elif exclude is not None and name in exclude: - continue - - entry = available_comp[name] - - # Exclude itself to avoid infinite loop - if entry == NetworkInitializerChoice or hasattr(entry, 'get_components'): - continue - - # target_type = dataset_properties['target_type'] - # Apply some automatic filtering here based on dataset - - components_dict[name] = entry - - return components_dict - - def get_hyperparameter_search_space( - self, - dataset_properties: Optional[Dict[str, str]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - ) -> ConfigurationSpace: - """Returns the configuration space of the current chosen components - - Args: - dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on - default (Optional[str]): Default component to use - include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive - list, and will exclusively use this components. - exclude: Optional[Dict[str, Any]]: which components to skip - - Returns: - ConfigurationSpace: the configuration space of the hyper-parameters of the - chosen component - """ - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = {} - - # Compile a list of legal preprocessors for this problem - initializers = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) - - if len(initializers) == 0: - raise ValueError("No initializers found") - - if default is None: - defaults = ['XavierInit', - ] - for default_ in defaults: - if default_ in initializers: - default = default_ - break - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(initializers): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - initializers, - choice_hyperparameter.value_range)) - initializer = CSH.CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - initializer = CSH.CategoricalHyperparameter( - '__choice__', - list(initializers.keys()), - default_value=default - ) - cs.add_hyperparameter(initializer) - for name in initializer.choices: - updates = self._get_search_space_updates(prefix=name) - config_space = initializers[name].get_hyperparameter_search_space(dataset_properties, # type:ignore - **updates) - parent_hyperparameter = {'parent': initializer, 'value': name} - cs.add_configuration_space( - name, - config_space, - parent_hyperparameter=parent_hyperparameter - ) - - self.configuration_space_ = cs - self.dataset_properties_ = dataset_properties - return cs - - def transform(self, X: np.ndarray) -> np.ndarray: - assert self.choice is not None, "Cannot call transform before the object is initialized" - return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/optimizer/__init__.py b/autoPyTorch/pipeline/components/setup/optimizer/__init__.py index e69de29bb..93f61e74b 100644 --- a/autoPyTorch/pipeline/components/setup/optimizer/__init__.py +++ b/autoPyTorch/pipeline/components/setup/optimizer/__init__.py @@ -0,0 +1,181 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer import BaseOptimizerComponent + +directory = os.path.split(__file__)[0] +_optimizers = find_components(__package__, + directory, + BaseOptimizerComponent) +_addons = ThirdPartyComponents(BaseOptimizerComponent) + + +def add_optimizer(optimizer: BaseOptimizerComponent) -> None: + _addons.add_component(optimizer) + + +class OptimizerChoice(autoPyTorchChoice): + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available optimizer components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all BaseOptimizerComponents available + as choices + """ + components = OrderedDict() + components.update(_optimizers) + components.update(_addons.components) + return components + + def get_available_components( + self, + dataset_properties: Optional[Dict[str, str]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics + of the dataset to guide the pipeline choices of components + + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of Optimizer + components + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry == OptimizerChoice or hasattr(entry, 'get_components'): + continue + + # target_type = dataset_properties['target_type'] + # Apply some automatic filtering here based on dataset + + components_dict[name] = entry + + return components_dict + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default component to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + # Compile a list of legal preprocessors for this problem + available_optimizer = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(available_optimizer) == 0: + raise ValueError("No Optimizer found") + + if default is None: + defaults = [ + 'AdamOptimizer', + 'AdamWOptimizer', + 'SGDOptimizer', + 'RMSpropOptimizer' + ] + for default_ in defaults: + if default_ in available_optimizer: + default = default_ + break + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_optimizer): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_optimizer, + choice_hyperparameter.value_range)) + optimizer = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + optimizer = CSH.CategoricalHyperparameter( + '__choice__', + list(available_optimizer.keys()), + default_value=default + ) + cs.add_hyperparameter(optimizer) + for name in optimizer.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_optimizer[name].get_hyperparameter_search_space(dataset_properties, # type: ignore + **updates) + parent_hyperparameter = {'parent': optimizer, 'value': name} + cs.add_configuration_space( + name, + config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call transform before the object is initialized" + return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/optimizer/base_optimizer_choice.py b/autoPyTorch/pipeline/components/setup/optimizer/base_optimizer_choice.py deleted file mode 100644 index 93f61e74b..000000000 --- a/autoPyTorch/pipeline/components/setup/optimizer/base_optimizer_choice.py +++ /dev/null @@ -1,181 +0,0 @@ -import os -from collections import OrderedDict -from typing import Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -import numpy as np - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer import BaseOptimizerComponent - -directory = os.path.split(__file__)[0] -_optimizers = find_components(__package__, - directory, - BaseOptimizerComponent) -_addons = ThirdPartyComponents(BaseOptimizerComponent) - - -def add_optimizer(optimizer: BaseOptimizerComponent) -> None: - _addons.add_component(optimizer) - - -class OptimizerChoice(autoPyTorchChoice): - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available optimizer components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all BaseOptimizerComponents available - as choices - """ - components = OrderedDict() - components.update(_optimizers) - components.update(_addons.components) - return components - - def get_available_components( - self, - dataset_properties: Optional[Dict[str, str]] = None, - include: List[str] = None, - exclude: List[str] = None, - ) -> Dict[str, autoPyTorchComponent]: - """Filters out components based on user provided - include/exclude directives, as well as the dataset properties - - Args: - include (Optional[Dict[str, Any]]): what hyper-parameter configurations - to honor when creating the configuration space - exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations - to remove from the configuration space - dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics - of the dataset to guide the pipeline choices of components - - Returns: - Dict[str, autoPyTorchComponent]: A filtered dict of Optimizer - components - """ - if dataset_properties is None: - dataset_properties = {} - - if include is not None and exclude is not None: - raise ValueError( - "The argument include and exclude cannot be used together.") - - available_comp = self.get_components() - - if include is not None: - for incl in include: - if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) - - components_dict = OrderedDict() - for name in available_comp: - if include is not None and name not in include: - continue - elif exclude is not None and name in exclude: - continue - - entry = available_comp[name] - - # Exclude itself to avoid infinite loop - if entry == OptimizerChoice or hasattr(entry, 'get_components'): - continue - - # target_type = dataset_properties['target_type'] - # Apply some automatic filtering here based on dataset - - components_dict[name] = entry - - return components_dict - - def get_hyperparameter_search_space( - self, - dataset_properties: Optional[Dict[str, str]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - ) -> ConfigurationSpace: - """Returns the configuration space of the current chosen components - - Args: - dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on - default (Optional[str]): Default component to use - include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive - list, and will exclusively use this components. - exclude: Optional[Dict[str, Any]]: which components to skip - - Returns: - ConfigurationSpace: the configuration space of the hyper-parameters of the - chosen component - """ - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = {} - - # Compile a list of legal preprocessors for this problem - available_optimizer = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) - - if len(available_optimizer) == 0: - raise ValueError("No Optimizer found") - - if default is None: - defaults = [ - 'AdamOptimizer', - 'AdamWOptimizer', - 'SGDOptimizer', - 'RMSpropOptimizer' - ] - for default_ in defaults: - if default_ in available_optimizer: - default = default_ - break - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(available_optimizer): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_optimizer, - choice_hyperparameter.value_range)) - optimizer = CSH.CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - optimizer = CSH.CategoricalHyperparameter( - '__choice__', - list(available_optimizer.keys()), - default_value=default - ) - cs.add_hyperparameter(optimizer) - for name in optimizer.choices: - updates = self._get_search_space_updates(prefix=name) - config_space = available_optimizer[name].get_hyperparameter_search_space(dataset_properties, # type: ignore - **updates) - parent_hyperparameter = {'parent': optimizer, 'value': name} - cs.add_configuration_space( - name, - config_space, - parent_hyperparameter=parent_hyperparameter - ) - - self.configuration_space_ = cs - self.dataset_properties_ = dataset_properties - return cs - - def transform(self, X: np.ndarray) -> np.ndarray: - assert self.choice is not None, "Cannot call transform before the object is initialized" - return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/__init__.py index e69de29bb..3512fa6ce 100644 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/__init__.py +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/__init__.py @@ -0,0 +1,162 @@ +import os +from collections import OrderedDict +from typing import Any, Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.setup.traditional_ml.base_model import BaseModelComponent + + +directory = os.path.split(__file__)[0] +_models = find_components(__package__, + directory, + BaseModelComponent) +_addons = ThirdPartyComponents(BaseModelComponent) + + +def add_model(model: BaseModelComponent) -> None: + _addons.add_component(model) + + +class ModelChoice(autoPyTorchChoice): + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available model components + Args: + None + Returns: + Dict[str, autoPyTorchComponent]: all baseNetwork components available + as choices + """ + components = OrderedDict() + components.update(_models) + components.update(_addons.components) + return components + + def get_available_components( + self, + dataset_properties: Optional[Dict[str, str]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics + of the dataset to guide the pipeline choices of components + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of Network + components + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry == ModelChoice or hasattr(entry, 'get_components'): + continue + + # target_type = dataset_properties['target_type'] + # Apply some automatic filtering here based on dataset + components_dict[name] = entry + + return components_dict + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default component to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + # Compile a list of legal preprocessors for this problem + available_models = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(available_models) == 0: + raise ValueError("No Network found") + + if default is None: + defaults: List[Any] = [] + for default_ in defaults: + if default_ in available_models: + default = default_ + break + + model = CSH.CategoricalHyperparameter( + '__choice__', + list(available_models.keys()), + default_value=default + ) + cs.add_hyperparameter(model) + for name in available_models: + model_configuration_space = available_models[name]. \ + get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {'parent': model, 'value': name} + cs.add_configuration_space( + name, + model_configuration_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call transform before the object is initialized" + return self.choice.transform(X) + + def predict_proba(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call predict before the object is initialized" + return self.choice.predict_proba(X) diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/base_model_choice.py b/autoPyTorch/pipeline/components/setup/traditional_ml/base_model_choice.py deleted file mode 100644 index 3512fa6ce..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/base_model_choice.py +++ /dev/null @@ -1,162 +0,0 @@ -import os -from collections import OrderedDict -from typing import Any, Dict, List, Optional - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import ConfigurationSpace - -import numpy as np - -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.setup.traditional_ml.base_model import BaseModelComponent - - -directory = os.path.split(__file__)[0] -_models = find_components(__package__, - directory, - BaseModelComponent) -_addons = ThirdPartyComponents(BaseModelComponent) - - -def add_model(model: BaseModelComponent) -> None: - _addons.add_component(model) - - -class ModelChoice(autoPyTorchChoice): - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available model components - Args: - None - Returns: - Dict[str, autoPyTorchComponent]: all baseNetwork components available - as choices - """ - components = OrderedDict() - components.update(_models) - components.update(_addons.components) - return components - - def get_available_components( - self, - dataset_properties: Optional[Dict[str, str]] = None, - include: List[str] = None, - exclude: List[str] = None, - ) -> Dict[str, autoPyTorchComponent]: - """Filters out components based on user provided - include/exclude directives, as well as the dataset properties - Args: - include (Optional[Dict[str, Any]]): what hyper-parameter configurations - to honor when creating the configuration space - exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations - to remove from the configuration space - dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics - of the dataset to guide the pipeline choices of components - Returns: - Dict[str, autoPyTorchComponent]: A filtered dict of Network - components - """ - if dataset_properties is None: - dataset_properties = {} - - if include is not None and exclude is not None: - raise ValueError( - "The argument include and exclude cannot be used together.") - - available_comp = self.get_components() - - if include is not None: - for incl in include: - if incl not in available_comp: - raise ValueError("Trying to include unknown component: " - "%s" % incl) - - components_dict = OrderedDict() - for name in available_comp: - if include is not None and name not in include: - continue - elif exclude is not None and name in exclude: - continue - - entry = available_comp[name] - - # Exclude itself to avoid infinite loop - if entry == ModelChoice or hasattr(entry, 'get_components'): - continue - - # target_type = dataset_properties['target_type'] - # Apply some automatic filtering here based on dataset - components_dict[name] = entry - - return components_dict - - def get_hyperparameter_search_space( - self, - dataset_properties: Optional[Dict[str, str]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - ) -> ConfigurationSpace: - """Returns the configuration space of the current chosen components - Args: - dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on - default (Optional[str]): Default component to use - include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive - list, and will exclusively use this components. - exclude: Optional[Dict[str, Any]]: which components to skip - Returns: - ConfigurationSpace: the configuration space of the hyper-parameters of the - chosen component - """ - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = {} - - # Compile a list of legal preprocessors for this problem - available_models = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) - - if len(available_models) == 0: - raise ValueError("No Network found") - - if default is None: - defaults: List[Any] = [] - for default_ in defaults: - if default_ in available_models: - default = default_ - break - - model = CSH.CategoricalHyperparameter( - '__choice__', - list(available_models.keys()), - default_value=default - ) - cs.add_hyperparameter(model) - for name in available_models: - model_configuration_space = available_models[name]. \ - get_hyperparameter_search_space(dataset_properties) - parent_hyperparameter = {'parent': model, 'value': name} - cs.add_configuration_space( - name, - model_configuration_space, - parent_hyperparameter=parent_hyperparameter - ) - - self.configuration_space_ = cs - self.dataset_properties_ = dataset_properties - return cs - - def transform(self, X: np.ndarray) -> np.ndarray: - assert self.choice is not None, "Cannot call transform before the object is initialized" - return self.choice.transform(X) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - assert self.choice is not None, "Cannot call predict before the object is initialized" - return self.choice.predict_proba(X) diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py index e69de29bb..248d8085b 100755 --- a/autoPyTorch/pipeline/components/training/trainer/__init__.py +++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py @@ -0,0 +1,508 @@ +import collections +import logging.handlers +import os +import tempfile +import time +from typing import Any, Dict, List, Optional, Tuple, cast + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, +) + +import numpy as np + +import torch +from torch.optim import Optimizer +from torch.optim.lr_scheduler import _LRScheduler +from torch.utils.tensorboard.writer import SummaryWriter + +from autoPyTorch.constants import STRING_TO_TASK_TYPES +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.training.losses import get_loss +from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics +from autoPyTorch.pipeline.components.training.trainer.base_trainer import ( + BaseTrainerComponent, + BudgetTracker, + RunSummary, +) +from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary +from autoPyTorch.utils.logging_ import get_named_client_logger + +trainer_directory = os.path.split(__file__)[0] +_trainers = find_components(__package__, + trainer_directory, + BaseTrainerComponent) +_addons = ThirdPartyComponents(BaseTrainerComponent) + + +def add_trainer(trainer: BaseTrainerComponent) -> None: + _addons.add_component(trainer) + + +class TrainerChoice(autoPyTorchChoice): + """This class is an interface to the PyTorch trainer. + + + To map to pipeline terminology, a choice component will implement the epoch + loop through fit, whereas the component who is chosen will dictate how a single + epoch happens, that is, how batches of data are fed and used to train the network. + + """ + + def __init__(self, + dataset_properties: Dict[str, Any], + random_state: Optional[np.random.RandomState] = None + ): + + super().__init__(dataset_properties=dataset_properties, + random_state=random_state) + self.run_summary = None # type: Optional[RunSummary] + self.writer = None # type: Optional[SummaryWriter] + self._fit_requirements: Optional[List[FitRequirement]] = [ + FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False), + FitRequirement("num_run", (int,), user_defined=False, dataset_property=False), + FitRequirement( + "optimizer", (Optimizer,), user_defined=False, dataset_property=False), + FitRequirement("train_data_loader", + (torch.utils.data.DataLoader,), + user_defined=False, dataset_property=False), + FitRequirement("val_data_loader", + (torch.utils.data.DataLoader,), + user_defined=False, dataset_property=False)] + self.checkpoint_dir = None # type: Optional[str] + + def get_fit_requirements(self) -> Optional[List[FitRequirement]]: + return self._fit_requirements + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available trainer components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all components available + as choices for learning rate scheduling + """ + components = collections.OrderedDict() # type: Dict[str, autoPyTorchComponent] + components.update(_trainers) + components.update(_addons.components) + return components + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default scheduler to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + dataset_properties = {**self.dataset_properties, **dataset_properties} + + # Compile a list of legal trainers for this problem + available_trainers = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(available_trainers) == 0: + raise ValueError("No trainer found") + + if default is None: + defaults = ['StandardTrainer', + ] + for default_ in defaults: + if default_ in available_trainers: + default = default_ + break + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_trainers): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_trainers, + choice_hyperparameter.value_range)) + trainer = CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + trainer = CategoricalHyperparameter( + '__choice__', + list(available_trainers.keys()), + default_value=default + ) + cs.add_hyperparameter(trainer) + for name in trainer.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_trainers[name].get_hyperparameter_search_space(dataset_properties, # type:ignore + **updates) + parent_hyperparameter = {'parent': trainer, 'value': name} + cs.add_configuration_space( + name, + config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + """The transform function calls the transform function of the + underlying model and returns the transformed array. + + Args: + X (np.ndarray): input features + + Returns: + np.ndarray: Transformed features + """ + X.update({'run_summary': self.run_summary}) + return X + + def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchComponent: + """ + Fits a component by using an input dictionary with pre-requisites + + Args: + X (X: Dict[str, Any]): Dependencies needed by current component to perform fit + y (Any): not used. To comply with sklearn API + + Returns: + A instance of self + """ + # Make sure that the prerequisites are there + self.check_requirements(X, y) + + # Setup the logger + self.logger = get_named_client_logger( + name=f"{X['num_run']}_{time.time()}", + # Log to a user provided port else to the default logging port + port=X['logger_port' + ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, + ) + + # Call the actual fit function. + self._fit( + X=X, + y=y, + **kwargs + ) + + return cast(autoPyTorchComponent, self.choice) + + def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoice': + """ + Fits a component by using an input dictionary with pre-requisites + + Args: + X (X: Dict[str, Any]): Dependencies needed by current component to perform fit + y (Any): not used. To comply with sklearn API + + Returns: + A instance of self + """ + + # Comply with mypy + # Notice that choice here stands for the component choice framework, + # where we dynamically build the configuration space by selecting the available + # component choices. In this case, is what trainer choices are available + assert self.choice is not None + + # Setup a Logger and other logging support + # Writer is not pickable -- make sure it is not saved in self + writer = None + if 'use_tensorboard_logger' in X and X['use_tensorboard_logger']: + writer = SummaryWriter(log_dir=X['backend'].temporary_directory) + + if X["torch_num_threads"] > 0: + torch.set_num_threads(X["torch_num_threads"]) + + budget_tracker = BudgetTracker( + budget_type=X['budget_type'], + max_runtime=X['runtime'] if 'runtime' in X else None, + max_epochs=X['epochs'] if 'epochs' in X else None, + ) + + # Support additional user metrics + additional_metrics = X['additional_metrics'] if 'additional_metrics' in X else None + additional_losses = X['additional_losses'] if 'additional_losses' in X else None + self.choice.prepare( + model=X['network'], + metrics=get_metrics(dataset_properties=X['dataset_properties'], + names=additional_metrics), + criterion=get_loss(X['dataset_properties'], + name=additional_losses), + budget_tracker=budget_tracker, + optimizer=X['optimizer'], + device=get_device_from_fit_dictionary(X), + metrics_during_training=X['metrics_during_training'], + scheduler=X['lr_scheduler'], + task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']], + labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]] + ) + total_parameter_count, trainable_parameter_count = self.count_parameters(X['network']) + self.run_summary = RunSummary( + total_parameter_count, + trainable_parameter_count, + ) + + epoch = 1 + + while True: + + # prepare epoch + start_time = time.time() + + self.choice.on_epoch_start(X=X, epoch=epoch) + + # training + train_loss, train_metrics = self.choice.train_epoch( + train_loader=X['train_data_loader'], + epoch=epoch, + writer=writer, + ) + + val_loss, val_metrics, test_loss, test_metrics = None, {}, None, {} + if self.eval_valid_each_epoch(X): + val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer) + if 'test_data_loader' in X and X['test_data_loader']: + test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer) + + # Save training information + self.run_summary.add_performance( + epoch=epoch, + start_time=start_time, + end_time=time.time(), + train_loss=train_loss, + val_loss=val_loss, + test_loss=test_loss, + train_metrics=train_metrics, + val_metrics=val_metrics, + test_metrics=test_metrics, + ) + + # Save the weights of the best model and, if patience + # exhausted break training + if self.early_stop_handler(X): + break + + if self.choice.on_epoch_end(X=X, epoch=epoch): + break + + self.logger.debug(self.run_summary.repr_last_epoch()) + + # Reached max epoch on next iter, don't even go there + if budget_tracker.is_max_epoch_reached(epoch + 1): + break + + epoch += 1 + + if 'cuda' in X['device']: + torch.cuda.empty_cache() + + # wrap up -- add score if not evaluating every epoch + if not self.eval_valid_each_epoch(X): + val_loss, val_metrics = self.choice.evaluate(X['val_data_loader']) + if 'test_data_loader' in X and X['val_data_loader']: + test_loss, test_metrics = self.choice.evaluate(X['test_data_loader']) + self.run_summary.add_performance( + epoch=epoch, + start_time=start_time, + end_time=time.time(), + train_loss=train_loss, + val_loss=val_loss, + test_loss=test_loss, + train_metrics=train_metrics, + val_metrics=val_metrics, + test_metrics=test_metrics, + ) + self.save_model_for_ensemble() + + self.logger.info(f"Finished training with {self.run_summary.repr_last_epoch()}") + + # Tag as fitted + self.fitted_ = True + + return self + + def early_stop_handler(self, X: Dict[str, Any]) -> bool: + """ + If early stopping is enabled, this procedure stops the training after a + given patience + Args: + X (Dict[str, Any]): Dictionary with fitted parameters. It is a message passing + mechanism, in which during a transform, a components adds relevant information + so that further stages can be properly fitted + + Returns: + bool: If true, training should be stopped + """ + assert self.run_summary is not None + + # Allow to disable early stopping + if X['early_stopping'] is None or X['early_stopping'] < 0: + return False + + # Store the best weights seen so far: + if self.checkpoint_dir is None: + self.checkpoint_dir = tempfile.mkdtemp(dir=X['backend'].temporary_directory) + + epochs_since_best = self.run_summary.get_last_epoch() - self.run_summary.get_best_epoch() + + # Save the checkpoint if there is a new best epoch + best_path = os.path.join(self.checkpoint_dir, 'best.pth') + if epochs_since_best == 0: + torch.save(X['network'].state_dict(), best_path) + + if epochs_since_best > X['early_stopping']: + self.logger.debug(f" Early stopped model {X['num_run']} on epoch {self.run_summary.get_best_epoch()}") + # We will stop the training. Load the last best performing weights + X['network'].load_state_dict(torch.load(best_path)) + + # Let the tempfile module clean the temp dir + self.checkpoint_dir = None + return True + + return False + + def eval_valid_each_epoch(self, X: Dict[str, Any]) -> bool: + """ + Returns true if we are supposed to evaluate the model on every epoch, + on the validation data. Usually, we only validate the data at the end, + but in the case of early stopping, is appealing to evaluate each epoch. + Args: + X (Dict[str, Any]): Dictionary with fitted parameters. It is a message passing + mechanism, in which during a transform, a components adds relevant information + so that further stages can be properly fitted + + Returns: + bool: if True, the model is evaluated in every epoch + + """ + if 'early_stopping' in X and X['early_stopping']: + return True + + # We need to know if we should reduce the rate based on val loss + if 'ReduceLROnPlateau' in X['lr_scheduler'].__class__.__name__: + return True + + return False + + def check_requirements(self, X: Dict[str, Any], y: Any = None) -> None: + """ + A mechanism in code to ensure the correctness of the fit dictionary + It recursively makes sure that the children and parent level requirements + are honored before fit. + + Args: + X (Dict[str, Any]): Dictionary with fitted parameters. It is a message passing + mechanism, in which during a transform, a components adds relevant information + so that further stages can be properly fitted + """ + + # make sure the parent requirements are honored + super().check_requirements(X, y) + + # We need a working dir in where to put our data + if 'backend' not in X: + raise ValueError('Need a backend to provide the working directory, ' + "yet 'backend' was not found in the fit dictionary") + + # Whether we should evaluate metrics during training or no + if 'metrics_during_training' not in X: + raise ValueError('Missing metrics_during_training in the fit dictionary') + + # Setup Components + if 'lr_scheduler' not in X: + raise ValueError("Learning rate scheduler not found in the fit dictionary!") + + if 'network' not in X: + raise ValueError("Network not found in the fit dictionary!") + + if 'optimizer' not in X: + raise ValueError("Optimizer not found in the fit dictionary!") + + # Training Components + if 'train_data_loader' not in X: + raise ValueError("train_data_loader not found in the fit dictionary!") + + if 'val_data_loader' not in X: + raise ValueError("val_data_loader not found in the fit dictionary!") + + if 'budget_type' not in X: + raise ValueError("Budget type not found in the fit dictionary!") + else: + if 'epochs' not in X or 'runtime' not in X or 'epoch_or_time' not in X: + if X['budget_type'] in ['epochs', 'epoch_or_time'] and 'epochs' not in X: + raise ValueError("Budget type is epochs but " + "no epochs was not found in the fit dictionary!") + elif X['budget_type'] in ['runtime', 'epoch_or_time'] and 'runtime' not in X: + raise ValueError("Budget type is runtime but " + "no maximum number of seconds was provided!") + else: + raise ValueError("Unsupported budget type provided: {}".format( + X['budget_type'] + )) + + if 'num_run' not in X: + raise ValueError('To fit a trainer, expected fit dictionary to have a num_run') + + for config_option in ["torch_num_threads", 'device']: + if config_option not in X: + raise ValueError("To fit a trainer, expected fit dictionary to have a {}".format( + config_option + )) + + # For early stopping, we need to know the patience + if 'early_stopping' not in X: + raise ValueError('To fit a Trainer, expected fit dictionary to have early_stopping') + + @staticmethod + def count_parameters(model: torch.nn.Module) -> Tuple[int, int]: + """ + A method to get the total/trainable parameter count from the model + + Args: + model (torch.nn.Module): the module from which to count parameters + + Returns: + total_parameter_count: the total number of parameters of the model + trainable_parameter_count: only the parameters being optimized + """ + total_parameter_count = sum( + p.numel() for p in model.parameters()) + trainable_parameter_count = sum( + p.numel() for p in model.parameters() if p.requires_grad) + return total_parameter_count, trainable_parameter_count + + def save_model_for_ensemble(self) -> str: + raise NotImplementedError() + + def __str__(self) -> str: + """ Allow a nice understanding of what components where used """ + string = str(self.run_summary) + return string diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py deleted file mode 100755 index 248d8085b..000000000 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py +++ /dev/null @@ -1,508 +0,0 @@ -import collections -import logging.handlers -import os -import tempfile -import time -from typing import Any, Dict, List, Optional, Tuple, cast - -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import ( - CategoricalHyperparameter, -) - -import numpy as np - -import torch -from torch.optim import Optimizer -from torch.optim.lr_scheduler import _LRScheduler -from torch.utils.tensorboard.writer import SummaryWriter - -from autoPyTorch.constants import STRING_TO_TASK_TYPES -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.base_component import ( - ThirdPartyComponents, - autoPyTorchComponent, - find_components, -) -from autoPyTorch.pipeline.components.training.losses import get_loss -from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics -from autoPyTorch.pipeline.components.training.trainer.base_trainer import ( - BaseTrainerComponent, - BudgetTracker, - RunSummary, -) -from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary -from autoPyTorch.utils.logging_ import get_named_client_logger - -trainer_directory = os.path.split(__file__)[0] -_trainers = find_components(__package__, - trainer_directory, - BaseTrainerComponent) -_addons = ThirdPartyComponents(BaseTrainerComponent) - - -def add_trainer(trainer: BaseTrainerComponent) -> None: - _addons.add_component(trainer) - - -class TrainerChoice(autoPyTorchChoice): - """This class is an interface to the PyTorch trainer. - - - To map to pipeline terminology, a choice component will implement the epoch - loop through fit, whereas the component who is chosen will dictate how a single - epoch happens, that is, how batches of data are fed and used to train the network. - - """ - - def __init__(self, - dataset_properties: Dict[str, Any], - random_state: Optional[np.random.RandomState] = None - ): - - super().__init__(dataset_properties=dataset_properties, - random_state=random_state) - self.run_summary = None # type: Optional[RunSummary] - self.writer = None # type: Optional[SummaryWriter] - self._fit_requirements: Optional[List[FitRequirement]] = [ - FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False), - FitRequirement("num_run", (int,), user_defined=False, dataset_property=False), - FitRequirement( - "optimizer", (Optimizer,), user_defined=False, dataset_property=False), - FitRequirement("train_data_loader", - (torch.utils.data.DataLoader,), - user_defined=False, dataset_property=False), - FitRequirement("val_data_loader", - (torch.utils.data.DataLoader,), - user_defined=False, dataset_property=False)] - self.checkpoint_dir = None # type: Optional[str] - - def get_fit_requirements(self) -> Optional[List[FitRequirement]]: - return self._fit_requirements - - def get_components(self) -> Dict[str, autoPyTorchComponent]: - """Returns the available trainer components - - Args: - None - - Returns: - Dict[str, autoPyTorchComponent]: all components available - as choices for learning rate scheduling - """ - components = collections.OrderedDict() # type: Dict[str, autoPyTorchComponent] - components.update(_trainers) - components.update(_addons.components) - return components - - def get_hyperparameter_search_space( - self, - dataset_properties: Optional[Dict[str, str]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - ) -> ConfigurationSpace: - """Returns the configuration space of the current chosen components - - Args: - dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on - default (Optional[str]): Default scheduler to use - include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive - list, and will exclusively use this components. - exclude: Optional[Dict[str, Any]]: which components to skip - - Returns: - ConfigurationSpace: the configuration space of the hyper-parameters of the - chosen component - """ - cs = ConfigurationSpace() - - if dataset_properties is None: - dataset_properties = {} - - dataset_properties = {**self.dataset_properties, **dataset_properties} - - # Compile a list of legal trainers for this problem - available_trainers = self.get_available_components( - dataset_properties=dataset_properties, - include=include, exclude=exclude) - - if len(available_trainers) == 0: - raise ValueError("No trainer found") - - if default is None: - defaults = ['StandardTrainer', - ] - for default_ in defaults: - if default_ in available_trainers: - default = default_ - break - updates = self._get_search_space_updates() - if '__choice__' in updates.keys(): - choice_hyperparameter = updates['__choice__'] - if not set(choice_hyperparameter.value_range).issubset(available_trainers): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_trainers, - choice_hyperparameter.value_range)) - trainer = CategoricalHyperparameter('__choice__', - choice_hyperparameter.value_range, - default_value=choice_hyperparameter.default_value) - else: - trainer = CategoricalHyperparameter( - '__choice__', - list(available_trainers.keys()), - default_value=default - ) - cs.add_hyperparameter(trainer) - for name in trainer.choices: - updates = self._get_search_space_updates(prefix=name) - config_space = available_trainers[name].get_hyperparameter_search_space(dataset_properties, # type:ignore - **updates) - parent_hyperparameter = {'parent': trainer, 'value': name} - cs.add_configuration_space( - name, - config_space, - parent_hyperparameter=parent_hyperparameter - ) - - self.configuration_space_ = cs - self.dataset_properties_ = dataset_properties - return cs - - def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: - """The transform function calls the transform function of the - underlying model and returns the transformed array. - - Args: - X (np.ndarray): input features - - Returns: - np.ndarray: Transformed features - """ - X.update({'run_summary': self.run_summary}) - return X - - def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchComponent: - """ - Fits a component by using an input dictionary with pre-requisites - - Args: - X (X: Dict[str, Any]): Dependencies needed by current component to perform fit - y (Any): not used. To comply with sklearn API - - Returns: - A instance of self - """ - # Make sure that the prerequisites are there - self.check_requirements(X, y) - - # Setup the logger - self.logger = get_named_client_logger( - name=f"{X['num_run']}_{time.time()}", - # Log to a user provided port else to the default logging port - port=X['logger_port' - ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, - ) - - # Call the actual fit function. - self._fit( - X=X, - y=y, - **kwargs - ) - - return cast(autoPyTorchComponent, self.choice) - - def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoice': - """ - Fits a component by using an input dictionary with pre-requisites - - Args: - X (X: Dict[str, Any]): Dependencies needed by current component to perform fit - y (Any): not used. To comply with sklearn API - - Returns: - A instance of self - """ - - # Comply with mypy - # Notice that choice here stands for the component choice framework, - # where we dynamically build the configuration space by selecting the available - # component choices. In this case, is what trainer choices are available - assert self.choice is not None - - # Setup a Logger and other logging support - # Writer is not pickable -- make sure it is not saved in self - writer = None - if 'use_tensorboard_logger' in X and X['use_tensorboard_logger']: - writer = SummaryWriter(log_dir=X['backend'].temporary_directory) - - if X["torch_num_threads"] > 0: - torch.set_num_threads(X["torch_num_threads"]) - - budget_tracker = BudgetTracker( - budget_type=X['budget_type'], - max_runtime=X['runtime'] if 'runtime' in X else None, - max_epochs=X['epochs'] if 'epochs' in X else None, - ) - - # Support additional user metrics - additional_metrics = X['additional_metrics'] if 'additional_metrics' in X else None - additional_losses = X['additional_losses'] if 'additional_losses' in X else None - self.choice.prepare( - model=X['network'], - metrics=get_metrics(dataset_properties=X['dataset_properties'], - names=additional_metrics), - criterion=get_loss(X['dataset_properties'], - name=additional_losses), - budget_tracker=budget_tracker, - optimizer=X['optimizer'], - device=get_device_from_fit_dictionary(X), - metrics_during_training=X['metrics_during_training'], - scheduler=X['lr_scheduler'], - task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']], - labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]] - ) - total_parameter_count, trainable_parameter_count = self.count_parameters(X['network']) - self.run_summary = RunSummary( - total_parameter_count, - trainable_parameter_count, - ) - - epoch = 1 - - while True: - - # prepare epoch - start_time = time.time() - - self.choice.on_epoch_start(X=X, epoch=epoch) - - # training - train_loss, train_metrics = self.choice.train_epoch( - train_loader=X['train_data_loader'], - epoch=epoch, - writer=writer, - ) - - val_loss, val_metrics, test_loss, test_metrics = None, {}, None, {} - if self.eval_valid_each_epoch(X): - val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer) - if 'test_data_loader' in X and X['test_data_loader']: - test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer) - - # Save training information - self.run_summary.add_performance( - epoch=epoch, - start_time=start_time, - end_time=time.time(), - train_loss=train_loss, - val_loss=val_loss, - test_loss=test_loss, - train_metrics=train_metrics, - val_metrics=val_metrics, - test_metrics=test_metrics, - ) - - # Save the weights of the best model and, if patience - # exhausted break training - if self.early_stop_handler(X): - break - - if self.choice.on_epoch_end(X=X, epoch=epoch): - break - - self.logger.debug(self.run_summary.repr_last_epoch()) - - # Reached max epoch on next iter, don't even go there - if budget_tracker.is_max_epoch_reached(epoch + 1): - break - - epoch += 1 - - if 'cuda' in X['device']: - torch.cuda.empty_cache() - - # wrap up -- add score if not evaluating every epoch - if not self.eval_valid_each_epoch(X): - val_loss, val_metrics = self.choice.evaluate(X['val_data_loader']) - if 'test_data_loader' in X and X['val_data_loader']: - test_loss, test_metrics = self.choice.evaluate(X['test_data_loader']) - self.run_summary.add_performance( - epoch=epoch, - start_time=start_time, - end_time=time.time(), - train_loss=train_loss, - val_loss=val_loss, - test_loss=test_loss, - train_metrics=train_metrics, - val_metrics=val_metrics, - test_metrics=test_metrics, - ) - self.save_model_for_ensemble() - - self.logger.info(f"Finished training with {self.run_summary.repr_last_epoch()}") - - # Tag as fitted - self.fitted_ = True - - return self - - def early_stop_handler(self, X: Dict[str, Any]) -> bool: - """ - If early stopping is enabled, this procedure stops the training after a - given patience - Args: - X (Dict[str, Any]): Dictionary with fitted parameters. It is a message passing - mechanism, in which during a transform, a components adds relevant information - so that further stages can be properly fitted - - Returns: - bool: If true, training should be stopped - """ - assert self.run_summary is not None - - # Allow to disable early stopping - if X['early_stopping'] is None or X['early_stopping'] < 0: - return False - - # Store the best weights seen so far: - if self.checkpoint_dir is None: - self.checkpoint_dir = tempfile.mkdtemp(dir=X['backend'].temporary_directory) - - epochs_since_best = self.run_summary.get_last_epoch() - self.run_summary.get_best_epoch() - - # Save the checkpoint if there is a new best epoch - best_path = os.path.join(self.checkpoint_dir, 'best.pth') - if epochs_since_best == 0: - torch.save(X['network'].state_dict(), best_path) - - if epochs_since_best > X['early_stopping']: - self.logger.debug(f" Early stopped model {X['num_run']} on epoch {self.run_summary.get_best_epoch()}") - # We will stop the training. Load the last best performing weights - X['network'].load_state_dict(torch.load(best_path)) - - # Let the tempfile module clean the temp dir - self.checkpoint_dir = None - return True - - return False - - def eval_valid_each_epoch(self, X: Dict[str, Any]) -> bool: - """ - Returns true if we are supposed to evaluate the model on every epoch, - on the validation data. Usually, we only validate the data at the end, - but in the case of early stopping, is appealing to evaluate each epoch. - Args: - X (Dict[str, Any]): Dictionary with fitted parameters. It is a message passing - mechanism, in which during a transform, a components adds relevant information - so that further stages can be properly fitted - - Returns: - bool: if True, the model is evaluated in every epoch - - """ - if 'early_stopping' in X and X['early_stopping']: - return True - - # We need to know if we should reduce the rate based on val loss - if 'ReduceLROnPlateau' in X['lr_scheduler'].__class__.__name__: - return True - - return False - - def check_requirements(self, X: Dict[str, Any], y: Any = None) -> None: - """ - A mechanism in code to ensure the correctness of the fit dictionary - It recursively makes sure that the children and parent level requirements - are honored before fit. - - Args: - X (Dict[str, Any]): Dictionary with fitted parameters. It is a message passing - mechanism, in which during a transform, a components adds relevant information - so that further stages can be properly fitted - """ - - # make sure the parent requirements are honored - super().check_requirements(X, y) - - # We need a working dir in where to put our data - if 'backend' not in X: - raise ValueError('Need a backend to provide the working directory, ' - "yet 'backend' was not found in the fit dictionary") - - # Whether we should evaluate metrics during training or no - if 'metrics_during_training' not in X: - raise ValueError('Missing metrics_during_training in the fit dictionary') - - # Setup Components - if 'lr_scheduler' not in X: - raise ValueError("Learning rate scheduler not found in the fit dictionary!") - - if 'network' not in X: - raise ValueError("Network not found in the fit dictionary!") - - if 'optimizer' not in X: - raise ValueError("Optimizer not found in the fit dictionary!") - - # Training Components - if 'train_data_loader' not in X: - raise ValueError("train_data_loader not found in the fit dictionary!") - - if 'val_data_loader' not in X: - raise ValueError("val_data_loader not found in the fit dictionary!") - - if 'budget_type' not in X: - raise ValueError("Budget type not found in the fit dictionary!") - else: - if 'epochs' not in X or 'runtime' not in X or 'epoch_or_time' not in X: - if X['budget_type'] in ['epochs', 'epoch_or_time'] and 'epochs' not in X: - raise ValueError("Budget type is epochs but " - "no epochs was not found in the fit dictionary!") - elif X['budget_type'] in ['runtime', 'epoch_or_time'] and 'runtime' not in X: - raise ValueError("Budget type is runtime but " - "no maximum number of seconds was provided!") - else: - raise ValueError("Unsupported budget type provided: {}".format( - X['budget_type'] - )) - - if 'num_run' not in X: - raise ValueError('To fit a trainer, expected fit dictionary to have a num_run') - - for config_option in ["torch_num_threads", 'device']: - if config_option not in X: - raise ValueError("To fit a trainer, expected fit dictionary to have a {}".format( - config_option - )) - - # For early stopping, we need to know the patience - if 'early_stopping' not in X: - raise ValueError('To fit a Trainer, expected fit dictionary to have early_stopping') - - @staticmethod - def count_parameters(model: torch.nn.Module) -> Tuple[int, int]: - """ - A method to get the total/trainable parameter count from the model - - Args: - model (torch.nn.Module): the module from which to count parameters - - Returns: - total_parameter_count: the total number of parameters of the model - trainable_parameter_count: only the parameters being optimized - """ - total_parameter_count = sum( - p.numel() for p in model.parameters()) - trainable_parameter_count = sum( - p.numel() for p in model.parameters() if p.requires_grad) - return total_parameter_count, trainable_parameter_count - - def save_model_for_ensemble(self) -> str: - raise NotImplementedError() - - def __str__(self) -> str: - """ Allow a nice understanding of what components where used """ - string = str(self.run_summary) - return string diff --git a/autoPyTorch/pipeline/image_classification.py b/autoPyTorch/pipeline/image_classification.py index bf15d738a..d013f7027 100644 --- a/autoPyTorch/pipeline/image_classification.py +++ b/autoPyTorch/pipeline/image_classification.py @@ -8,7 +8,7 @@ from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise.base_normalizer_choice import ( +from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise import ( NormalizerChoice ) from autoPyTorch.pipeline.components.setup.augmentation.image.ImageAugmenter import ImageAugmenter diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index ef57a8569..fd607bf70 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -18,27 +18,24 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import ( TabularColumnTransformer ) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import ( +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import ( EncoderChoice ) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ - base_feature_preprocessor_choice import FeatureProprocessorChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing import ( + FeatureProprocessorChoice +) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing -from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice +from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent -from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice -from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding_choice import NetworkEmbeddingChoice -from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice -from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import ( - NetworkInitializerChoice -) -from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice import OptimizerChoice +from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice +from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice +from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice +from autoPyTorch.pipeline.components.setup.network_initializer import NetworkInitializerChoice +from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader -from autoPyTorch.pipeline.components.training.trainer.base_trainer_choice import ( - TrainerChoice -) +from autoPyTorch.pipeline.components.training.trainer import TrainerChoice from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 2650868b6..27a3ae314 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -16,27 +16,26 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import ( TabularColumnTransformer ) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import ( +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import ( EncoderChoice ) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ - base_feature_preprocessor_choice import FeatureProprocessorChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing import ( + FeatureProprocessorChoice, +) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing -from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice +from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent -from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice -from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding_choice import NetworkEmbeddingChoice -from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice -from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import ( +from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice +from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice +from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice +from autoPyTorch.pipeline.components.setup.network_initializer import ( NetworkInitializerChoice ) -from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice import OptimizerChoice +from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader -from autoPyTorch.pipeline.components.training.trainer.base_trainer_choice import ( - TrainerChoice -) +from autoPyTorch.pipeline.components.training.trainer import TrainerChoice from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates diff --git a/autoPyTorch/pipeline/traditional_tabular_classification.py b/autoPyTorch/pipeline/traditional_tabular_classification.py index 49be2a1fa..5b0471e87 100644 --- a/autoPyTorch/pipeline/traditional_tabular_classification.py +++ b/autoPyTorch/pipeline/traditional_tabular_classification.py @@ -9,7 +9,7 @@ from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.setup.traditional_ml.base_model_choice import ModelChoice +from autoPyTorch.pipeline.components.setup.traditional_ml import ModelChoice from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates diff --git a/autoPyTorch/utils/parallel.py b/autoPyTorch/utils/parallel.py new file mode 100644 index 000000000..0b2f9f390 --- /dev/null +++ b/autoPyTorch/utils/parallel.py @@ -0,0 +1,41 @@ +import multiprocessing +import sys + + +def preload_modules(context: multiprocessing.context.BaseContext) -> None: + """ + This function is meant to be used with the forkserver multiprocessing context. + More details about it can be found here: + https://docs.python.org/3/library/multiprocessing.html + + Forkserver is known to be slower than other contexts. We use it, because it helps + reduce the probability of a deadlock. To make it fast, we pre-load modules so that + forked children have the desired modules available. + + We do not inherit dead-lock problematic modules like logging. + + Arguments: + context (multiprocessing.context.BaseContext): One of the three supported multiprocessing + contexts being fork, forkserver or spawn. + """ + all_loaded_modules = sys.modules.keys() + preload = [ + loaded_module for loaded_module in all_loaded_modules + if loaded_module.split('.')[0] in ( + 'smac', + 'autoPyTorch', + 'numpy', + 'scipy', + 'pandas', + 'pynisher', + 'sklearn', + 'ConfigSpace', + 'torch', + 'torchvision', + 'tensorboard', + 'imgaug', + 'catboost', + 'lightgbm', + ) and 'logging' not in loaded_module + ] + context.set_forkserver_preload(preload) diff --git a/autoPyTorch/utils/single_thread_client.py b/autoPyTorch/utils/single_thread_client.py new file mode 100644 index 000000000..3d8455800 --- /dev/null +++ b/autoPyTorch/utils/single_thread_client.py @@ -0,0 +1,93 @@ +import typing +from pathlib import Path + +import dask.distributed + + +class DummyFuture(dask.distributed.Future): + """ + A class that mimics a distributed Future, the outcome of + performing submit on a distributed client. + """ + def __init__(self, result: typing.Any) -> None: + self._result = result # type: typing.Any + + def result(self, timeout: typing.Optional[int] = None) -> typing.Any: + return self._result + + def cancel(self) -> None: + pass + + def done(self) -> bool: + return True + + def __repr__(self) -> str: + return "DummyFuture: {}".format(self._result) + + def __del__(self) -> None: + pass + + +class SingleThreadedClient(dask.distributed.Client): + """ + A class to Mock the Distributed Client class. + + Using dask requires a scheduler which submits jobs on a different process. Also, + pynisher submits jobs in a further additional process. + + When using a single core, we would prefer using the same main process without any + multiprocessing overhead (that is, without the need of a LocalCluster in + dask.distributed.Client). In other words, this class enriches the Client() class + with the capability to run a future in the same thread (without any deadlock). + """ + def __init__(self) -> None: + + # Raise a not implemented error if using a method from Client + implemented_methods = ['submit', 'close', 'shutdown', 'write_scheduler_file', + '_get_scheduler_info', 'nthreads'] + method_list = [func for func in dir(dask.distributed.Client) if callable( + getattr(dask.distributed.Client, func)) and not func.startswith('__')] + for method in method_list: + if method in implemented_methods: + continue + setattr(self, method, self._unsupported_method) + pass + + def _unsupported_method(self) -> None: + raise NotImplementedError() + + def submit( + self, + func: typing.Callable, + *args: typing.List, + priority: int = 0, + **kwargs: typing.Any, + ) -> typing.Any: + return DummyFuture(func(*args, **kwargs)) + + def close(self) -> None: + pass + + def shutdown(self) -> None: + pass + + def write_scheduler_file(self, scheduler_file: str) -> None: + Path(scheduler_file).touch() + return + + def _get_scheduler_info(self) -> typing.Dict: + return { + 'workers': ['127.0.0.1'], + 'type': 'Scheduler', + } + + def nthreads(self) -> typing.Dict: + return { + '127.0.0.1': 1, + } + + def __repr__(self) -> str: + return 'SingleThreadedClient()' + + def __del__(self) -> None: + pass diff --git a/examples/20_basics/example_tabular_classification.py b/examples/20_basics/example_tabular_classification.py index c0251fa90..7b1aa9995 100644 --- a/examples/20_basics/example_tabular_classification.py +++ b/examples/20_basics/example_tabular_classification.py @@ -24,50 +24,48 @@ from autoPyTorch.api.tabular_classification import TabularClassificationTask -if __name__ == '__main__': +############################################################################ +# Data Loading +# ============ +X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=1, +) - ############################################################################ - # Data Loading - # ============ - X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, - y, - random_state=42, - ) +############################################################################ +# Build and fit a classifier +# ========================== +api = TabularClassificationTask( + # To maintain logs of the run, you can uncomment the + # Following lines + # temporary_directory='./tmp/autoPyTorch_example_tmp_01', + # output_directory='./tmp/autoPyTorch_example_out_01', + # delete_tmp_folder_after_terminate=False, + # delete_output_folder_after_terminate=False, + seed=42, +) - ############################################################################ - # Build and fit a classifier - # ========================== - api = TabularClassificationTask( - # To maintain logs of the run, you can uncomment the - # Following lines - # temporary_directory='./tmp/autoPyTorch_example_tmp_01', - # output_directory='./tmp/autoPyTorch_example_out_01', - # delete_tmp_folder_after_terminate=False, - # delete_output_folder_after_terminate=False, - seed=42, - ) +############################################################################ +# Search for an ensemble of machine learning algorithms +# ===================================================== +api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=300, + func_eval_time_limit_secs=50 +) - ############################################################################ - # Search for an ensemble of machine learning algorithms - # ===================================================== - api.search( - X_train=X_train, - y_train=y_train, - X_test=X_test.copy(), - y_test=y_test.copy(), - optimize_metric='accuracy', - total_walltime_limit=300, - func_eval_time_limit_secs=50 - ) - - ############################################################################ - # Print the final ensemble performance - # ==================================== - print(api.run_history, api.trajectory) - y_pred = api.predict(X_test) - score = api.score(y_pred, y_test) - print(score) - # Print the final ensemble built by AutoPyTorch - print(api.show_models()) +############################################################################ +# Print the final ensemble performance +# ==================================== +print(api.run_history, api.trajectory) +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test) +print(score) +# Print the final ensemble built by AutoPyTorch +print(api.show_models()) diff --git a/examples/20_basics/example_tabular_regression.py b/examples/20_basics/example_tabular_regression.py index ef8cacb37..836d4d6d6 100644 --- a/examples/20_basics/example_tabular_regression.py +++ b/examples/20_basics/example_tabular_regression.py @@ -24,56 +24,44 @@ from autoPyTorch.api.tabular_regression import TabularRegressionTask -if __name__ == '__main__': +############################################################################ +# Data Loading +# ============ +X, y = sklearn.datasets.fetch_openml(name='boston', return_X_y=True, as_frame=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=1, +) - ############################################################################ - # Data Loading - # ============ - X, y = sklearn.datasets.fetch_openml(name='boston', return_X_y=True, as_frame=True) - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, - y, - random_state=1, - ) +############################################################################ +# Build and fit a regressor +# ========================== +api = TabularRegressionTask() - # Scale the regression targets to have zero mean and unit variance. - # This is important for Neural Networks since predicting large target values would require very large weights. - # One can later rescale the network predictions like this: y_pred = y_pred_scaled * y_train_std + y_train_mean - y_train_mean = y_train.mean() - y_train_std = y_train.std() +############################################################################ +# Search for an ensemble of machine learning algorithms +# ===================================================== +api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='r2', + total_walltime_limit=300, + func_eval_time_limit_secs=50, + enable_traditional_pipeline=False, +) - y_train_scaled = (y_train - y_train_mean) / y_train_std - y_test_scaled = (y_test - y_train_mean) / y_train_std +############################################################################ +# Print the final ensemble performance +# ==================================== +print(api.run_history, api.trajectory) +y_pred = api.predict(X_test) - ############################################################################ - # Build and fit a regressor - # ========================== - api = TabularRegressionTask() +# Rescale the Neural Network predictions into the original target range +score = api.score(y_pred, y_test) - ############################################################################ - # Search for an ensemble of machine learning algorithms - # ===================================================== - api.search( - X_train=X_train, - y_train=y_train_scaled, - X_test=X_test.copy(), - y_test=y_test_scaled.copy(), - optimize_metric='r2', - total_walltime_limit=300, - func_eval_time_limit_secs=50, - enable_traditional_pipeline=False, - ) - - ############################################################################ - # Print the final ensemble performance - # ==================================== - print(api.run_history, api.trajectory) - y_pred_scaled = api.predict(X_test) - - # Rescale the Neural Network predictions into the original target range - y_pred = y_pred_scaled * y_train_std + y_train_mean - score = api.score(y_pred, y_test) - - print(score) - # Print the final ensemble built by AutoPyTorch - print(api.show_models()) +print(score) +# Print the final ensemble built by AutoPyTorch +print(api.show_models()) diff --git a/examples/40_advanced/README.txt b/examples/40_advanced/README.txt index f3293bf16..fb68a2c31 100644 --- a/examples/40_advanced/README.txt +++ b/examples/40_advanced/README.txt @@ -1,11 +1,12 @@ .. _examples_tabular_basics: -============================== +================================= Advanced Tabular Dataset Examples -============================== +================================= Advanced examples for using *Auto-PyTorch* on tabular datasets. We explain 1. How to customise the search space 2. How to split the data according to different resampling strategies + 3. How to visualize the results of Auto-PyTorch diff --git a/examples/40_advanced/example_custom_configuration_space.py b/examples/40_advanced/example_custom_configuration_space.py index 6a3764b94..bd02e51f1 100644 --- a/examples/40_advanced/example_custom_configuration_space.py +++ b/examples/40_advanced/example_custom_configuration_space.py @@ -72,7 +72,11 @@ def get_search_space_updates(): ############################################################################ # Build and fit a classifier with include components # ================================================== + # AutoPyTorch can search for multiple configurations at the same time + # if multiple cores are allocated, using the n_jobs argument. By default, + # Only 1 core is used while searching for configurations. api = TabularClassificationTask( + n_jobs=2, search_space_updates=get_search_space_updates(), include_components={'network_backbone': ['MLPBackbone', 'ResNetBackbone'], 'encoder': ['OneHotEncoder']} diff --git a/examples/40_advanced/example_resampling_strategy.py b/examples/40_advanced/example_resampling_strategy.py index 270f518c8..9fb77b76d 100644 --- a/examples/40_advanced/example_resampling_strategy.py +++ b/examples/40_advanced/example_resampling_strategy.py @@ -27,118 +27,116 @@ from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes -if __name__ == '__main__': - - ############################################################################ - # Data Loading - # ============ - X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, - y, - random_state=1, - ) - - ############################################################################ - # Build and fit a classifier with default resampling strategy - # =========================================================== - api = TabularClassificationTask( - # 'HoldoutValTypes.holdout_validation' with 'val_share': 0.33 - # is the default argument setting for TabularClassificationTask. - # It is explicitly specified in this example for demonstrational - # purpose. - resampling_strategy=HoldoutValTypes.holdout_validation, - resampling_strategy_args={'val_share': 0.33} - ) - - ############################################################################ - # Search for an ensemble of machine learning algorithms - # ===================================================== - api.search( - X_train=X_train, - y_train=y_train, - X_test=X_test.copy(), - y_test=y_test.copy(), - optimize_metric='accuracy', - total_walltime_limit=150, - func_eval_time_limit_secs=30 - ) - - ############################################################################ - # Print the final ensemble performance - # ==================================== - print(api.run_history, api.trajectory) - y_pred = api.predict(X_test) - score = api.score(y_pred, y_test) - print(score) - # Print the final ensemble built by AutoPyTorch - print(api.show_models()) - - ############################################################################ - - ############################################################################ - # Build and fit a classifier with Cross validation resampling strategy - # ==================================================================== - api = TabularClassificationTask( - resampling_strategy=CrossValTypes.k_fold_cross_validation, - resampling_strategy_args={'num_splits': 3} - ) - - ############################################################################ - # Search for an ensemble of machine learning algorithms - # ===================================================== - api.search( - X_train=X_train, - y_train=y_train, - X_test=X_test.copy(), - y_test=y_test.copy(), - optimize_metric='accuracy', - total_walltime_limit=150, - func_eval_time_limit_secs=30 - ) - - ############################################################################ - # Print the final ensemble performance - # ==================================== - print(api.run_history, api.trajectory) - y_pred = api.predict(X_test) - score = api.score(y_pred, y_test) - print(score) - # Print the final ensemble built by AutoPyTorch - print(api.show_models()) - - ############################################################################ - - ############################################################################ - # Build and fit a classifier with Stratified resampling strategy - # ============================================================== - api = TabularClassificationTask( - # For demonstration purposes, we use - # Stratified hold out validation. However, - # one can also use CrossValTypes.stratified_k_fold_cross_validation. - resampling_strategy=HoldoutValTypes.stratified_holdout_validation, - resampling_strategy_args={'val_share': 0.33} - ) - - ############################################################################ - # Search for an ensemble of machine learning algorithms - # ===================================================== - api.search( - X_train=X_train, - y_train=y_train, - X_test=X_test.copy(), - y_test=y_test.copy(), - optimize_metric='accuracy', - total_walltime_limit=150, - func_eval_time_limit_secs=30 - ) - - ############################################################################ - # Print the final ensemble performance - # ==================================== - print(api.run_history, api.trajectory) - y_pred = api.predict(X_test) - score = api.score(y_pred, y_test) - print(score) - # Print the final ensemble built by AutoPyTorch - print(api.show_models()) +############################################################################ +# Data Loading +# ============ +X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=1, +) + +############################################################################ +# Build and fit a classifier with default resampling strategy +# =========================================================== +api = TabularClassificationTask( + # 'HoldoutValTypes.holdout_validation' with 'val_share': 0.33 + # is the default argument setting for TabularClassificationTask. + # It is explicitly specified in this example for demonstrational + # purpose. + resampling_strategy=HoldoutValTypes.holdout_validation, + resampling_strategy_args={'val_share': 0.33} +) + +############################################################################ +# Search for an ensemble of machine learning algorithms +# ===================================================== +api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit_secs=30 +) + +############################################################################ +# Print the final ensemble performance +# ==================================== +print(api.run_history, api.trajectory) +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test) +print(score) +# Print the final ensemble built by AutoPyTorch +print(api.show_models()) + +############################################################################ + +############################################################################ +# Build and fit a classifier with Cross validation resampling strategy +# ==================================================================== +api = TabularClassificationTask( + resampling_strategy=CrossValTypes.k_fold_cross_validation, + resampling_strategy_args={'num_splits': 3} +) + +############################################################################ +# Search for an ensemble of machine learning algorithms +# ===================================================== +api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit_secs=30 +) + +############################################################################ +# Print the final ensemble performance +# ==================================== +print(api.run_history, api.trajectory) +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test) +print(score) +# Print the final ensemble built by AutoPyTorch +print(api.show_models()) + +############################################################################ + +############################################################################ +# Build and fit a classifier with Stratified resampling strategy +# ============================================================== +api = TabularClassificationTask( + # For demonstration purposes, we use + # Stratified hold out validation. However, + # one can also use CrossValTypes.stratified_k_fold_cross_validation. + resampling_strategy=HoldoutValTypes.stratified_holdout_validation, + resampling_strategy_args={'val_share': 0.33} +) + +############################################################################ +# Search for an ensemble of machine learning algorithms +# ===================================================== +api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit_secs=30 +) + +############################################################################ +# Print the final ensemble performance +# ==================================== +print(api.run_history, api.trajectory) +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test) +print(score) +# Print the final ensemble built by AutoPyTorch +print(api.show_models()) diff --git a/examples/40_advanced/example_visualization.py b/examples/40_advanced/example_visualization.py index 011ea9b78..107d07a47 100644 --- a/examples/40_advanced/example_visualization.py +++ b/examples/40_advanced/example_visualization.py @@ -50,119 +50,117 @@ from autoPyTorch.metrics import accuracy -if __name__ == '__main__': - - ############################################################################ - # Data Loading - # ============ - - # We will use the iris dataset for this Toy example - seed = 42 - X, y = sklearn.datasets.fetch_openml(data_id=61, return_X_y=True, as_frame=True) - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, - y, - random_state=42, - ) - - ############################################################################ - # Build and fit a classifier - # ========================== - api = TabularClassificationTask(seed=seed) - api.search( - X_train=X_train, - y_train=y_train, - X_test=X_test.copy(), - y_test=y_test.copy(), - optimize_metric=accuracy.name, - total_walltime_limit=200, - func_eval_time_limit_secs=50 - ) - - ############################################################################ - # One can also save the model for future inference - # ================================================ - - # For more details on how to deploy a model, please check - # `Scikit-Learn persistence - # `_ support. - with open('estimator.pickle', 'wb') as handle: - pickle.dump(api, handle, protocol=pickle.HIGHEST_PROTOCOL) - - # Then let us read it back and use it for our analysis - with open('estimator.pickle', 'rb') as handle: - estimator = pickle.load(handle) - - ############################################################################ - # Plotting the model performance - # ============================== - - # We will plot the search incumbent through time. - - # Collect the performance of individual machine learning algorithms - # found by SMAC - individual_performances = [] - for run_key, run_value in estimator.run_history.data.items(): - if run_value.status != StatusType.SUCCESS: - # Ignore crashed runs - continue - individual_performances.append({ - 'Timestamp': pd.Timestamp( - time.strftime( - '%Y-%m-%d %H:%M:%S', - time.localtime(run_value.endtime) - ) - ), - 'single_best_optimization_accuracy': accuracy._optimum - run_value.cost, - 'single_best_test_accuracy': np.nan if run_value.additional_info is None else - accuracy._optimum - run_value.additional_info['test_loss'], - }) - individual_performance_frame = pd.DataFrame(individual_performances) - - # Collect the performance of the ensemble through time - # This ensemble is built from the machine learning algorithms - # found by SMAC - ensemble_performance_frame = pd.DataFrame(estimator.ensemble_performance_history) - - # As we are tracking the incumbent, we are interested in the cummax() performance - ensemble_performance_frame['ensemble_optimization_accuracy'] = ensemble_performance_frame[ - 'train_accuracy' - ].cummax() - ensemble_performance_frame['ensemble_test_accuracy'] = ensemble_performance_frame[ - 'test_accuracy' - ].cummax() - ensemble_performance_frame.drop(columns=['test_accuracy', 'train_accuracy'], inplace=True) - individual_performance_frame['single_best_optimization_accuracy'] = individual_performance_frame[ - 'single_best_optimization_accuracy' - ].cummax() - individual_performance_frame['single_best_test_accuracy'] = individual_performance_frame[ - 'single_best_test_accuracy' - ].cummax() - - pd.merge( - ensemble_performance_frame, - individual_performance_frame, - on="Timestamp", how='outer' - ).sort_values('Timestamp').fillna(method='ffill').plot( - x='Timestamp', - kind='line', - legend=True, - title='Auto-PyTorch accuracy over time', - grid=True, - ) - plt.show() - - # We then can understand the importance of each input feature using - # a permutation importance analysis. This is done as a proof of concept, to - # showcase that we can leverage of scikit-learn API. - result = permutation_importance(estimator, X_train, y_train, n_repeats=5, - scoring='accuracy', - random_state=seed) - sorted_idx = result.importances_mean.argsort() - - fig, ax = plt.subplots() - ax.boxplot(result.importances[sorted_idx].T, - vert=False, labels=X_test.columns[sorted_idx]) - ax.set_title("Permutation Importances (Train set)") - fig.tight_layout() - plt.show() +############################################################################ +# Data Loading +# ============ + +# We will use the iris dataset for this Toy example +seed = 42 +X, y = sklearn.datasets.fetch_openml(data_id=61, return_X_y=True, as_frame=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=42, +) + +############################################################################ +# Build and fit a classifier +# ========================== +api = TabularClassificationTask(seed=seed) +api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric=accuracy.name, + total_walltime_limit=200, + func_eval_time_limit_secs=50 +) + +############################################################################ +# One can also save the model for future inference +# ================================================ + +# For more details on how to deploy a model, please check +# `Scikit-Learn persistence +# `_ support. +with open('estimator.pickle', 'wb') as handle: + pickle.dump(api, handle, protocol=pickle.HIGHEST_PROTOCOL) + +# Then let us read it back and use it for our analysis +with open('estimator.pickle', 'rb') as handle: + estimator = pickle.load(handle) + +############################################################################ +# Plotting the model performance +# ============================== + +# We will plot the search incumbent through time. + +# Collect the performance of individual machine learning algorithms +# found by SMAC +individual_performances = [] +for run_key, run_value in estimator.run_history.data.items(): + if run_value.status != StatusType.SUCCESS: + # Ignore crashed runs + continue + individual_performances.append({ + 'Timestamp': pd.Timestamp( + time.strftime( + '%Y-%m-%d %H:%M:%S', + time.localtime(run_value.endtime) + ) + ), + 'single_best_optimization_accuracy': accuracy._optimum - run_value.cost, + 'single_best_test_accuracy': np.nan if run_value.additional_info is None else + accuracy._optimum - run_value.additional_info['test_loss'], + }) +individual_performance_frame = pd.DataFrame(individual_performances) + +# Collect the performance of the ensemble through time +# This ensemble is built from the machine learning algorithms +# found by SMAC +ensemble_performance_frame = pd.DataFrame(estimator.ensemble_performance_history) + +# As we are tracking the incumbent, we are interested in the cummax() performance +ensemble_performance_frame['ensemble_optimization_accuracy'] = ensemble_performance_frame[ + 'train_accuracy' +].cummax() +ensemble_performance_frame['ensemble_test_accuracy'] = ensemble_performance_frame[ + 'test_accuracy' +].cummax() +ensemble_performance_frame.drop(columns=['test_accuracy', 'train_accuracy'], inplace=True) +individual_performance_frame['single_best_optimization_accuracy'] = individual_performance_frame[ + 'single_best_optimization_accuracy' +].cummax() +individual_performance_frame['single_best_test_accuracy'] = individual_performance_frame[ + 'single_best_test_accuracy' +].cummax() + +pd.merge( + ensemble_performance_frame, + individual_performance_frame, + on="Timestamp", how='outer' +).sort_values('Timestamp').fillna(method='ffill').plot( + x='Timestamp', + kind='line', + legend=True, + title='Auto-PyTorch accuracy over time', + grid=True, +) +plt.show() + +# We then can understand the importance of each input feature using +# a permutation importance analysis. This is done as a proof of concept, to +# showcase that we can leverage of scikit-learn API. +result = permutation_importance(estimator, X_train, y_train, n_repeats=5, + scoring='accuracy', + random_state=seed) +sorted_idx = result.importances_mean.argsort() + +fig, ax = plt.subplots() +ax.boxplot(result.importances[sorted_idx].T, + vert=False, labels=X_test.columns[sorted_idx]) +ax.set_title("Permutation Importances (Train set)") +fig.tight_layout() +plt.show() diff --git a/test/test_ensemble/test_ensemble.py b/test/test_ensemble/test_ensemble.py index cd0f02e72..913fda548 100644 --- a/test/test_ensemble/test_ensemble.py +++ b/test/test_ensemble/test_ensemble.py @@ -525,7 +525,7 @@ def test_run_end_at(ensemble_backend): current_time = time.time() - ensbuilder.run(end_at=current_time + 10, iteration=1) + ensbuilder.run(end_at=current_time + 10, iteration=1, pynisher_context='forkserver') # 4 seconds left because: 10 seconds - 5 seconds overhead - very little overhead, # but then rounded to an integer assert pynisher_mock.call_args_list[0][1]["wall_time_in_s"], 4 @@ -718,9 +718,10 @@ def test_ensemble_builder_nbest_remembered(fit_ensemble, ensemble_backend, dask_ ensemble_memory_limit=1000, random_state=0, max_iterations=None, + pynisher_context='fork', ) - manager.build_ensemble(dask_client, unit_test=True, pynisher_context='fork') + manager.build_ensemble(dask_client, unit_test=True) future = manager.futures[0] dask.distributed.wait([future]) # wait for the ensemble process to finish assert future.result() == ([], 5, None, None), vars(future.result()) diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py index 015b78dca..9afa8969f 100644 --- a/test/test_evaluation/test_evaluation.py +++ b/test/test_evaluation/test_evaluation.py @@ -353,6 +353,7 @@ def test_exception_in_target_function(self, eval_holdout_mock): self.assertIn('traceback', info[1].additional_info) self.assertNotIn('exitcode', info[1].additional_info) + @unittest.skipIf(sys.version_info < (3, 7), reason="requires python3.7 or higher") def test_silent_exception_in_target_function(self): config = unittest.mock.Mock(spec=int) config.config_id = 198 @@ -380,6 +381,7 @@ def test_silent_exception_in_target_function(self): """'save_targets_ensemble'",)""", """AttributeError("'BackendMock' object has no attribute """ """'save_targets_ensemble'")""", + """AttributeError('save_targets_ensemble')""" """AttributeError("'BackendMock' object has no attribute """ """'setup_logger'",)""", """AttributeError("'BackendMock' object has no attribute """ diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py index 875ed399c..d9f5170c5 100644 --- a/test/test_pipeline/components/preprocessing/base.py +++ b/test/test_pipeline/components/preprocessing/base.py @@ -3,10 +3,9 @@ from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \ TabularColumnTransformer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import \ - EncoderChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline diff --git a/test/test_pipeline/components/preprocessing/test_encoder_choice.py b/test/test_pipeline/components/preprocessing/test_encoder_choice.py index f4dbcc119..860fd8eac 100644 --- a/test/test_pipeline/components/preprocessing/test_encoder_choice.py +++ b/test/test_pipeline/components/preprocessing/test_encoder_choice.py @@ -1,7 +1,7 @@ import copy import unittest -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import ( +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import ( EncoderChoice ) diff --git a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py index 822112fca..99fad6b1f 100644 --- a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py +++ b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py @@ -7,10 +7,11 @@ from sklearn.base import BaseEstimator from sklearn.compose import make_column_transformer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing import ( + FeatureProprocessorChoice +) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ NoFeaturePreprocessor import NoFeaturePreprocessor -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ - base_feature_preprocessor_choice import FeatureProprocessorChoice from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline diff --git a/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py index 52d55c6df..57841aef0 100644 --- a/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py +++ b/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py @@ -1,8 +1,9 @@ import copy import unittest -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ - base_feature_preprocessor_choice import FeatureProprocessorChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing import ( + FeatureProprocessorChoice +) class TestFeaturePreprocessorChoice(unittest.TestCase): diff --git a/test/test_pipeline/components/preprocessing/test_normalizer_choice.py b/test/test_pipeline/components/preprocessing/test_normalizer_choice.py index dbb711ab0..42b79d72f 100644 --- a/test/test_pipeline/components/preprocessing/test_normalizer_choice.py +++ b/test/test_pipeline/components/preprocessing/test_normalizer_choice.py @@ -1,7 +1,7 @@ import copy import unittest -from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise.base_normalizer_choice import ( +from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise import ( NormalizerChoice ) diff --git a/test/test_pipeline/components/preprocessing/test_scaler_choice.py b/test/test_pipeline/components/preprocessing/test_scaler_choice.py index 9d10af59f..3e4b6a3e5 100644 --- a/test/test_pipeline/components/preprocessing/test_scaler_choice.py +++ b/test/test_pipeline/components/preprocessing/test_scaler_choice.py @@ -1,7 +1,7 @@ import copy import unittest -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice class TestRescalerChoice(unittest.TestCase): diff --git a/test/test_pipeline/components/setup/test_setup.py b/test/test_pipeline/components/setup/test_setup.py index aae1c8ff4..9d9b6f7ad 100644 --- a/test/test_pipeline/components/setup/test_setup.py +++ b/test/test_pipeline/components/setup/test_setup.py @@ -10,27 +10,26 @@ import torch from torch import nn -import autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice as lr_components -import \ - autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice as network_initializer_components # noqa: E501 -import autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice as optimizer_components +import autoPyTorch.pipeline.components.setup.lr_scheduler as lr_components +import autoPyTorch.pipeline.components.setup.network_backbone as base_network_backbone_choice +import autoPyTorch.pipeline.components.setup.network_head as base_network_head_choice +import autoPyTorch.pipeline.components.setup.network_initializer as network_initializer_components # noqa: E501 +import autoPyTorch.pipeline.components.setup.optimizer as optimizer_components from autoPyTorch import constants from autoPyTorch.pipeline.components.base_component import ThirdPartyComponents -from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import ( +from autoPyTorch.pipeline.components.setup.lr_scheduler import ( BaseLRComponent, SchedulerChoice ) -from autoPyTorch.pipeline.components.setup.network_backbone import base_network_backbone_choice +from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent -from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice -from autoPyTorch.pipeline.components.setup.network_head import base_network_head_choice +from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent -from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice -from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import ( +from autoPyTorch.pipeline.components.setup.network_initializer import ( BaseNetworkInitializerComponent, NetworkInitializerChoice ) -from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice import ( +from autoPyTorch.pipeline.components.setup.optimizer import ( BaseOptimizerComponent, OptimizerChoice ) diff --git a/test/test_pipeline/components/setup/test_setup_traditional_classification.py b/test/test_pipeline/components/setup/test_setup_traditional_classification.py index ea3100724..90c7f18f6 100644 --- a/test/test_pipeline/components/setup/test_setup_traditional_classification.py +++ b/test/test_pipeline/components/setup/test_setup_traditional_classification.py @@ -6,7 +6,7 @@ import pytest -from autoPyTorch.pipeline.components.setup.traditional_ml.base_model_choice import ModelChoice +from autoPyTorch.pipeline.components.setup.traditional_ml import ModelChoice from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models.classifiers import ( CatboostModel, ExtraTreesModel, diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py index 38b6b5007..98ab27b31 100644 --- a/test/test_pipeline/components/training/base.py +++ b/test/test_pipeline/components/training/base.py @@ -23,8 +23,8 @@ def prepare_trainer(self, trainer: BaseTrainerComponent, task_type: int, epochs=50): + # make this test reproducible torch.manual_seed(1) - if task_type in CLASSIFICATION_TASKS: X, y = make_classification( n_samples=n_samples, diff --git a/test/test_pipeline/components/training/test_training.py b/test/test_pipeline/components/training/test_training.py index 36670e325..c55bd967c 100644 --- a/test/test_pipeline/components/training/test_training.py +++ b/test/test_pipeline/components/training/test_training.py @@ -16,6 +16,9 @@ from autoPyTorch.pipeline.components.training.data_loader.base_data_loader import ( BaseDataLoaderComponent, ) +from autoPyTorch.pipeline.components.training.trainer import ( + TrainerChoice, +) from autoPyTorch.pipeline.components.training.trainer.MixUpTrainer import ( MixUpTrainer ) @@ -24,9 +27,6 @@ ) from autoPyTorch.pipeline.components.training.trainer.base_trainer import ( BaseTrainerComponent, ) -from autoPyTorch.pipeline.components.training.trainer.base_trainer_choice import ( - TrainerChoice, -) sys.path.append(os.path.dirname(__file__)) from test.test_pipeline.components.training.base import BaseTraining # noqa (E402: module level import not at top of file) diff --git a/test/test_utils/test_single_thread_client.py b/test/test_utils/test_single_thread_client.py new file mode 100644 index 000000000..2338d4cf7 --- /dev/null +++ b/test/test_utils/test_single_thread_client.py @@ -0,0 +1,32 @@ +import dask.distributed + +from distributed.utils_test import inc + +import pytest + +from autoPyTorch.utils.single_thread_client import SingleThreadedClient + + +def test_single_thread_client_like_dask_client(): + single_thread_client = SingleThreadedClient() + assert isinstance(single_thread_client, dask.distributed.Client) + future = single_thread_client.submit(inc, 1) + assert isinstance(future, dask.distributed.Future) + assert future.done() + assert future.result() == 2 + assert sum(single_thread_client.nthreads().values()) == 1 + single_thread_client.close() + single_thread_client.shutdown() + + # Client/Futures are printed, so make sure str works + # str calls __rpr__ which is the purpose of below check + assert str(future) != "" + assert str(single_thread_client) != "" + + # Single thread client is an inherited version of dask client + # so that futures run in the same thread as the main job. + # We carefully selected what methods are inherited, and any other + # method should raise a not implemented error to be safe of major + # dask client api changes. + with pytest.raises(NotImplementedError): + single_thread_client.get_scheduler_logs()