diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 11c6cf577..4f64b429a 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -1,20 +1,14 @@ import logging -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union import numpy as np -import pandas as pd - -from scipy.sparse import spmatrix - from sklearn.base import BaseEstimator +from autoPyTorch.data.utils import SupportedFeatTypes, list_to_pandas from autoPyTorch.utils.logging_ import PicklableClientLogger -SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, spmatrix] - - class BaseFeatureValidator(BaseEstimator): """ A class to pre-process features. In this regards, the format of the data is checked, @@ -27,8 +21,8 @@ class BaseFeatureValidator(BaseEstimator): column_transformer (Optional[BaseEstimator]) Host a encoder object if the data requires transformation (for example, if provided a categorical column in a pandas DataFrame) - transformed_columns (List[str]) - List of columns that were encoded. + enc_columns (Optional[List[str]]): + The list of column names that should be encoded. """ def __init__( self, @@ -37,11 +31,11 @@ def __init__( # Register types to detect unsupported data format changes self.feat_type: Optional[List[str]] = None self.data_type: Optional[type] = None - self.dtypes: List[str] = [] + self.dtypes: Dict[str, str] = {} self.column_order: List[str] = [] self.column_transformer: Optional[BaseEstimator] = None - self.transformed_columns: List[str] = [] + self.enc_columns: List[str] = [] self.logger: Union[ PicklableClientLogger, logging.Logger @@ -75,7 +69,8 @@ def fit( # If a list was provided, it will be converted to pandas if isinstance(X_train, list): - X_train, X_test = self.list_to_dataframe(X_train, X_test) + X_train = list_to_pandas(X_train, self.logger) + X_test = list_to_pandas(X_test, self.logger) if X_test is not None else None self._check_data(X_train) diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index 530675fbd..f5099ff62 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -1,18 +1,14 @@ import logging -from typing import List, Optional, Union, cast +from typing import Optional, Union, cast import numpy as np import pandas as pd -from scipy.sparse import spmatrix - from sklearn.base import BaseEstimator from autoPyTorch.utils.logging_ import PicklableClientLogger - - -SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, spmatrix] +from autoPyTorch.data.utils import SupportedTargetTypes class BaseTargetValidator(BaseEstimator): diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 3e8c316b0..cecd90257 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,6 +1,12 @@ +""" +TODO: + 1. Add dtypes argument to TabularFeatureValidator + 2. Modify dtypes from List[str] to Dict[str, str] + 3. Add the feature to enforce the dtype to the provided dtypes +""" import functools from logging import Logger -from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Mapping, Optional, Set, Tuple, Union, cast import numpy as np @@ -9,7 +15,6 @@ from scipy.sparse import issparse, spmatrix -import sklearn.utils from sklearn import preprocessing from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer @@ -19,9 +24,15 @@ from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes from autoPyTorch.data.utils import ( + ColumnDTypes, DatasetCompressionInputType, DatasetDTypeContainerType, - reduce_dataset_size_if_too_large + _categorical_left_mover, + _check_and_to_array, + _get_columns_to_encode, + has_object_columns, + reduce_dataset_size_if_too_large, + to_pandas, ) from autoPyTorch.utils.common import ispandas from autoPyTorch.utils.logging_ import PicklableClientLogger @@ -86,7 +97,7 @@ class TabularFeatureValidator(BaseFeatureValidator): List for which an element at each index is a list containing the categories for the respective categorical column. - transformed_columns (List[str]) + enc_columns (List[str]) List of columns that were transformed. column_transformer (Optional[BaseEstimator]) Hosts an imputer and an encoder object if the data @@ -104,38 +115,79 @@ def __init__( self, logger: Optional[Union[PicklableClientLogger, Logger]] = None, dataset_compression: Optional[Mapping[str, Any]] = None, + dtypes: Optional[Dict[str, str]] = None, ) -> None: + super().__init__(logger) self._dataset_compression = dataset_compression self._reduced_dtype: Optional[DatasetDTypeContainerType] = None - super().__init__(logger) + self.all_nan_columns: Optional[Set[str]] = None + self.dtypes = dtypes if dtypes is not None else {} + self._called_infer_object = False + + def _convert_all_nan_columns_to_numeric(self, X: pd.DataFrame, fit: bool = False) -> pd.DataFrame: + """ + Convert columns whose values were all nan in the training dataset to numeric. + + Args: + X (pd.DataFrame): + The data to transform. + fit (bool): + Whether this call is the fit to X or the transform using pre-fitted transformer. + """ + if not fit and not issparse(X) and self.all_nan_columns is None: + raise ValueError('_fit must be called before calling transform') + + if fit: + all_nan_columns = X.columns[X.isna().all()] + else: + assert self.all_nan_columns is not None + all_nan_columns = list(self.all_nan_columns) + + for col in all_nan_columns: + X[col] = np.nan + X[col] = pd.to_numeric(X[col]) + if len(self.dtypes): + self.dtypes[col] = X[col].dtype.name + + if has_object_columns(X.dtypes.values): + X = self.infer_objects(X) + + if fit: + # TODO: Check how to integrate below + # self.dtypes = [dt.name for dt in X.dtypes] + self.all_nan_columns = set(all_nan_columns) + + return X @staticmethod def _comparator(cmp1: str, cmp2: str) -> int: - """Order so that categorical columns come left and numerical columns come right + return _categorical_left_mover(cmp1, cmp2) - Args: - cmp1 (str): First variable to compare - cmp2 (str): Second variable to compare + def _encode_categories(self, X: pd.DataFrame) -> None: + preprocessors = get_tabular_preprocessors() + self.column_transformer = _create_column_transformer( + preprocessors=preprocessors, + categorical_columns=self.enc_columns, + ) - Raises: - ValueError: if the values of the variables to compare - are not in 'categorical' or 'numerical' + assert self.column_transformer is not None # Mypy redefinition + self.column_transformer.fit(X) - Returns: - int: either [0, -1, 1] - """ - choices = ['categorical', 'numerical'] - if cmp1 not in choices or cmp2 not in choices: - raise ValueError('The comparator for the column order only accepts {}, ' - 'but got {} and {}'.format(choices, cmp1, cmp2)) + # The column transformer moves categoricals to the left side + assert self.feat_type is not None + self.feat_type = sorted(self.feat_type, key=functools.cmp_to_key(self._comparator)) - idx1, idx2 = choices.index(cmp1), choices.index(cmp2) - return idx1 - idx2 + encoded_categories = self.column_transformer.\ + named_transformers_['categorical_pipeline'].\ + named_steps['ordinalencoder'].categories_ - def _fit( - self, - X: SupportedFeatTypes, - ) -> BaseEstimator: + # An ordinal encoder for each categorical columns + self.categories = [ + list(range(len(cat))) + for cat in encoded_categories + ] + + def _fit(self, X: SupportedFeatTypes) -> BaseEstimator: """ In case input data is a pandas DataFrame, this utility encodes the user provided features (from categorical for example) to a numerical value that further stages @@ -151,78 +203,27 @@ def _fit( The fitted base estimator """ - # The final output of a validator is a numpy array. But pandas - # gives us information about the column dtype - if isinstance(X, np.ndarray): - X = self.numpy_array_to_pandas(X) + X = to_pandas(X) # there is the column dtype info, so convert it to pandas if ispandas(X) and not issparse(X): X = cast(pd.DataFrame, X) - # Treat a column with all instances a NaN as numerical - # This will prevent doing encoding to a categorical column made completely - # out of nan values -- which will trigger a fail, as encoding is not supported - # with nan values. - # Columns that are completely made of NaN values are provided to the pipeline - # so that later stages decide how to handle them - if np.any(pd.isnull(X)): - for column in X.columns: - if X[column].isna().all(): - X[column] = pd.to_numeric(X[column]) - # Also note this change in self.dtypes - if len(self.dtypes) != 0: - self.dtypes[list(X.columns).index(column)] = X[column].dtype - - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) - - self.transformed_columns, self.feat_type = self._get_columns_to_encode(X) + X = self._convert_all_nan_columns_to_numeric(X, fit=True) + self.enc_columns, self.feat_type = self._get_columns_to_encode(X) assert self.feat_type is not None + if len(self.enc_columns) > 0: + self._encode_categories(X) - if len(self.transformed_columns) > 0: - - preprocessors = get_tabular_preprocessors() - self.column_transformer = _create_column_transformer( - preprocessors=preprocessors, - categorical_columns=self.transformed_columns, - ) - - # Mypy redefinition - assert self.column_transformer is not None - self.column_transformer.fit(X) - - # The column transformer reorders the feature types - # therefore, we need to change the order of columns as well - # This means categorical columns are shifted to the left - self.feat_type = sorted( - self.feat_type, - key=functools.cmp_to_key(self._comparator) - ) - - encoded_categories = self.column_transformer.\ - named_transformers_['categorical_pipeline'].\ - named_steps['ordinalencoder'].categories_ - self.categories = [ - # We fit an ordinal encoder, where all categorical - # columns are shifted to the left - list(range(len(cat))) - for cat in encoded_categories - ] - - for i, type_ in enumerate(self.feat_type): - if 'numerical' in type_: + for i, type_name in enumerate(self.feat_type): + if ColumnDTypes.numerical in type_name: self.numerical_columns.append(i) else: self.categorical_columns.append(i) - # Lastly, store the number of features self.num_features = np.shape(X)[1] return self - def transform( - self, - X: SupportedFeatTypes, - ) -> Union[np.ndarray, spmatrix, pd.DataFrame]: + def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.DataFrame]: """ Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. @@ -235,40 +236,54 @@ def transform( Return: np.ndarray: The transformed array + + Note: + The default transform performs the folloing: + * simple imputation for both + * scaling for numerical + * one-hot encoding for categorical + For example, here is a simple case + of which all the columns are categorical. + data = [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'B': 3, 'C': np.nan}, + {'A': 2, 'B': np.nan, 'C': np.nan} + ] + and suppose all the columns are categorical, + then + * `A` in {np.nan, 1, 2} + * `B` in {np.nan, 3} + * `C` in {np.nan} <=== it will be dropped. + + So in the column A, + * np.nan ==> [1, 0, 0] (always the index 0) + * 1 ==> [0, 1, 0] + * 2 ==> [0, 0, 1] + in the column B, + * np.nan ==> [1, 0] + * 3 ==> [0, 1] + Therefore, by concatenating, + * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0] + * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1] + * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0] + ==> [ + [0, 1, 0, 1, 0], + [1, 0, 0, 0, 1], + [0, 0, 1, 1, 0] + ] """ if not self._is_fitted: raise NotFittedError("Cannot call transform on a validator that is not fitted") - # If a list was provided, it will be converted to pandas - if isinstance(X, list): - X, _ = self.list_to_dataframe(X) - - if isinstance(X, np.ndarray): - X = self.numpy_array_to_pandas(X) - + X = to_pandas(X) if ispandas(X) and not issparse(X): - if np.any(pd.isnull(X)): - for column in X.columns: - if X[column].isna().all(): - X[column] = pd.to_numeric(X[column]) - - # Also remove the object dtype for new data - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) + X = self._convert_all_nan_columns_to_numeric(X) # Check the data here so we catch problems on new test data self._check_data(X) # Pandas related transformations if ispandas(X) and self.column_transformer is not None: - if np.any(pd.isnull(X)): - # After above check it means that if there is a NaN - # the whole column must be NaN - # Make sure it is numerical and let the pipeline handle it - for column in X.columns: - if X[column].isna().all(): - X[column] = pd.to_numeric(X[column]) - X = self.column_transformer.transform(X) # Sparse related transformations @@ -276,20 +291,7 @@ def transform( if issparse(X) and hasattr(X, 'sort_indices'): X.sort_indices() - try: - X = sklearn.utils.check_array( - X, - force_all_finite=False, - accept_sparse='csr' - ) - except Exception as e: - self.logger.exception(f"Conversion failed for input {X.dtypes} {X}" - "This means AutoPyTorch was not able to properly " - "Extract the dtypes of the provided input features. " - "Please try to manually cast it to a supported " - "numerical or categorical values.") - raise e - + X = _check_and_to_array(X, logger=self.logger) X = self._compress_dataset(X) return X @@ -301,7 +303,6 @@ def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressio the testing data is converted to the same dtype as the training data. - Args: X (DatasetCompressionInputType): Dataset @@ -322,10 +323,26 @@ def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressio self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype return X - def _check_data( - self, - X: SupportedFeatTypes, - ) -> None: + def _check_dataframe(self, X: pd.DataFrame) -> None: + err_msg = " of the features must be identical before/after fit(), " + err_msg += "but different between training and test datasets:\n" + + # Define the column to be encoded as the feature validator is fitted once per estimator + self.enc_columns, self.feat_type = self._get_columns_to_encode(X) + + column_order = [column for column in X.columns] + if len(self.column_order) == 0: + self.column_order = column_order + elif self.column_order != column_order: + raise ValueError(f"The column order{err_msg}train: {self.column_order}\ntest: {column_order}") + + dtypes = {col: dtype.name for col, dtype in zip(X.columns, X.dtypes)} + if len(self.dtypes) == 0: + self.dtypes = dtypes + elif self.dtypes != dtypes: + raise ValueError(f"The dtypes{err_msg}train: {self.dtypes}\ntest: {dtypes}") + + def _check_data(self, X: SupportedFeatTypes) -> None: """ Feature dimensionality and data type checks @@ -336,73 +353,31 @@ def _check_data( """ if not isinstance(X, (np.ndarray, pd.DataFrame)) and not issparse(X): - raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames," - " scipy sparse and Python Lists, yet, the provided input is" - " of type {}".format(type(X)) - ) + raise TypeError( + "AutoPyTorch only supports numpy.ndarray, pandas.DataFrame," + f" scipy.sparse and List, but got {type(X)}" + ) if self.data_type is None: self.data_type = type(X) if self.data_type != type(X): - self.logger.warning("AutoPyTorch previously received features of type %s " - "yet the current features have type %s. Changing the dtype " - "of inputs to an estimator might cause problems" % ( - str(self.data_type), - str(type(X)), - ), - ) - - # Do not support category/string numpy data. Only numbers - if hasattr(X, "dtype"): - if not np.issubdtype(X.dtype.type, np.number): # type: ignore[union-attr] - raise ValueError( - "When providing a numpy array to AutoPyTorch, the only valid " - "dtypes are numerical ones. The provided data type {} is not supported." - "".format( - X.dtype.type, # type: ignore[union-attr] - ) - ) + self.logger.warning( + f"AutoPyTorch previously received features of type {str(self.data_type)}, " + f"but got type {str(type(X))} in the current features. This change might cause problems" + ) - # Then for Pandas, we do not support Nan in categorical columns - if ispandas(X): - # If entered here, we have a pandas dataframe + if ispandas(X): # For pandas, no support of nan in categorical cols X = cast(pd.DataFrame, X) + self._check_dataframe(X) - # Handle objects if possible - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) - - # Define the column to be encoded here as the feature validator is fitted once - # per estimator - self.transformed_columns, self.feat_type = self._get_columns_to_encode(X) - - column_order = [column for column in X.columns] - if len(self.column_order) > 0: - if self.column_order != column_order: - raise ValueError("Changing the column order of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format(self.column_order, - column_order,) - ) - else: - self.column_order = column_order - - dtypes = [dtype.name for dtype in X.dtypes] - if len(self.dtypes) > 0: - if self.dtypes != dtypes: - raise ValueError("Changing the dtype of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format(self.dtypes, - dtypes, - ) - ) - else: - self.dtypes = dtypes - - def _get_columns_to_encode( - self, - X: pd.DataFrame, - ) -> Tuple[List[str], List[str]]: + # For ndarray, no support of category/string + if isinstance(X, np.ndarray) and not np.issubdtype(X.dtype.type, np.number): + dt = X.dtype.type + raise TypeError( + f"AutoPyTorch does not support numpy.ndarray with non-numerical dtype, but got {dt}" + ) + + def _get_columns_to_encode(self, X: pd.DataFrame) -> Tuple[List[str], Dict[str, str]]: """ Return the columns to be encoded from a pandas dataframe @@ -412,121 +387,16 @@ def _get_columns_to_encode( checks) and an encoder fitted in the case the data needs encoding Returns: - transformed_columns (List[str]): - Columns to encode, if any - feat_type: - Type of each column numerical/categorical - """ - - if len(self.transformed_columns) > 0 and self.feat_type is not None: - return self.transformed_columns, self.feat_type - - # Register if a column needs encoding - transformed_columns = [] - - # Also, register the feature types for the estimator - feat_type = [] - - # Make sure each column is a valid type - for i, column in enumerate(X.columns): - if X[column].dtype.name in ['category', 'bool']: - - transformed_columns.append(column) - feat_type.append('categorical') - # Move away from np.issubdtype as it causes - # TypeError: data type not understood in certain pandas types - elif not is_numeric_dtype(X[column]): - if X[column].dtype.name == 'object': - raise ValueError( - "Input Column {} has invalid type object. " - "Cast it to a valid dtype before using it in AutoPyTorch. " - "Valid types are numerical, categorical or boolean. " - "You can cast it to a valid dtype using " - "pandas.Series.astype ." - "If working with string objects, the following " - "tutorial illustrates how to work with text data: " - "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( - # noqa: E501 - column, - ) - ) - elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype( - X[column].dtype - ): - raise ValueError( - "AutoPyTorch does not support time and/or date datatype as given " - "in column {}. Please convert the time information to a numerical value " - "first. One example on how to do this can be found on " - "https://stats.stackexchange.com/questions/311494/".format( - column, - ) - ) - else: - raise ValueError( - "Input Column {} has unsupported dtype {}. " - "Supported column types are categorical/bool/numerical dtypes. " - "Make sure your data is formatted in a correct way, " - "before feeding it to AutoPyTorch.".format( - column, - X[column].dtype.name, - ) - ) - else: - feat_type.append('numerical') - return transformed_columns, feat_type - - def list_to_dataframe( - self, - X_train: SupportedFeatTypes, - X_test: Optional[SupportedFeatTypes] = None, - ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: + enc_columns (List[str]): + Columns to encode + feat_type (Dict[str, str]): + Whether each column is numerical or categorical """ - Converts a list to a pandas DataFrame. In this process, column types are inferred. - - If test data is provided, we proactively match it to train data - Args: - X_train (SupportedFeatTypes): - A set of features that are going to be validated (type and dimensionality - checks) and a encoder fitted in the case the data needs encoding - X_test (Optional[SupportedFeatTypes]): - A hold out set of data used for checking - - Returns: - pd.DataFrame: - transformed train data from list to pandas DataFrame - pd.DataFrame: - transformed test data from list to pandas DataFrame - """ - - # If a list was provided, it will be converted to pandas - X_train = pd.DataFrame(data=X_train).infer_objects() - self.logger.warning("The provided feature types to AutoPyTorch are of type list." - "Features have been interpreted as: {}".format([(col, t) for col, t in - zip(X_train.columns, X_train.dtypes)])) - if X_test is not None: - if not isinstance(X_test, list): - self.logger.warning("Train features are a list while the provided test data" - "is {}. X_test will be casted as DataFrame.".format(type(X_test)) - ) - X_test = pd.DataFrame(data=X_test).infer_objects() - return X_train, X_test - - def numpy_array_to_pandas( - self, - X: np.ndarray, - ) -> pd.DataFrame: - """ - Converts a numpy array to pandas for type inference - - Args: - X (np.ndarray): - data to be interpreted. - - Returns: - pd.DataFrame - """ - return pd.DataFrame(X).infer_objects().convert_dtypes() + if len(self.enc_columns) > 0 and self.feat_type is not None: + return self.enc_columns, self.feat_type + else: + return _get_columns_to_encode(X) def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: """ @@ -541,26 +411,25 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame """ - if hasattr(self, 'object_dtype_mapping'): - # Mypy does not process the has attr. This dict is defined below - for key, dtype in self.object_dtype_mapping.items(): # type: ignore[has-type] - if 'int' in dtype.name: - # In the case train data was interpreted as int - # and test data was interpreted as float, because of 0.0 - # for example, honor training data - X[key] = X[key].applymap(np.int64) - else: - try: - X[key] = X[key].astype(dtype.name) - except Exception as e: - # Try inference if possible - self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}") - pass - else: + if self._called_infer_object: + # honor the training data types + try: + # Mypy does not process the has attr. + X = X.astype(self.dtypes) # type: ignore[has-type] + except Exception as e: + self.logger.warning( + 'Casting the columns to training dtypes ' + f'{self.dtypes} caused the exception {e}' # type: ignore[has-type] + ) + elif len(self.dtypes): # Overwrite the dtypes in test data by those in the training data + X = X.astype(self.dtypes) + else: # Calling for the first time to infer the categories X = X.infer_objects() - for column in X.columns: - if not is_numeric_dtype(X[column]): - X[column] = X[column].astype('category') - self.object_dtype_mapping = {column: X[column].dtype for column in X.columns} - self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}") + cat_dtypes = {col: 'category' for col, dtype in zip(X.columns, X.dtypes) if not is_numeric_dtype(dtype)} + X = X.astype(cat_dtypes) + + self.dtypes.update({col: dtype.name for col, dtype in zip(X.columns, X.dtypes)}) + self.logger.debug(f"New dtypes of data: {self.dtypes}") + self._called_infer_object = True + return X diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py index 22cabb999..693a24cae 100644 --- a/autoPyTorch/data/tabular_target_validator.py +++ b/autoPyTorch/data/tabular_target_validator.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union, cast +from typing import List, Optional, cast import numpy as np @@ -7,24 +7,16 @@ from scipy.sparse import issparse, spmatrix -import sklearn.utils from sklearn import preprocessing from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError from sklearn.utils.multiclass import type_of_target from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes +from autoPyTorch.data.utils import ArrayType, _check_and_to_array from autoPyTorch.utils.common import ispandas -ArrayType = Union[np.ndarray, spmatrix] - - -def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType: - """ sklearn check array will make sure we have the correct numerical features for the array """ - return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False) - - def _modify_regression_target(y: ArrayType) -> ArrayType: # Regression targets must have numbers after a decimal point. # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952 @@ -124,8 +116,9 @@ def _fit( return self def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray: + kwargs = dict(force_all_finite=True, ensure_2d=False) if self.encoder is None: - return _check_and_to_array(y) + return _check_and_to_array(y, **kwargs) # remove ravel warning from pandas Series shape = np.shape(y) @@ -139,7 +132,7 @@ def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray: else: y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1) - return _check_and_to_array(y) + return _check_and_to_array(y, **kwargs) def transform(self, y: SupportedTargetTypes) -> np.ndarray: """ diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index 03375ce27..40dc7aa11 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -1,5 +1,6 @@ # Implementation used from https://github.com/automl/auto-sklearn/blob/development/autosklearn/util/data.py import warnings +from logging import Logger from math import floor from typing import ( Any, @@ -18,11 +19,18 @@ import numpy as np import pandas as pd +from pandas.api.types import is_numeric_dtype from scipy.sparse import issparse, spmatrix -from autoPyTorch.utils.common import ispandas +from sklearn.utils import check_array +from autoPyTorch.utils.common import autoPyTorchEnum, ispandas + + +ArrayType = Union[np.ndarray, spmatrix] +SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, spmatrix] +SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, spmatrix] # TODO: TypedDict with python 3.8 # @@ -39,6 +47,188 @@ } +class ColumnDTypes(autoPyTorchEnum): + numerical = "numerical" + categorical = "categorical" + + +def convert_dtype_enum_dict_to_str_dict(dtype_dict: Dict[str, ColumnDTypes]) -> Dict[str, str]: + enum2str = {type_choice: str(type_choice) for type_choice in ColumnDTypes} + return {col_name: enum2str[dtype_choice] for col_name, dtype_choice in dtype_dict.items()} + + +def list_to_pandas(data: List, logger: Optional[Logger] = None) -> pd.DataFrame: + """ + Convert a list to a pandas DataFrame. In this process, column types are inferred. + + Args: + data (List): + A list of features. + + Returns: + pd.DataFrame: + transformed data from list to pandas DataFrame + """ + if not isinstance(data, list): + raise TypeError(f"data must be list, but got {type(data)}") + + # If a list was provided, it will be converted to pandas + data = pd.DataFrame(data=data).infer_objects() + data_info = [(col, t) for col, t in zip(data.columns, data.dtypes)] + + if logger is not None: + logger.warning( + "The provided feature types to AutoPyTorch are list." + f"Features have been interpreted as: {data_info}" + ) + + return data + + +def numpy_to_pandas(data: np.ndarray) -> pd.DataFrame: + """ + Converts a numpy array to pandas for type inference + + Args: + X (np.ndarray): + data to be interpreted. + + Returns: + pd.DataFrame + """ + if not isinstance(data, np.ndarray): + raise TypeError(f"data must be np.ndarray, but got {type(data)}") + + return pd.DataFrame(data).infer_objects().convert_dtypes() + + +def to_pandas(data: SupportedFeatTypes, logger: Optional[Logger] = None) -> SupportedFeatTypes: + if isinstance(data, list): + data = list_to_pandas(data, logger) + elif isinstance(data, np.ndarray): + data = numpy_to_pandas(data) + + return data + + +def has_object_columns(feature_types: pd.Series) -> bool: + """ + Indicate whether on a Series of dtypes for a Pandas DataFrame + there exists one or more object columns. + Args: + feature_types (pd.Series): The feature types for a DataFrame. + Returns: + bool: + True if the DataFrame dtypes contain an object column, False + otherwise. + """ + return np.dtype('O') in feature_types + + +def _check_and_to_array( + data: Union[SupportedFeatTypes, SupportedTargetTypes], + logger: Optional[Logger] = None, + **kwargs: Dict[str, Any] +) -> ArrayType: + """ sklearn check array will make sure we have the correct numerical features for the array """ + _kwargs = dict(accept_sparse='csr', force_all_finite=False) + _kwargs.update(kwargs) + try: + return check_array(data, **_kwargs) + except Exception as e: + if logger is not None: + logger.exception( + f"Conversion failed for input {data}" + "This means AutoPyTorch was not able to properly " + "Extract the dtypes of the provided input features. " + "Please try to manually cast it to a supported " + "numerical or categorical values." + ) + raise e + + +def _error_due_to_unsupported_column(X: pd.DataFrame, column: str) -> None: + # Move away from np.issubdtype as it causes + # TypeError: data type not understood in certain pandas types + def _generate_error_message_prefix(type_name: str, proc_type: Optional[str] = None) -> str: + msg1 = f"column `{column}` has an invalid type `{type_name}`. " + msg2 = "Cast it to a numerical type, category type or bool type by astype method. " + msg3 = f"The following link might help you to know {proc_type} processing: " + return msg1 + msg2 + ("" if proc_type is None else msg3) + + dtype = X[column].dtype + if dtype.name == 'object': + err_msg = _generate_error_message_prefix(type_name="object", proc_type="string") + url = "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html" + raise TypeError(f"{err_msg}{url}") + elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(dtype): + err_msg = _generate_error_message_prefix(type_name="time and/or date datatype", proc_type="datetime") + raise TypeError(f"{err_msg}https://stats.stackexchange.com/questions/311494/") + else: + err_msg = _generate_error_message_prefix(type_name=dtype.name) + raise TypeError(err_msg) + + +def _get_columns_to_encode(X: pd.DataFrame) -> Tuple[List[str], Dict[str, str]]: + """ + In case input data is a pandas DataFrame, this utility encodes the user provided + features (from categorical for example) to a numerical value that further stages + will be able to use + + Args: + X (pd.DataFrame): + A set of features that are going to be validated (type and dimensionality + checks) and an encoder fitted in the case the data needs encoding + + Returns: + enc_columns (List[str]): + Columns to encode + feat_type (Dict[str, str]): + Whether each column is numerical or categorical + """ + enc_columns: List[str] = [] + # feat_type: Dict[str, str] = {} + feat_type: List[str] = [] + + for dtype, col in zip(X.dtypes, X.columns): + if dtype.name in ['category', 'bool']: + enc_columns.append(col) + # feat_type[col] = str(ColumnDTypes.categorical) + feat_type.append(str(ColumnDTypes.categorical)) + elif is_numeric_dtype(dtype): + # feat_type[col] = str(ColumnDTypes.numerical) + feat_type.append(str(ColumnDTypes.numerical)) + else: + _error_due_to_unsupported_column(X, col) + + return enc_columns, feat_type + + +def _categorical_left_mover(cmp1: str, cmp2: str) -> int: + """Order so that categorical columns come left and numerical columns come right + + Args: + cmp1 (str): First variable to compare + cmp2 (str): Second variable to compare + + Raises: + ValueError: if the values of the variables to compare + are not in 'categorical' or 'numerical' + + Returns: + int: either [0, -1, 1] + """ + choices = [str(ColumnDTypes.categorical), str(ColumnDTypes.numerical)] + if cmp1 not in choices or cmp2 not in choices: + raise ValueError( + f"The comparator for the column order only accepts {choices}, " + f"but got {cmp1} and {cmp2}" + ) + + idx1, idx2 = choices.index(cmp1), choices.index(cmp2) + return idx1 - idx2 + + def get_dataset_compression_mapping( memory_limit: int, dataset_compression: Union[bool, Mapping[str, Any]] diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index 48302bdee..23d3908e7 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -101,6 +101,9 @@ def __eq__(self, other: Any) -> bool: def __hash__(self) -> int: return hash(self.value) + def __str__(self) -> str: + return str(self.value) + def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]: """ diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 3d352d765..c0d497ad9 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -220,7 +220,7 @@ def test_featurevalidator_supported_types(input_data_featuretest): ) def test_featurevalidator_unsupported_numpy(input_data_featuretest): validator = TabularFeatureValidator() - with pytest.raises(ValueError, match=r".*When providing a numpy array.*not supported."): + with pytest.raises(TypeError, match=r"AutoPyTorch does not support numpy.ndarray with non-numerical dtype"): validator.fit(input_data_featuretest) @@ -328,13 +328,11 @@ def test_features_unsupported_calls_are_raised(): expected """ validator = TabularFeatureValidator() - with pytest.raises(ValueError, match=r"AutoPyTorch does not support time"): - validator.fit( - pd.DataFrame({'datetime': [pd.Timestamp('20180310')]}) - ) - with pytest.raises(ValueError, match=r"AutoPyTorch only supports.*yet, the provided input"): + with pytest.raises(TypeError, match=r"invalid type `time and/or date datatype`."): + validator.fit(pd.DataFrame({'datetime': [pd.Timestamp('20180310')]})) + with pytest.raises(TypeError, match=r"AutoPyTorch only supports numpy.ndarray, pandas.DataFrame"): validator.fit({'input1': 1, 'input2': 2}) - with pytest.raises(ValueError, match=r"has unsupported dtype string"): + with pytest.raises(TypeError, match=r"invalid type `string`."): validator.fit(pd.DataFrame([{'A': 1, 'B': 2}], dtype='string')) with pytest.raises(ValueError, match=r"The feature dimensionality of the train and test"): validator.fit(X_train=np.array([[1, 2, 3], [4, 5, 6]]), @@ -517,15 +515,16 @@ def test_featurevalidator_new_data_after_fit(openml_id, # And then check proper error messages if train_data_type == 'pandas': + pattern = r"of the features must be identical before/after fit()" old_dtypes = copy.deepcopy(validator.dtypes) validator.dtypes = ['dummy' for dtype in X_train.dtypes] - with pytest.raises(ValueError, match=r"Changing the dtype of the features after fit"): + with pytest.raises(ValueError, match=pattern): transformed_X = validator.transform(X_test) validator.dtypes = old_dtypes if test_data_type == 'pandas': columns = X_test.columns.tolist() X_test = X_test[reversed(columns)] - with pytest.raises(ValueError, match=r"Changing the column order of the features"): + with pytest.raises(ValueError, match=pattern): transformed_X = validator.transform(X_test) diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py index 482c99769..cc89f5276 100644 --- a/test/test_data/test_validation.py +++ b/test/test_data/test_validation.py @@ -103,7 +103,7 @@ def test_sparse_data_validation_for_regression(): validator.fit(X_train=X_sp, y_train=y) - X_t, y_t = validator.transform(X, y) + X_t, y_t = validator.transform(X_sp, y) assert np.shape(X) == np.shape(X_t) # make sure everything was encoded to number