Skip to content
19 changes: 6 additions & 13 deletions autoPyTorch/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import numpy as np

import pandas as pd
from pandas.api.types import is_float_dtype, is_numeric_dtype

from scipy.sparse import issparse, spmatrix

Expand Down Expand Up @@ -246,23 +245,17 @@ def reduce_precision(
f" {supported_precision_reductions}")
reduced_dtypes = reduction_mapping[X.dtype]
X = X.astype(reduced_dtypes)

elif hasattr(X, 'iloc'):
dtypes = dict(X.dtypes)

integer_columns = []
float_columns = []
col_names = X.dtypes.index

for col, dtype in dtypes.items():
if is_numeric_dtype(dtype):
if is_float_dtype(dtype):
float_columns.append(col)
else:
integer_columns.append(col)
float_cols = col_names[[dt.name.startswith("float") for dt in X.dtypes.values]]
int_cols = col_names[[dt.name.startswith("int") for dt in X.dtypes.values]]
X[int_cols] = X[int_cols].apply(lambda column: pd.to_numeric(column, downcast='integer'))
X[float_cols] = X[float_cols].apply(lambda column: pd.to_numeric(column, downcast='float'))

if len(integer_columns) > 0:
X[integer_columns] = X[integer_columns].apply(lambda column: pd.to_numeric(column, downcast='integer'))
if len(float_columns) > 0:
X[float_columns] = X[float_columns].apply(lambda column: pd.to_numeric(column, downcast='float'))
reduced_dtypes = dict(X.dtypes)
else:
raise ValueError(f"Unrecognised data type of X, expected data type to "
Expand Down
15 changes: 15 additions & 0 deletions test/test_data/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,18 @@ def test_get_dataset_compression_mapping(memory_limit):
memory_limit=memory_limit
)
assert dataset_compression_mapping is None


def test_unsupported_errors():
"""
Checks if errors are raised when unsupported data is passed to reduce
"""
X = np.array([
['a', 'b', 'c', 'a', 'b', 'c'],
['a', 'b', 'd', 'r', 'b', 'c']])
with pytest.raises(ValueError, match=r'X.dtype = .*'):
reduce_dataset_size_if_too_large(X, 0)

X = [[1, 2], [2, 3]]
with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'):
reduce_dataset_size_if_too_large(X, 0)