automl · nabenabe0928 · Feb 25, 2022 · Feb 22, 2022 · Feb 22, 2022 · Feb 22, 2022
diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
@@ -18,7 +18,6 @@
 import numpy as np
 
 import pandas as pd
-from pandas.api.types import is_float_dtype, is_numeric_dtype
 
 from scipy.sparse import issparse, spmatrix
 
@@ -246,23 +245,17 @@ def reduce_precision(
                              f" {supported_precision_reductions}")
         reduced_dtypes = reduction_mapping[X.dtype]
         X = X.astype(reduced_dtypes)
+
     elif hasattr(X, 'iloc'):
         dtypes = dict(X.dtypes)
 
-        integer_columns = []
-        float_columns = []
+        col_names = X.dtypes.index
 
-        for col, dtype in dtypes.items():
-            if is_numeric_dtype(dtype):
-                if is_float_dtype(dtype):
-                    float_columns.append(col)
-                else:
-                    integer_columns.append(col)
+        float_cols = col_names[[dt.name.startswith("float") for dt in X.dtypes.values]]
+        int_cols = col_names[[dt.name.startswith("int") for dt in X.dtypes.values]]
+        X[int_cols] = X[int_cols].apply(lambda column: pd.to_numeric(column, downcast='integer'))
+        X[float_cols] = X[float_cols].apply(lambda column: pd.to_numeric(column, downcast='float'))
 
-        if len(integer_columns) > 0:
-            X[integer_columns] = X[integer_columns].apply(lambda column: pd.to_numeric(column, downcast='integer'))
-        if len(float_columns) > 0:
-            X[float_columns] = X[float_columns].apply(lambda column: pd.to_numeric(column, downcast='float'))
         reduced_dtypes = dict(X.dtypes)
     else:
         raise ValueError(f"Unrecognised data type of X, expected data type to "

diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py
@@ -110,3 +110,18 @@ def test_get_dataset_compression_mapping(memory_limit):
         memory_limit=memory_limit
     )
     assert dataset_compression_mapping is None
+
+
+def test_unsupported_errors():
+    """
+    Checks if errors are raised when unsupported data is passed to reduce
+    """
+    X = np.array([
+        ['a', 'b', 'c', 'a', 'b', 'c'],
+        ['a', 'b', 'd', 'r', 'b', 'c']])
+    with pytest.raises(ValueError, match=r'X.dtype = .*'):
+        reduce_dataset_size_if_too_large(X, 0)
+
+    X = [[1, 2], [2, 3]]
+    with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'):
+        reduce_dataset_size_if_too_large(X, 0)