From a85b11af96d310102f44a3ef83545975fd6be060 Mon Sep 17 00:00:00 2001 From: Suman Michael Date: Tue, 13 Jul 2021 14:53:05 +0530 Subject: [PATCH 01/10] added TabularClassificationData,TabularRegressionData extending TabularData --- flash/tabular/__init__.py | 3 +- flash/tabular/classification/__init__.py | 2 +- flash/tabular/classification/data.py | 395 +++-------------- flash/tabular/data.py | 515 +++++++++++++++++++++++ flash/tabular/regression/__init__.py | 1 + flash/tabular/regression/data.py | 216 ++++++++++ 6 files changed, 784 insertions(+), 348 deletions(-) create mode 100644 flash/tabular/data.py create mode 100644 flash/tabular/regression/__init__.py create mode 100644 flash/tabular/regression/data.py diff --git a/flash/tabular/__init__.py b/flash/tabular/__init__.py index a3b8e2ca2d..7938299366 100644 --- a/flash/tabular/__init__.py +++ b/flash/tabular/__init__.py @@ -1 +1,2 @@ -from flash.tabular.classification import TabularClassifier, TabularData # noqa: F401 +from flash.tabular.classification import TabularClassifier, TabularClassificationData # noqa: F401 +from flash.tabular.data import TabularData # noqa: F401 diff --git a/flash/tabular/classification/__init__.py b/flash/tabular/classification/__init__.py index 45724db27b..6134277abf 100644 --- a/flash/tabular/classification/__init__.py +++ b/flash/tabular/classification/__init__.py @@ -1,2 +1,2 @@ -from flash.tabular.classification.data import TabularData # noqa: F401 +from flash.tabular.classification.data import TabularClassificationData # noqa: F401 from flash.tabular.classification.model import TabularClassifier # noqa: F401 diff --git a/flash/tabular/classification/data.py b/flash/tabular/classification/data.py index c2a60e24da..69b20b6ea5 100644 --- a/flash/tabular/classification/data.py +++ b/flash/tabular/classification/data.py @@ -11,25 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from io import StringIO -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Optional, Union, List, Dict, Callable, Any -import numpy as np -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from flash.core.data.data_module import DataModule + +from flash.core.data.process import Preprocess -from flash.core.classification import LabelsState from flash.core.data.callback import BaseDataFetcher -from flash.core.data.data_module import DataModule -from flash.core.data.data_source import DataSource, DefaultDataKeys, DefaultDataSources -from flash.core.data.process import Deserializer, Postprocess, Preprocess + from flash.core.utilities.imports import _PANDAS_AVAILABLE -from flash.tabular.classification.utils import ( - _compute_normalization, - _generate_codes, - _pre_transform, - _to_cat_vars_numpy, - _to_num_vars_numpy, -) if _PANDAS_AVAILABLE: import pandas as pd @@ -37,316 +27,33 @@ else: DataFrame = object +from flash.tabular.data import TabularData -class TabularDataFrameDataSource(DataSource[DataFrame]): - - def __init__( - self, - cat_cols: Optional[List[str]] = None, - num_cols: Optional[List[str]] = None, - target_col: Optional[str] = None, - mean: Optional[DataFrame] = None, - std: Optional[DataFrame] = None, - codes: Optional[Dict[str, Any]] = None, - target_codes: Optional[Dict[str, Any]] = None, - classes: Optional[List[str]] = None, - is_regression: bool = True, - ): - super().__init__() - - self.cat_cols = cat_cols - self.num_cols = num_cols - self.target_col = target_col - self.mean = mean - self.std = std - self.codes = codes - self.target_codes = target_codes - self.is_regression = is_regression - - self.set_state(LabelsState(classes)) - self.num_classes = len(classes) - - def common_load_data( - self, - df: DataFrame, - dataset: Optional[Any] = None, - ): - # impute_data - # compute train dataset stats - dfs = _pre_transform([df], self.num_cols, self.cat_cols, self.codes, self.mean, self.std, self.target_col, - self.target_codes) - - df = dfs[0] - - if dataset is not None: - dataset.num_samples = len(df) - - cat_vars = _to_cat_vars_numpy(df, self.cat_cols) - num_vars = _to_num_vars_numpy(df, self.num_cols) - - cat_vars = np.stack(cat_vars, 1) # if len(cat_vars) else np.zeros((len(self), 0)) - num_vars = np.stack(num_vars, 1) # if len(num_vars) else np.zeros((len(self), 0)) - return df, cat_vars, num_vars - - def load_data(self, data: DataFrame, dataset: Optional[Any] = None): - df, cat_vars, num_vars = self.common_load_data(data, dataset=dataset) - target = df[self.target_col].to_numpy().astype(np.float32 if self.is_regression else np.int64) - return [{ - DefaultDataKeys.INPUT: (c, n), - DefaultDataKeys.TARGET: t - } for c, n, t in zip(cat_vars, num_vars, target)] - - def predict_load_data(self, data: DataFrame, dataset: Optional[Any] = None): - _, cat_vars, num_vars = self.common_load_data(data, dataset=dataset) - return [{DefaultDataKeys.INPUT: (c, n)} for c, n in zip(cat_vars, num_vars)] - - -class TabularCSVDataSource(TabularDataFrameDataSource): - - def load_data(self, data: str, dataset: Optional[Any] = None): - return super().load_data(pd.read_csv(data), dataset=dataset) - - def predict_load_data(self, data: str, dataset: Optional[Any] = None): - return super().predict_load_data(pd.read_csv(data), dataset=dataset) - - -class TabularDeserializer(Deserializer): - - def __init__( - self, - cat_cols: Optional[List[str]] = None, - num_cols: Optional[List[str]] = None, - target_col: Optional[str] = None, - mean: Optional[DataFrame] = None, - std: Optional[DataFrame] = None, - codes: Optional[Dict[str, Any]] = None, - target_codes: Optional[Dict[str, Any]] = None, - classes: Optional[List[str]] = None, - is_regression: bool = True - ): - super().__init__() - self.cat_cols = cat_cols - self.num_cols = num_cols - self.target_col = target_col - self.mean = mean - self.std = std - self.codes = codes - self.target_codes = target_codes - self.classes = classes - self.is_regression = is_regression - - def deserialize(self, data: str) -> Any: - df = pd.read_csv(StringIO(data)) - df = _pre_transform([df], self.num_cols, self.cat_cols, self.codes, self.mean, self.std, self.target_col, - self.target_codes)[0] - - cat_vars = _to_cat_vars_numpy(df, self.cat_cols) - num_vars = _to_num_vars_numpy(df, self.num_cols) - - cat_vars = np.stack(cat_vars, 1) - num_vars = np.stack(num_vars, 1) - - return [{DefaultDataKeys.INPUT: [c, n]} for c, n in zip(cat_vars, num_vars)] - - @property - def example_input(self) -> str: - row = {} - for cat_col in self.cat_cols: - row[cat_col] = ["test"] - for num_col in self.num_cols: - row[num_col] = [0] - return str(DataFrame.from_dict(row).to_csv()) - - -class TabularPreprocess(Preprocess): - - def __init__( - self, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - cat_cols: Optional[List[str]] = None, - num_cols: Optional[List[str]] = None, - target_col: Optional[str] = None, - mean: Optional[DataFrame] = None, - std: Optional[DataFrame] = None, - codes: Optional[Dict[str, Any]] = None, - target_codes: Optional[Dict[str, Any]] = None, - classes: Optional[List[str]] = None, - is_regression: bool = True, - deserializer: Optional[Deserializer] = None - ): - self.cat_cols = cat_cols - self.num_cols = num_cols - self.target_col = target_col - self.mean = mean - self.std = std - self.codes = codes - self.target_codes = target_codes - self.classes = classes - self.is_regression = is_regression - - super().__init__( - train_transform=train_transform, - val_transform=val_transform, - test_transform=test_transform, - predict_transform=predict_transform, - data_sources={ - DefaultDataSources.CSV: TabularCSVDataSource( - cat_cols, num_cols, target_col, mean, std, codes, target_codes, classes, is_regression - ), - "data_frame": TabularDataFrameDataSource( - cat_cols, num_cols, target_col, mean, std, codes, target_codes, classes, is_regression - ), - }, - default_data_source=DefaultDataSources.CSV, - deserializer=deserializer or TabularDeserializer( - cat_cols=cat_cols, - num_cols=num_cols, - target_col=target_col, - mean=mean, - std=std, - codes=codes, - target_codes=target_codes, - classes=classes, - is_regression=is_regression - ) - ) - - def get_state_dict(self, strict: bool = False) -> Dict[str, Any]: - return { - **self.transforms, - "cat_cols": self.cat_cols, - "num_cols": self.num_cols, - "target_col": self.target_col, - "mean": self.mean, - "std": self.std, - "codes": self.codes, - "target_codes": self.target_codes, - "classes": self.classes, - "is_regression": self.is_regression, - } - - @classmethod - def load_state_dict(cls, state_dict: Dict[str, Any], strict: bool = True) -> 'Preprocess': - return cls(**state_dict) - - -class TabularPostprocess(Postprocess): - - def uncollate(self, batch: Any) -> Any: - return batch - - -class TabularData(DataModule): - """Data module for tabular tasks""" - - preprocess_cls = TabularPreprocess - postprocess_cls = TabularPostprocess - - @property - def codes(self) -> Dict[str, str]: - return self._data_source.codes - - @property - def num_classes(self) -> int: - return self._data_source.num_classes - - @property - def cat_cols(self) -> Optional[List[str]]: - return self._data_source.cat_cols - - @property - def num_cols(self) -> Optional[List[str]]: - return self._data_source.num_cols - - @property - def num_features(self) -> int: - return len(self.cat_cols) + len(self.num_cols) - - @property - def emb_sizes(self) -> list: - """Recommended embedding sizes.""" - - # https://developers.googleblog.com/2017/11/introducing-tensorflow-feature-columns.html - # The following "formula" provides a general rule of thumb about the number of embedding dimensions: - # embedding_dimensions = number_of_categories**0.25 - num_classes = [len(self.codes[cat]) for cat in self.cat_cols] - emb_dims = [max(int(n**0.25), 16) for n in num_classes] - return list(zip(num_classes, emb_dims)) - - @staticmethod - def _sanetize_cols(cat_cols: Optional[Union[str, List[str]]], num_cols: Optional[Union[str, List[str]]]): - if cat_cols is None and num_cols is None: - raise RuntimeError('Both `cat_cols` and `num_cols` are None!') - - return cat_cols or [], num_cols or [] - - @classmethod - def compute_state( - cls, - train_data_frame: DataFrame, - val_data_frame: Optional[DataFrame], - test_data_frame: Optional[DataFrame], - predict_data_frame: Optional[DataFrame], - target_fields: str, - numerical_fields: List[str], - categorical_fields: List[str], - ) -> Tuple[float, float, List[str], Dict[str, Any], Dict[str, Any]]: - - if train_data_frame is None: - raise MisconfigurationException( - "train_data_frame is required to instantiate the TabularDataFrameDataSource" - ) - - data_frames = [train_data_frame] - - if val_data_frame is not None: - data_frames += [val_data_frame] - - if test_data_frame is not None: - data_frames += [test_data_frame] - - if predict_data_frame is not None: - data_frames += [predict_data_frame] - - mean, std = _compute_normalization(data_frames[0], numerical_fields) - - classes = list(data_frames[0][target_fields].unique()) - - if data_frames[0][target_fields].dtype == object: - # if the target_fields is a category, not an int - target_codes = _generate_codes(data_frames, [target_fields]) - else: - target_codes = None - codes = _generate_codes(data_frames, categorical_fields) - - return mean, std, classes, codes, target_codes +class TabularClassificationData(TabularData): @classmethod def from_data_frame( - cls, - categorical_fields: Optional[Union[str, List[str]]], - numerical_fields: Optional[Union[str, List[str]]], - target_fields: Optional[str] = None, - train_data_frame: Optional[DataFrame] = None, - val_data_frame: Optional[DataFrame] = None, - test_data_frame: Optional[DataFrame] = None, - predict_data_frame: Optional[DataFrame] = None, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - data_fetcher: Optional[BaseDataFetcher] = None, - preprocess: Optional[Preprocess] = None, - val_split: Optional[float] = None, - batch_size: int = 4, - num_workers: Optional[int] = None, - is_regression: bool = False, - **preprocess_kwargs: Any, + cls, + categorical_fields: Optional[Union[str, List[str]]], + numerical_fields: Optional[Union[str, List[str]]], + target_fields: Optional[str] = None, + train_data_frame: Optional[DataFrame] = None, + val_data_frame: Optional[DataFrame] = None, + test_data_frame: Optional[DataFrame] = None, + predict_data_frame: Optional[DataFrame] = None, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + data_fetcher: Optional[BaseDataFetcher] = None, + preprocess: Optional[Preprocess] = None, + val_split: Optional[float] = None, + batch_size: int = 4, + num_workers: Optional[int] = None, + **preprocess_kwargs: Any, ): - """Creates a :class:`~flash.tabular.data.TabularData` object from the given data frames. + """Creates a :class:`~flash.tabular.classification.data.TabularClassificationData` object from the given data + frames. Args: categorical_fields: The field or fields (columns) in the CSV file containing categorical inputs. @@ -372,8 +79,6 @@ def from_data_frame( val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - is_regression: If ``True``, targets will be formatted as floating point. If ``False``, targets will be - formatted as integers. preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used if ``preprocess = None``. @@ -430,33 +135,33 @@ def from_data_frame( codes=codes, target_codes=target_codes, classes=classes, - is_regression=is_regression, + is_regression=False, **preprocess_kwargs, ) @classmethod def from_csv( - cls, - categorical_fields: Optional[Union[str, List[str]]], - numerical_fields: Optional[Union[str, List[str]]], - target_fields: Optional[str] = None, - train_file: Optional[str] = None, - val_file: Optional[str] = None, - test_file: Optional[str] = None, - predict_file: Optional[str] = None, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - data_fetcher: Optional[BaseDataFetcher] = None, - preprocess: Optional[Preprocess] = None, - val_split: Optional[float] = None, - batch_size: int = 4, - num_workers: Optional[int] = None, - is_regression: bool = False, - **preprocess_kwargs: Any, + cls, + categorical_fields: Optional[Union[str, List[str]]], + numerical_fields: Optional[Union[str, List[str]]], + target_fields: Optional[str] = None, + train_file: Optional[str] = None, + val_file: Optional[str] = None, + test_file: Optional[str] = None, + predict_file: Optional[str] = None, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + data_fetcher: Optional[BaseDataFetcher] = None, + preprocess: Optional[Preprocess] = None, + val_split: Optional[float] = None, + batch_size: int = 4, + num_workers: Optional[int] = None, + **preprocess_kwargs: Any, ) -> 'DataModule': - """Creates a :class:`~flash.tabular.data.TabularData` object from the given CSV files. + """Creates a :class:`~flash.tabular.classification.data.TabularClassificationData` object from the given CSV + files. Args: categorical_fields: The field or fields (columns) in the CSV file containing categorical inputs. @@ -482,8 +187,6 @@ def from_csv( val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - is_regression: If ``True``, targets will be formatted as floating point. If ``False``, targets will be - formatted as integers. preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used if ``preprocess = None``. @@ -507,7 +210,7 @@ def from_csv( val_data_frame=pd.read_csv(val_file) if val_file is not None else None, test_data_frame=pd.read_csv(test_file) if test_file is not None else None, predict_data_frame=pd.read_csv(predict_file) if predict_file is not None else None, - is_regression=is_regression, + is_regression=False, preprocess=preprocess, val_split=val_split, batch_size=batch_size, diff --git a/flash/tabular/data.py b/flash/tabular/data.py new file mode 100644 index 0000000000..c2a60e24da --- /dev/null +++ b/flash/tabular/data.py @@ -0,0 +1,515 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from io import StringIO +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +from pytorch_lightning.utilities.exceptions import MisconfigurationException + +from flash.core.classification import LabelsState +from flash.core.data.callback import BaseDataFetcher +from flash.core.data.data_module import DataModule +from flash.core.data.data_source import DataSource, DefaultDataKeys, DefaultDataSources +from flash.core.data.process import Deserializer, Postprocess, Preprocess +from flash.core.utilities.imports import _PANDAS_AVAILABLE +from flash.tabular.classification.utils import ( + _compute_normalization, + _generate_codes, + _pre_transform, + _to_cat_vars_numpy, + _to_num_vars_numpy, +) + +if _PANDAS_AVAILABLE: + import pandas as pd + from pandas.core.frame import DataFrame +else: + DataFrame = object + + +class TabularDataFrameDataSource(DataSource[DataFrame]): + + def __init__( + self, + cat_cols: Optional[List[str]] = None, + num_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + mean: Optional[DataFrame] = None, + std: Optional[DataFrame] = None, + codes: Optional[Dict[str, Any]] = None, + target_codes: Optional[Dict[str, Any]] = None, + classes: Optional[List[str]] = None, + is_regression: bool = True, + ): + super().__init__() + + self.cat_cols = cat_cols + self.num_cols = num_cols + self.target_col = target_col + self.mean = mean + self.std = std + self.codes = codes + self.target_codes = target_codes + self.is_regression = is_regression + + self.set_state(LabelsState(classes)) + self.num_classes = len(classes) + + def common_load_data( + self, + df: DataFrame, + dataset: Optional[Any] = None, + ): + # impute_data + # compute train dataset stats + dfs = _pre_transform([df], self.num_cols, self.cat_cols, self.codes, self.mean, self.std, self.target_col, + self.target_codes) + + df = dfs[0] + + if dataset is not None: + dataset.num_samples = len(df) + + cat_vars = _to_cat_vars_numpy(df, self.cat_cols) + num_vars = _to_num_vars_numpy(df, self.num_cols) + + cat_vars = np.stack(cat_vars, 1) # if len(cat_vars) else np.zeros((len(self), 0)) + num_vars = np.stack(num_vars, 1) # if len(num_vars) else np.zeros((len(self), 0)) + return df, cat_vars, num_vars + + def load_data(self, data: DataFrame, dataset: Optional[Any] = None): + df, cat_vars, num_vars = self.common_load_data(data, dataset=dataset) + target = df[self.target_col].to_numpy().astype(np.float32 if self.is_regression else np.int64) + return [{ + DefaultDataKeys.INPUT: (c, n), + DefaultDataKeys.TARGET: t + } for c, n, t in zip(cat_vars, num_vars, target)] + + def predict_load_data(self, data: DataFrame, dataset: Optional[Any] = None): + _, cat_vars, num_vars = self.common_load_data(data, dataset=dataset) + return [{DefaultDataKeys.INPUT: (c, n)} for c, n in zip(cat_vars, num_vars)] + + +class TabularCSVDataSource(TabularDataFrameDataSource): + + def load_data(self, data: str, dataset: Optional[Any] = None): + return super().load_data(pd.read_csv(data), dataset=dataset) + + def predict_load_data(self, data: str, dataset: Optional[Any] = None): + return super().predict_load_data(pd.read_csv(data), dataset=dataset) + + +class TabularDeserializer(Deserializer): + + def __init__( + self, + cat_cols: Optional[List[str]] = None, + num_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + mean: Optional[DataFrame] = None, + std: Optional[DataFrame] = None, + codes: Optional[Dict[str, Any]] = None, + target_codes: Optional[Dict[str, Any]] = None, + classes: Optional[List[str]] = None, + is_regression: bool = True + ): + super().__init__() + self.cat_cols = cat_cols + self.num_cols = num_cols + self.target_col = target_col + self.mean = mean + self.std = std + self.codes = codes + self.target_codes = target_codes + self.classes = classes + self.is_regression = is_regression + + def deserialize(self, data: str) -> Any: + df = pd.read_csv(StringIO(data)) + df = _pre_transform([df], self.num_cols, self.cat_cols, self.codes, self.mean, self.std, self.target_col, + self.target_codes)[0] + + cat_vars = _to_cat_vars_numpy(df, self.cat_cols) + num_vars = _to_num_vars_numpy(df, self.num_cols) + + cat_vars = np.stack(cat_vars, 1) + num_vars = np.stack(num_vars, 1) + + return [{DefaultDataKeys.INPUT: [c, n]} for c, n in zip(cat_vars, num_vars)] + + @property + def example_input(self) -> str: + row = {} + for cat_col in self.cat_cols: + row[cat_col] = ["test"] + for num_col in self.num_cols: + row[num_col] = [0] + return str(DataFrame.from_dict(row).to_csv()) + + +class TabularPreprocess(Preprocess): + + def __init__( + self, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + cat_cols: Optional[List[str]] = None, + num_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + mean: Optional[DataFrame] = None, + std: Optional[DataFrame] = None, + codes: Optional[Dict[str, Any]] = None, + target_codes: Optional[Dict[str, Any]] = None, + classes: Optional[List[str]] = None, + is_regression: bool = True, + deserializer: Optional[Deserializer] = None + ): + self.cat_cols = cat_cols + self.num_cols = num_cols + self.target_col = target_col + self.mean = mean + self.std = std + self.codes = codes + self.target_codes = target_codes + self.classes = classes + self.is_regression = is_regression + + super().__init__( + train_transform=train_transform, + val_transform=val_transform, + test_transform=test_transform, + predict_transform=predict_transform, + data_sources={ + DefaultDataSources.CSV: TabularCSVDataSource( + cat_cols, num_cols, target_col, mean, std, codes, target_codes, classes, is_regression + ), + "data_frame": TabularDataFrameDataSource( + cat_cols, num_cols, target_col, mean, std, codes, target_codes, classes, is_regression + ), + }, + default_data_source=DefaultDataSources.CSV, + deserializer=deserializer or TabularDeserializer( + cat_cols=cat_cols, + num_cols=num_cols, + target_col=target_col, + mean=mean, + std=std, + codes=codes, + target_codes=target_codes, + classes=classes, + is_regression=is_regression + ) + ) + + def get_state_dict(self, strict: bool = False) -> Dict[str, Any]: + return { + **self.transforms, + "cat_cols": self.cat_cols, + "num_cols": self.num_cols, + "target_col": self.target_col, + "mean": self.mean, + "std": self.std, + "codes": self.codes, + "target_codes": self.target_codes, + "classes": self.classes, + "is_regression": self.is_regression, + } + + @classmethod + def load_state_dict(cls, state_dict: Dict[str, Any], strict: bool = True) -> 'Preprocess': + return cls(**state_dict) + + +class TabularPostprocess(Postprocess): + + def uncollate(self, batch: Any) -> Any: + return batch + + +class TabularData(DataModule): + """Data module for tabular tasks""" + + preprocess_cls = TabularPreprocess + postprocess_cls = TabularPostprocess + + @property + def codes(self) -> Dict[str, str]: + return self._data_source.codes + + @property + def num_classes(self) -> int: + return self._data_source.num_classes + + @property + def cat_cols(self) -> Optional[List[str]]: + return self._data_source.cat_cols + + @property + def num_cols(self) -> Optional[List[str]]: + return self._data_source.num_cols + + @property + def num_features(self) -> int: + return len(self.cat_cols) + len(self.num_cols) + + @property + def emb_sizes(self) -> list: + """Recommended embedding sizes.""" + + # https://developers.googleblog.com/2017/11/introducing-tensorflow-feature-columns.html + # The following "formula" provides a general rule of thumb about the number of embedding dimensions: + # embedding_dimensions = number_of_categories**0.25 + num_classes = [len(self.codes[cat]) for cat in self.cat_cols] + emb_dims = [max(int(n**0.25), 16) for n in num_classes] + return list(zip(num_classes, emb_dims)) + + @staticmethod + def _sanetize_cols(cat_cols: Optional[Union[str, List[str]]], num_cols: Optional[Union[str, List[str]]]): + if cat_cols is None and num_cols is None: + raise RuntimeError('Both `cat_cols` and `num_cols` are None!') + + return cat_cols or [], num_cols or [] + + @classmethod + def compute_state( + cls, + train_data_frame: DataFrame, + val_data_frame: Optional[DataFrame], + test_data_frame: Optional[DataFrame], + predict_data_frame: Optional[DataFrame], + target_fields: str, + numerical_fields: List[str], + categorical_fields: List[str], + ) -> Tuple[float, float, List[str], Dict[str, Any], Dict[str, Any]]: + + if train_data_frame is None: + raise MisconfigurationException( + "train_data_frame is required to instantiate the TabularDataFrameDataSource" + ) + + data_frames = [train_data_frame] + + if val_data_frame is not None: + data_frames += [val_data_frame] + + if test_data_frame is not None: + data_frames += [test_data_frame] + + if predict_data_frame is not None: + data_frames += [predict_data_frame] + + mean, std = _compute_normalization(data_frames[0], numerical_fields) + + classes = list(data_frames[0][target_fields].unique()) + + if data_frames[0][target_fields].dtype == object: + # if the target_fields is a category, not an int + target_codes = _generate_codes(data_frames, [target_fields]) + else: + target_codes = None + codes = _generate_codes(data_frames, categorical_fields) + + return mean, std, classes, codes, target_codes + + @classmethod + def from_data_frame( + cls, + categorical_fields: Optional[Union[str, List[str]]], + numerical_fields: Optional[Union[str, List[str]]], + target_fields: Optional[str] = None, + train_data_frame: Optional[DataFrame] = None, + val_data_frame: Optional[DataFrame] = None, + test_data_frame: Optional[DataFrame] = None, + predict_data_frame: Optional[DataFrame] = None, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + data_fetcher: Optional[BaseDataFetcher] = None, + preprocess: Optional[Preprocess] = None, + val_split: Optional[float] = None, + batch_size: int = 4, + num_workers: Optional[int] = None, + is_regression: bool = False, + **preprocess_kwargs: Any, + ): + """Creates a :class:`~flash.tabular.data.TabularData` object from the given data frames. + + Args: + categorical_fields: The field or fields (columns) in the CSV file containing categorical inputs. + numerical_fields: The field or fields (columns) in the CSV file containing numerical inputs. + target_fields: The field or fields (columns) in the CSV file to use for the target. + train_data_frame: The pandas ``DataFrame`` containing the training data. + val_data_frame: The pandas ``DataFrame`` containing the validation data. + test_data_frame: The pandas ``DataFrame`` containing the testing data. + predict_data_frame: The pandas ``DataFrame`` containing the data to use when predicting. + train_transform: The dictionary of transforms to use during training which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + val_transform: The dictionary of transforms to use during validation which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + test_transform: The dictionary of transforms to use during testing which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + predict_transform: The dictionary of transforms to use during predicting which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + data_fetcher: The :class:`~flash.core.data.callback.BaseDataFetcher` to pass to the + :class:`~flash.core.data.data_module.DataModule`. + preprocess: The :class:`~flash.core.data.data.Preprocess` to pass to the + :class:`~flash.core.data.data_module.DataModule`. If ``None``, ``cls.preprocess_cls`` + will be constructed and used. + val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + is_regression: If ``True``, targets will be formatted as floating point. If ``False``, targets will be + formatted as integers. + preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used + if ``preprocess = None``. + + Returns: + The constructed data module. + + Examples:: + + data_module = TabularData.from_data_frame( + "categorical_input", + "numerical_input", + "target", + train_data_frame=train_data, + ) + """ + categorical_fields, numerical_fields = cls._sanetize_cols(categorical_fields, numerical_fields) + + if not isinstance(categorical_fields, list): + categorical_fields = [categorical_fields] + + if not isinstance(numerical_fields, list): + numerical_fields = [numerical_fields] + + mean, std, classes, codes, target_codes = cls.compute_state( + train_data_frame=train_data_frame, + val_data_frame=val_data_frame, + test_data_frame=test_data_frame, + predict_data_frame=predict_data_frame, + target_fields=target_fields, + numerical_fields=numerical_fields, + categorical_fields=categorical_fields, + ) + + return cls.from_data_source( + "data_frame", + train_data_frame, + val_data_frame, + test_data_frame, + predict_data_frame, + train_transform=train_transform, + val_transform=val_transform, + test_transform=test_transform, + predict_transform=predict_transform, + data_fetcher=data_fetcher, + preprocess=preprocess, + val_split=val_split, + batch_size=batch_size, + num_workers=num_workers, + cat_cols=categorical_fields, + num_cols=numerical_fields, + target_col=target_fields, + mean=mean, + std=std, + codes=codes, + target_codes=target_codes, + classes=classes, + is_regression=is_regression, + **preprocess_kwargs, + ) + + @classmethod + def from_csv( + cls, + categorical_fields: Optional[Union[str, List[str]]], + numerical_fields: Optional[Union[str, List[str]]], + target_fields: Optional[str] = None, + train_file: Optional[str] = None, + val_file: Optional[str] = None, + test_file: Optional[str] = None, + predict_file: Optional[str] = None, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + data_fetcher: Optional[BaseDataFetcher] = None, + preprocess: Optional[Preprocess] = None, + val_split: Optional[float] = None, + batch_size: int = 4, + num_workers: Optional[int] = None, + is_regression: bool = False, + **preprocess_kwargs: Any, + ) -> 'DataModule': + """Creates a :class:`~flash.tabular.data.TabularData` object from the given CSV files. + + Args: + categorical_fields: The field or fields (columns) in the CSV file containing categorical inputs. + numerical_fields: The field or fields (columns) in the CSV file containing numerical inputs. + target_fields: The field or fields (columns) in the CSV file to use for the target. + train_file: The CSV file containing the training data. + val_file: The CSV file containing the validation data. + test_file: The CSV file containing the testing data. + predict_file: The CSV file containing the data to use when predicting. + train_transform: The dictionary of transforms to use during training which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + val_transform: The dictionary of transforms to use during validation which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + test_transform: The dictionary of transforms to use during testing which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + predict_transform: The dictionary of transforms to use during predicting which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + data_fetcher: The :class:`~flash.core.data.callback.BaseDataFetcher` to pass to the + :class:`~flash.core.data.data_module.DataModule`. + preprocess: The :class:`~flash.core.data.data.Preprocess` to pass to the + :class:`~flash.core.data.data_module.DataModule`. If ``None``, ``cls.preprocess_cls`` + will be constructed and used. + val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + is_regression: If ``True``, targets will be formatted as floating point. If ``False``, targets will be + formatted as integers. + preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used + if ``preprocess = None``. + + Returns: + The constructed data module. + + Examples:: + + data_module = TabularData.from_csv( + "categorical_input", + "numerical_input", + "target", + train_file="train_data.csv", + ) + """ + return cls.from_data_frame( + categorical_fields=categorical_fields, + numerical_fields=numerical_fields, + target_fields=target_fields, + train_data_frame=pd.read_csv(train_file) if train_file is not None else None, + val_data_frame=pd.read_csv(val_file) if val_file is not None else None, + test_data_frame=pd.read_csv(test_file) if test_file is not None else None, + predict_data_frame=pd.read_csv(predict_file) if predict_file is not None else None, + is_regression=is_regression, + preprocess=preprocess, + val_split=val_split, + batch_size=batch_size, + num_workers=num_workers, + ) diff --git a/flash/tabular/regression/__init__.py b/flash/tabular/regression/__init__.py new file mode 100644 index 0000000000..a93e599ff0 --- /dev/null +++ b/flash/tabular/regression/__init__.py @@ -0,0 +1 @@ +from flash.tabular.regression.data import TabularRegressionData # noqa: F401 diff --git a/flash/tabular/regression/data.py b/flash/tabular/regression/data.py new file mode 100644 index 0000000000..a9d15fe5d3 --- /dev/null +++ b/flash/tabular/regression/data.py @@ -0,0 +1,216 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, Union, List, Dict, Callable, Any + +from flash.core.data.data_module import DataModule + +from flash.core.data.process import Preprocess + +from flash.core.data.callback import BaseDataFetcher + +from flash.core.utilities.imports import _PANDAS_AVAILABLE + +if _PANDAS_AVAILABLE: + import pandas as pd + from pandas.core.frame import DataFrame +else: + DataFrame = object + +from flash.tabular.data import TabularData + + +class TabularRegressionData(TabularData): + @classmethod + def from_data_frame( + cls, + categorical_fields: Optional[Union[str, List[str]]], + numerical_fields: Optional[Union[str, List[str]]], + target_fields: Optional[str] = None, + train_data_frame: Optional[DataFrame] = None, + val_data_frame: Optional[DataFrame] = None, + test_data_frame: Optional[DataFrame] = None, + predict_data_frame: Optional[DataFrame] = None, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + data_fetcher: Optional[BaseDataFetcher] = None, + preprocess: Optional[Preprocess] = None, + val_split: Optional[float] = None, + batch_size: int = 4, + num_workers: Optional[int] = None, + **preprocess_kwargs: Any, + ): + """Creates a :class:`~flash.tabular.regression.data.TabularRegressionData` object from the given data frames. + + Args: + categorical_fields: The field or fields (columns) in the CSV file containing categorical inputs. + numerical_fields: The field or fields (columns) in the CSV file containing numerical inputs. + target_fields: The field or fields (columns) in the CSV file to use for the target. + train_data_frame: The pandas ``DataFrame`` containing the training data. + val_data_frame: The pandas ``DataFrame`` containing the validation data. + test_data_frame: The pandas ``DataFrame`` containing the testing data. + predict_data_frame: The pandas ``DataFrame`` containing the data to use when predicting. + train_transform: The dictionary of transforms to use during training which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + val_transform: The dictionary of transforms to use during validation which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + test_transform: The dictionary of transforms to use during testing which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + predict_transform: The dictionary of transforms to use during predicting which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + data_fetcher: The :class:`~flash.core.data.callback.BaseDataFetcher` to pass to the + :class:`~flash.core.data.data_module.DataModule`. + preprocess: The :class:`~flash.core.data.data.Preprocess` to pass to the + :class:`~flash.core.data.data_module.DataModule`. If ``None``, ``cls.preprocess_cls`` + will be constructed and used. + val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used + if ``preprocess = None``. + + Returns: + The constructed data module. + + Examples:: + + data_module = TabularData.from_data_frame( + "categorical_input", + "numerical_input", + "target", + train_data_frame=train_data, + ) + """ + categorical_fields, numerical_fields = cls._sanetize_cols(categorical_fields, numerical_fields) + + if not isinstance(categorical_fields, list): + categorical_fields = [categorical_fields] + + if not isinstance(numerical_fields, list): + numerical_fields = [numerical_fields] + + mean, std, classes, codes, target_codes = cls.compute_state( + train_data_frame=train_data_frame, + val_data_frame=val_data_frame, + test_data_frame=test_data_frame, + predict_data_frame=predict_data_frame, + target_fields=target_fields, + numerical_fields=numerical_fields, + categorical_fields=categorical_fields, + ) + + return cls.from_data_source( + "data_frame", + train_data_frame, + val_data_frame, + test_data_frame, + predict_data_frame, + train_transform=train_transform, + val_transform=val_transform, + test_transform=test_transform, + predict_transform=predict_transform, + data_fetcher=data_fetcher, + preprocess=preprocess, + val_split=val_split, + batch_size=batch_size, + num_workers=num_workers, + cat_cols=categorical_fields, + num_cols=numerical_fields, + target_col=target_fields, + mean=mean, + std=std, + codes=codes, + target_codes=target_codes, + classes=classes, + is_regression=True, + **preprocess_kwargs, + ) + + @classmethod + def from_csv( + cls, + categorical_fields: Optional[Union[str, List[str]]], + numerical_fields: Optional[Union[str, List[str]]], + target_fields: Optional[str] = None, + train_file: Optional[str] = None, + val_file: Optional[str] = None, + test_file: Optional[str] = None, + predict_file: Optional[str] = None, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + data_fetcher: Optional[BaseDataFetcher] = None, + preprocess: Optional[Preprocess] = None, + val_split: Optional[float] = None, + batch_size: int = 4, + num_workers: Optional[int] = None, + **preprocess_kwargs: Any, + ) -> 'DataModule': + """Creates a :class:`~flash.tabular.regression.data.TabularRegressionData` object from the given CSV files. + + Args: + categorical_fields: The field or fields (columns) in the CSV file containing categorical inputs. + numerical_fields: The field or fields (columns) in the CSV file containing numerical inputs. + target_fields: The field or fields (columns) in the CSV file to use for the target. + train_file: The CSV file containing the training data. + val_file: The CSV file containing the validation data. + test_file: The CSV file containing the testing data. + predict_file: The CSV file containing the data to use when predicting. + train_transform: The dictionary of transforms to use during training which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + val_transform: The dictionary of transforms to use during validation which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + test_transform: The dictionary of transforms to use during testing which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + predict_transform: The dictionary of transforms to use during predicting which maps + :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. + data_fetcher: The :class:`~flash.core.data.callback.BaseDataFetcher` to pass to the + :class:`~flash.core.data.data_module.DataModule`. + preprocess: The :class:`~flash.core.data.data.Preprocess` to pass to the + :class:`~flash.core.data.data_module.DataModule`. If ``None``, ``cls.preprocess_cls`` + will be constructed and used. + val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. + preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used + if ``preprocess = None``. + + Returns: + The constructed data module. + + Examples:: + + data_module = TabularData.from_csv( + "categorical_input", + "numerical_input", + "target", + train_file="train_data.csv", + ) + """ + return cls.from_data_frame( + categorical_fields=categorical_fields, + numerical_fields=numerical_fields, + target_fields=target_fields, + train_data_frame=pd.read_csv(train_file) if train_file is not None else None, + val_data_frame=pd.read_csv(val_file) if val_file is not None else None, + test_data_frame=pd.read_csv(test_file) if test_file is not None else None, + predict_data_frame=pd.read_csv(predict_file) if predict_file is not None else None, + is_regression=True, + preprocess=preprocess, + val_split=val_split, + batch_size=batch_size, + num_workers=num_workers, + ) From 4010015681ea7992d64515ddee3e4766bfcbccde Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Jul 2021 09:36:21 +0000 Subject: [PATCH 02/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flash/tabular/__init__.py | 2 +- flash/tabular/classification/data.py | 80 ++++++++++++++-------------- flash/tabular/regression/data.py | 80 ++++++++++++++-------------- 3 files changed, 79 insertions(+), 83 deletions(-) diff --git a/flash/tabular/__init__.py b/flash/tabular/__init__.py index 7938299366..d4deb4c851 100644 --- a/flash/tabular/__init__.py +++ b/flash/tabular/__init__.py @@ -1,2 +1,2 @@ -from flash.tabular.classification import TabularClassifier, TabularClassificationData # noqa: F401 +from flash.tabular.classification import TabularClassificationData, TabularClassifier # noqa: F401 from flash.tabular.data import TabularData # noqa: F401 diff --git a/flash/tabular/classification/data.py b/flash/tabular/classification/data.py index 69b20b6ea5..b04b7a67b1 100644 --- a/flash/tabular/classification/data.py +++ b/flash/tabular/classification/data.py @@ -11,14 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional, Union, List, Dict, Callable, Any +from typing import Any, Callable, Dict, List, Optional, Union +from flash.core.data.callback import BaseDataFetcher from flash.core.data.data_module import DataModule - from flash.core.data.process import Preprocess - -from flash.core.data.callback import BaseDataFetcher - from flash.core.utilities.imports import _PANDAS_AVAILABLE if _PANDAS_AVAILABLE: @@ -31,26 +28,27 @@ class TabularClassificationData(TabularData): + @classmethod def from_data_frame( - cls, - categorical_fields: Optional[Union[str, List[str]]], - numerical_fields: Optional[Union[str, List[str]]], - target_fields: Optional[str] = None, - train_data_frame: Optional[DataFrame] = None, - val_data_frame: Optional[DataFrame] = None, - test_data_frame: Optional[DataFrame] = None, - predict_data_frame: Optional[DataFrame] = None, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - data_fetcher: Optional[BaseDataFetcher] = None, - preprocess: Optional[Preprocess] = None, - val_split: Optional[float] = None, - batch_size: int = 4, - num_workers: Optional[int] = None, - **preprocess_kwargs: Any, + cls, + categorical_fields: Optional[Union[str, List[str]]], + numerical_fields: Optional[Union[str, List[str]]], + target_fields: Optional[str] = None, + train_data_frame: Optional[DataFrame] = None, + val_data_frame: Optional[DataFrame] = None, + test_data_frame: Optional[DataFrame] = None, + predict_data_frame: Optional[DataFrame] = None, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + data_fetcher: Optional[BaseDataFetcher] = None, + preprocess: Optional[Preprocess] = None, + val_split: Optional[float] = None, + batch_size: int = 4, + num_workers: Optional[int] = None, + **preprocess_kwargs: Any, ): """Creates a :class:`~flash.tabular.classification.data.TabularClassificationData` object from the given data frames. @@ -141,24 +139,24 @@ def from_data_frame( @classmethod def from_csv( - cls, - categorical_fields: Optional[Union[str, List[str]]], - numerical_fields: Optional[Union[str, List[str]]], - target_fields: Optional[str] = None, - train_file: Optional[str] = None, - val_file: Optional[str] = None, - test_file: Optional[str] = None, - predict_file: Optional[str] = None, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - data_fetcher: Optional[BaseDataFetcher] = None, - preprocess: Optional[Preprocess] = None, - val_split: Optional[float] = None, - batch_size: int = 4, - num_workers: Optional[int] = None, - **preprocess_kwargs: Any, + cls, + categorical_fields: Optional[Union[str, List[str]]], + numerical_fields: Optional[Union[str, List[str]]], + target_fields: Optional[str] = None, + train_file: Optional[str] = None, + val_file: Optional[str] = None, + test_file: Optional[str] = None, + predict_file: Optional[str] = None, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + data_fetcher: Optional[BaseDataFetcher] = None, + preprocess: Optional[Preprocess] = None, + val_split: Optional[float] = None, + batch_size: int = 4, + num_workers: Optional[int] = None, + **preprocess_kwargs: Any, ) -> 'DataModule': """Creates a :class:`~flash.tabular.classification.data.TabularClassificationData` object from the given CSV files. diff --git a/flash/tabular/regression/data.py b/flash/tabular/regression/data.py index a9d15fe5d3..c7a7a47e42 100644 --- a/flash/tabular/regression/data.py +++ b/flash/tabular/regression/data.py @@ -11,14 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional, Union, List, Dict, Callable, Any +from typing import Any, Callable, Dict, List, Optional, Union +from flash.core.data.callback import BaseDataFetcher from flash.core.data.data_module import DataModule - from flash.core.data.process import Preprocess - -from flash.core.data.callback import BaseDataFetcher - from flash.core.utilities.imports import _PANDAS_AVAILABLE if _PANDAS_AVAILABLE: @@ -31,26 +28,27 @@ class TabularRegressionData(TabularData): + @classmethod def from_data_frame( - cls, - categorical_fields: Optional[Union[str, List[str]]], - numerical_fields: Optional[Union[str, List[str]]], - target_fields: Optional[str] = None, - train_data_frame: Optional[DataFrame] = None, - val_data_frame: Optional[DataFrame] = None, - test_data_frame: Optional[DataFrame] = None, - predict_data_frame: Optional[DataFrame] = None, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - data_fetcher: Optional[BaseDataFetcher] = None, - preprocess: Optional[Preprocess] = None, - val_split: Optional[float] = None, - batch_size: int = 4, - num_workers: Optional[int] = None, - **preprocess_kwargs: Any, + cls, + categorical_fields: Optional[Union[str, List[str]]], + numerical_fields: Optional[Union[str, List[str]]], + target_fields: Optional[str] = None, + train_data_frame: Optional[DataFrame] = None, + val_data_frame: Optional[DataFrame] = None, + test_data_frame: Optional[DataFrame] = None, + predict_data_frame: Optional[DataFrame] = None, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + data_fetcher: Optional[BaseDataFetcher] = None, + preprocess: Optional[Preprocess] = None, + val_split: Optional[float] = None, + batch_size: int = 4, + num_workers: Optional[int] = None, + **preprocess_kwargs: Any, ): """Creates a :class:`~flash.tabular.regression.data.TabularRegressionData` object from the given data frames. @@ -140,24 +138,24 @@ def from_data_frame( @classmethod def from_csv( - cls, - categorical_fields: Optional[Union[str, List[str]]], - numerical_fields: Optional[Union[str, List[str]]], - target_fields: Optional[str] = None, - train_file: Optional[str] = None, - val_file: Optional[str] = None, - test_file: Optional[str] = None, - predict_file: Optional[str] = None, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - data_fetcher: Optional[BaseDataFetcher] = None, - preprocess: Optional[Preprocess] = None, - val_split: Optional[float] = None, - batch_size: int = 4, - num_workers: Optional[int] = None, - **preprocess_kwargs: Any, + cls, + categorical_fields: Optional[Union[str, List[str]]], + numerical_fields: Optional[Union[str, List[str]]], + target_fields: Optional[str] = None, + train_file: Optional[str] = None, + val_file: Optional[str] = None, + test_file: Optional[str] = None, + predict_file: Optional[str] = None, + train_transform: Optional[Dict[str, Callable]] = None, + val_transform: Optional[Dict[str, Callable]] = None, + test_transform: Optional[Dict[str, Callable]] = None, + predict_transform: Optional[Dict[str, Callable]] = None, + data_fetcher: Optional[BaseDataFetcher] = None, + preprocess: Optional[Preprocess] = None, + val_split: Optional[float] = None, + batch_size: int = 4, + num_workers: Optional[int] = None, + **preprocess_kwargs: Any, ) -> 'DataModule': """Creates a :class:`~flash.tabular.regression.data.TabularRegressionData` object from the given CSV files. From a0e8de0ceb505af785d9dc79223c6d36b3da6b2d Mon Sep 17 00:00:00 2001 From: Suman Michael Date: Tue, 13 Jul 2021 16:23:26 +0530 Subject: [PATCH 03/10] Update flash/tabular/regression/data.py Co-authored-by: thomas chaton --- flash/tabular/regression/data.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flash/tabular/regression/data.py b/flash/tabular/regression/data.py index c7a7a47e42..98026a6619 100644 --- a/flash/tabular/regression/data.py +++ b/flash/tabular/regression/data.py @@ -29,7 +29,10 @@ class TabularRegressionData(TabularData): + is_regression: True + @classmethod + def from_data_frame( cls, categorical_fields: Optional[Union[str, List[str]]], From ca663a3a3ef97c442feb269df91386eae3d54724 Mon Sep 17 00:00:00 2001 From: Suman Michael Date: Tue, 13 Jul 2021 16:23:31 +0530 Subject: [PATCH 04/10] Update flash/tabular/classification/data.py Co-authored-by: thomas chaton --- flash/tabular/classification/data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flash/tabular/classification/data.py b/flash/tabular/classification/data.py index b04b7a67b1..8e5cdf4fc8 100644 --- a/flash/tabular/classification/data.py +++ b/flash/tabular/classification/data.py @@ -29,6 +29,8 @@ class TabularClassificationData(TabularData): + is_regression = False + @classmethod def from_data_frame( cls, From fdde1c22925aebb39b61b8b39c104d08d38373d0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Jul 2021 10:54:08 +0000 Subject: [PATCH 05/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flash/tabular/regression/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flash/tabular/regression/data.py b/flash/tabular/regression/data.py index 98026a6619..6398623b34 100644 --- a/flash/tabular/regression/data.py +++ b/flash/tabular/regression/data.py @@ -32,7 +32,6 @@ class TabularRegressionData(TabularData): is_regression: True @classmethod - def from_data_frame( cls, categorical_fields: Optional[Union[str, List[str]]], From 894f4e33ec5d1e616907c589d3d3b7d33da13d73 Mon Sep 17 00:00:00 2001 From: Suman Michael Date: Tue, 13 Jul 2021 16:33:52 +0530 Subject: [PATCH 06/10] added TabularClassificationData,TabularRegressionData extending TabularData --- flash/tabular/classification/data.py | 202 +-------------------------- flash/tabular/data.py | 12 +- flash/tabular/regression/data.py | 199 +------------------------- 3 files changed, 6 insertions(+), 407 deletions(-) diff --git a/flash/tabular/classification/data.py b/flash/tabular/classification/data.py index 8e5cdf4fc8..63cdda9ea2 100644 --- a/flash/tabular/classification/data.py +++ b/flash/tabular/classification/data.py @@ -11,208 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, Dict, List, Optional, Union - -from flash.core.data.callback import BaseDataFetcher -from flash.core.data.data_module import DataModule -from flash.core.data.process import Preprocess -from flash.core.utilities.imports import _PANDAS_AVAILABLE - -if _PANDAS_AVAILABLE: - import pandas as pd - from pandas.core.frame import DataFrame -else: - DataFrame = object - from flash.tabular.data import TabularData class TabularClassificationData(TabularData): - - is_regression = False - - @classmethod - def from_data_frame( - cls, - categorical_fields: Optional[Union[str, List[str]]], - numerical_fields: Optional[Union[str, List[str]]], - target_fields: Optional[str] = None, - train_data_frame: Optional[DataFrame] = None, - val_data_frame: Optional[DataFrame] = None, - test_data_frame: Optional[DataFrame] = None, - predict_data_frame: Optional[DataFrame] = None, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - data_fetcher: Optional[BaseDataFetcher] = None, - preprocess: Optional[Preprocess] = None, - val_split: Optional[float] = None, - batch_size: int = 4, - num_workers: Optional[int] = None, - **preprocess_kwargs: Any, - ): - """Creates a :class:`~flash.tabular.classification.data.TabularClassificationData` object from the given data - frames. - - Args: - categorical_fields: The field or fields (columns) in the CSV file containing categorical inputs. - numerical_fields: The field or fields (columns) in the CSV file containing numerical inputs. - target_fields: The field or fields (columns) in the CSV file to use for the target. - train_data_frame: The pandas ``DataFrame`` containing the training data. - val_data_frame: The pandas ``DataFrame`` containing the validation data. - test_data_frame: The pandas ``DataFrame`` containing the testing data. - predict_data_frame: The pandas ``DataFrame`` containing the data to use when predicting. - train_transform: The dictionary of transforms to use during training which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - val_transform: The dictionary of transforms to use during validation which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - test_transform: The dictionary of transforms to use during testing which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - predict_transform: The dictionary of transforms to use during predicting which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - data_fetcher: The :class:`~flash.core.data.callback.BaseDataFetcher` to pass to the - :class:`~flash.core.data.data_module.DataModule`. - preprocess: The :class:`~flash.core.data.data.Preprocess` to pass to the - :class:`~flash.core.data.data_module.DataModule`. If ``None``, ``cls.preprocess_cls`` - will be constructed and used. - val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used - if ``preprocess = None``. - - Returns: - The constructed data module. - - Examples:: - - data_module = TabularData.from_data_frame( - "categorical_input", - "numerical_input", - "target", - train_data_frame=train_data, - ) - """ - categorical_fields, numerical_fields = cls._sanetize_cols(categorical_fields, numerical_fields) - - if not isinstance(categorical_fields, list): - categorical_fields = [categorical_fields] - - if not isinstance(numerical_fields, list): - numerical_fields = [numerical_fields] - - mean, std, classes, codes, target_codes = cls.compute_state( - train_data_frame=train_data_frame, - val_data_frame=val_data_frame, - test_data_frame=test_data_frame, - predict_data_frame=predict_data_frame, - target_fields=target_fields, - numerical_fields=numerical_fields, - categorical_fields=categorical_fields, - ) - - return cls.from_data_source( - "data_frame", - train_data_frame, - val_data_frame, - test_data_frame, - predict_data_frame, - train_transform=train_transform, - val_transform=val_transform, - test_transform=test_transform, - predict_transform=predict_transform, - data_fetcher=data_fetcher, - preprocess=preprocess, - val_split=val_split, - batch_size=batch_size, - num_workers=num_workers, - cat_cols=categorical_fields, - num_cols=numerical_fields, - target_col=target_fields, - mean=mean, - std=std, - codes=codes, - target_codes=target_codes, - classes=classes, - is_regression=False, - **preprocess_kwargs, - ) - - @classmethod - def from_csv( - cls, - categorical_fields: Optional[Union[str, List[str]]], - numerical_fields: Optional[Union[str, List[str]]], - target_fields: Optional[str] = None, - train_file: Optional[str] = None, - val_file: Optional[str] = None, - test_file: Optional[str] = None, - predict_file: Optional[str] = None, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - data_fetcher: Optional[BaseDataFetcher] = None, - preprocess: Optional[Preprocess] = None, - val_split: Optional[float] = None, - batch_size: int = 4, - num_workers: Optional[int] = None, - **preprocess_kwargs: Any, - ) -> 'DataModule': - """Creates a :class:`~flash.tabular.classification.data.TabularClassificationData` object from the given CSV - files. - - Args: - categorical_fields: The field or fields (columns) in the CSV file containing categorical inputs. - numerical_fields: The field or fields (columns) in the CSV file containing numerical inputs. - target_fields: The field or fields (columns) in the CSV file to use for the target. - train_file: The CSV file containing the training data. - val_file: The CSV file containing the validation data. - test_file: The CSV file containing the testing data. - predict_file: The CSV file containing the data to use when predicting. - train_transform: The dictionary of transforms to use during training which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - val_transform: The dictionary of transforms to use during validation which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - test_transform: The dictionary of transforms to use during testing which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - predict_transform: The dictionary of transforms to use during predicting which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - data_fetcher: The :class:`~flash.core.data.callback.BaseDataFetcher` to pass to the - :class:`~flash.core.data.data_module.DataModule`. - preprocess: The :class:`~flash.core.data.data.Preprocess` to pass to the - :class:`~flash.core.data.data_module.DataModule`. If ``None``, ``cls.preprocess_cls`` - will be constructed and used. - val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used - if ``preprocess = None``. - - Returns: - The constructed data module. - - Examples:: - - data_module = TabularData.from_csv( - "categorical_input", - "numerical_input", - "target", - train_file="train_data.csv", - ) - """ - return cls.from_data_frame( - categorical_fields=categorical_fields, - numerical_fields=numerical_fields, - target_fields=target_fields, - train_data_frame=pd.read_csv(train_file) if train_file is not None else None, - val_data_frame=pd.read_csv(val_file) if val_file is not None else None, - test_data_frame=pd.read_csv(test_file) if test_file is not None else None, - predict_data_frame=pd.read_csv(predict_file) if predict_file is not None else None, - is_regression=False, - preprocess=preprocess, - val_split=val_split, - batch_size=batch_size, - num_workers=num_workers, - ) + is_regression = False diff --git a/flash/tabular/data.py b/flash/tabular/data.py index c2a60e24da..48e792c125 100644 --- a/flash/tabular/data.py +++ b/flash/tabular/data.py @@ -245,6 +245,8 @@ class TabularData(DataModule): preprocess_cls = TabularPreprocess postprocess_cls = TabularPostprocess + is_regression: Optional[bool] = None + @property def codes(self) -> Dict[str, str]: return self._data_source.codes @@ -343,7 +345,6 @@ def from_data_frame( val_split: Optional[float] = None, batch_size: int = 4, num_workers: Optional[int] = None, - is_regression: bool = False, **preprocess_kwargs: Any, ): """Creates a :class:`~flash.tabular.data.TabularData` object from the given data frames. @@ -372,8 +373,6 @@ def from_data_frame( val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - is_regression: If ``True``, targets will be formatted as floating point. If ``False``, targets will be - formatted as integers. preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used if ``preprocess = None``. @@ -430,7 +429,7 @@ def from_data_frame( codes=codes, target_codes=target_codes, classes=classes, - is_regression=is_regression, + is_regression=cls.is_regression, **preprocess_kwargs, ) @@ -453,7 +452,6 @@ def from_csv( val_split: Optional[float] = None, batch_size: int = 4, num_workers: Optional[int] = None, - is_regression: bool = False, **preprocess_kwargs: Any, ) -> 'DataModule': """Creates a :class:`~flash.tabular.data.TabularData` object from the given CSV files. @@ -482,8 +480,6 @@ def from_csv( val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - is_regression: If ``True``, targets will be formatted as floating point. If ``False``, targets will be - formatted as integers. preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used if ``preprocess = None``. @@ -507,7 +503,7 @@ def from_csv( val_data_frame=pd.read_csv(val_file) if val_file is not None else None, test_data_frame=pd.read_csv(test_file) if test_file is not None else None, predict_data_frame=pd.read_csv(predict_file) if predict_file is not None else None, - is_regression=is_regression, + is_regression=cls.is_regression, preprocess=preprocess, val_split=val_split, batch_size=batch_size, diff --git a/flash/tabular/regression/data.py b/flash/tabular/regression/data.py index 6398623b34..6f3ccec166 100644 --- a/flash/tabular/regression/data.py +++ b/flash/tabular/regression/data.py @@ -11,206 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, Dict, List, Optional, Union - -from flash.core.data.callback import BaseDataFetcher -from flash.core.data.data_module import DataModule -from flash.core.data.process import Preprocess -from flash.core.utilities.imports import _PANDAS_AVAILABLE - -if _PANDAS_AVAILABLE: - import pandas as pd - from pandas.core.frame import DataFrame -else: - DataFrame = object - from flash.tabular.data import TabularData class TabularRegressionData(TabularData): + is_regression = True - is_regression: True - - @classmethod - def from_data_frame( - cls, - categorical_fields: Optional[Union[str, List[str]]], - numerical_fields: Optional[Union[str, List[str]]], - target_fields: Optional[str] = None, - train_data_frame: Optional[DataFrame] = None, - val_data_frame: Optional[DataFrame] = None, - test_data_frame: Optional[DataFrame] = None, - predict_data_frame: Optional[DataFrame] = None, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - data_fetcher: Optional[BaseDataFetcher] = None, - preprocess: Optional[Preprocess] = None, - val_split: Optional[float] = None, - batch_size: int = 4, - num_workers: Optional[int] = None, - **preprocess_kwargs: Any, - ): - """Creates a :class:`~flash.tabular.regression.data.TabularRegressionData` object from the given data frames. - - Args: - categorical_fields: The field or fields (columns) in the CSV file containing categorical inputs. - numerical_fields: The field or fields (columns) in the CSV file containing numerical inputs. - target_fields: The field or fields (columns) in the CSV file to use for the target. - train_data_frame: The pandas ``DataFrame`` containing the training data. - val_data_frame: The pandas ``DataFrame`` containing the validation data. - test_data_frame: The pandas ``DataFrame`` containing the testing data. - predict_data_frame: The pandas ``DataFrame`` containing the data to use when predicting. - train_transform: The dictionary of transforms to use during training which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - val_transform: The dictionary of transforms to use during validation which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - test_transform: The dictionary of transforms to use during testing which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - predict_transform: The dictionary of transforms to use during predicting which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - data_fetcher: The :class:`~flash.core.data.callback.BaseDataFetcher` to pass to the - :class:`~flash.core.data.data_module.DataModule`. - preprocess: The :class:`~flash.core.data.data.Preprocess` to pass to the - :class:`~flash.core.data.data_module.DataModule`. If ``None``, ``cls.preprocess_cls`` - will be constructed and used. - val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used - if ``preprocess = None``. - - Returns: - The constructed data module. - - Examples:: - - data_module = TabularData.from_data_frame( - "categorical_input", - "numerical_input", - "target", - train_data_frame=train_data, - ) - """ - categorical_fields, numerical_fields = cls._sanetize_cols(categorical_fields, numerical_fields) - - if not isinstance(categorical_fields, list): - categorical_fields = [categorical_fields] - - if not isinstance(numerical_fields, list): - numerical_fields = [numerical_fields] - - mean, std, classes, codes, target_codes = cls.compute_state( - train_data_frame=train_data_frame, - val_data_frame=val_data_frame, - test_data_frame=test_data_frame, - predict_data_frame=predict_data_frame, - target_fields=target_fields, - numerical_fields=numerical_fields, - categorical_fields=categorical_fields, - ) - - return cls.from_data_source( - "data_frame", - train_data_frame, - val_data_frame, - test_data_frame, - predict_data_frame, - train_transform=train_transform, - val_transform=val_transform, - test_transform=test_transform, - predict_transform=predict_transform, - data_fetcher=data_fetcher, - preprocess=preprocess, - val_split=val_split, - batch_size=batch_size, - num_workers=num_workers, - cat_cols=categorical_fields, - num_cols=numerical_fields, - target_col=target_fields, - mean=mean, - std=std, - codes=codes, - target_codes=target_codes, - classes=classes, - is_regression=True, - **preprocess_kwargs, - ) - - @classmethod - def from_csv( - cls, - categorical_fields: Optional[Union[str, List[str]]], - numerical_fields: Optional[Union[str, List[str]]], - target_fields: Optional[str] = None, - train_file: Optional[str] = None, - val_file: Optional[str] = None, - test_file: Optional[str] = None, - predict_file: Optional[str] = None, - train_transform: Optional[Dict[str, Callable]] = None, - val_transform: Optional[Dict[str, Callable]] = None, - test_transform: Optional[Dict[str, Callable]] = None, - predict_transform: Optional[Dict[str, Callable]] = None, - data_fetcher: Optional[BaseDataFetcher] = None, - preprocess: Optional[Preprocess] = None, - val_split: Optional[float] = None, - batch_size: int = 4, - num_workers: Optional[int] = None, - **preprocess_kwargs: Any, - ) -> 'DataModule': - """Creates a :class:`~flash.tabular.regression.data.TabularRegressionData` object from the given CSV files. - - Args: - categorical_fields: The field or fields (columns) in the CSV file containing categorical inputs. - numerical_fields: The field or fields (columns) in the CSV file containing numerical inputs. - target_fields: The field or fields (columns) in the CSV file to use for the target. - train_file: The CSV file containing the training data. - val_file: The CSV file containing the validation data. - test_file: The CSV file containing the testing data. - predict_file: The CSV file containing the data to use when predicting. - train_transform: The dictionary of transforms to use during training which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - val_transform: The dictionary of transforms to use during validation which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - test_transform: The dictionary of transforms to use during testing which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - predict_transform: The dictionary of transforms to use during predicting which maps - :class:`~flash.core.data.process.Preprocess` hook names to callable transforms. - data_fetcher: The :class:`~flash.core.data.callback.BaseDataFetcher` to pass to the - :class:`~flash.core.data.data_module.DataModule`. - preprocess: The :class:`~flash.core.data.data.Preprocess` to pass to the - :class:`~flash.core.data.data_module.DataModule`. If ``None``, ``cls.preprocess_cls`` - will be constructed and used. - val_split: The ``val_split`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - batch_size: The ``batch_size`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - num_workers: The ``num_workers`` argument to pass to the :class:`~flash.core.data.data_module.DataModule`. - preprocess_kwargs: Additional keyword arguments to use when constructing the preprocess. Will only be used - if ``preprocess = None``. - - Returns: - The constructed data module. - - Examples:: - - data_module = TabularData.from_csv( - "categorical_input", - "numerical_input", - "target", - train_file="train_data.csv", - ) - """ - return cls.from_data_frame( - categorical_fields=categorical_fields, - numerical_fields=numerical_fields, - target_fields=target_fields, - train_data_frame=pd.read_csv(train_file) if train_file is not None else None, - val_data_frame=pd.read_csv(val_file) if val_file is not None else None, - test_data_frame=pd.read_csv(test_file) if test_file is not None else None, - predict_data_frame=pd.read_csv(predict_file) if predict_file is not None else None, - is_regression=True, - preprocess=preprocess, - val_split=val_split, - batch_size=batch_size, - num_workers=num_workers, - ) From 11bc09adc0cb2002ba65fcdab18daf5e48337d7f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Jul 2021 11:05:07 +0000 Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flash/tabular/regression/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flash/tabular/regression/data.py b/flash/tabular/regression/data.py index 6f3ccec166..04dd8cd3b4 100644 --- a/flash/tabular/regression/data.py +++ b/flash/tabular/regression/data.py @@ -16,4 +16,3 @@ class TabularRegressionData(TabularData): is_regression = True - From e1190a88749de5010c0634e6e587e47b5652dd5b Mon Sep 17 00:00:00 2001 From: Suman Michael Date: Tue, 13 Jul 2021 16:43:08 +0530 Subject: [PATCH 08/10] PEP8 fix --- flash/tabular/regression/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flash/tabular/regression/data.py b/flash/tabular/regression/data.py index 6f3ccec166..04dd8cd3b4 100644 --- a/flash/tabular/regression/data.py +++ b/flash/tabular/regression/data.py @@ -16,4 +16,3 @@ class TabularRegressionData(TabularData): is_regression = True - From bb3ea2ee4619ebb33bb980d45906b3d937c45d73 Mon Sep 17 00:00:00 2001 From: Suman Michael Date: Tue, 13 Jul 2021 17:30:18 +0530 Subject: [PATCH 09/10] modified tests --- README.md | 4 ++-- flash/tabular/__init__.py | 1 + flash/tabular/data.py | 3 +-- flash_examples/tabular_classification.py | 4 ++-- tests/tabular/classification/test_data.py | 18 +++++++++--------- .../test_data_model_integration.py | 4 ++-- tests/tabular/classification/test_model.py | 5 ++--- 7 files changed, 19 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index a950b6c458..59d855d358 100644 --- a/README.md +++ b/README.md @@ -260,13 +260,13 @@ To illustrate, say we want to build a model to predict if a passenger survived o from torchmetrics.classification import Accuracy, Precision, Recall import flash from flash.core.data.utils import download_data -from flash.tabular import TabularClassifier, TabularData +from flash.tabular import TabularClassifier, TabularClassificationData # 1. Download the data download_data("https://pl-flash-data.s3.amazonaws.com/titanic.zip", 'data/') # 2. Load the data -datamodule = TabularData.from_csv( +datamodule = TabularClassificationData.from_csv( ["Sex", "Age", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"], "Fare", target_fields="Survived", diff --git a/flash/tabular/__init__.py b/flash/tabular/__init__.py index d4deb4c851..3b343ea1f1 100644 --- a/flash/tabular/__init__.py +++ b/flash/tabular/__init__.py @@ -1,2 +1,3 @@ from flash.tabular.classification import TabularClassificationData, TabularClassifier # noqa: F401 +from flash.tabular.regression import TabularRegressionData # noqa: F401 from flash.tabular.data import TabularData # noqa: F401 diff --git a/flash/tabular/data.py b/flash/tabular/data.py index 48e792c125..f6a9d717e5 100644 --- a/flash/tabular/data.py +++ b/flash/tabular/data.py @@ -245,7 +245,7 @@ class TabularData(DataModule): preprocess_cls = TabularPreprocess postprocess_cls = TabularPostprocess - is_regression: Optional[bool] = None + is_regression: bool = False @property def codes(self) -> Dict[str, str]: @@ -503,7 +503,6 @@ def from_csv( val_data_frame=pd.read_csv(val_file) if val_file is not None else None, test_data_frame=pd.read_csv(test_file) if test_file is not None else None, predict_data_frame=pd.read_csv(predict_file) if predict_file is not None else None, - is_regression=cls.is_regression, preprocess=preprocess, val_split=val_split, batch_size=batch_size, diff --git a/flash_examples/tabular_classification.py b/flash_examples/tabular_classification.py index fa3a2cc23e..e639456507 100644 --- a/flash_examples/tabular_classification.py +++ b/flash_examples/tabular_classification.py @@ -13,12 +13,12 @@ # limitations under the License. import flash from flash.core.data.utils import download_data -from flash.tabular import TabularClassifier, TabularData +from flash.tabular import TabularClassifier, TabularClassificationData # 1. Create the DataModule download_data("https://pl-flash-data.s3.amazonaws.com/titanic.zip", "./data") -datamodule = TabularData.from_csv( +datamodule = TabularClassificationData.from_csv( ["Sex", "Age", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"], "Fare", target_fields="Survived", diff --git a/tests/tabular/classification/test_data.py b/tests/tabular/classification/test_data.py index baa87b3451..6bf2cae4fb 100644 --- a/tests/tabular/classification/test_data.py +++ b/tests/tabular/classification/test_data.py @@ -23,7 +23,7 @@ if _PANDAS_AVAILABLE: import pandas as pd - from flash.tabular import TabularData + from flash.tabular import TabularClassificationData from flash.tabular.classification.utils import _categorize, _normalize TEST_DF_1 = pd.DataFrame( @@ -73,19 +73,19 @@ def test_emb_sizes(): self.codes = {"category": [None, "a", "b", "c"]} self.cat_cols = ["category"] # use __get__ to test property with mocked self - es = TabularData.emb_sizes.__get__(self) # pylint: disable=E1101 + es = TabularClassificationData.emb_sizes.__get__(self) # pylint: disable=E1101 assert es == [(4, 16)] self.codes = {} self.cat_cols = [] # use __get__ to test property with mocked self - es = TabularData.emb_sizes.__get__(self) # pylint: disable=E1101 + es = TabularClassificationData.emb_sizes.__get__(self) # pylint: disable=E1101 assert es == [] self.codes = {"large": ["a"] * 100_000, "larger": ["b"] * 1_000_000} self.cat_cols = ["large", "larger"] # use __get__ to test property with mocked self - es = TabularData.emb_sizes.__get__(self) # pylint: disable=E1101 + es = TabularClassificationData.emb_sizes.__get__(self) # pylint: disable=E1101 assert es == [(100_000, 17), (1_000_000, 31)] @@ -94,7 +94,7 @@ def test_tabular_data(tmpdir): train_data_frame = TEST_DF_1.copy() val_data_frame = TEST_DF_2.copy() test_data_frame = TEST_DF_2.copy() - dm = TabularData.from_data_frame( + dm = TabularClassificationData.from_data_frame( categorical_fields=["category"], numerical_fields=["scalar_a", "scalar_b"], target_fields="label", @@ -122,7 +122,7 @@ def test_categorical_target(tmpdir): # change int label to string df["label"] = df["label"].astype(str) - dm = TabularData.from_data_frame( + dm = TabularClassificationData.from_data_frame( categorical_fields=["category"], numerical_fields=["scalar_a", "scalar_b"], target_fields="label", @@ -146,7 +146,7 @@ def test_from_data_frame(tmpdir): train_data_frame = TEST_DF_1.copy() val_data_frame = TEST_DF_2.copy() test_data_frame = TEST_DF_2.copy() - dm = TabularData.from_data_frame( + dm = TabularClassificationData.from_data_frame( categorical_fields=["category"], numerical_fields=["scalar_a", "scalar_b"], target_fields="label", @@ -173,7 +173,7 @@ def test_from_csv(tmpdir): TEST_DF_2.to_csv(val_csv) TEST_DF_2.to_csv(test_csv) - dm = TabularData.from_csv( + dm = TabularClassificationData.from_csv( categorical_fields=["category"], numerical_fields=["scalar_a", "scalar_b"], target_fields="label", @@ -196,7 +196,7 @@ def test_from_csv(tmpdir): def test_empty_inputs(): train_data_frame = TEST_DF_1.copy() with pytest.raises(RuntimeError): - TabularData.from_data_frame( + TabularClassificationData.from_data_frame( numerical_fields=None, categorical_fields=None, target_fields="label", diff --git a/tests/tabular/classification/test_data_model_integration.py b/tests/tabular/classification/test_data_model_integration.py index 349aeeaaba..26a0b5c3d7 100644 --- a/tests/tabular/classification/test_data_model_integration.py +++ b/tests/tabular/classification/test_data_model_integration.py @@ -15,7 +15,7 @@ import pytorch_lightning as pl from flash.core.utilities.imports import _TABULAR_AVAILABLE -from flash.tabular import TabularClassifier, TabularData +from flash.tabular import TabularClassifier, TabularClassificationData from tests.helpers.utils import _TABULAR_TESTING if _TABULAR_AVAILABLE: @@ -37,7 +37,7 @@ def test_classification(tmpdir): train_data_frame = TEST_DF_1.copy() val_data_frame = TEST_DF_1.copy() test_data_frame = TEST_DF_1.copy() - data = TabularData.from_data_frame( + data = TabularClassificationData.from_data_frame( categorical_fields=["category"], numerical_fields=["scalar_a", "scalar_b"], target_fields="label", diff --git a/tests/tabular/classification/test_model.py b/tests/tabular/classification/test_model.py index d3cc3db332..743f1e3a8a 100644 --- a/tests/tabular/classification/test_model.py +++ b/tests/tabular/classification/test_model.py @@ -21,8 +21,7 @@ from flash.core.data.data_source import DefaultDataKeys from flash.core.utilities.imports import _TABULAR_AVAILABLE -from flash.tabular import TabularClassifier -from flash.tabular.classification.data import TabularData +from flash.tabular import TabularClassifier, TabularClassificationData from tests.helpers.utils import _SERVE_TESTING, _TABULAR_TESTING # ======== Mock functions ======== @@ -100,7 +99,7 @@ def test_jit(tmpdir): @mock.patch("flash._IS_TESTING", True) def test_serve(): train_data = {"num_col": [1.4, 2.5], "cat_col": ["positive", "negative"], "target": [1, 2]} - datamodule = TabularData.from_data_frame( + datamodule = TabularClassificationData.from_data_frame( "cat_col", "num_col", "target", From 68df0eaf734c3e881c9c9bee6d833c5c3fc42ccb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Jul 2021 12:01:11 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flash/tabular/__init__.py | 2 +- flash_examples/tabular_classification.py | 2 +- tests/tabular/classification/test_data_model_integration.py | 2 +- tests/tabular/classification/test_model.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/flash/tabular/__init__.py b/flash/tabular/__init__.py index 3b343ea1f1..22698efc99 100644 --- a/flash/tabular/__init__.py +++ b/flash/tabular/__init__.py @@ -1,3 +1,3 @@ from flash.tabular.classification import TabularClassificationData, TabularClassifier # noqa: F401 -from flash.tabular.regression import TabularRegressionData # noqa: F401 from flash.tabular.data import TabularData # noqa: F401 +from flash.tabular.regression import TabularRegressionData # noqa: F401 diff --git a/flash_examples/tabular_classification.py b/flash_examples/tabular_classification.py index e639456507..9e6b0ab049 100644 --- a/flash_examples/tabular_classification.py +++ b/flash_examples/tabular_classification.py @@ -13,7 +13,7 @@ # limitations under the License. import flash from flash.core.data.utils import download_data -from flash.tabular import TabularClassifier, TabularClassificationData +from flash.tabular import TabularClassificationData, TabularClassifier # 1. Create the DataModule download_data("https://pl-flash-data.s3.amazonaws.com/titanic.zip", "./data") diff --git a/tests/tabular/classification/test_data_model_integration.py b/tests/tabular/classification/test_data_model_integration.py index 26a0b5c3d7..e30cac67c8 100644 --- a/tests/tabular/classification/test_data_model_integration.py +++ b/tests/tabular/classification/test_data_model_integration.py @@ -15,7 +15,7 @@ import pytorch_lightning as pl from flash.core.utilities.imports import _TABULAR_AVAILABLE -from flash.tabular import TabularClassifier, TabularClassificationData +from flash.tabular import TabularClassificationData, TabularClassifier from tests.helpers.utils import _TABULAR_TESTING if _TABULAR_AVAILABLE: diff --git a/tests/tabular/classification/test_model.py b/tests/tabular/classification/test_model.py index 743f1e3a8a..a64c2d090d 100644 --- a/tests/tabular/classification/test_model.py +++ b/tests/tabular/classification/test_model.py @@ -21,7 +21,7 @@ from flash.core.data.data_source import DefaultDataKeys from flash.core.utilities.imports import _TABULAR_AVAILABLE -from flash.tabular import TabularClassifier, TabularClassificationData +from flash.tabular import TabularClassificationData, TabularClassifier from tests.helpers.utils import _SERVE_TESTING, _TABULAR_TESTING # ======== Mock functions ========