From d3e523fb8320970d20a6fb0b66f568c36f02f5df Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Feb 2021 21:53:38 -0500 Subject: [PATCH 1/8] added .csv image loading utils --- flash/data/data_utils.py | 5 ++++- flash/vision/classification/data.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/flash/data/data_utils.py b/flash/data/data_utils.py index 50e6b3bf63c..07b4f1a5179 100644 --- a/flash/data/data_utils.py +++ b/flash/data/data_utils.py @@ -1,5 +1,5 @@ +import os from typing import Dict, List, Union - import pandas as pd @@ -19,6 +19,9 @@ def labels_from_categorical_csv(csv: str, index_col: str, return_dict: bool = Tr names = df[index_col].to_list() del df[index_col] + # remove extensions + names = [os.path.splitext(x)[0] for x in names] + # everything else is binary labels = df.to_numpy().argmax(1).tolist() diff --git a/flash/vision/classification/data.py b/flash/vision/classification/data.py index 4af59a87644..8d7871ff5c2 100644 --- a/flash/vision/classification/data.py +++ b/flash/vision/classification/data.py @@ -77,7 +77,8 @@ def __getitem__(self, index: int) -> Tuple[Any, Optional[int]]: img = self.transform(img) label = None if self.has_dict_labels: - name = os.path.basename(filename) + name = os.path.splitext(filename)[0] + name = os.path.basename(name) label = self.labels[name] elif self.has_labels: From 8179a5e8797b46ad6073cb751c237611cbe188fb Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Feb 2021 22:00:05 -0500 Subject: [PATCH 2/8] added .csv image loading utils --- flash/data/data_utils.py | 6 +++--- tests/vision/classification/test_data.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/flash/data/data_utils.py b/flash/data/data_utils.py index 07b4f1a5179..66ed2c21559 100644 --- a/flash/data/data_utils.py +++ b/flash/data/data_utils.py @@ -3,7 +3,7 @@ import pandas as pd -def labels_from_categorical_csv(csv: str, index_col: str, return_dict: bool = True) -> Union[Dict, List]: +def labels_from_categorical_csv(csv: str, index_col: str, feature_cols: List, return_dict: bool = True) -> Union[Dict, List]: """ Returns a dictionary with {index_col: label} for each entry in the csv. @@ -17,13 +17,13 @@ def labels_from_categorical_csv(csv: str, index_col: str, return_dict: bool = Tr df = pd.read_csv(csv) # get names names = df[index_col].to_list() - del df[index_col] # remove extensions names = [os.path.splitext(x)[0] for x in names] # everything else is binary - labels = df.to_numpy().argmax(1).tolist() + feature_df = df[feature_cols] + labels = feature_df.to_numpy().argmax(1).tolist() if return_dict: labels = {name: label for name, label in zip(names, labels)} diff --git a/tests/vision/classification/test_data.py b/tests/vision/classification/test_data.py index f05ecab3061..a349fa9781d 100644 --- a/tests/vision/classification/test_data.py +++ b/tests/vision/classification/test_data.py @@ -93,27 +93,27 @@ def test_categorical_csv_labels(tmpdir): train_csv = os.path.join(tmpdir, 'some_dataset', 'train.csv') text_file = open(train_csv, 'w') text_file.write( - 'my_id, label_a, label_b, label_c\n"train_1.png", 0, 1, 0\n"train_2.png", 0, 0, 1\n"train_2.png", 1, 0, 0\n' + 'my_id,label_a,label_b,label_c\n"train_1.png", 0, 1, 0\n"train_2.png", 0, 0, 1\n"train_2.png", 1, 0, 0\n' ) text_file.close() valid_csv = os.path.join(tmpdir, 'some_dataset', 'valid.csv') text_file = open(valid_csv, 'w') text_file.write( - 'my_id, label_a, label_b, label_c\n"valid_1.png", 0, 1, 0\n"valid_2.png", 0, 0, 1\n"valid_3.png", 1, 0, 0\n' + 'my_id,label_a,label_b,label_c\n"valid_1.png", 0, 1, 0\n"valid_2.png", 0, 0, 1\n"valid_3.png", 1, 0, 0\n' ) text_file.close() test_csv = os.path.join(tmpdir, 'some_dataset', 'test.csv') text_file = open(test_csv, 'w') text_file.write( - 'my_id, label_a, label_b, label_c\n"test_1.png", 0, 1, 0\n"test_2.png", 0, 0, 1\n"test_3.png", 1, 0, 0\n' + 'my_id,label_a,label_b,label_c\n"test_1.png", 0, 1, 0\n"test_2.png", 0, 0, 1\n"test_3.png", 1, 0, 0\n' ) text_file.close() - train_labels = labels_from_categorical_csv(train_csv, 'my_id') - valid_labels = labels_from_categorical_csv(valid_csv, 'my_id') - test_labels = labels_from_categorical_csv(test_csv, 'my_id') + train_labels = labels_from_categorical_csv(train_csv, 'my_id', feature_cols=['label_a', 'label_b', 'label_c']) + valid_labels = labels_from_categorical_csv(valid_csv, 'my_id', feature_cols=['label_a', 'label_b', 'label_c']) + test_labels = labels_from_categorical_csv(test_csv, 'my_id', feature_cols=['label_a', 'label_b', 'label_c']) data = ImageClassificationData.from_filepaths( batch_size=2, From 93a7f865cadfbc879dc59cefb81cc8bedf41971b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Feb 2021 22:13:53 -0500 Subject: [PATCH 3/8] added .csv image loading utils --- flash/data/data_utils.py | 17 ++++++++++++----- tests/vision/classification/test_data.py | 12 +++++++++--- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/flash/data/data_utils.py b/flash/data/data_utils.py index 66ed2c21559..2978da54b5a 100644 --- a/flash/data/data_utils.py +++ b/flash/data/data_utils.py @@ -1,9 +1,14 @@ -import os -from typing import Dict, List, Union +from typing import Dict, List, Union, Any import pandas as pd -def labels_from_categorical_csv(csv: str, index_col: str, feature_cols: List, return_dict: bool = True) -> Union[Dict, List]: +def labels_from_categorical_csv( + csv: str, + index_col: str, + feature_cols: List, + return_dict: bool = True, + index_col_collate_fn: Any = None + ) -> Union[Dict, List]: """ Returns a dictionary with {index_col: label} for each entry in the csv. @@ -18,8 +23,10 @@ def labels_from_categorical_csv(csv: str, index_col: str, feature_cols: List, re # get names names = df[index_col].to_list() - # remove extensions - names = [os.path.splitext(x)[0] for x in names] + # apply colate fn to index_col + if index_col_collate_fn: + for i in range(len(names)): + names[i] = index_col_collate_fn(names[i]) # everything else is binary feature_df = df[feature_cols] diff --git a/tests/vision/classification/test_data.py b/tests/vision/classification/test_data.py index a349fa9781d..c200ca5c81b 100644 --- a/tests/vision/classification/test_data.py +++ b/tests/vision/classification/test_data.py @@ -111,9 +111,15 @@ def test_categorical_csv_labels(tmpdir): ) text_file.close() - train_labels = labels_from_categorical_csv(train_csv, 'my_id', feature_cols=['label_a', 'label_b', 'label_c']) - valid_labels = labels_from_categorical_csv(valid_csv, 'my_id', feature_cols=['label_a', 'label_b', 'label_c']) - test_labels = labels_from_categorical_csv(test_csv, 'my_id', feature_cols=['label_a', 'label_b', 'label_c']) + def index_col_collate_fn(x): + return os.path.splitext(x)[0] + + train_labels = labels_from_categorical_csv( + train_csv, 'my_id', feature_cols=['label_a', 'label_b', 'label_c'], index_col_collate_fn=index_col_collate_fn) + valid_labels = labels_from_categorical_csv( + valid_csv, 'my_id', feature_cols=['label_a', 'label_b', 'label_c'], index_col_collate_fn=index_col_collate_fn) + test_labels = labels_from_categorical_csv( + test_csv, 'my_id', feature_cols=['label_a', 'label_b', 'label_c'], index_col_collate_fn=index_col_collate_fn) data = ImageClassificationData.from_filepaths( batch_size=2, From 5af9209e6094e106f70e7624fec348bda00363ee Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Feb 2021 22:26:58 -0500 Subject: [PATCH 4/8] added .csv image loading utils --- flash/vision/classification/data.py | 25 ++++++++++++++++-------- tests/vision/classification/test_data.py | 10 ++++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/flash/vision/classification/data.py b/flash/vision/classification/data.py index 8d7871ff5c2..1277b6bf54e 100644 --- a/flash/vision/classification/data.py +++ b/flash/vision/classification/data.py @@ -257,6 +257,7 @@ def from_filepaths( train_filepaths: Union[str, Optional[Sequence[Union[str, pathlib.Path]]]] = None, train_labels: Optional[Sequence] = None, train_transform: Optional[Callable] = _default_train_transforms, + valid_split: Union[None, float] = None, valid_filepaths: Union[str, Optional[Sequence[Union[str, pathlib.Path]]]] = None, valid_labels: Optional[Sequence] = None, valid_transform: Optional[Callable] = _default_valid_transforms, @@ -273,6 +274,7 @@ def from_filepaths( train_filepaths: string or sequence of file paths for training dataset. Defaults to ``None``. train_labels: sequence of labels for training dataset. Defaults to ``None``. train_transform: transforms for training dataset. Defaults to ``None``. + valid_split: if not None, generates val split from train dataloader using this value. valid_filepaths: string or sequence of file paths for validation dataset. Defaults to ``None``. valid_labels: sequence of labels for validation dataset. Defaults to ``None``. valid_transform: transforms for validation and testing dataset. Defaults to ``None``. @@ -320,14 +322,21 @@ def from_filepaths( loader=loader, transform=train_transform, ) - valid_ds = ( - FilepathDataset( - filepaths=valid_filepaths, - labels=valid_labels, - loader=loader, - transform=valid_transform, - ) if valid_filepaths is not None else None - ) + + if valid_split: + full_length = len(train_ds) + train_split = int((1.0 - valid_split) * full_length) + valid_split = full_length - train_split + train_ds, valid_ds = torch.utils.data.random_split(train_ds, [train_split, valid_split]) + else: + valid_ds = ( + FilepathDataset( + filepaths=valid_filepaths, + labels=valid_labels, + loader=loader, + transform=valid_transform, + ) if valid_filepaths is not None else None + ) test_ds = ( FilepathDataset( diff --git a/tests/vision/classification/test_data.py b/tests/vision/classification/test_data.py index c200ca5c81b..8e442fec1e0 100644 --- a/tests/vision/classification/test_data.py +++ b/tests/vision/classification/test_data.py @@ -140,6 +140,16 @@ def index_col_collate_fn(x): for (x, y) in data.test_dataloader(): assert len(x) == 2 + data = ImageClassificationData.from_filepaths( + batch_size=2, + train_filepaths=os.path.join(tmpdir, 'some_dataset', 'train'), + train_labels=train_labels, + valid_split=0.5 + ) + + for (x, y) in data.val_dataloader(): + assert len(x) == 1 + def test_from_folders(tmpdir): train_dir = Path(tmpdir / "train") From bc33414a3fef95fa7ef0358ae92f49083917c4b8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Feb 2021 22:30:30 -0500 Subject: [PATCH 5/8] added .csv image loading utils --- flash/vision/classification/data.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flash/vision/classification/data.py b/flash/vision/classification/data.py index 1277b6bf54e..9af8801e804 100644 --- a/flash/vision/classification/data.py +++ b/flash/vision/classification/data.py @@ -266,6 +266,7 @@ def from_filepaths( loader: Callable = _pil_loader, batch_size: int = 64, num_workers: Optional[int] = None, + seed: int = 1234, **kwargs ): """Creates a ImageClassificationData object from lists of image filepaths and labels @@ -284,6 +285,7 @@ def from_filepaths( batch_size: the batchsize to use for parallel loading. Defaults to ``64``. num_workers: The number of workers to use for parallelized loading. Defaults to ``None`` which equals the number of available CPU threads. + seed: Used for the train/val splits when valid_split is not None Returns: ImageClassificationData: The constructed data module. @@ -327,7 +329,9 @@ def from_filepaths( full_length = len(train_ds) train_split = int((1.0 - valid_split) * full_length) valid_split = full_length - train_split - train_ds, valid_ds = torch.utils.data.random_split(train_ds, [train_split, valid_split]) + train_ds, valid_ds = torch.utils.data.random_split(train_ds, + [train_split, valid_split], + generator=torch.Generator().manual_seed(seed)) else: valid_ds = ( FilepathDataset( From 891a6e9c2f72ed050182f390b27790cdfdeb3d72 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Feb 2021 23:26:28 -0500 Subject: [PATCH 6/8] added .csv image loading utils --- flash/data/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flash/data/data_utils.py b/flash/data/data_utils.py index 2978da54b5a..3a3a148edd1 100644 --- a/flash/data/data_utils.py +++ b/flash/data/data_utils.py @@ -8,7 +8,7 @@ def labels_from_categorical_csv( feature_cols: List, return_dict: bool = True, index_col_collate_fn: Any = None - ) -> Union[Dict, List]: +) -> Union[Dict, List]: """ Returns a dictionary with {index_col: label} for each entry in the csv. From b8b52e3b209b08e1fc4b5416d85e153f24c42b88 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Feb 2021 23:28:39 -0500 Subject: [PATCH 7/8] added .csv image loading utils --- flash/data/data_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flash/data/data_utils.py b/flash/data/data_utils.py index 3a3a148edd1..2cabd08552c 100644 --- a/flash/data/data_utils.py +++ b/flash/data/data_utils.py @@ -1,4 +1,5 @@ -from typing import Dict, List, Union, Any +from typing import Any, Dict, List, Union + import pandas as pd From f435f8f25765aaceb64fe9d6bb92a6f5b53b645e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Feb 2021 23:31:02 -0500 Subject: [PATCH 8/8] added .csv image loading utils --- flash/vision/classification/data.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/flash/vision/classification/data.py b/flash/vision/classification/data.py index 9af8801e804..6f90f2571d9 100644 --- a/flash/vision/classification/data.py +++ b/flash/vision/classification/data.py @@ -329,9 +329,11 @@ def from_filepaths( full_length = len(train_ds) train_split = int((1.0 - valid_split) * full_length) valid_split = full_length - train_split - train_ds, valid_ds = torch.utils.data.random_split(train_ds, - [train_split, valid_split], - generator=torch.Generator().manual_seed(seed)) + train_ds, valid_ds = torch.utils.data.random_split( + train_ds, + [train_split, valid_split], + generator=torch.Generator().manual_seed(seed) + ) else: valid_ds = ( FilepathDataset(