Lightning-Universe · tchaton · Apr 30, 2021 · Apr 15, 2021 · Apr 15, 2021 · Apr 15, 2021
@@ -42,6 +42,15 @@ jobs:
       run: |
         python -c "req = open('requirements.txt').read().replace('>', '=') ; open('requirements.txt', 'w').write(req)"
 
+    - name: Filter requirements
+      run: |
+        import sys
+        if sys.version_info.minor < 7:
+          fname = 'requirements.txt'
+          lines = [line for line in open(fname).readlines() if not line.startswith('pytorchvideo')]
+          open(fname, 'w').writelines(lines)
+      shell: python
+
     # Note: This uses an internal pip API and may not always work
     # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
     - name: Get pip cache

@@ -148,3 +148,4 @@ imdb
 xsum
 coco128
 wmt_en_ro
+action_youtube_naudio
@@ -14,6 +14,7 @@
 from typing import Any
 
 import torch
+import torch.nn.functional as F
 
 from flash.core.model import Task
 from flash.data.process import Postprocess
@@ -29,3 +30,6 @@ class ClassificationTask(Task):
 
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, default_postprocess=ClassificationPostprocess(), **kwargs)
+
+    def to_metrics_format(self, x: torch.Tensor) -> torch.Tensor:
+        return F.softmax(x, -1)
@@ -96,6 +96,7 @@ def step(self, batch: Any, batch_idx: int) -> Any:
         output = {"y_hat": y_hat}
         losses = {name: l_fn(y_hat, y) for name, l_fn in self.loss_fn.items()}
         logs = {}
+        y_hat = self.to_metrics_format(y_hat)
         for name, metric in self.metrics.items():
             if isinstance(metric, torchmetrics.metric.Metric):
                 metric(y_hat, y)
@@ -111,6 +112,9 @@ def step(self, batch: Any, batch_idx: int) -> Any:
         output["y"] = y
         return output
 
+    def to_metrics_format(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
     def forward(self, x: Any) -> Any:
         return self.model(x)
 
@@ -172,10 +176,10 @@ def configure_finetune_callback(self) -> List[Callback]:
 
     @staticmethod
     def _resolve(
-            old_preprocess: Optional[Preprocess],
-            old_postprocess: Optional[Postprocess],
-            new_preprocess: Optional[Preprocess],
-            new_postprocess: Optional[Postprocess],
+        old_preprocess: Optional[Preprocess],
+        old_postprocess: Optional[Postprocess],
+        new_preprocess: Optional[Preprocess],
+        new_postprocess: Optional[Postprocess],
     ) -> Tuple[Optional[Preprocess], Optional[Postprocess]]:
         """Resolves the correct :class:`.Preprocess` and :class:`.Postprocess` to use, choosing ``new_*`` if it is not
         None or a base class (:class:`.Preprocess` or :class:`.Postprocess`) and ``old_*`` otherwise.
@@ -308,3 +312,10 @@ def available_backbones(cls) -> List[str]:
         if registry is None:
             return []
         return registry.available_keys()
+
+    @classmethod
+    def available_models(cls) -> List[str]:
+        registry: Optional[FlashRegistry] = getattr(cls, "models", None)
+        if registry is None:
+            return []
+        return registry.available_keys()
diff --git a/flash/data/auto_dataset.py b/flash/data/auto_dataset.py
@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from contextlib import contextmanager
 from inspect import signature
-from typing import Any, Callable, Iterable, Optional, TYPE_CHECKING
+from typing import Any, Callable, Iterable, Iterator, Optional, TYPE_CHECKING
 
+import torch
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities.warning_utils import rank_zero_warn
-from torch.utils.data import Dataset
+from torch.utils.data import Dataset, IterableDataset
 
 from flash.data.callback import ControlFlow
 from flash.data.process import Preprocess
@@ -27,13 +27,13 @@
     from flash.data.data_pipeline import DataPipeline
 
 
-class AutoDataset(Dataset):
+class BaseAutoDataset:
 
     DATASET_KEY = "dataset"
     """
         This class is used to encapsulate a Preprocess Object ``load_data`` and ``load_sample`` functions.
         ``load_data`` will be called within the ``__init__`` function of the AutoDataset if ``running_stage``
-        is provided and ``load_sample`` within ``__getitem__`` function.
+        is provided and ``load_sample`` within ``__getitem__``.
     """
 
     def __init__(
@@ -103,6 +103,12 @@ def _call_load_sample(self, sample: Any) -> Any:
         else:
             return self.load_sample(sample)
 
+    def _setup(self, stage: Optional[RunningStage]) -> None:
+        raise NotImplementedError
+
+
+class AutoDataset(BaseAutoDataset, Dataset):
+
     def _setup(self, stage: Optional[RunningStage]) -> None:
         assert not stage or _STAGES_PREFIX[stage] in _STAGES_PREFIX_VALUES
         previous_load_data = self.load_data.__code__ if self.load_data else None
@@ -141,3 +147,48 @@ def __len__(self) -> int:
         if not self.load_sample and not self.load_data:
             raise RuntimeError("`__len__` for `load_sample` and `load_data` could not be inferred.")
         return len(self.preprocessed_data)
+
+
+class IterableAutoDataset(BaseAutoDataset, IterableDataset):
+
+    def _setup(self, stage: Optional[RunningStage]) -> None:
+        assert not stage or _STAGES_PREFIX[stage] in _STAGES_PREFIX_VALUES
+        previous_load_data = self.load_data.__code__ if self.load_data else None
+
+        if self._running_stage and self.data_pipeline and (not self.load_data or not self.load_sample) and stage:
+            self.load_data = getattr(
+                self.preprocess,
+                self.data_pipeline._resolve_function_hierarchy('load_data', self.preprocess, stage, Preprocess)
+            )
+            self.load_sample = getattr(
+                self.preprocess,
+                self.data_pipeline._resolve_function_hierarchy('load_sample', self.preprocess, stage, Preprocess)
+            )
+        if self.load_data and (previous_load_data != self.load_data.__code__ or not self._load_data_called):
+            if previous_load_data:
+                rank_zero_warn(
+                    "The load_data function of the Autogenerated Dataset changed. "
+                    "This is not expected! Preloading Data again to ensure compatibility. This may take some time."
+                )
+            with self._load_data_context:
+                self.dataset = self._call_load_data(self.data)
+                self.dataset_iter = None
+            self._load_data_called = True
+
+    def __iter__(self):
+        self.dataset_iter = iter(self.dataset)
+        return self
+
+    def __next__(self) -> Any:
+        if not self.load_sample and not self.load_data:
+            raise RuntimeError("`__getitem__` for `load_sample` and `load_data` could not be inferred.")
+
+        data = next(self.dataset_iter)
+
+        if self.load_sample:
+            with self._load_sample_context:
+                data: Any = self._call_load_sample(data)
+                if self.control_flow_callback:
+                    self.control_flow_callback.on_load_sample(data, self.running_stage)
+                return data
+        return data
diff --git a/flash/data/data_module.py b/flash/data/data_module.py
@@ -21,9 +21,9 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from torch.nn import Module
 from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.dataset import Subset
+from torch.utils.data.dataset import IterableDataset, Subset
 
-from flash.data.auto_dataset import AutoDataset
+from flash.data.auto_dataset import BaseAutoDataset, IterableAutoDataset
 from flash.data.base_viz import BaseVisualization
 from flash.data.callback import BaseDataFetcher
 from flash.data.data_pipeline import DataPipeline, Postprocess, Preprocess
@@ -207,15 +207,16 @@ def set_running_stages(self):
             self.set_dataset_attribute(self._predict_ds, 'running_stage', RunningStage.PREDICTING)
 
     def _resolve_collate_fn(self, dataset: Dataset, running_stage: RunningStage) -> Optional[Callable]:
-        if isinstance(dataset, AutoDataset):
+        if isinstance(dataset, BaseAutoDataset):
             return self.data_pipeline.worker_preprocessor(running_stage)
 
     def _train_dataloader(self) -> DataLoader:
         train_ds: Dataset = self._train_ds() if isinstance(self._train_ds, Callable) else self._train_ds
+        shuffle = not isinstance(train_ds, (IterableDataset, IterableAutoDataset))
         return DataLoader(
             train_ds,
             batch_size=self.batch_size,
-            shuffle=True,
+            shuffle=shuffle,
             num_workers=self.num_workers,
             pin_memory=True,
             drop_last=True,
@@ -258,6 +259,13 @@ def generate_auto_dataset(self, *args, **kwargs):
             return None
         return self.data_pipeline._generate_auto_dataset(*args, **kwargs)
 
+    @property
+    def num_classes(self) -> Optional[int]:
+        return (
+            getattr(self.train_dataset, "num_classes", None) or getattr(self.val_dataset, "num_classes", None)
+            or getattr(self.test_dataset, "num_classes", None)
+        )
+
     @property
     def preprocess(self) -> Preprocess:
         return self._preprocess or self.preprocess_cls()
@@ -287,9 +295,10 @@ def autogenerate_dataset(
         whole_data_load_fn: Optional[Callable] = None,
         per_sample_load_fn: Optional[Callable] = None,
         data_pipeline: Optional[DataPipeline] = None,
-    ) -> AutoDataset:
+        use_iterable_auto_dataset: bool = False,
+    ) -> Union[BaseAutoDataset]:
         """
-        This function is used to generate an ``AutoDataset`` from a ``DataPipeline`` if provided
+        This function is used to generate an ``BaseAutoDataset`` from a ``DataPipeline`` if provided
         or from the provided ``whole_data_load_fn``, ``per_sample_load_fn`` functions directly
         """
 
@@ -304,7 +313,11 @@ def autogenerate_dataset(
                 cls.preprocess_cls,
                 DataPipeline._resolve_function_hierarchy('load_sample', cls.preprocess_cls, running_stage, Preprocess)
             )
-        return AutoDataset(data, whole_data_load_fn, per_sample_load_fn, data_pipeline, running_stage=running_stage)
+        if use_iterable_auto_dataset:
+            return IterableAutoDataset(
+                data, whole_data_load_fn, per_sample_load_fn, data_pipeline, running_stage=running_stage
+            )
+        return BaseAutoDataset(data, whole_data_load_fn, per_sample_load_fn, data_pipeline, running_stage=running_stage)
 
     @staticmethod
     def train_val_test_split(
@@ -374,15 +387,27 @@ def _generate_dataset_if_possible(
         running_stage: RunningStage,
         whole_data_load_fn: Optional[Callable] = None,
         per_sample_load_fn: Optional[Callable] = None,
-        data_pipeline: Optional[DataPipeline] = None
-    ) -> Optional[AutoDataset]:
+        data_pipeline: Optional[DataPipeline] = None,
+        use_iterable_auto_dataset: bool = False,
+    ) -> Optional[BaseAutoDataset]:
         if data is None:
             return
 
         if data_pipeline:
-            return data_pipeline._generate_auto_dataset(data, running_stage=running_stage)
+            return data_pipeline._generate_auto_dataset(
+                data,
+                running_stage=running_stage,
+                use_iterable_auto_dataset=use_iterable_auto_dataset,
+            )
 
-        return cls.autogenerate_dataset(data, running_stage, whole_data_load_fn, per_sample_load_fn, data_pipeline)
+        return cls.autogenerate_dataset(
+            data,
+            running_stage,
+            whole_data_load_fn,
+            per_sample_load_fn,
+            data_pipeline,
+            use_iterable_auto_dataset=use_iterable_auto_dataset,
+        )
 
     @classmethod
     def from_load_data_inputs(
@@ -393,6 +418,7 @@ def from_load_data_inputs(
         predict_load_data_input: Optional[Any] = None,
         preprocess: Optional[Preprocess] = None,
         postprocess: Optional[Postprocess] = None,
+        use_iterable_auto_dataset: bool = False,
         **kwargs,
     ) -> 'DataModule':
         """
@@ -424,16 +450,28 @@ def from_load_data_inputs(
         data_fetcher.attach_to_preprocess(data_pipeline._preprocess_pipeline)
 
         train_dataset = cls._generate_dataset_if_possible(
-            train_load_data_input, running_stage=RunningStage.TRAINING, data_pipeline=data_pipeline
+            train_load_data_input,
+            running_stage=RunningStage.TRAINING,
+            data_pipeline=data_pipeline,
+            use_iterable_auto_dataset=use_iterable_auto_dataset,
         )
         val_dataset = cls._generate_dataset_if_possible(
-            val_load_data_input, running_stage=RunningStage.VALIDATING, data_pipeline=data_pipeline
+            val_load_data_input,
+            running_stage=RunningStage.VALIDATING,
+            data_pipeline=data_pipeline,
+            use_iterable_auto_dataset=use_iterable_auto_dataset,
         )
         test_dataset = cls._generate_dataset_if_possible(
-            test_load_data_input, running_stage=RunningStage.TESTING, data_pipeline=data_pipeline
+            test_load_data_input,
+            running_stage=RunningStage.TESTING,
+            data_pipeline=data_pipeline,
+            use_iterable_auto_dataset=use_iterable_auto_dataset,
         )
         predict_dataset = cls._generate_dataset_if_possible(
-            predict_load_data_input, running_stage=RunningStage.PREDICTING, data_pipeline=data_pipeline
+            predict_load_data_input,
+            running_stage=RunningStage.PREDICTING,
+            data_pipeline=data_pipeline,
+            use_iterable_auto_dataset=use_iterable_auto_dataset,
         )
         datamodule = cls(
             train_dataset=train_dataset,
-Original file line number
+Diff line change
@@ Expand Up / @@ -148,3 +148,4 @@ imdb @@
     xsum
     coco128
     wmt_en_ro
+    action_youtube_naudio