Lightning-Universe · tchaton · Apr 30, 2021 · Apr 15, 2021 · Apr 15, 2021 · Apr 15, 2021
diff --git a/flash/data/auto_dataset.py b/flash/data/auto_dataset.py
@@ -11,13 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from contextlib import contextmanager
 from inspect import signature
 from typing import Any, Callable, Iterable, Optional, TYPE_CHECKING
 
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities.warning_utils import rank_zero_warn
-from torch.utils.data import Dataset
+from torch.utils.data import Dataset, IterableDataset
 
 from flash.data.callback import ControlFlow
 from flash.data.process import Preprocess
@@ -123,21 +122,139 @@ def _setup(self, stage: Optional[RunningStage]) -> None:
                     "This is not expected! Preloading Data again to ensure compatibility. This may take some time."
                 )
             with self._load_data_context:
-                self.preprocessed_data = self._call_load_data(self.data)
+                self.processed_data = self._call_load_data(self.data)
             self._load_data_called = True
 
     def __getitem__(self, index: int) -> Any:
         if not self.load_sample and not self.load_data:
             raise RuntimeError("`__getitem__` for `load_sample` and `load_data` could not be inferred.")
         if self.load_sample:
             with self._load_sample_context:
-                data: Any = self._call_load_sample(self.preprocessed_data[index])
+                data: Any = self._call_load_sample(self.processed_data[index])
                 if self.control_flow_callback:
                     self.control_flow_callback.on_load_sample(data, self.running_stage)
                 return data
-        return self.preprocessed_data[index]
+        return self.processed_data[index]
 
     def __len__(self) -> int:
         if not self.load_sample and not self.load_data:
             raise RuntimeError("`__len__` for `load_sample` and `load_data` could not be inferred.")
-        return len(self.preprocessed_data)
+        return len(self.processed_data)
+
+
+class IterableAutoDataset(IterableDataset):
+
+    DATASET_KEY = "dataset"
+    """
+        This class is used to encapsulate a Preprocess Object ``load_data`` and ``load_sample`` functions.
+        ``load_data`` will be called within the ``__init__`` function of the AutoDataset if ``running_stage``
+        is provided and ``load_sample`` within ``__getitem__`` function.
+    """
+
+    def __init__(
+        self,
+        data: Any,
+        load_data: Optional[Callable] = None,
+        load_sample: Optional[Callable] = None,
+        data_pipeline: Optional['DataPipeline'] = None,
+        running_stage: Optional[RunningStage] = None
+    ) -> None:
+        super().__init__()
+
+        if load_data or load_sample:
+            if data_pipeline:
+                rank_zero_warn(
+                    "``datapipeline`` is specified but load_sample and/or load_data are also specified. "
+                    "Won't use datapipeline"
+                )
+        # initial states
+        self._load_data_called = False
+        self._running_stage = None
+
+        self.data = data
+        self.data_pipeline = data_pipeline
+        self.load_data = load_data
+        self.load_sample = load_sample
+
+        # trigger the setup only if `running_stage` is provided
+        self.running_stage = running_stage
+
+    @property
+    def running_stage(self) -> Optional[RunningStage]:
+        return self._running_stage
+
+    @running_stage.setter
+    def running_stage(self, running_stage: RunningStage) -> None:
+        if self._running_stage != running_stage or (not self._running_stage):
+            self._running_stage = running_stage
+            self._load_data_context = CurrentRunningStageFuncContext(self._running_stage, "load_data", self.preprocess)
+            self._load_sample_context = CurrentRunningStageFuncContext(
+                self._running_stage, "load_sample", self.preprocess
+            )
+            self._setup(running_stage)
+
+    @property
+    def preprocess(self) -> Optional[Preprocess]:
+        if self.data_pipeline is not None:
+            return self.data_pipeline._preprocess_pipeline
+
+    @property
+    def control_flow_callback(self) -> Optional[ControlFlow]:
+        preprocess = self.preprocess
+        if preprocess is not None:
+            return ControlFlow(preprocess.callbacks)
+
+    def _call_load_data(self, data: Any) -> Iterable:
+        parameters = signature(self.load_data).parameters
+        if len(parameters) > 1 and self.DATASET_KEY in parameters:
+            return self.load_data(data, self)
+        else:
+            return self.load_data(data)
+
+    def _call_load_sample(self, sample: Any) -> Any:
+        parameters = signature(self.load_sample).parameters
+        if len(parameters) > 1 and self.DATASET_KEY in parameters:
+            return self.load_sample(sample, self)
+        else:
+            return self.load_sample(sample)
+
+    def _setup(self, stage: Optional[RunningStage]) -> None:
+        assert not stage or _STAGES_PREFIX[stage] in _STAGES_PREFIX_VALUES
+        previous_load_data = self.load_data.__code__ if self.load_data else None
+
+        if self._running_stage and self.data_pipeline and (not self.load_data or not self.load_sample) and stage:
+            self.load_data = getattr(
+                self.preprocess,
+                self.data_pipeline._resolve_function_hierarchy('load_data', self.preprocess, stage, Preprocess)
+            )
+            self.load_sample = getattr(
+                self.preprocess,
+                self.data_pipeline._resolve_function_hierarchy('load_sample', self.preprocess, stage, Preprocess)
+            )
+        if self.load_data and (previous_load_data != self.load_data.__code__ or not self._load_data_called):
+            if previous_load_data:
+                rank_zero_warn(
+                    "The load_data function of the Autogenerated Dataset changed. "
+                    "This is not expected! Preloading Data again to ensure compatibility. This may take some time."
+                )
+            with self._load_data_context:
+                self.sampler = self._call_load_data(self.data)
+                self.sampler_iter = None
+            self._load_data_called = True
+
+    def __next___(self) -> Any:
+        if not self.load_sample and not self.load_data:
+            raise RuntimeError("`__getitem__` for `load_sample` and `load_data` could not be inferred.")
+
+        if self.sampler_iter is None:
+            self.sampler_iter = iter(self.sampler)
+
+        data = next(self.sampler_iter)
+
+        if self.load_sample:
+            with self._load_sample_context:
+                data: Any = self._call_load_sample(data)
+                if self.control_flow_callback:
+                    self.control_flow_callback.on_load_sample(data, self.running_stage)
+                return data
+        return data
diff --git a/flash/data/data_module.py b/flash/data/data_module.py
@@ -23,7 +23,7 @@
 from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.dataset import Subset
 
-from flash.data.auto_dataset import AutoDataset
+from flash.data.auto_dataset import AutoDataset, IterableAutoDataset
 from flash.data.base_viz import BaseVisualization
 from flash.data.callback import BaseDataFetcher
 from flash.data.data_pipeline import DataPipeline, Postprocess, Preprocess
@@ -287,7 +287,8 @@ def autogenerate_dataset(
         whole_data_load_fn: Optional[Callable] = None,
         per_sample_load_fn: Optional[Callable] = None,
         data_pipeline: Optional[DataPipeline] = None,
-    ) -> AutoDataset:
+        use_iterable_auto_dataset: bool = False,
+    ) -> Union[AutoDataset, IterableAutoDataset]:
         """
         This function is used to generate an ``AutoDataset`` from a ``DataPipeline`` if provided
         or from the provided ``whole_data_load_fn``, ``per_sample_load_fn`` functions directly
@@ -304,6 +305,10 @@ def autogenerate_dataset(
                 cls.preprocess_cls,
                 DataPipeline._resolve_function_hierarchy('load_sample', cls.preprocess_cls, running_stage, Preprocess)
             )
+        if use_iterable_auto_dataset:
+            return IterableAutoDataset(
+                data, whole_data_load_fn, per_sample_load_fn, data_pipeline, running_stage=running_stage
+            )
         return AutoDataset(data, whole_data_load_fn, per_sample_load_fn, data_pipeline, running_stage=running_stage)
 
     @staticmethod
@@ -374,15 +379,25 @@ def _generate_dataset_if_possible(
         running_stage: RunningStage,
         whole_data_load_fn: Optional[Callable] = None,
         per_sample_load_fn: Optional[Callable] = None,
-        data_pipeline: Optional[DataPipeline] = None
+        data_pipeline: Optional[DataPipeline] = None,
+        use_iterable_auto_dataset: bool = False,
     ) -> Optional[AutoDataset]:
         if data is None:
             return
 
         if data_pipeline:
-            return data_pipeline._generate_auto_dataset(data, running_stage=running_stage)
+            return data_pipeline._generate_auto_dataset(
+                data, running_stage=running_stage, use_iterable_auto_dataset=use_iterable_auto_dataset
+            )
 
-        return cls.autogenerate_dataset(data, running_stage, whole_data_load_fn, per_sample_load_fn, data_pipeline)
+        return cls.autogenerate_dataset(
+            data,
+            running_stage,
+            whole_data_load_fn,
+            per_sample_load_fn,
+            data_pipeline,
+            use_iterable_auto_dataset=use_iterable_auto_dataset
+        )
 
     @classmethod
     def from_load_data_inputs(
@@ -393,6 +408,7 @@ def from_load_data_inputs(
         predict_load_data_input: Optional[Any] = None,
         preprocess: Optional[Preprocess] = None,
         postprocess: Optional[Postprocess] = None,
+        use_iterable_auto_dataset: bool = False,
         **kwargs,
     ) -> 'DataModule':
         """
@@ -424,16 +440,28 @@ def from_load_data_inputs(
         data_fetcher.attach_to_preprocess(data_pipeline._preprocess_pipeline)
 
         train_dataset = cls._generate_dataset_if_possible(
-            train_load_data_input, running_stage=RunningStage.TRAINING, data_pipeline=data_pipeline
+            train_load_data_input,
+            running_stage=RunningStage.TRAINING,
+            data_pipeline=data_pipeline,
+            use_iterable_auto_dataset=use_iterable_auto_dataset
         )
         val_dataset = cls._generate_dataset_if_possible(
-            val_load_data_input, running_stage=RunningStage.VALIDATING, data_pipeline=data_pipeline
+            val_load_data_input,
+            running_stage=RunningStage.VALIDATING,
+            data_pipeline=data_pipeline,
+            use_iterable_auto_dataset=use_iterable_auto_dataset
         )
         test_dataset = cls._generate_dataset_if_possible(
-            test_load_data_input, running_stage=RunningStage.TESTING, data_pipeline=data_pipeline
+            test_load_data_input,
+            running_stage=RunningStage.TESTING,
+            data_pipeline=data_pipeline,
+            use_iterable_auto_dataset=use_iterable_auto_dataset
         )
         predict_dataset = cls._generate_dataset_if_possible(
-            predict_load_data_input, running_stage=RunningStage.PREDICTING, data_pipeline=data_pipeline
+            predict_load_data_input,
+            running_stage=RunningStage.PREDICTING,
+            data_pipeline=data_pipeline,
+            use_iterable_auto_dataset=use_iterable_auto_dataset
         )
         datamodule = cls(
             train_dataset=train_dataset,

diff --git a/flash/data/data_pipeline.py b/flash/data/data_pipeline.py
@@ -23,7 +23,7 @@
 from torch.utils.data._utils.collate import default_collate, default_convert
 from torch.utils.data.dataloader import DataLoader
 
-from flash.data.auto_dataset import AutoDataset
+from flash.data.auto_dataset import AutoDataset, IterableAutoDataset
 from flash.data.batch import _PostProcessor, _PreProcessor, _Sequential
 from flash.data.process import Postprocess, Preprocess
 from flash.data.utils import _POSTPROCESS_FUNCS, _PREPROCESS_FUNCS, _STAGES_PREFIX
@@ -458,7 +458,16 @@ def fn():
 
         return fn
 
-    def _generate_auto_dataset(self, data: Union[Iterable, Any], running_stage: RunningStage = None) -> AutoDataset:
+    def _generate_auto_dataset(
+        self,
+        data: Union[Iterable, Any],
+        running_stage: RunningStage = None,
+        use_iterable_auto_dataset: bool = False
+    ) -> Union[AutoDataset, IterableAutoDataset]:
+        if use_iterable_auto_dataset:
+            return IterableAutoDataset(
+                data, whole_data_load_fn, per_sample_load_fn, data_pipeline, running_stage=running_stage
+            )
         return AutoDataset(data=data, data_pipeline=self, running_stage=running_stage)
 
     def to_dataloader(

@@ -5,3 +5,4 @@
 _COCO_AVAILABLE = _module_available("pycocotools")
 _TIMM_AVAILABLE = _module_available("timm")
 _TORCHVISION_AVAILABLE = _module_available("torchvision")
+_PYTORCH_VIDEO_AVAILABLE = _module_available("pytorchvideo")
diff --git a/flash/vision/video/__init__.py b/flash/vision/video/__init__.py
@@ -0,0 +1,2 @@
+from flash.vision.video.classification.data import VideoClassificationData
+from flash.vision.video.classification.model import VideoClassifier
diff --git a/flash/vision/video/classification/__init__.py b/flash/vision/video/classification/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from flash.vision.video.classification.data import VideoClassificationData
		from flash.vision.video.classification.model import VideoClassifier