Lightning-Universe · tchaton · May 7, 2021 · Apr 28, 2021 · Apr 29, 2021 · Apr 29, 2021
@@ -20,20 +20,15 @@
 from pytorch_lightning.utilities import rank_zero_warn
 
 from flash.core.model import Task
-from flash.data.process import ProcessState, Serializer
+from flash.data.data_source import LabelsState
+from flash.data.process import Serializer
 
 
 def binary_cross_entropy_with_logits(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     """Calls BCE with logits and cast the target one_hot (y) encoding to floating point precision."""
     return F.binary_cross_entropy_with_logits(x, y.float())
 
 
-@dataclass(unsafe_hash=True, frozen=True)
-class ClassificationState(ProcessState):
-
-    labels: Optional[List[str]]
-
-
 class ClassificationTask(Task):
 
     def __init__(
@@ -140,15 +135,16 @@ class Labels(Classes):
     def __init__(self, labels: Optional[List[str]] = None, multi_label: bool = False, threshold: float = 0.5):
         super().__init__(multi_label=multi_label, threshold=threshold)
         self._labels = labels
-        self.set_state(ClassificationState(labels))
+        if labels is not None:
+            self.set_state(LabelsState(labels))
 
     def serialize(self, sample: Any) -> Union[int, List[int], str, List[str]]:
         labels = None
 
         if self._labels is not None:
             labels = self._labels
         else:
-            state = self.get_state(ClassificationState)
+            state = self.get_state(LabelsState)
             if state is not None:
                 labels = state.labels
 

@@ -20,7 +20,6 @@
 from pytorch_lightning import LightningModule
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.trainer.states import RunningStage
-from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from torch import nn
 from torch.optim.lr_scheduler import _LRScheduler
@@ -29,7 +28,8 @@
 from flash.core.registry import FlashRegistry
 from flash.core.schedulers import _SCHEDULERS_REGISTRY
 from flash.core.utils import get_callable_dict
-from flash.data.data_pipeline import DataPipeline
+from flash.data.data_pipeline import DataPipeline, DataPipelineState
+from flash.data.data_source import DataSource, DefaultDataSource
 from flash.data.process import Postprocess, Preprocess, Serializer, SerializerMapping
 
 
@@ -103,14 +103,17 @@ def __init__(
         self._postprocess: Optional[Postprocess] = postprocess
         self._serializer: Optional[Serializer] = None
 
+        self._data_pipeline_state: Optional[DataPipelineState] = None
+
         # Explicitly set the serializer to call the setter
         self.serializer = serializer
 
     def step(self, batch: Any, batch_idx: int) -> Any:
         """
         The training/validation/test step. Override for custom behavior.
         """
-        x, y = batch
+        x, y = batch['input'], batch['target']
+        # x, y = batch
         y_hat = self(x)
         output = {"y_hat": y_hat}
         losses = {name: l_fn(y_hat, y) for name, l_fn in self.loss_fn.items()}
@@ -154,6 +157,7 @@ def test_step(self, batch: Any, batch_idx: int) -> None:
     def predict(
         self,
         x: Any,
+        data_source: Union[str, DefaultDataSource, DataSource] = DefaultDataSource.FILES,
         data_pipeline: Optional[DataPipeline] = None,
     ) -> Any:
         """
@@ -169,9 +173,9 @@ def predict(
         """
         running_stage = RunningStage.PREDICTING
 
-        data_pipeline = self.build_data_pipeline(data_pipeline)
+        data_pipeline = self.build_data_pipeline(data_source, data_pipeline)
 
-        x = [x for x in data_pipeline._generate_auto_dataset(x, running_stage)]
+        x = [x for x in data_pipeline._data_source.generate_dataset(x, running_stage)]
         x = data_pipeline.worker_preprocessor(running_stage)(x)
         # switch to self.device when #7188 merge in Lightning
         x = self.transfer_batch_to_device(x, next(self.parameters()).device)
@@ -181,6 +185,7 @@ def predict(
         return predictions
 
     def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
+        batch = batch['input']
         if isinstance(batch, tuple):
             batch = batch[0]
         elif isinstance(batch, list):
@@ -252,7 +257,11 @@ def serializer(self, serializer: Union[Serializer, Mapping[str, Serializer]]):
             serializer = SerializerMapping(serializer)
         self._serializer = serializer
 
-    def build_data_pipeline(self, data_pipeline: Optional[DataPipeline] = None) -> Optional[DataPipeline]:
+    def build_data_pipeline(
+        self,
+        data_source: Optional[Union[str, DefaultDataSource, DataSource]] = None,
+        data_pipeline: Optional[DataPipeline] = None,
+    ) -> Optional[DataPipeline]:
         """Build a :class:`.DataPipeline` incorporating available
         :class:`~flash.data.process.Preprocess` and :class:`~flash.data.process.Postprocess`
         objects. These will be overridden in the following resolution order (lowest priority first):
@@ -269,17 +278,19 @@ def build_data_pipeline(self, data_pipeline: Optional[DataPipeline] = None) -> O
         Returns:
             The fully resolved :class:`.DataPipeline`.
         """
-        preprocess, postprocess, serializer = None, None, None
+        old_data_source, preprocess, postprocess, serializer = None, None, None, None
 
         # Datamodule
         if self.datamodule is not None and getattr(self.datamodule, 'data_pipeline', None) is not None:
+            old_data_source = getattr(self.datamodule.data_pipeline, '_data_source', None)
             preprocess = getattr(self.datamodule.data_pipeline, '_preprocess_pipeline', None)
             postprocess = getattr(self.datamodule.data_pipeline, '_postprocess_pipeline', None)
             serializer = getattr(self.datamodule.data_pipeline, '_serializer', None)
 
         elif self.trainer is not None and hasattr(
             self.trainer, 'datamodule'
         ) and getattr(self.trainer.datamodule, 'data_pipeline', None) is not None:
+            old_data_source = getattr(self.trainer.datamodule.data_pipeline, '_data_source', None)
             preprocess = getattr(self.trainer.datamodule.data_pipeline, '_preprocess_pipeline', None)
             postprocess = getattr(self.trainer.datamodule.data_pipeline, '_postprocess_pipeline', None)
             serializer = getattr(self.trainer.datamodule.data_pipeline, '_serializer', None)
@@ -305,8 +316,19 @@ def build_data_pipeline(self, data_pipeline: Optional[DataPipeline] = None) -> O
                 getattr(data_pipeline, '_serializer', None),
             )
 
-        data_pipeline = DataPipeline(preprocess, postprocess, serializer)
-        data_pipeline.initialize()
+        data_source = data_source or old_data_source
+
+        if str(data_source) == data_source:
+            data_source = DefaultDataSource(data_source)
+
+        if not isinstance(data_source, DataSource):
+            data_source = preprocess.data_source_of_type(data_source.as_type())()
+
+        if old_data_source is not None:
+            data_source._state.update(old_data_source._state)  # TODO: This is a hack
+
+        data_pipeline = DataPipeline(data_source, preprocess, postprocess, serializer)
+        self._data_pipeline_state = data_pipeline.initialize(self._data_pipeline_state)
         return data_pipeline
 
     @property

@@ -12,22 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from inspect import signature
-from typing import Any, Callable, Iterable, Iterator, Optional, TYPE_CHECKING
+from typing import Any, Generic, Iterable, Sequence, TYPE_CHECKING, TypeVar
 
-import torch
 from pytorch_lightning.trainer.states import RunningStage
-from pytorch_lightning.utilities.warning_utils import rank_zero_warn
 from torch.utils.data import Dataset, IterableDataset
 
-from flash.data.callback import ControlFlow
-from flash.data.process import Preprocess
-from flash.data.utils import _STAGES_PREFIX, _STAGES_PREFIX_VALUES, CurrentRunningStageFuncContext
+from flash.data.utils import CurrentRunningStageFuncContext
 
 if TYPE_CHECKING:
     from flash.data.data_pipeline import DataPipeline
+    from flash.data.data_source import DataSource
 
+DATA_TYPE = TypeVar('DATA_TYPE')
 
-class BaseAutoDataset:
+
+class BaseAutoDataset(Generic[DATA_TYPE]):
 
     DATASET_KEY = "dataset"
     """
@@ -38,141 +37,66 @@ class BaseAutoDataset:
 
     def __init__(
         self,
-        data: Any,
-        load_data: Optional[Callable] = None,
-        load_sample: Optional[Callable] = None,
-        data_pipeline: Optional['DataPipeline'] = None,
-        running_stage: Optional[RunningStage] = None
+        data: DATA_TYPE,
+        data_source: 'DataSource',
+        running_stage: RunningStage,
     ) -> None:
         super().__init__()
 
-        if load_data or load_sample:
-            if data_pipeline:
-                rank_zero_warn(
-                    "``datapipeline`` is specified but load_sample and/or load_data are also specified. "
-                    "Won't use datapipeline"
-                )
-        # initial states
-        self._load_data_called = False
-        self._running_stage = None
-
         self.data = data
-        self.data_pipeline = data_pipeline
-        self.load_data = load_data
-        self.load_sample = load_sample
+        self.data_source = data_source
 
-        # trigger the setup only if `running_stage` is provided
+        self._running_stage = None
         self.running_stage = running_stage
 
     @property
-    def running_stage(self) -> Optional[RunningStage]:
+    def running_stage(self) -> RunningStage:
         return self._running_stage
 
     @running_stage.setter
     def running_stage(self, running_stage: RunningStage) -> None:
-        if self._running_stage != running_stage or (not self._running_stage):
-            self._running_stage = running_stage
-            self._load_data_context = CurrentRunningStageFuncContext(self._running_stage, "load_data", self.preprocess)
-            self._load_sample_context = CurrentRunningStageFuncContext(
-                self._running_stage, "load_sample", self.preprocess
-            )
-            self._setup(running_stage)
+        from flash.data.data_pipeline import DataPipeline
+        from flash.data.data_source import DataSource  # Hack to avoid circular import TODO: something better than this
 
-    @property
-    def preprocess(self) -> Optional[Preprocess]:
-        if self.data_pipeline is not None:
-            return self.data_pipeline._preprocess_pipeline
+        self._running_stage = running_stage
 
-    @property
-    def control_flow_callback(self) -> Optional[ControlFlow]:
-        preprocess = self.preprocess
-        if preprocess is not None:
-            return ControlFlow(preprocess.callbacks)
-
-    def _call_load_data(self, data: Any) -> Iterable:
-        parameters = signature(self.load_data).parameters
-        if len(parameters) > 1 and self.DATASET_KEY in parameters:
-            return self.load_data(data, self)
-        else:
-            return self.load_data(data)
+        self._load_sample_context = CurrentRunningStageFuncContext(self.running_stage, "load_sample", self.data_source)
 
-    def _call_load_sample(self, sample: Any) -> Any:
-        parameters = signature(self.load_sample).parameters
-        if len(parameters) > 1 and self.DATASET_KEY in parameters:
-            return self.load_sample(sample, self)
-        else:
-            return self.load_sample(sample)
-
-    def _setup(self, stage: Optional[RunningStage]) -> None:
-        assert not stage or _STAGES_PREFIX[stage] in _STAGES_PREFIX_VALUES
-        previous_load_data = self.load_data.__code__ if self.load_data else None
-
-        if self._running_stage and self.data_pipeline and (not self.load_data or not self.load_sample) and stage:
-            self.load_data = getattr(
-                self.preprocess,
-                self.data_pipeline._resolve_function_hierarchy('load_data', self.preprocess, stage, Preprocess)
+        self.load_sample = getattr(
+            self.data_source,
+            DataPipeline._resolve_function_hierarchy(
+                'load_sample',
+                self.data_source,
+                self.running_stage,
+                DataSource,
             )
-            self.load_sample = getattr(
-                self.preprocess,
-                self.data_pipeline._resolve_function_hierarchy('load_sample', self.preprocess, stage, Preprocess)
-            )
-        if self.load_data and (previous_load_data != self.load_data.__code__ or not self._load_data_called):
-            if previous_load_data:
-                rank_zero_warn(
-                    "The load_data function of the Autogenerated Dataset changed. "
-                    "This is not expected! Preloading Data again to ensure compatibility. This may take some time."
-                )
-            self.setup()
-            self._load_data_called = True
-
-    def setup(self):
-        raise NotImplementedError
+        )
 
+    def _call_load_sample(self, sample: Any) -> Any:
+        if self.load_sample:
+            with self._load_sample_context:
+                parameters = signature(self.load_sample).parameters
+                if len(parameters) > 1 and self.DATASET_KEY in parameters:
+                    sample = self.load_sample(sample, self)
+                else:
+                    sample = self.load_sample(sample)
+        return sample
 
-class AutoDataset(BaseAutoDataset, Dataset):
 
-    def setup(self):
-        with self._load_data_context:
-            self.preprocessed_data = self._call_load_data(self.data)
+class AutoDataset(BaseAutoDataset[Sequence[Any]], Dataset):
 
     def __getitem__(self, index: int) -> Any:
-        if not self.load_sample and not self.load_data:
-            raise RuntimeError("`__getitem__` for `load_sample` and `load_data` could not be inferred.")
-        if self.load_sample:
-            with self._load_sample_context:
-                data: Any = self._call_load_sample(self.preprocessed_data[index])
-                if self.control_flow_callback:
-                    self.control_flow_callback.on_load_sample(data, self.running_stage)
-                return data
-        return self.preprocessed_data[index]
+        return self._call_load_sample(self.data[index])
 
     def __len__(self) -> int:
-        if not self.load_sample and not self.load_data:
-            raise RuntimeError("`__len__` for `load_sample` and `load_data` could not be inferred.")
-        return len(self.preprocessed_data)
+        return len(self.data)
 
 
-class IterableAutoDataset(BaseAutoDataset, IterableDataset):
-
-    def setup(self):
-        with self._load_data_context:
-            self.dataset = self._call_load_data(self.data)
-            self.dataset_iter = None
+class IterableAutoDataset(BaseAutoDataset[Iterable[Any]], IterableDataset):
 
     def __iter__(self):
-        self.dataset_iter = iter(self.dataset)
+        self.data_iter = iter(self.data)
         return self
 
     def __next__(self) -> Any:
-        if not self.load_sample and not self.load_data:
-            raise RuntimeError("`__getitem__` for `load_sample` and `load_data` could not be inferred.")
-
-        data = next(self.dataset_iter)
-
-        if self.load_sample:
-            with self._load_sample_context:
-                data: Any = self._call_load_sample(data)
-                if self.control_flow_callback:
-                    self.control_flow_callback.on_load_sample(data, self.running_stage)
-                return data
-        return data
+        return self._call_load_sample(next(self.data_iter))
@@ -57,6 +57,8 @@ def __init__(
         self._post_tensor_transform_context = CurrentFuncContext("post_tensor_transform", preprocess)
 
     def forward(self, sample: Any) -> Any:
+        self.callback.on_load_sample(sample, self.stage)
+
         with self._current_stage_context:
             with self._pre_tensor_transform_context:
                 sample = self.pre_tensor_transform(sample)

@@ -190,9 +190,6 @@ def enable(self):
         yield
         self.enabled = False
 
-    def attach_to_datamodule(self, datamodule) -> None:
-        datamodule.data_fetcher = self
-
     def attach_to_preprocess(self, preprocess: 'flash.data.process.Preprocess') -> None:
         preprocess.add_callbacks([self])
         self._preprocess = preprocess