Lightning-Universe · tchaton · Apr 30, 2021 · Apr 15, 2021 · Apr 15, 2021 · Apr 15, 2021
@@ -42,6 +42,15 @@ jobs:
       run: |
         python -c "req = open('requirements.txt').read().replace('>', '=') ; open('requirements.txt', 'w').write(req)"
 
+    - name: Filter requirements
+      run: |
+        import sys
+        if sys.version_info.minor < 7:
+          fname = 'requirements.txt'
+          lines = [line for line in open(fname).readlines() if not line.startswith('pytorchvideo')]
+          open(fname, 'w').writelines(lines)
+      shell: python
+
     # Note: This uses an internal pip API and may not always work
     # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
     - name: Get pip cache

@@ -148,3 +148,5 @@ imdb
 xsum
 coco128
 wmt_en_ro
+action_youtube_naudio
+kinetics
@@ -27,6 +27,8 @@ Lightning Flash
    reference/tabular_classification
    reference/translation
    reference/object_detection
+   reference/video_classification
+
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/source/reference/image_classification.rst b/docs/source/reference/image_classification.rst
@@ -8,7 +8,7 @@ Image Classification
 ********
 The task
 ********
-The task of identifying what is in an image is called image classification. Typically, Image Classification is used to identify images containing a single object. The task predicts which ‘class’ the image most likely belongs to with a degree of certainty.  A class is a label that desecribes what is in an image, such as ‘car’, ‘house’, ‘cat’ etc. For example, we can train the image classifier task on images of ants and it will learn to predict the probability that an image contains an ant.
+The task of identifying what is in an image is called image classification. Typically, Image Classification is used to identify images containing a single object. The task predicts which ‘class’ the image most likely belongs to with a degree of certainty.  A class is a label that describes what is in an image, such as ‘car’, ‘house’, ‘cat’ etc. For example, we can train the image classifier task on images of ants and it will learn to predict the probability that an image contains an ant.
 
 ------
 

@@ -0,0 +1,156 @@
+
+.. _video_classification:
+
+####################
+Video Classification
+####################
+
+********
+The task
+********
+
+Typically, Video Classification refers to the task of producing a label for actions identified in a given video.
+
+The task predicts which ‘class’ the video clip most likely belongs to with a degree of certainty.
+
+A class is a label that describes what action is being performed within the video clip, such as **swimming** , **playing piano**, etc.
+
+For example, we can train the video classifier task on video clips with human actions
+and it will learn to predict the probability that a video contains a certain human action.
+
+Lightning Flash :class:`~flash.video.VideoClassifier` and :class:`~flash.video.VideoClassificationData`
+relies on `PyTorchVideo <https://pytorchvideo.readthedocs.io/en/latest/index.html>`_ internally.
+
+You can use any models from `PyTorchVideo Model Zoo <https://pytorchvideo.readthedocs.io/en/latest/model_zoo.html>`_
+with the :class:`~flash.video.VideoClassifier`.
+
+------
+
+**********
+Finetuning
+**********
+
+Let's say you wanted to develop a model that could determine whether a video clip contains a human **swimming** or **playing piano**,
+using the `Kinetics dataset <https://deepmind.com/research/open-source/kinetics>`_.
+Once we download the data using :func:`~flash.data.download_data`, all we need is the train data and validation data folders to create the :class:`~flash.video.VideoClassificationData`.
+
+.. code-block::
+
+    video_dataset
+    ├── train
+    │   ├── class_1
+    │   │   ├── a.ext
+    │   │   ├── b.ext
+    │   │   ...
+    │   └── class_n
+    │       ├── c.ext
+    │       ├── d.ext
+    │       ...
+    └── val
+        ├── class_1
+        │   ├── e.ext
+        │   ├── f.ext
+        │   ...
+        └── class_n
+            ├── g.ext
+            ├── h.ext
+            ...
+
+
+.. code-block:: python
+
+    import sys
+
+    import torch
+    from torch.utils.data import SequentialSampler
+
+    import flash
+    from flash.data.utils import download_data
+    from flash.video import VideoClassificationData, VideoClassifier
+    import kornia.augmentation as K
+    from pytorchvideo.transforms import ApplyTransformToKey, RandomShortSideScale, UniformTemporalSubsample
+    from torchvision.transforms import Compose, RandomCrop, RandomHorizontalFlip
+
+    # 1. Download a video clip dataset. Find more dataset at https://pytorchvideo.readthedocs.io/en/latest/data.html
+    download_data("https://pl-flash-data.s3.amazonaws.com/kinetics.zip")
+
+    # 2. [Optional] Specify transforms to be used during training.
+    # Flash helps you to place your transform exactly where you want.
+    # Learn more at https://lightning-flash.readthedocs.io/en/latest/general/data.html#flash.data.process.Preprocess
+    train_transform = {
+        "post_tensor_transform": Compose([
+            ApplyTransformToKey(
+                key="video",
+                transform=Compose([
+                    UniformTemporalSubsample(8),
+                    RandomShortSideScale(min_size=256, max_size=320),
+                    RandomCrop(244),
+                    RandomHorizontalFlip(p=0.5),
+                ]),
+            ),
+        ]),
+        "per_batch_transform_on_device": Compose([
+            ApplyTransformToKey(
+                key="video",
+                transform=K.VideoSequential(
+                    K.Normalize(torch.tensor([0.45, 0.45, 0.45]), torch.tensor([0.225, 0.225, 0.225])),
+                    K.augmentation.ColorJitter(0.1, 0.1, 0.1, 0.1, p=1.0),
+                    data_format="BCTHW",
+                    same_on_frame=False
+                )
+            ),
+        ]),
+    }
+
+    # 3. Load the data from directories.
+    datamodule = VideoClassificationData.from_paths(
+        train_data_path="data/kinetics/train",
+        val_data_path="data/kinetics/val",
+        predict_data_path="data/kinetics/predict",
+        clip_sampler="uniform",
+        clip_duration=2,
+        video_sampler=SequentialSampler,
+        decode_audio=False,
+        train_transform=train_transform
+    )
+
+    # 4. List the available models
+    print(VideoClassifier.available_models())
+    # out: ['efficient_x3d_s', 'efficient_x3d_xs', ... ,slowfast_r50', 'x3d_m', 'x3d_s', 'x3d_xs']
+
+    # 5. Build the model
+    model = VideoClassifier(model="x3d_xs", num_classes=datamodule.num_classes, pretrained=False)
+
+    # 6. Train the model
+    trainer = flash.Trainer(fast_dev_run=True)
+
+    # 6. Finetune the model
+    trainer.finetune(model, datamodule=datamodule)
+
+    predictions = model.predict("data/kinetics/train/archery/-1q7jA3DXQM_000005_000015.mp4")
+    print(predictions)
+
+
+------
+
+*************
+API reference
+*************
+
+.. _video_classifier:
+
+VideoClassifier
+---------------
+
+.. autoclass:: flash.video.VideoClassifier
+    :members:
+    :exclude-members: forward
+
+.. _video_classification_data:
+
+VideoClassificationData
+-----------------------
+
+.. autoclass:: flash.video.VideoClassificationData
+
+.. automethod:: flash.video.VideoClassificationData.from_paths
@@ -14,6 +14,7 @@
 from typing import Any
 
 import torch
+import torch.nn.functional as F
 
 from flash.core.model import Task
 from flash.data.process import Postprocess
@@ -29,3 +30,6 @@ class ClassificationTask(Task):
 
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, default_postprocess=ClassificationPostprocess(), **kwargs)
+
+    def to_metrics_format(self, x: torch.Tensor) -> torch.Tensor:
+        return F.softmax(x, -1)
@@ -96,6 +96,7 @@ def step(self, batch: Any, batch_idx: int) -> Any:
         output = {"y_hat": y_hat}
         losses = {name: l_fn(y_hat, y) for name, l_fn in self.loss_fn.items()}
         logs = {}
+        y_hat = self.to_metrics_format(y_hat)
         for name, metric in self.metrics.items():
             if isinstance(metric, torchmetrics.metric.Metric):
                 metric(y_hat, y)
@@ -111,6 +112,9 @@ def step(self, batch: Any, batch_idx: int) -> Any:
         output["y"] = y
         return output
 
+    def to_metrics_format(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
     def forward(self, x: Any) -> Any:
         return self.model(x)
 
@@ -172,10 +176,10 @@ def configure_finetune_callback(self) -> List[Callback]:
 
     @staticmethod
     def _resolve(
-            old_preprocess: Optional[Preprocess],
-            old_postprocess: Optional[Postprocess],
-            new_preprocess: Optional[Preprocess],
-            new_postprocess: Optional[Postprocess],
+        old_preprocess: Optional[Preprocess],
+        old_postprocess: Optional[Postprocess],
+        new_preprocess: Optional[Preprocess],
+        new_postprocess: Optional[Postprocess],
     ) -> Tuple[Optional[Preprocess], Optional[Postprocess]]:
         """Resolves the correct :class:`.Preprocess` and :class:`.Postprocess` to use, choosing ``new_*`` if it is not
         None or a base class (:class:`.Preprocess` or :class:`.Postprocess`) and ``old_*`` otherwise.
@@ -308,3 +312,10 @@ def available_backbones(cls) -> List[str]:
         if registry is None:
             return []
         return registry.available_keys()
+
+    @classmethod
+    def available_models(cls) -> List[str]:
+        registry: Optional[FlashRegistry] = getattr(cls, "models", None)
+        if registry is None:
+            return []
+        return registry.available_keys()
diff --git a/flash/data/auto_dataset.py b/flash/data/auto_dataset.py
@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from contextlib import contextmanager
 from inspect import signature
-from typing import Any, Callable, Iterable, Optional, TYPE_CHECKING
+from typing import Any, Callable, Iterable, Iterator, Optional, TYPE_CHECKING
 
+import torch
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities.warning_utils import rank_zero_warn
-from torch.utils.data import Dataset
+from torch.utils.data import Dataset, IterableDataset
 
 from flash.data.callback import ControlFlow
 from flash.data.process import Preprocess
@@ -27,13 +27,13 @@
     from flash.data.data_pipeline import DataPipeline
 
 
-class AutoDataset(Dataset):
+class BaseAutoDataset:
 
     DATASET_KEY = "dataset"
     """
         This class is used to encapsulate a Preprocess Object ``load_data`` and ``load_sample`` functions.
         ``load_data`` will be called within the ``__init__`` function of the AutoDataset if ``running_stage``
-        is provided and ``load_sample`` within ``__getitem__`` function.
+        is provided and ``load_sample`` within ``__getitem__``.
     """
 
     def __init__(
@@ -122,10 +122,19 @@ def _setup(self, stage: Optional[RunningStage]) -> None:
                     "The load_data function of the Autogenerated Dataset changed. "
                     "This is not expected! Preloading Data again to ensure compatibility. This may take some time."
                 )
-            with self._load_data_context:
-                self.preprocessed_data = self._call_load_data(self.data)
+            self.setup()
             self._load_data_called = True
 
+    def setup(self):
+        raise NotImplementedError
+
+
+class AutoDataset(BaseAutoDataset, Dataset):
+
+    def setup(self):
+        with self._load_data_context:
+            self.preprocessed_data = self._call_load_data(self.data)
+
     def __getitem__(self, index: int) -> Any:
         if not self.load_sample and not self.load_data:
             raise RuntimeError("`__getitem__` for `load_sample` and `load_data` could not be inferred.")
@@ -141,3 +150,29 @@ def __len__(self) -> int:
         if not self.load_sample and not self.load_data:
             raise RuntimeError("`__len__` for `load_sample` and `load_data` could not be inferred.")
         return len(self.preprocessed_data)
+
+
+class IterableAutoDataset(BaseAutoDataset, IterableDataset):
+
+    def setup(self):
+        with self._load_data_context:
+            self.dataset = self._call_load_data(self.data)
+            self.dataset_iter = None
+
+    def __iter__(self):
+        self.dataset_iter = iter(self.dataset)
+        return self
+
+    def __next__(self) -> Any:
+        if not self.load_sample and not self.load_data:
+            raise RuntimeError("`__getitem__` for `load_sample` and `load_data` could not be inferred.")
+
+        data = next(self.dataset_iter)
+
+        if self.load_sample:
+            with self._load_sample_context:
+                data: Any = self._call_load_sample(data)
+                if self.control_flow_callback:
+                    self.control_flow_callback.on_load_sample(data, self.running_stage)
+                return data
+        return data