From 2f39872e6e1f6986c4a6369814637b38cf87ebd1 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Wed, 9 Nov 2022 19:01:48 -0500
Subject: [PATCH 01/17] :construction: wip video classification pipeline

---
 src/transformers/pipelines/__init__.py        |  11 +-
 .../pipelines/video_classification.py         | 130 ++++++++++++++++++
 2 files changed, 140 insertions(+), 1 deletion(-)
 create mode 100644 src/transformers/pipelines/video_classification.py

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 7d160b61a8ac..685e8e16e5f8 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -79,6 +79,7 @@
     TokenClassificationArgumentHandler,
     TokenClassificationPipeline,
 )
+from .video_classification import VideoClassificationPipeline
 from .visual_question_answering import VisualQuestionAnsweringPipeline
 from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
 from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
@@ -133,6 +134,7 @@
         AutoModelForSpeechSeq2Seq,
         AutoModelForTableQuestionAnswering,
         AutoModelForTokenClassification,
+        AutoModelForVideoClassification,
         AutoModelForVision2Seq,
         AutoModelForVisualQuestionAnswering,
         AutoModelForZeroShotObjectDetection,
@@ -361,6 +363,13 @@
         "default": {"model": {"pt": ("Intel/dpt-large", "e93beec")}},
         "type": "image",
     },
+    "video-classification": {
+        "impl": VideoClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModelForVideoClassification,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("MCG-NJU/videomae-base-finetuned-kinetics", "4800870")}},
+        "type": "video",
+    },
 }
 
 NO_FEATURE_EXTRACTOR_TASKS = set()
@@ -373,7 +382,7 @@
 for task, values in SUPPORTED_TASKS.items():
     if values["type"] == "text":
         NO_FEATURE_EXTRACTOR_TASKS.add(task)
-    elif values["type"] in {"audio", "image"}:
+    elif values["type"] in {"audio", "image", "video"}:
         NO_TOKENIZER_TASKS.add(task)
     elif values["type"] != "multimodal":
         raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")
diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
new file mode 100644
index 000000000000..61b33486290e
--- /dev/null
+++ b/src/transformers/pipelines/video_classification.py
@@ -0,0 +1,130 @@
+from typing import List, Union
+
+from ..utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import PIPELINE_INIT_ARGS, Pipeline
+
+
+def is_decord_available():
+    try:
+        import decord
+
+        return True
+    except ImportError:
+        return False
+
+if is_decord_available():
+    from decord import VideoReader, cpu
+
+if is_vision_available():
+    from PIL import Image
+    import numpy as np
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(PIPELINE_INIT_ARGS)
+class VideoClassificationPipeline(Pipeline):
+    """
+    Video classification pipeline using any `AutoModelForVideoClassification`. This pipeline predicts the class of a
+    video.
+
+    This video classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"video-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=video-classification).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING)
+
+        self.frame_sample_rate = kwargs.pop("frame_sample_rate", 4)
+        self.num_frames = self.model.config.num_frames
+
+    def _sanitize_parameters(self, top_k=None):
+        postprocess_params = {}
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+        return {}, {}, postprocess_params
+
+    def __call__(self, videos: Union[str, List[str], List["Image.Image"], List[List["Image.Image"]]], **kwargs):
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            videos (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of videos:
+
+                - A string containing a http link pointing to a video
+                - A string containing a local path to an video
+                - An video's frames loaded in PIL directly
+
+                The pipeline accepts either a single video or a batch of videos, which must then be passed as a string.
+                Videos in a batch must all be in the same format: all as http links, all as local paths, or all as a
+                list (or list of lists) of PIL Images containing the frames of the video(s).
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+
+        Return:
+            A dictionary or a list of dictionaries containing result. If the input is a single video, will return a
+            dictionary, if the input is a list of several videos, will return a list of dictionaries corresponding to
+            the videos.
+
+            The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        return super().__call__(videos, **kwargs)
+
+    def preprocess(self, video):
+
+        if isinstance(video, str):
+            videoreader = VideoReader(video, num_threads=1, ctx=cpu(0))
+            videoreader.seek(0)
+
+            converted_len = int(self.num_frames * self.frame_sample_rate)
+
+            seg_len = len(videoreader)
+            end_idx = np.random.randint(converted_len, seg_len)
+            start_idx = end_idx - converted_len
+            indices = np.linspace(start_idx, end_idx, num=self.num_frames)
+            indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+
+            video = videoreader.get_batch(indices).asnumpy()
+            video = list(video)
+
+        model_inputs = self.feature_extractor(list(video), return_tensors=self.framework)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5):
+        if top_k > self.model.config.num_labels:
+            top_k = self.model.config.num_labels
+
+        if self.framework == "pt":
+            probs = model_outputs.logits.softmax(-1)[0]
+            scores, ids = probs.topk(top_k)
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework}")
+
+        scores = scores.tolist()
+        ids = ids.tolist()
+        return [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]

From 8d80c25612f2603fcc6d8b92c299cb5f43d37f56 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 10 Nov 2022 14:22:32 -0500
Subject: [PATCH 02/17] :construction: wip - add is_decord_available check

---
 .../pipelines/video_classification.py           | 12 ++----------
 src/transformers/utils/import_utils.py          | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 61b33486290e..f3591b760acc 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -2,7 +2,7 @@
 
 from ..utils import (
     add_end_docstrings,
-    is_tf_available,
+    is_decord_available,
     is_torch_available,
     is_vision_available,
     logging,
@@ -11,20 +11,12 @@
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
-def is_decord_available():
-    try:
-        import decord
-
-        return True
-    except ImportError:
-        return False
-
 if is_decord_available():
     from decord import VideoReader, cpu
 
 if is_vision_available():
-    from PIL import Image
     import numpy as np
+    from PIL import Image
 
 
 if is_torch_available():
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index cc00acefa399..b38d44e78537 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -268,6 +268,13 @@
 except importlib_metadata.PackageNotFoundError:
     _is_ccl_available = False
 
+_decord_availale = importlib.util.find_spec("decord") is not None
+try:
+    _decord_version = importlib_metadata.version("decord")
+    logger.debug(f"Successfully imported decord version {_decord_version}")
+except importlib_metadata.PackageNotFoundError:
+    _decord_availale = False
+
 # This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
 TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
 TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8")
@@ -697,6 +704,10 @@ def is_ccl_available():
     return _is_ccl_available
 
 
+def is_decord_available():
+    return _decord_availale
+
+
 def is_sudachi_available():
     return importlib.util.find_spec("sudachipy") is not None
 
@@ -944,6 +955,11 @@ def is_jumanpp_available():
 Please note that you may need to restart your runtime after installation.
 """
 
+DECORD_IMPORT_ERROR = """
+{0} requires the decord library but it was not found in your environment. You can install it with pip: `pip install
+decord`. Please note that you may need to restart your runtime after installation.
+"""
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
@@ -973,6 +989,7 @@ def is_jumanpp_available():
         ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
         ("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
         ("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)),
+        ("decord", (is_decord_available, DECORD_IMPORT_ERROR)),
     ]
 )
 

From f161a90c8166a67f41eafcd8faa99d2df9ab2ef5 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 10 Nov 2022 14:26:30 -0500
Subject: [PATCH 03/17] :bug: add missing import

---
 src/transformers/utils/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 53220c3fe541..e073e68b423c 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -103,6 +103,7 @@
     is_bitsandbytes_available,
     is_bs4_available,
     is_coloredlogs_available,
+    is_decord_available,
     is_datasets_available,
     is_detectron2_available,
     is_faiss_available,

From a7e7cfd3bbc4b55a094f9fb8825ff34dcde0ef2f Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 10 Nov 2022 16:48:17 -0500
Subject: [PATCH 04/17] :white_check_mark: add tests

---
 src/transformers/__init__.py                  |   1 +
 .../pipelines/video_classification.py         |   2 +-
 src/transformers/testing_utils.py             |   8 ++
 src/transformers/utils/__init__.py            |   2 +-
 .../test_pipelines_video_classification.py    | 108 ++++++++++++++++++
 5 files changed, 119 insertions(+), 2 deletions(-)
 create mode 100644 tests/pipelines/test_pipelines_video_classification.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2e477e7a09d1..ed0304f773cf 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -532,6 +532,7 @@
         "add_start_docstrings",
         "is_apex_available",
         "is_datasets_available",
+        "is_decord_available",
         "is_faiss_available",
         "is_flax_available",
         "is_keras_nlp_available",
diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index f3591b760acc..c32a7fcc1b76 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -100,7 +100,7 @@ def preprocess(self, video):
             video = videoreader.get_batch(indices).asnumpy()
             video = list(video)
 
-        model_inputs = self.feature_extractor(list(video), return_tensors=self.framework)
+        model_inputs = self.feature_extractor(video, return_tensors=self.framework)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 5ef6bfd36aa9..31760557aa9c 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -51,6 +51,7 @@
     is_apex_available,
     is_bitsandbytes_available,
     is_bs4_available,
+    is_decord_available,
     is_detectron2_available,
     is_faiss_available,
     is_flax_available,
@@ -446,6 +447,13 @@ def require_spacy(test_case):
     return unittest.skipUnless(is_spacy_available(), "test requires spacy")(test_case)
 
 
+def require_decord(test_case):
+    """
+    Decorator marking a test that requires decord. These tests are skipped when decord isn't installed.
+    """
+    return unittest.skipUnless(is_decord_available(), "test requires decord")(test_case)
+
+
 def require_torch_multi_gpu(test_case):
     """
     Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index e073e68b423c..3e4c19bbf8c4 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -103,8 +103,8 @@
     is_bitsandbytes_available,
     is_bs4_available,
     is_coloredlogs_available,
-    is_decord_available,
     is_datasets_available,
+    is_decord_available,
     is_detectron2_available,
     is_faiss_available,
     is_flax_available,
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
new file mode 100644
index 000000000000..3ec93dc52474
--- /dev/null
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -0,0 +1,108 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from huggingface_hub import hf_hub_download
+from transformers import (
+    MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
+    VideoMAEFeatureExtractor,
+    is_decord_available,
+    is_vision_available,
+)
+from transformers.pipelines import VideoClassificationPipeline, pipeline
+from transformers.testing_utils import (
+    nested_simplify,
+    require_decord,
+    require_tf,
+    require_torch,
+    require_torch_or_tf,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY, PipelineTestCaseMeta
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@require_torch_or_tf
+@require_vision
+@require_decord
+class VideoClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+    model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
+
+    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+        example_video_filepath = hf_hub_download(
+            repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
+        )
+        video_classifier = VideoClassificationPipeline(model=model, feature_extractor=feature_extractor, top_k=2)
+        examples = [
+            example_video_filepath,
+        ]
+        return video_classifier, examples
+
+    def run_pipeline_test(self, video_classifier, examples):
+
+        outputs = video_classifier(examples[0])
+
+        self.assertEqual(
+            outputs,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt(self):
+        small_model = "hf-internal-testing/tiny-random-VideoMAEForVideoClassification"
+        small_feature_extractor = VideoMAEFeatureExtractor(size=10, crop_size=dict(height=10, width=10))
+        video_classifier = pipeline(
+            "video-classification", model=small_model, feature_extractor=small_feature_extractor
+        )
+
+        video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
+        outputs = video_classifier(video_file_path, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=3),
+            [{"score": 0.521, "label": "LABEL_0"}, {"score": 0.479, "label": "LABEL_1"}],
+        )
+
+        outputs = video_classifier(
+            [
+                video_file_path,
+                video_file_path,
+            ],
+            top_k=2,
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=3),
+            [
+                [{"score": 0.521, "label": "LABEL_0"}, {"score": 0.479, "label": "LABEL_1"}],
+                [{"score": 0.521, "label": "LABEL_0"}, {"score": 0.479, "label": "LABEL_1"}],
+            ],
+        )
+
+    @require_tf
+    def test_small_model_tf(self):
+        pass

From 7ebe5fe7100d5a63e0d9e73ed9f65f489661f801 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 10 Nov 2022 17:30:48 -0500
Subject: [PATCH 05/17] :wrench: add decord to setup extras

---
 setup.py                                      | 4 +++-
 src/transformers/dependency_versions_table.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index cba614baae0e..34e604e50c29 100644
--- a/setup.py
+++ b/setup.py
@@ -103,6 +103,7 @@
     "cookiecutter==1.7.3",
     "dataclasses",
     "datasets!=2.5.0",
+    "decord==0.6.0",
     "deepspeed>=0.6.5",
     "dill<0.3.5",
     "evaluate>=0.2.0",
@@ -286,7 +287,7 @@ def run(self):
 extras["timm"] = deps_list("timm")
 extras["natten"] = deps_list("natten")
 extras["codecarbon"] = deps_list("codecarbon")
-
+extras["video"] = deps_list("decord")
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
@@ -332,6 +333,7 @@ def run(self):
     + extras["timm"]
     + extras["codecarbon"]
     + extras["accelerate"]
+    + extras["video"]
 )
 
 # Might need to add doc-builder and some specific deps in the future
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 5ee9317270fd..21b047dcd923 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -9,6 +9,7 @@
     "cookiecutter": "cookiecutter==1.7.3",
     "dataclasses": "dataclasses",
     "datasets": "datasets!=2.5.0",
+    "decord": "decord==0.6.0",
     "deepspeed": "deepspeed>=0.6.5",
     "dill": "dill<0.3.5",
     "evaluate": "evaluate>=0.2.0",

From 932507939a80d23ad69c20429bcc88089cf44b6b Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 10 Nov 2022 18:05:00 -0500
Subject: [PATCH 06/17] :construction: add is_decord_available

---
 src/transformers/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ed0304f773cf..865ce6b475ec 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3750,6 +3750,7 @@
         add_start_docstrings,
         is_apex_available,
         is_datasets_available,
+        is_decord_available,
         is_faiss_available,
         is_flax_available,
         is_keras_nlp_available,

From aa4405465a7c9103b98a251858b71b6e787ad52a Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 10 Nov 2022 18:45:08 -0500
Subject: [PATCH 07/17] :sparkles: add video-classification pipeline

---
 .../pipelines/video_classification.py         | 45 ++++++------------
 .../test_pipelines_video_classification.py    | 46 +++++++------------
 utils/update_metadata.py                      |  1 +
 3 files changed, 32 insertions(+), 60 deletions(-)

diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index c32a7fcc1b76..5f411cde3e99 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -1,22 +1,13 @@
 from typing import List, Union
 
-from ..utils import (
-    add_end_docstrings,
-    is_decord_available,
-    is_torch_available,
-    is_vision_available,
-    logging,
-    requires_backends,
-)
+from ..utils import add_end_docstrings, is_decord_available, is_torch_available, logging, requires_backends
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
 if is_decord_available():
-    from decord import VideoReader, cpu
-
-if is_vision_available():
     import numpy as np
-    from PIL import Image
+
+    from decord import VideoReader, cpu
 
 
 if is_torch_available():
@@ -52,21 +43,19 @@ def _sanitize_parameters(self, top_k=None):
             postprocess_params["top_k"] = top_k
         return {}, {}, postprocess_params
 
-    def __call__(self, videos: Union[str, List[str], List["Image.Image"], List[List["Image.Image"]]], **kwargs):
+    def __call__(self, videos: Union[str, List[str]], **kwargs):
         """
-        Assign labels to the image(s) passed as inputs.
+        Assign labels to the video(s) passed as inputs.
 
         Args:
-            videos (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+            videos (`str`, `List[str]`):
                 The pipeline handles three types of videos:
 
                 - A string containing a http link pointing to a video
                 - A string containing a local path to an video
-                - An video's frames loaded in PIL directly
 
                 The pipeline accepts either a single video or a batch of videos, which must then be passed as a string.
-                Videos in a batch must all be in the same format: all as http links, all as local paths, or all as a
-                list (or list of lists) of PIL Images containing the frames of the video(s).
+                Videos in a batch must all be in the same format: all as http links or all as local paths.
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
@@ -85,20 +74,16 @@ def __call__(self, videos: Union[str, List[str], List["Image.Image"], List[List[
 
     def preprocess(self, video):
 
-        if isinstance(video, str):
-            videoreader = VideoReader(video, num_threads=1, ctx=cpu(0))
-            videoreader.seek(0)
-
-            converted_len = int(self.num_frames * self.frame_sample_rate)
+        videoreader = VideoReader(video, num_threads=1, ctx=cpu(0))
+        videoreader.seek(0)
 
-            seg_len = len(videoreader)
-            end_idx = np.random.randint(converted_len, seg_len)
-            start_idx = end_idx - converted_len
-            indices = np.linspace(start_idx, end_idx, num=self.num_frames)
-            indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        start_idx = 0
+        end_idx = int(self.num_frames * self.frame_sample_rate)
+        indices = np.linspace(start_idx, end_idx, num=self.num_frames)
+        indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
 
-            video = videoreader.get_batch(indices).asnumpy()
-            video = list(video)
+        video = videoreader.get_batch(indices).asnumpy()
+        video = list(video)
 
         model_inputs = self.feature_extractor(video, return_tensors=self.framework)
         return model_inputs
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index 3ec93dc52474..3d6798e31928 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -15,12 +15,7 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
-from transformers import (
-    MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
-    VideoMAEFeatureExtractor,
-    is_decord_available,
-    is_vision_available,
-)
+from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor
 from transformers.pipelines import VideoClassificationPipeline, pipeline
 from transformers.testing_utils import (
     nested_simplify,
@@ -29,22 +24,11 @@
     require_torch,
     require_torch_or_tf,
     require_vision,
-    slow,
 )
 
 from .test_pipelines_common import ANY, PipelineTestCaseMeta
 
 
-if is_vision_available():
-    from PIL import Image
-else:
-
-    class Image:
-        @staticmethod
-        def open(*args, **kwargs):
-            pass
-
-
 @require_torch_or_tf
 @require_vision
 @require_decord
@@ -58,20 +42,22 @@ def get_test_pipeline(self, model, tokenizer, feature_extractor):
         video_classifier = VideoClassificationPipeline(model=model, feature_extractor=feature_extractor, top_k=2)
         examples = [
             example_video_filepath,
+            "https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4",
         ]
         return video_classifier, examples
 
     def run_pipeline_test(self, video_classifier, examples):
 
-        outputs = video_classifier(examples[0])
+        for example in examples:
+            outputs = video_classifier(example)
 
-        self.assertEqual(
-            outputs,
-            [
-                {"score": ANY(float), "label": ANY(str)},
-                {"score": ANY(float), "label": ANY(str)},
-            ],
-        )
+            self.assertEqual(
+                outputs,
+                [
+                    {"score": ANY(float), "label": ANY(str)},
+                    {"score": ANY(float), "label": ANY(str)},
+                ],
+            )
 
     @require_torch
     def test_small_model_pt(self):
@@ -84,8 +70,8 @@ def test_small_model_pt(self):
         video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
         outputs = video_classifier(video_file_path, top_k=2)
         self.assertEqual(
-            nested_simplify(outputs, decimals=3),
-            [{"score": 0.521, "label": "LABEL_0"}, {"score": 0.479, "label": "LABEL_1"}],
+            nested_simplify(outputs, decimals=4),
+            [{"score": 0.5209, "label": "LABEL_0"}, {"score": 0.4791, "label": "LABEL_1"}],
         )
 
         outputs = video_classifier(
@@ -96,10 +82,10 @@ def test_small_model_pt(self):
             top_k=2,
         )
         self.assertEqual(
-            nested_simplify(outputs, decimals=3),
+            nested_simplify(outputs, decimals=4),
             [
-                [{"score": 0.521, "label": "LABEL_0"}, {"score": 0.479, "label": "LABEL_1"}],
-                [{"score": 0.521, "label": "LABEL_0"}, {"score": 0.479, "label": "LABEL_1"}],
+                [{"score": 0.5209, "label": "LABEL_0"}, {"score": 0.4791, "label": "LABEL_1"}],
+                [{"score": 0.5209, "label": "LABEL_0"}, {"score": 0.4791, "label": "LABEL_1"}],
             ],
         )
 
diff --git a/utils/update_metadata.py b/utils/update_metadata.py
index 5e7169c25585..e624759ebe25 100644
--- a/utils/update_metadata.py
+++ b/utils/update_metadata.py
@@ -102,6 +102,7 @@
         "AutoModel",
     ),
     ("depth-estimation", "MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES", "AutoModelForDepthEstimation"),
+    ("video-classification", "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES", "AutoModelForVideoClassification"),
 ]
 
 

From cf4f421ba47e07f03c9da61d9f29333132957a81 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 10 Nov 2022 18:57:52 -0500
Subject: [PATCH 08/17] :memo: add video classification pipe to docs

---
 docs/source/en/main_classes/pipelines.mdx | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/source/en/main_classes/pipelines.mdx b/docs/source/en/main_classes/pipelines.mdx
index f6c63a983fc0..a029139884a1 100644
--- a/docs/source/en/main_classes/pipelines.mdx
+++ b/docs/source/en/main_classes/pipelines.mdx
@@ -424,6 +424,21 @@ See [`TokenClassificationPipeline`] for all details.
     - __call__
     - all
 
+<<<<<<< HEAD
+=======
+### VideoClassificationPipeline
+
+[[autodoc]] VideoClassificationPipeline
+    - __call__
+    - all
+
+### VisualQuestionAnsweringPipeline
+
+[[autodoc]] VisualQuestionAnsweringPipeline
+    - __call__
+    - all
+
+>>>>>>> 75d3a65ea (:memo: add video classification pipe to docs)
 ### ZeroShotClassificationPipeline
 
 [[autodoc]] ZeroShotClassificationPipeline

From 80bf526832809b33c11f3fc1fae9fedf3b09f768 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 10 Nov 2022 19:04:44 -0500
Subject: [PATCH 09/17] :bug: add missing VideoClassificationPipeline import

---
 src/transformers/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 865ce6b475ec..47771a1a8d9b 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -487,6 +487,7 @@
         "TextGenerationPipeline",
         "TokenClassificationPipeline",
         "TranslationPipeline",
+        "VideoClassificationPipeline",
         "VisualQuestionAnsweringPipeline",
         "ZeroShotClassificationPipeline",
         "ZeroShotImageClassificationPipeline",
@@ -3700,6 +3701,7 @@
         TextGenerationPipeline,
         TokenClassificationPipeline,
         TranslationPipeline,
+        VideoClassificationPipeline,
         VisualQuestionAnsweringPipeline,
         ZeroShotClassificationPipeline,
         ZeroShotImageClassificationPipeline,

From 8ed1b7de4c78c7031e7126ce7bb7ef2bc64df461 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 10 Nov 2022 22:22:00 -0500
Subject: [PATCH 10/17] :pushpin: add decord install in test runner

---
 .circleci/create_circleci_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 8c00789fbb35..8cfbd37d15f0 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -188,7 +188,7 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
         "pip install --upgrade pip",
-        "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
+        "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
     ],
     pytest_options={"rA": None},
     tests_to_run="tests/pipelines/"

From 2f3c93f2ee6202bab27de5188ee19d26caed3209 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Fri, 11 Nov 2022 14:33:48 -0500
Subject: [PATCH 11/17] :white_check_mark: fix url inputs to
 video-classification pipeline

---
 src/transformers/pipelines/video_classification.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 5f411cde3e99..27d3e2c701e5 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -1,5 +1,8 @@
+from io import BytesIO
 from typing import List, Union
 
+import requests
+
 from ..utils import add_end_docstrings, is_decord_available, is_torch_available, logging, requires_backends
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
@@ -31,10 +34,10 @@ class VideoClassificationPipeline(Pipeline):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        requires_backends(self, "vision")
+        requires_backends(self, "decord")
         self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING)
 
-        self.frame_sample_rate = kwargs.pop("frame_sample_rate", 4)
+        self.frame_sampling_rate = kwargs.pop("frame_sample_rate", 4)
         self.num_frames = self.model.config.num_frames
 
     def _sanitize_parameters(self, top_k=None):
@@ -74,13 +77,16 @@ def __call__(self, videos: Union[str, List[str]], **kwargs):
 
     def preprocess(self, video):
 
+        if video.startswith("http://") or video.startswith("https://"):
+            video = BytesIO(requests.get(video).content)
+
         videoreader = VideoReader(video, num_threads=1, ctx=cpu(0))
         videoreader.seek(0)
 
         start_idx = 0
-        end_idx = int(self.num_frames * self.frame_sample_rate)
+        end_idx = self.num_frames * self.frame_sampling_rate - 1
         indices = np.linspace(start_idx, end_idx, num=self.num_frames)
-        indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        indices = np.clip(indices, start_idx, end_idx).astype(np.int64)
 
         video = videoreader.get_batch(indices).asnumpy()
         video = list(video)

From 8f2a52ca2bdd94c6a00e457061bbfc89b449ab74 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Tue, 6 Dec 2022 16:12:39 -0500
Subject: [PATCH 12/17] :sparkles: updates from review

---
 .../pipelines/video_classification.py         | 23 +++++++++++--------
 .../test_pipelines_video_classification.py    | 12 ++++++----
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 27d3e2c701e5..911ccdb48099 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -37,14 +37,17 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, "decord")
         self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING)
 
-        self.frame_sampling_rate = kwargs.pop("frame_sample_rate", 4)
-        self.num_frames = self.model.config.num_frames
+    def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None):
+        preprocess_params = {}
+        if frame_sampling_rate is not None:
+            preprocess_params["frame_sampling_rate"] = frame_sampling_rate
+        if num_frames is not None:
+            preprocess_params["num_frames"] = num_frames
 
-    def _sanitize_parameters(self, top_k=None):
         postprocess_params = {}
         if top_k is not None:
             postprocess_params["top_k"] = top_k
-        return {}, {}, postprocess_params
+        return preprocess_params, {}, postprocess_params
 
     def __call__(self, videos: Union[str, List[str]], **kwargs):
         """
@@ -75,18 +78,20 @@ def __call__(self, videos: Union[str, List[str]], **kwargs):
         """
         return super().__call__(videos, **kwargs)
 
-    def preprocess(self, video):
+    def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
+
+        if num_frames is None:
+            num_frames = self.model.config.num_frames
 
         if video.startswith("http://") or video.startswith("https://"):
             video = BytesIO(requests.get(video).content)
 
-        videoreader = VideoReader(video, num_threads=1, ctx=cpu(0))
+        videoreader = VideoReader(video)
         videoreader.seek(0)
 
         start_idx = 0
-        end_idx = self.num_frames * self.frame_sampling_rate - 1
-        indices = np.linspace(start_idx, end_idx, num=self.num_frames)
-        indices = np.clip(indices, start_idx, end_idx).astype(np.int64)
+        end_idx = num_frames * frame_sampling_rate - 1
+        indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)
 
         video = videoreader.get_batch(indices).asnumpy()
         video = list(video)
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index 3d6798e31928..25ddcfaf2d33 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -62,16 +62,18 @@ def run_pipeline_test(self, video_classifier, examples):
     @require_torch
     def test_small_model_pt(self):
         small_model = "hf-internal-testing/tiny-random-VideoMAEForVideoClassification"
-        small_feature_extractor = VideoMAEFeatureExtractor(size=10, crop_size=dict(height=10, width=10))
+        small_feature_extractor = VideoMAEFeatureExtractor(
+            size=dict(shortest_edge=10), crop_size=dict(height=10, width=10)
+        )
         video_classifier = pipeline(
-            "video-classification", model=small_model, feature_extractor=small_feature_extractor
+            "video-classification", model=small_model, feature_extractor=small_feature_extractor, frame_sampling_rate=4
         )
 
         video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
         outputs = video_classifier(video_file_path, top_k=2)
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
-            [{"score": 0.5209, "label": "LABEL_0"}, {"score": 0.4791, "label": "LABEL_1"}],
+            [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
         )
 
         outputs = video_classifier(
@@ -84,8 +86,8 @@ def test_small_model_pt(self):
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
             [
-                [{"score": 0.5209, "label": "LABEL_0"}, {"score": 0.4791, "label": "LABEL_1"}],
-                [{"score": 0.5209, "label": "LABEL_0"}, {"score": 0.4791, "label": "LABEL_1"}],
+                [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
+                [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
             ],
         )
 

From 349b780a184f62d3354b7ab0b99c59fafb543004 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Tue, 6 Dec 2022 16:17:23 -0500
Subject: [PATCH 13/17] :memo: add video cls pipeline to docs

---
 docs/source/en/main_classes/pipelines.mdx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/en/main_classes/pipelines.mdx b/docs/source/en/main_classes/pipelines.mdx
index a029139884a1..f0bd94809378 100644
--- a/docs/source/en/main_classes/pipelines.mdx
+++ b/docs/source/en/main_classes/pipelines.mdx
@@ -341,6 +341,12 @@ Pipelines available for computer vision tasks include the following.
     - __call__
     - all
 
+### VideoClassificationPipeline
+
+[[autodoc]] VideoClassificationPipeline
+    - __call__
+    - all
+
 ### ZeroShotImageClassificationPipeline
 
 [[autodoc]] ZeroShotImageClassificationPipeline

From e51ebb6f6182c9d81b1cb8e1b57a0e1e03350448 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Tue, 6 Dec 2022 16:17:45 -0500
Subject: [PATCH 14/17] :memo: add docstring

---
 src/transformers/pipelines/video_classification.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 911ccdb48099..d06b0b307616 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -65,6 +65,12 @@ def __call__(self, videos: Union[str, List[str]], **kwargs):
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            num_frames (`int`, *optional*, defaults to `self.model.config.num_frames`):
+                The number of frames sampled from the video to run the classification on. If not provided, will default
+                to the number of frames specified in the model configuration.
+            frame_sampling_rate (`int`, *optional*, defaults to 1):
+                The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
+                frame will be used.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single video, will return a

From 36f1fe190154edcae32305ed6258b1c92bb3bcbb Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Tue, 6 Dec 2022 16:22:36 -0500
Subject: [PATCH 15/17] :fire: remove unused import

---
 src/transformers/pipelines/video_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index d06b0b307616..2b344e8adc56 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -10,7 +10,7 @@
 if is_decord_available():
     import numpy as np
 
-    from decord import VideoReader, cpu
+    from decord import VideoReader
 
 
 if is_torch_available():

From 623ceec0a90783901583be9137580b3c11d1a927 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Tue, 6 Dec 2022 16:33:35 -0500
Subject: [PATCH 16/17] :fire: remove some code

---
 docs/source/en/main_classes/pipelines.mdx | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/docs/source/en/main_classes/pipelines.mdx b/docs/source/en/main_classes/pipelines.mdx
index f0bd94809378..e5ee3902028e 100644
--- a/docs/source/en/main_classes/pipelines.mdx
+++ b/docs/source/en/main_classes/pipelines.mdx
@@ -430,21 +430,6 @@ See [`TokenClassificationPipeline`] for all details.
     - __call__
     - all
 
-<<<<<<< HEAD
-=======
-### VideoClassificationPipeline
-
-[[autodoc]] VideoClassificationPipeline
-    - __call__
-    - all
-
-### VisualQuestionAnsweringPipeline
-
-[[autodoc]] VisualQuestionAnsweringPipeline
-    - __call__
-    - all
-
->>>>>>> 75d3a65ea (:memo: add video classification pipe to docs)
 ### ZeroShotClassificationPipeline
 
 [[autodoc]] ZeroShotClassificationPipeline

From 7c88d92e1f7e968c8e9f47cbffeaf6c204d0b066 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 8 Dec 2022 13:57:40 -0500
Subject: [PATCH 17/17] :memo: docfix

---
 src/transformers/pipelines/video_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 2b344e8adc56..8d53fb851b5a 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -58,7 +58,7 @@ def __call__(self, videos: Union[str, List[str]], **kwargs):
                 The pipeline handles three types of videos:
 
                 - A string containing a http link pointing to a video
-                - A string containing a local path to an video
+                - A string containing a local path to a video
 
                 The pipeline accepts either a single video or a batch of videos, which must then be passed as a string.
                 Videos in a batch must all be in the same format: all as http links or all as local paths.