Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/create_circleci_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def job_name(self):
install_steps=[
"sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
"pip install --upgrade pip",
"pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
"pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
],
pytest_options={"rA": None},
tests_to_run="tests/pipelines/"
Expand Down
6 changes: 6 additions & 0 deletions docs/source/en/main_classes/pipelines.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,12 @@ Pipelines available for computer vision tasks include the following.
- __call__
- all

### VideoClassificationPipeline

[[autodoc]] VideoClassificationPipeline
- __call__
- all

### ZeroShotImageClassificationPipeline

[[autodoc]] ZeroShotImageClassificationPipeline
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
"cookiecutter==1.7.3",
"dataclasses",
"datasets!=2.5.0",
"decord==0.6.0",
"deepspeed>=0.6.5",
"dill<0.3.5",
"evaluate>=0.2.0",
Expand Down Expand Up @@ -286,7 +287,7 @@ def run(self):
extras["timm"] = deps_list("timm")
extras["natten"] = deps_list("natten")
extras["codecarbon"] = deps_list("codecarbon")

extras["video"] = deps_list("decord")

extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
extras["testing"] = (
Expand Down Expand Up @@ -332,6 +333,7 @@ def run(self):
+ extras["timm"]
+ extras["codecarbon"]
+ extras["accelerate"]
+ extras["video"]
)

# Might need to add doc-builder and some specific deps in the future
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,7 @@
"TextGenerationPipeline",
"TokenClassificationPipeline",
"TranslationPipeline",
"VideoClassificationPipeline",
"VisualQuestionAnsweringPipeline",
"ZeroShotClassificationPipeline",
"ZeroShotImageClassificationPipeline",
Expand Down Expand Up @@ -534,6 +535,7 @@
"add_start_docstrings",
"is_apex_available",
"is_datasets_available",
"is_decord_available",
"is_faiss_available",
"is_flax_available",
"is_keras_nlp_available",
Expand Down Expand Up @@ -3724,6 +3726,7 @@
TextGenerationPipeline,
TokenClassificationPipeline,
TranslationPipeline,
VideoClassificationPipeline,
VisualQuestionAnsweringPipeline,
ZeroShotClassificationPipeline,
ZeroShotImageClassificationPipeline,
Expand Down Expand Up @@ -3774,6 +3777,7 @@
add_start_docstrings,
is_apex_available,
is_datasets_available,
is_decord_available,
is_faiss_available,
is_flax_available,
is_keras_nlp_available,
Expand Down
1 change: 1 addition & 0 deletions src/transformers/dependency_versions_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"cookiecutter": "cookiecutter==1.7.3",
"dataclasses": "dataclasses",
"datasets": "datasets!=2.5.0",
"decord": "decord==0.6.0",
"deepspeed": "deepspeed>=0.6.5",
"dill": "dill<0.3.5",
"evaluate": "evaluate>=0.2.0",
Expand Down
11 changes: 10 additions & 1 deletion src/transformers/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
TokenClassificationArgumentHandler,
TokenClassificationPipeline,
)
from .video_classification import VideoClassificationPipeline
from .visual_question_answering import VisualQuestionAnsweringPipeline
from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
Expand Down Expand Up @@ -133,6 +134,7 @@
AutoModelForSpeechSeq2Seq,
AutoModelForTableQuestionAnswering,
AutoModelForTokenClassification,
AutoModelForVideoClassification,
AutoModelForVision2Seq,
AutoModelForVisualQuestionAnswering,
AutoModelForZeroShotObjectDetection,
Expand Down Expand Up @@ -361,6 +363,13 @@
"default": {"model": {"pt": ("Intel/dpt-large", "e93beec")}},
"type": "image",
},
"video-classification": {
"impl": VideoClassificationPipeline,
"tf": (),
"pt": (AutoModelForVideoClassification,) if is_torch_available() else (),
"default": {"model": {"pt": ("MCG-NJU/videomae-base-finetuned-kinetics", "4800870")}},
"type": "video",
},
}

NO_FEATURE_EXTRACTOR_TASKS = set()
Expand All @@ -373,7 +382,7 @@
for task, values in SUPPORTED_TASKS.items():
if values["type"] == "text":
NO_FEATURE_EXTRACTOR_TASKS.add(task)
elif values["type"] in {"audio", "image"}:
elif values["type"] in {"audio", "image", "video"}:
NO_TOKENIZER_TASKS.add(task)
elif values["type"] != "multimodal":
raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")
Expand Down
124 changes: 124 additions & 0 deletions src/transformers/pipelines/video_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from io import BytesIO
from typing import List, Union

import requests

from ..utils import add_end_docstrings, is_decord_available, is_torch_available, logging, requires_backends
from .base import PIPELINE_INIT_ARGS, Pipeline


if is_decord_available():
import numpy as np

from decord import VideoReader


if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING

logger = logging.get_logger(__name__)


@add_end_docstrings(PIPELINE_INIT_ARGS)
class VideoClassificationPipeline(Pipeline):
"""
Video classification pipeline using any `AutoModelForVideoClassification`. This pipeline predicts the class of a
video.

This video classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"video-classification"`.

See the list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=video-classification).
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
requires_backends(self, "decord")
self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING)

def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None):
preprocess_params = {}
if frame_sampling_rate is not None:
preprocess_params["frame_sampling_rate"] = frame_sampling_rate
if num_frames is not None:
preprocess_params["num_frames"] = num_frames

postprocess_params = {}
if top_k is not None:
postprocess_params["top_k"] = top_k
return preprocess_params, {}, postprocess_params

def __call__(self, videos: Union[str, List[str]], **kwargs):
"""
Assign labels to the video(s) passed as inputs.

Args:
videos (`str`, `List[str]`):
The pipeline handles three types of videos:

- A string containing a http link pointing to a video
- A string containing a local path to a video

The pipeline accepts either a single video or a batch of videos, which must then be passed as a string.
Videos in a batch must all be in the same format: all as http links or all as local paths.
top_k (`int`, *optional*, defaults to 5):
The number of top labels that will be returned by the pipeline. If the provided number is higher than
the number of labels available in the model configuration, it will default to the number of labels.
num_frames (`int`, *optional*, defaults to `self.model.config.num_frames`):
The number of frames sampled from the video to run the classification on. If not provided, will default
to the number of frames specified in the model configuration.
frame_sampling_rate (`int`, *optional*, defaults to 1):
The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
frame will be used.

Return:
A dictionary or a list of dictionaries containing result. If the input is a single video, will return a
dictionary, if the input is a list of several videos, will return a list of dictionaries corresponding to
the videos.

The dictionaries contain the following keys:

- **label** (`str`) -- The label identified by the model.
- **score** (`int`) -- The score attributed by the model for that label.
"""
return super().__call__(videos, **kwargs)

def preprocess(self, video, num_frames=None, frame_sampling_rate=1):

if num_frames is None:
num_frames = self.model.config.num_frames

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

if video.startswith("http://") or video.startswith("https://"):
video = BytesIO(requests.get(video).content)

videoreader = VideoReader(video)
videoreader.seek(0)

start_idx = 0
end_idx = num_frames * frame_sampling_rate - 1
indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)

video = videoreader.get_batch(indices).asnumpy()
video = list(video)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why casting to list ? We cannot work with a real tensor ?

Shouldn't the tensor already be in torch format ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is library agnostic...its a list of numpy frames. The video classification feature extractor expects this as a list, not numpy array, which is why I did it this way

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Casting objects around creates many copies is usually a good way to slow things down unecessarily.
For video it's especially important to be as efficient as possible IMO. (Like copying small strings or 1-d token id tensor is usually not that impactful, but entire decoded videos much more so)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally agree. Unfortunately, it seems the feature_extractor here is looking for a list of np arrays or PIL images. Passing the whole video array and processing it all at once would be much better, but it seems that's not how its implemented?

So this issue isn't really with this PR it's rooted in the processor I think. WDYT I should do?

Other option with current implem of feature_extractor is to do something like video = [x.asnumpy() for x in videoreader.get_batch(indices)] but I assume that's slower than the way I did it here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be left as another PR, but I think the feature extractor should be able to receive the array as-is, I don't see any valid reasons to be forced to receive list of PIL image (although it's nice to be able to receive that).

I looked at the feature extractor, and it does center cropping and some normalization only, it feels to me that we could do all that at load time of the video meaning it should be mostly a no-op (the normalize should be simple add on the tensor iiuc).


model_inputs = self.feature_extractor(video, return_tensors=self.framework)
return model_inputs

def _forward(self, model_inputs):
model_outputs = self.model(**model_inputs)
return model_outputs

def postprocess(self, model_outputs, top_k=5):
if top_k > self.model.config.num_labels:
top_k = self.model.config.num_labels

if self.framework == "pt":
probs = model_outputs.logits.softmax(-1)[0]
scores, ids = probs.topk(top_k)
else:
raise ValueError(f"Unsupported framework: {self.framework}")

scores = scores.tolist()
ids = ids.tolist()
return [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
8 changes: 8 additions & 0 deletions src/transformers/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
is_apex_available,
is_bitsandbytes_available,
is_bs4_available,
is_decord_available,
is_detectron2_available,
is_faiss_available,
is_flax_available,
Expand Down Expand Up @@ -446,6 +447,13 @@ def require_spacy(test_case):
return unittest.skipUnless(is_spacy_available(), "test requires spacy")(test_case)


def require_decord(test_case):
"""
Decorator marking a test that requires decord. These tests are skipped when decord isn't installed.
"""
return unittest.skipUnless(is_decord_available(), "test requires decord")(test_case)


def require_torch_multi_gpu(test_case):
"""
Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without
Expand Down
1 change: 1 addition & 0 deletions src/transformers/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
is_bs4_available,
is_coloredlogs_available,
is_datasets_available,
is_decord_available,
is_detectron2_available,
is_faiss_available,
is_flax_available,
Expand Down
17 changes: 17 additions & 0 deletions src/transformers/utils/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,13 @@
except importlib_metadata.PackageNotFoundError:
_is_ccl_available = False

_decord_availale = importlib.util.find_spec("decord") is not None
try:
_decord_version = importlib_metadata.version("decord")
logger.debug(f"Successfully imported decord version {_decord_version}")
except importlib_metadata.PackageNotFoundError:
_decord_availale = False

# This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8")
Expand Down Expand Up @@ -706,6 +713,10 @@ def is_ccl_available():
return _is_ccl_available


def is_decord_available():
return _decord_availale


def is_sudachi_available():
return importlib.util.find_spec("sudachipy") is not None

Expand Down Expand Up @@ -953,6 +964,11 @@ def is_jumanpp_available():
Please note that you may need to restart your runtime after installation.
"""

DECORD_IMPORT_ERROR = """
{0} requires the decord library but it was not found in your environment. You can install it with pip: `pip install
decord`. Please note that you may need to restart your runtime after installation.
"""

BACKENDS_MAPPING = OrderedDict(
[
("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
Expand Down Expand Up @@ -982,6 +998,7 @@ def is_jumanpp_available():
("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)),
("decord", (is_decord_available, DECORD_IMPORT_ERROR)),
]
)

Expand Down
Loading