diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fee83bd..2483407 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,9 +64,6 @@ jobs: if: matrix.operating-system == 'macos-latest' run: pip install torch==${{ matrix.torch-version }} - - name: Install Pytorchvideo from main branch - run: pip install git+https://github.com/facebookresearch/pytorchvideo.git - - name: Lint with flake8, black and isort run: | pip install .[dev] @@ -77,6 +74,12 @@ jobs: # exit-zero treats all errors as warnings. Allowed max line length is 120. flake8 . --count --exit-zero --max-complexity=10 --max-line-length=120 --statistics + - name: Install Pytorchvideo from main branch + run: pip install git+https://github.com/facebookresearch/pytorchvideo.git + + - name: Install HF/Transformers from main branch + run: pip install -U git+https://github.com/huggingface/transformers.git + - name: Install video-transformers package from local setup.py run: > pip install . diff --git a/.github/workflows/package_testing.yml b/.github/workflows/package_testing.yml index 388ce2c..6102752 100644 --- a/.github/workflows/package_testing.yml +++ b/.github/workflows/package_testing.yml @@ -66,6 +66,9 @@ jobs: - name: Install Pytorchvideo from main branch run: pip install git+https://github.com/facebookresearch/pytorchvideo.git + - name: Install HF/Transformers from main branch + run: pip install -U git+https://github.com/huggingface/transformers.git + - name: Install latest video-transformers package run: > pip install --upgrade --force-reinstall video-transformers[test] diff --git a/README.md b/README.md index 0564603..da29eb7 100644 --- a/README.md +++ b/README.md @@ -44,10 +44,11 @@ and supports: conda install pytorch=1.11.0 torchvision=0.12.0 cudatoolkit=11.3 -c pytorch ``` -- Install pytorchvideo from main branch: +- Install pytorchvideo and transformers from main branch: ```bash pip install git+https://github.com/facebookresearch/pytorchvideo.git +pip install git+https://github.com/huggingface/transformers.git ``` - Install `video-transformers`: @@ -83,7 +84,48 @@ val_root ... ``` -- Fine-tune CVT (from HuggingFace) + Transformer based video classifier: +- Fine-tune Timesformer (from HuggingFace) video classifier: + +```python +from torch.optim import AdamW +from video_transformers import VideoModel +from video_transformers.backbones.transformers import TransformersBackbone +from video_transformers.data import VideoDataModule +from video_transformers.heads import LinearHead +from video_transformers.trainer import trainer_factory +from video_transformers.utils.file import download_ucf6 + +backbone = TransformersBackbone("facebook/timesformer-base-finetuned-k400", num_unfrozen_stages=1) + +download_ucf6("./") +datamodule = VideoDataModule( + train_root="ucf6/train", + val_root="ucf6/val", + batch_size=4, + num_workers=4, + num_timesteps=8, + preprocess_input_size=224, + preprocess_clip_duration=1, + preprocess_means=backbone.mean, + preprocess_stds=backbone.std, + preprocess_min_short_side=256, + preprocess_max_short_side=320, + preprocess_horizontal_flip_p=0.5, +) + +head = LinearHead(hidden_size=backbone.num_features, num_classes=datamodule.num_classes) +model = VideoModel(backbone, head) + +optimizer = AdamW(model.parameters(), lr=1e-4) + +Trainer = trainer_factory("single_label_classification") +trainer = Trainer(datamodule, model, optimizer=optimizer, max_epochs=8) + +trainer.fit() + +``` + +- Fine-tune ConvNeXT (from HuggingFace) + Transformer based video classifier: ```python from torch.optim import AdamW @@ -95,7 +137,7 @@ from video_transformers.necks import TransformerNeck from video_transformers.trainer import trainer_factory from video_transformers.utils.file import download_ucf6 -backbone = TimeDistributed(TransformersBackbone("microsoft/cvt-13", num_unfrozen_stages=0)) +backbone = TimeDistributed(TransformersBackbone("facebook/convnext-small-224", num_unfrozen_stages=1)) neck = TransformerNeck( num_features=backbone.num_features, num_timesteps=8, @@ -137,18 +179,18 @@ trainer.fit() ``` -- Fine-tune MobileViT (from Timm) + GRU based video classifier: +- Fine-tune Resnet18 (from HuggingFace) + GRU based video classifier: ```python from video_transformers import TimeDistributed, VideoModel -from video_transformers.backbones.timm import TimmBackbone +from video_transformers.backbones.transformers import TransformersBackbone from video_transformers.data import VideoDataModule from video_transformers.heads import LinearHead from video_transformers.necks import GRUNeck from video_transformers.trainer import trainer_factory from video_transformers.utils.file import download_ucf6 -backbone = TimeDistributed(TimmBackbone("mobilevitv2_100", num_unfrozen_stages=0)) +backbone = TimeDistributed(TransformersBackbone("microsoft/resnet-18", num_unfrozen_stages=1)) neck = GRUNeck(num_features=backbone.num_features, hidden_size=128, num_layers=2, return_last=True) download_ucf6("./") @@ -188,7 +230,7 @@ from video_transformers import VideoModel model = VideoModel.from_pretrained(model_name_or_path) -model.predict(video_path="video.mp4") +model.predict(video_or_folder_path="video.mp4") >> [{'filename': "video.mp4", 'predictions': {'class1': 0.98, 'class2': 0.02}}] ``` @@ -277,3 +319,20 @@ from video_transformers import VideoModel model = VideoModel.from_pretrained("runs/exp/checkpoint") model.to_gradio(examples=['video.mp4'], export_dir="runs/exports/", export_filename="app.py") ``` + + +## Contributing + +Before opening a PR: + +- Install required development packages: + +```bash +pip install -e ."[dev]" +``` + +- Reformat with black and isort: + +```bash +python -m tests.run_code_style format +``` diff --git a/requirements.txt b/requirements.txt index b8db1bc..50113e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ accelerate>=0.14.0,<0.15.0 evaluate>=0.3.0,<0.4.0 -transformers>=4.24.0,<4.25.0 +transformers>=4.25.0 timm>=0.6.12,<0.7.0 click==8.0.4 balanced-loss diff --git a/tests/run_code_style.py b/tests/run_code_style.py new file mode 100644 index 0000000..7b612b3 --- /dev/null +++ b/tests/run_code_style.py @@ -0,0 +1,16 @@ +import sys + +from tests.utils import shell, validate_and_exit + +if __name__ == "__main__": + arg = sys.argv[1] + + if arg == "check": + sts_flake = shell("flake8 . --config setup.cfg --select=E9,F63,F7,F82") + sts_isort = shell("isort . --check --settings pyproject.toml") + sts_black = shell("black . --check --config pyproject.toml") + validate_and_exit(flake8=sts_flake, isort=sts_isort, black=sts_black) + elif arg == "format": + sts_isort = shell("isort . --settings pyproject.toml") + sts_black = shell("black . --config pyproject.toml") + validate_and_exit(isort=sts_isort, black=sts_black) diff --git a/tests/test_auto_backbone.py b/tests/test_auto_backbone.py index 4f4556d..85bb068 100644 --- a/tests/test_auto_backbone.py +++ b/tests/test_auto_backbone.py @@ -8,9 +8,9 @@ def test_transformers_backbone(self): from video_transformers import AutoBackbone config = { - "framework": {"name": "timm"}, + "framework": {"name": "transformers"}, "type": "2d_backbone", - "model_name": "mobilevitv2_100", + "model_name": "microsoft/resnet-18", "num_timesteps": 8, } batch_size = 2 @@ -20,23 +20,21 @@ def test_transformers_backbone(self): output = backbone(input) self.assertEqual(output.shape, (batch_size, config["num_timesteps"], backbone.num_features)) - def test_timm_backbone(self): - import torch - + def test_from_transformers(self): from video_transformers import AutoBackbone - config = { - "framework": {"name": "transformers"}, - "type": "2d_backbone", - "model_name": "microsoft/cvt-13", - "num_timesteps": 8, - } - batch_size = 2 - - backbone = AutoBackbone.from_config(config) - input = torch.randn(batch_size, 3, config["num_timesteps"], 224, 224) - output = backbone(input) - self.assertEqual(output.shape, (batch_size, config["num_timesteps"], backbone.num_features)) + backbone = AutoBackbone.from_transformers("facebook/timesformer-base-finetuned-k400") + assert backbone.model_name == "facebook/timesformer-base-finetuned-k400" + backbone = AutoBackbone.from_transformers("facebook/timesformer-base-finetuned-k600") + assert backbone.model_name == "facebook/timesformer-base-finetuned-k600" + backbone = AutoBackbone.from_transformers("facebook/timesformer-hr-finetuned-k400") + assert backbone.model_name == "facebook/timesformer-hr-finetuned-k400" + backbone = AutoBackbone.from_transformers("facebook/timesformer-hr-finetuned-k600") + assert backbone.model_name == "facebook/timesformer-hr-finetuned-k600" + backbone = AutoBackbone.from_transformers("facebook/timesformer-base-finetuned-ssv2") + assert backbone.model_name == "facebook/timesformer-base-finetuned-ssv2" + backbone = AutoBackbone.from_transformers("facebook/timesformer-hr-finetuned-ssv2") + assert backbone.model_name == "facebook/timesformer-hr-finetuned-ssv2" if __name__ == "__main__": diff --git a/tests/test_auto_head.py b/tests/test_auto_head.py index 983080e..5ac36ad 100644 --- a/tests/test_auto_head.py +++ b/tests/test_auto_head.py @@ -2,7 +2,7 @@ class TestAutoHead(unittest.TestCase): - def test_liear_head(self): + def test_linear_head(self): import torch from video_transformers import AutoHead @@ -20,6 +20,22 @@ def test_liear_head(self): output = head(input) self.assertEqual(output.shape, (batch_size, config["num_classes"])) + def test_from_transformers(self): + from video_transformers import AutoHead + + linear_head = AutoHead.from_transformers("facebook/timesformer-base-finetuned-k400") + assert linear_head.num_classes == 400 + linear_head = AutoHead.from_transformers("facebook/timesformer-base-finetuned-k600") + assert linear_head.num_classes == 600 + linear_head = AutoHead.from_transformers("facebook/timesformer-hr-finetuned-k400") + assert linear_head.num_classes == 400 + linear_head = AutoHead.from_transformers("facebook/timesformer-hr-finetuned-k600") + assert linear_head.num_classes == 600 + linear_head = AutoHead.from_transformers("facebook/timesformer-base-finetuned-ssv2") + assert linear_head.num_classes == 174 + linear_head = AutoHead.from_transformers("facebook/timesformer-hr-finetuned-ssv2") + assert linear_head.num_classes == 174 + if __name__ == "__main__": unittest.main() diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..5b25767 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,41 @@ +import os +import shutil +import sys + + +def shell(command, exit_status=0): + """ + Run command through shell and return exit status if exit status of command run match with given exit status. + + Args: + command: (str) Command string which runs through system shell. + exit_status: (int) Expected exit status of given command run. + + Returns: actual_exit_status + + """ + actual_exit_status = os.system(command) + if actual_exit_status == exit_status: + return 0 + return actual_exit_status + + +def validate_and_exit(expected_out_status=0, **kwargs): + if all([arg == expected_out_status for arg in kwargs.values()]): + # Expected status, OK + sys.exit(0) + else: + # Failure + print_console_centered("Summary Results") + fail_count = 0 + for component, exit_status in kwargs.items(): + if exit_status != expected_out_status: + print(f"{component} failed.") + fail_count += 1 + print_console_centered(f"{len(kwargs)-fail_count} success, {fail_count} failure") + sys.exit(1) + + +def print_console_centered(text: str, fill_char="="): + w, _ = shutil.get_terminal_size((80, 20)) + print(f" {text} ".center(w, fill_char)) diff --git a/video_transformers/__init__.py b/video_transformers/__init__.py index 1429fd3..2a734c0 100644 --- a/video_transformers/__init__.py +++ b/video_transformers/__init__.py @@ -3,4 +3,4 @@ from video_transformers.auto.neck import AutoNeck from video_transformers.modeling import TimeDistributed, VideoModel -__version__ = "0.0.7" +__version__ = "0.0.8" diff --git a/video_transformers/auto/backbone.py b/video_transformers/auto/backbone.py index 50490d2..4d2c213 100644 --- a/video_transformers/auto/backbone.py +++ b/video_transformers/auto/backbone.py @@ -15,19 +15,22 @@ def from_config(cls, config: Dict) -> Union[Backbone, TimeDistributed]: backbone_type = config.get("type") backbone_model_name = config.get("model_name") - if backbone_framework["name"] == "transformers": - from video_transformers.backbones.transformers import TransformersBackbone + from video_transformers.backbones.transformers import TransformersBackbone - backbone = TransformersBackbone(model_name=backbone_model_name) - elif backbone_framework["name"] == "timm": - from video_transformers.backbones.timm import TimmBackbone - - backbone = TimmBackbone(model_name=backbone_model_name) - else: - raise ValueError(f"Unknown framework {backbone_framework}") + backbone = TransformersBackbone(model_name=backbone_model_name) if backbone_type == "2d_backbone": from video_transformers.modeling import TimeDistributed backbone = TimeDistributed(backbone) return backbone + + @classmethod + def from_transformers(cls, name_or_path: str) -> Union[Backbone, TimeDistributed]: + from video_transformers.backbones.transformers import TransformersBackbone + + backbone = TransformersBackbone(model_name=name_or_path) + + if backbone.type == "2d_backbone": + raise ValueError("2D backbones are not supported for from_transformers method.") + return backbone diff --git a/video_transformers/auto/head.py b/video_transformers/auto/head.py index 7f91ce1..0c172cf 100644 --- a/video_transformers/auto/head.py +++ b/video_transformers/auto/head.py @@ -18,3 +18,15 @@ def from_config(cls, config: Dict): return LinearHead(hidden_size, num_classes, dropout_p) else: raise ValueError(f"Unsupported head class name: {head_class_name}") + + @classmethod + def from_transformers(cls, name_or_path: str): + from transformers import AutoModelForVideoClassification + + from video_transformers.heads import LinearHead + + model = AutoModelForVideoClassification.from_pretrained(name_or_path) + linear_head = LinearHead(model.classifier.in_features, model.classifier.out_features) + linear_head.linear.weight = model.classifier.weight + linear_head.linear.bias = model.classifier.bias + return linear_head diff --git a/video_transformers/backbones/transformers.py b/video_transformers/backbones/transformers.py index 61ea08f..9019acc 100644 --- a/video_transformers/backbones/transformers.py +++ b/video_transformers/backbones/transformers.py @@ -3,8 +3,8 @@ from video_transformers.backbones.base import Backbone from video_transformers.utils.torch import unfreeze_last_n_stages as unfreeze_last_n_stages_torch -models_2d = ["convnext", "levit", "cvt", "clip", "swin", "vit", "deit", "beit"] -models_3d = ["videomae"] +models_2d = ["convnext", "levit", "cvt", "clip", "swin", "vit", "deit", "beit", "resnet"] +models_3d = ["videomae", "timesformer"] class TransformersBackbone(Backbone): @@ -24,7 +24,7 @@ def __init__(self, model_name: str, num_unfrozen_stages=0, **backbone_kwargs): if hasattr(backbone.config, "hidden_size"): # vit, swin, deit num_features = backbone.config.hidden_size - elif hasattr(backbone.config, "hidden_sizes"): # levit, convnext + elif hasattr(backbone.config, "hidden_sizes"): # levit, convnext, resnet num_features = backbone.config.hidden_sizes[-1] elif hasattr(backbone.config, "embed_dim"): # cvt num_features = backbone.config.embed_dim[-1] @@ -142,5 +142,22 @@ def unfreeze_last_n_stages(self, n): # unfreeze last n stages stages.extend(list(self.model.base_model.encoder.layer.children())) unfreeze_last_n_stages_torch(stages, n) + elif self.model.base_model_prefix == "timesformer": + stages = [] + # freeze embeddings + for param in self.model.base_model.embeddings.parameters(): + param.requires_grad = False + # unfreeze last n stages + stages.extend(list(self.model.base_model.encoder.layer.children())) + unfreeze_last_n_stages_torch(stages, n) + elif self.model.base_model_prefix == "resnet": + stages = [] + # stages.append(self.model.base_model.embeddings) + # freeze embeddings + for param in self.model.base_model.embedder.parameters(): + param.requires_grad = False + # unfreeze last n stages + stages.extend(self.model.base_model.encoder.stages) + unfreeze_last_n_stages_torch(stages, n) else: raise NotImplementedError(f"Freezing not supported for Huggingface model: {self.model.base_model_prefix}") diff --git a/video_transformers/modeling.py b/video_transformers/modeling.py index 07ae79b..2e51553 100644 --- a/video_transformers/modeling.py +++ b/video_transformers/modeling.py @@ -135,6 +135,43 @@ def from_config(cls, config: Dict) -> "VideoModel": task=config["task"], ) + @classmethod + def from_transformers(cls, name_of_path: str, clip_duration: int = 2) -> "VideoModel": + """ + Loads a model from a hf/transformers models. + """ + from transformers import AutoConfig, AutoProcessor + + import video_transformers + import video_transformers.data + + processor = AutoProcessor.from_pretrained(name_of_path) + model_config = AutoConfig.from_pretrained(name_of_path) + labels = list(model_config.id2label.values()) + + video_preprocessor_config = { + "num_timesteps": model_config.num_frames, + "input_size": model_config.image_size, + "means": [0.45, 0.45, 0.45], + "stds": [0.225, 0.225, 0.225], + "min_short_side": model_config.image_size, + "max_short_side": model_config.image_size, + "horizontal_flip_p": 0, + "clip_duration": clip_duration, + } + + backbone = video_transformers.AutoBackbone.from_transformers(name_of_path) + head = video_transformers.AutoHead.from_transformers(name_of_path) + + return cls( + backbone=backbone, + head=head, + neck=None, + preprocessor_config=video_preprocessor_config, + labels=labels, + task="single_label_classification", + ) + @classmethod def _from_pretrained( cls, @@ -392,7 +429,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.head(x) return x - def predict(self, video_or_folder_path: Union[str, Path], mode) -> List[Dict]: + def predict(self, video_or_folder_path: Union[str, Path], mode: str = "first_clip") -> List[Dict]: """ Predict the labels and probabilities of a video or folder of videos. Supports local file path/folder directory, S3 URI and https URL. @@ -401,7 +438,7 @@ def predict(self, video_or_folder_path: Union[str, Path], mode) -> List[Dict]: model: The model to use for prediction. video_or_folder_path: The path to the video or folder of videos. Supports local file path/folder directory, S3 URI and https URL. - mode: The mode to use for prediction. Can be "first_batch", "average_all", "random_batch", "uniform_batch". + mode: The mode to use for prediction. Can be "first_clip", "average_all", "random_clip", "uniform_clip". """ result = video_transformers.predict.predict( self, @@ -413,3 +450,7 @@ def predict(self, video_or_folder_path: Union[str, Path], mode) -> List[Dict]: ) return result + + +if __name__ == "__main__": + VideoModel.from_transformers("facebook/timesformer-base-finetuned-k600") diff --git a/video_transformers/predict.py b/video_transformers/predict.py index 377d671..78d324d 100644 --- a/video_transformers/predict.py +++ b/video_transformers/predict.py @@ -15,7 +15,7 @@ def predict( video_or_folder_path, preprocessor_config: dict, labels: List[str], - mode: str = "first_batch", + mode: str = "first_clip", device: str = None, ): """ @@ -28,7 +28,7 @@ def predict( Supports local file path/folder directory, S3 URI and https URL. preprocessor_config: The preprocessor config to use for prediction. labels: The labels to use for prediction. - mode: The mode to use for prediction. Can be "first_batch", "average_all", "random_batch", "uniform_batch". + mode: The mode to use for prediction. Can be "first_clip", "average_all", "random_clip", "uniform_clip". device: The device to use for prediction. Can be "cpu" or "cuda:0". """ @@ -38,22 +38,22 @@ def predict( device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if device is None else device - if mode == "first_batch": + if mode == "first_clip": clip_sampler = pytorchvideo.data.clip_sampling.UniformClipSamplerTruncateFromStart( clip_duration=preprocessor_config["clip_duration"], truncation_duration=preprocessor_config["clip_duration"], ) elif mode == "average_all": clip_sampler = pytorchvideo.data.make_clip_sampler("uniform", preprocessor_config["clip_duration"]) - elif mode == "random_batch": + elif mode == "random_clip": clip_sampler = pytorchvideo.data.make_clip_sampler("random", preprocessor_config["clip_duration"]) - elif mode == "uniform_batch": + elif mode == "uniform_clip": clip_sampler = pytorchvideo.data.make_clip_sampler( "constant_clips_per_video", preprocessor_config["clip_duration"], 1 ) else: raise ValueError( - f"Unknown mode: {mode}. Should be one of 'first_batch', 'average_all', 'random_batch', 'uniform_batch'." + f"Unknown mode: {mode}. Should be one of 'first_clip', 'average_all', 'random_clip', 'uniform_clip'." ) preprocessor = VideoPreprocessor(**preprocessor_config) @@ -99,7 +99,10 @@ def predict( video_name = video_index_to_video_name[video_index] probabilities = torch.mean(torch.stack(clip_predictions), dim=1) video_predictions = zip(labels, probabilities.tolist()[0]) - video_results = {"filename": video_name, "predictions": dict(video_predictions)} + # sort predictions by probability + video_predictions = dict(sorted(dict(video_predictions).items(), key=lambda item: item[1], reverse=True)) + # add video name + video_results = {"filename": video_name, "predictions": video_predictions} results.append(video_results) if g_pathmgr.isfile(video_or_folder_path):