Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor dataloading and preprocessing #9

Merged
merged 2 commits into from
Aug 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ datamodule = VideoDataModule(
num_workers=4,
num_timesteps=8,
preprocess_input_size=224,
preprocess_clip_duration=1,
preprocess_means=backbone.mean,
preprocess_stds=backbone.std,
preprocess_min_short_side_scale=256,
Expand All @@ -119,7 +120,8 @@ Trainer = trainer_factory("single_label_classification")
trainer = Trainer(
datamodule,
model,
optimizer=optimizer
optimizer=optimizer,
max_epochs=8
)

trainer.fit()
Expand All @@ -142,12 +144,12 @@ neck = GRUNeck(num_features=backbone.num_features, hidden_size=128, num_layers=2
datamodule = VideoDataModule(
train_root=".../ucf6/train",
val_root=".../ucf6/val",
clip_duration=2,
train_dataset_multiplier=1,
batch_size=4,
num_workers=4,
num_timesteps=8,
preprocess_input_size=224,
preprocess_clip_duration=1,
preprocess_means=backbone.mean,
preprocess_stds=backbone.std,
preprocess_min_short_side_scale=256,
Expand All @@ -162,6 +164,7 @@ Trainer = trainer_factory("single_label_classification")
trainer = Trainer(
datamodule,
model,
max_epochs=8
)

trainer.fit()
Expand Down
26 changes: 16 additions & 10 deletions tests/test_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,15 @@ def test_onnx_export(self):
"transformer_enc_num_layers": 2,
"return_mean": True,
},
"preprocess_means": [0.485, 0.456, 0.406],
"preprocess_stds": [0.229, 0.224, 0.225],
"preprocess_min_short_side_scale": 256,
"preprocess_input_size": 224,
"num_timesteps": 8,
"preprocessor": {
"means": [0.485, 0.456, 0.406],
"stds": [0.229, 0.224, 0.225],
"min_short_side": 256,
"input_size": 224,
"num_timesteps": 8,
},
"labels": ["BodyWeightSquats", "JumpRope", "Lunges", "PullUps", "PushUps", "WallPushups"],
"task": "single_label_classification",
}

model = VideoClassificationModel.from_config(config)
Expand Down Expand Up @@ -66,12 +69,15 @@ def test_quantized_onnx_export(self):
"transformer_enc_num_layers": 2,
"return_mean": True,
},
"preprocess_means": [0.485, 0.456, 0.406],
"preprocess_stds": [0.229, 0.224, 0.225],
"preprocess_min_short_side_scale": 256,
"preprocess_input_size": 224,
"num_timesteps": 8,
"preprocessor": {
"means": [0.485, 0.456, 0.406],
"stds": [0.229, 0.224, 0.225],
"min_short_side": 256,
"input_size": 224,
"num_timesteps": 8,
},
"labels": ["BodyWeightSquats", "JumpRope", "Lunges", "PullUps", "PushUps", "WallPushups"],
"task": "single_label_classification",
}

model = VideoClassificationModel.from_config(config)
Expand Down
15 changes: 9 additions & 6 deletions tests/test_video_classification_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,21 @@ def test_transformers_backbone(self):
"transformer_enc_num_layers": 2,
"return_mean": True,
},
"preprocess_means": [0.485, 0.456, 0.406],
"preprocess_tds": [0.229, 0.224, 0.225],
"preprocess_min_short_side_scale": 256,
"preprocess_input_size": 224,
"num_timesteps": 8,
"preprocessor": {
"means": [0.485, 0.456, 0.406],
"stds": [0.229, 0.224, 0.225],
"min_short_side": 256,
"input_size": 224,
"num_timesteps": 8,
},
"labels": ["BodyWeightSquats", "JumpRope", "Lunges", "PullUps", "PushUps", "WallPushups"],
"task": "single_label_classification",
}
batch_size = 2

model = VideoClassificationModel.from_config(config)

input = torch.randn(batch_size, 3, config["num_timesteps"], 224, 224)
input = torch.randn(batch_size, 3, config["preprocessor"]["num_timesteps"], 224, 224)
output = model(input)
self.assertEqual(output.shape, (batch_size, model.head.num_classes))

Expand Down
4 changes: 2 additions & 2 deletions video_transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from video_transformers.auto.backbone import AutoBackbone
from video_transformers.auto.head import AutoHead
from video_transformers.auto.neck import AutoNeck
from video_transformers.modules import TimeDistributed, VideoClassificationModel
from video_transformers.modeling import TimeDistributed, VideoClassificationModel

__version__ = "0.0.5"
__version__ = "0.0.6"
4 changes: 2 additions & 2 deletions video_transformers/auto/backbone.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Dict, Union

from video_transformers.backbones.base import Backbone
from video_transformers.modules import TimeDistributed
from video_transformers.modeling import TimeDistributed


class AutoBackbone:
Expand All @@ -27,7 +27,7 @@ def from_config(cls, config: Dict) -> Union[Backbone, TimeDistributed]:
raise ValueError(f"Unknown framework {backbone_framework}")

if backbone_type == "2d_backbone":
from video_transformers.modules import TimeDistributed
from video_transformers.modeling import TimeDistributed

backbone = TimeDistributed(backbone)
return backbone
2 changes: 1 addition & 1 deletion video_transformers/backbones/timm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from torch import nn

from video_transformers.backbones.base import Backbone
from video_transformers.modules import Identity
from video_transformers.modeling import Identity
from video_transformers.utils.torch import unfreeze_last_n_stages as unfreeze_last_n_stages_torch


Expand Down
74 changes: 42 additions & 32 deletions video_transformers/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@
from torch.utils.data import DataLoader
from torchvision.transforms import CenterCrop, Compose, Lambda, RandomCrop, RandomHorizontalFlip

from video_transformers.utils.dataset import LabeledVideoDataset, LabeledVideoPaths
from video_transformers.pytorchvideo_wrapper.data.labeled_video_paths import LabeledVideoDataset, LabeledVideoPaths
from video_transformers.utils.extra import class_to_config

logger = get_logger(__name__)


class VideoPreprocess:
class VideoPreprocessor:
@classmethod
def from_config(cls, config: Dict, **kwargs) -> "VideoPreprocess":
def from_config(cls, config: Dict, **kwargs) -> "VideoPreprocessor":
"""
Creates an instance of the class from a config.

Expand All @@ -36,25 +36,27 @@ def from_config(cls, config: Dict, **kwargs) -> "VideoPreprocess":

def __init__(
self,
timesteps: int = 8,
num_timesteps: int = 8,
input_size: int = 224,
means: Tuple[float] = (0.45, 0.45, 0.45),
stds: Tuple[float] = (0.225, 0.225, 0.225),
min_short_side_scale: int = 256,
max_short_side_scale: int = 320,
min_short_side: int = 256,
max_short_side: int = 320,
horizontal_flip_p: float = 0.5,
clip_duration: int = 1,
):
"""
Creates preprocess transforms.

Args:
timesteps: number of frames in a video clip
num_timesteps: number of frames in a video clip
input_size: model input isze
means: mean of the video clip
stds: standard deviation of the video clip
min_short_side_scale: minimum short side of the video clip
max_short_side_scale: maximum short side of the video clip
horizontal_flip_p: probability of horizontal flip
clip_duration: duration of each video clip

Properties:
train_transform: transforms for training
Expand All @@ -65,23 +67,24 @@ def __init__(
"""
super().__init__()

self.timesteps = timesteps
self.num_timesteps = num_timesteps
self.input_size = input_size
self.means = means
self.stds = stds
self.min_short_side_scale = min_short_side_scale
self.max_short_side_scale = max_short_side_scale
self.min_short_side = min_short_side
self.max_short_side = max_short_side
self.horizontal_flip_p = horizontal_flip_p
self.clip_duration = clip_duration

# Transforms applied to train dataset.
self.train_video_transform = Compose(
[
UniformTemporalSubsample(self.timesteps),
UniformTemporalSubsample(self.num_timesteps),
Lambda(lambda x: x / 255.0),
Normalize(self.means, self.stds),
RandomShortSideScale(
min_size=self.min_short_side_scale,
max_size=self.max_short_side_scale,
min_size=self.min_short_side,
max_size=self.max_short_side,
),
RandomCrop(self.input_size),
RandomHorizontalFlip(p=self.horizontal_flip_p),
Expand All @@ -93,10 +96,10 @@ def __init__(
# Transforms applied on val dataset or for inference.
self.val_video_transform = Compose(
[
UniformTemporalSubsample(self.timesteps),
UniformTemporalSubsample(self.num_timesteps),
Lambda(lambda x: x / 255.0),
Normalize(self.means, self.stds),
ShortSideScale(self.min_short_side_scale),
ShortSideScale(self.min_short_side),
CenterCrop(self.input_size),
]
)
Expand All @@ -109,16 +112,16 @@ def __init__(
train_root: str,
val_root: str,
test_root: str = None,
clip_duration: int = 2,
train_dataset_multiplier: int = 1,
batch_size: int = 4,
num_workers: int = 4,
num_timesteps: int = 8,
preprocess_input_size: int = 224,
preprocess_clip_duration: int = 1,
preprocess_means: Tuple[float] = (0.45, 0.45, 0.45),
preprocess_stds: Tuple[float] = (0.225, 0.225, 0.225),
preprocess_min_short_side_scale: int = 256,
preprocess_max_short_side_scale: int = 320,
preprocess_min_short_side: int = 256,
preprocess_max_short_side: int = 320,
preprocess_horizontal_flip_p: float = 0.5,
):
"""
Expand Down Expand Up @@ -169,25 +172,26 @@ def __init__(
Mean pixel value to be used during normalization.
preprocess_stds: Tuple[float]
Standard deviation pixel value to be used during normalization.
preprocess_min_short_side_scale: int
preprocess_min_short_side: int
Minimum value of the short side of the clip after resizing.
preprocess_max_short_side_scale: int
preprocess_max_short_side: int
Maximum value of the short side of the clip after resizing.
preprocess_horizontal_flip_p: float
Probability of horizontal flip.
"""
self.preprocess_config = {
"timesteps": num_timesteps,
self.preprocessor_config = {
"num_timesteps": num_timesteps,
"input_size": preprocess_input_size,
"means": preprocess_means,
"stds": preprocess_stds,
"min_short_side_scale": preprocess_min_short_side_scale,
"max_short_side_scale": preprocess_max_short_side_scale,
"min_short_side": preprocess_min_short_side,
"max_short_side": preprocess_max_short_side,
"horizontal_flip_p": preprocess_horizontal_flip_p,
"clip_duration": preprocess_clip_duration,
}
self.preprocess = VideoPreprocess.from_config(self.preprocess_config)
self.preprocessor = VideoPreprocessor.from_config(self.preprocessor_config)

self.dataloader_config = {"batch_size": batch_size, "num_workers": num_workers, "clip_duration": clip_duration}
self.dataloader_config = {"batch_size": batch_size, "num_workers": num_workers}

self.train_root = train_root
self.val_root = val_root
Expand All @@ -211,12 +215,12 @@ def _get_train_dataloader(self):
labeled_video_paths = LabeledVideoPaths.from_path(self.train_root)
labeled_video_paths.path_prefix = ""
video_sampler = torch.utils.data.RandomSampler
clip_sampler = pytorchvideo.data.make_clip_sampler("random", self.dataloader_config["clip_duration"])
clip_sampler = pytorchvideo.data.make_clip_sampler("random", self.preprocessor_config["clip_duration"])
dataset = LabeledVideoDataset(
labeled_video_paths,
clip_sampler,
video_sampler,
self.preprocess.train_transform,
self.preprocessor.train_transform,
decode_audio=False,
decoder="pyav",
dataset_multiplier=self.train_dataset_multiplier,
Expand All @@ -233,12 +237,15 @@ def _get_val_dataloader(self):
labeled_video_paths = LabeledVideoPaths.from_path(self.val_root)
labeled_video_paths.path_prefix = ""
video_sampler = torch.utils.data.SequentialSampler
clip_sampler = pytorchvideo.data.make_clip_sampler("uniform", self.dataloader_config["clip_duration"])
clip_sampler = pytorchvideo.data.clip_sampling.UniformClipSamplerTruncateFromStart(
clip_duration=self.preprocessor_config["clip_duration"],
truncation_duration=self.preprocessor_config["clip_duration"],
)
dataset = LabeledVideoDataset(
labeled_video_paths,
clip_sampler,
video_sampler,
self.preprocess.val_transform,
self.preprocessor.val_transform,
decode_audio=False,
decoder="pyav",
)
Expand All @@ -253,12 +260,15 @@ def _get_test_dataloader(self):
labeled_video_paths = LabeledVideoPaths.from_path(self.test_root)
labeled_video_paths.path_prefix = ""
video_sampler = torch.utils.data.SequentialSampler
clip_sampler = pytorchvideo.data.make_clip_sampler("uniform", self.dataloader_config["clip_duration"])
clip_sampler = pytorchvideo.data.clip_sampling.UniformClipSamplerTruncateFromStart(
clip_duration=self.preprocessor_config["clip_duration"],
truncation_duration=self.preprocessor_config["clip_duration"],
)
dataset = LabeledVideoDataset(
labeled_video_paths,
clip_sampler,
video_sampler,
self.preprocess.val_transform,
self.preprocessor.val_transform,
decode_audio=False,
decoder="pyav",
)
Expand Down
Empty file.
2 changes: 1 addition & 1 deletion video_transformers/deployment/onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def export(

from onnxruntime.quantization import quantize_dynamic

export_filename = Path(export_path).stem + f"_quantize.{Path(export_path).suffix}"
export_filename = Path(export_path).stem + f"_quantized.{Path(export_path).suffix}"

target_model_path = Path(export_path).parent / export_filename

Expand Down
Loading