Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/transformers/models/video_llava/processing_video_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ class VideoLlavaProcessor(ProcessorMixin):
The image processor is a required input.
tokenizer ([`LlamaTokenizerFast`], *optional*):
The tokenizer is a required input.
patch_size (`int`, *optional*):
patch_size (`int`, *optional*, defaults to 14):
Patch size from the vision tower.
vision_feature_select_strategy (`str`, *optional*):
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
The feature selection strategy used to select the vision feature from the vision backbone.
Shoudl be same as in model's config
image_token (`str`, *optional*, defaults to `"<image>"`):
Expand All @@ -51,7 +51,7 @@ class VideoLlavaProcessor(ProcessorMixin):
Special token used to denote video location.
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
in a chat into a tokenizable string.
num_additional_image_tokens (`int`, *optional*, defaults to 0):
num_additional_image_tokens (`int`, *optional*, defaults to 1):
Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
extra tokens appended, no need to set this arg.
"""
Expand All @@ -72,12 +72,12 @@ def __init__(
self,
image_processor=None,
tokenizer=None,
patch_size=None,
vision_feature_select_strategy=None,
patch_size=14,
vision_feature_select_strategy="default",
image_token="<image>", # set the default and let users change if they have peculiar special tokens in rare cases
video_token="<video>",
chat_template=None,
num_additional_image_tokens=0,
num_additional_image_tokens=1,
**kwargs,
):
self.patch_size = patch_size
Expand Down