huggingface · zucchini-nlp · Nov 26, 2024 · Nov 25, 2024
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
@@ -40,9 +40,9 @@ class VideoLlavaProcessor(ProcessorMixin):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`], *optional*):
             The tokenizer is a required input.
-        patch_size (`int`, *optional*):
+        patch_size (`int`, *optional*, defaults to 14):
             Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
             The feature selection strategy used to select the vision feature from the vision backbone.
             Shoudl be same as in model's config
         image_token (`str`, *optional*, defaults to `"<image>"`):
@@ -51,7 +51,7 @@ class VideoLlavaProcessor(ProcessorMixin):
             Special token used to denote video location.
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+        num_additional_image_tokens (`int`, *optional*, defaults to 1):
             Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
             extra tokens appended, no need to set this arg.
     """
@@ -72,12 +72,12 @@ def __init__(
         self,
         image_processor=None,
         tokenizer=None,
-        patch_size=None,
-        vision_feature_select_strategy=None,
+        patch_size=14,
+        vision_feature_select_strategy="default",
         image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
         video_token="<video>",
         chat_template=None,
-        num_additional_image_tokens=0,
+        num_additional_image_tokens=1,
         **kwargs,
     ):
         self.patch_size = patch_size