vllm-project · yuanheng-zhao · Jun 5, 2026 · Jun 2, 2026 · Jun 2, 2026 · May 31, 2026
@@ -57,19 +57,22 @@ Adjust `devices` in the YAML to match your hardware.
 
 #### Command
 
-Thinker only (text output):
+Thinker + talker (text and/or audio output):
 
 ```bash
-vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091
+vllm serve Jonathan1909/Ming-flash-omni-2.0 \
+    --omni \
+    --port 8091 \
+    --log-stats
 ```
 
-Thinker + talker (text and/or audio output):
+Thinker only (text-only output):
 
 ```bash
 vllm serve Jonathan1909/Ming-flash-omni-2.0 \
     --omni \
-    --port 8091 \
-    --log-stats
+    --deploy-config vllm_omni/deploy/ming_flash_omni_thinker_only.yaml \
+    --port 8091
 ```
 
 `--log-stats` is optional but recommended while validating the deployment.

@@ -72,6 +72,7 @@
     PLACEHOLDER_VIDEO_TOKEN_IN_TEXT,
     MingFlashOmniProcessor,
     MingWhisperFeatureExtractor,
+    raise_missing_video_processor,
 )
 
 from .audio_encoder import WhisperAudioEncoder
@@ -540,20 +541,20 @@ def _call_hf_processor(
         if images is not None:
             image_outputs = hf_processor.image_processor(
                 images=images,
-                videos=None,
                 return_tensors="pt",
             )
             data.update(image_outputs)
 
         videos = mm_data.get("videos", None)
         if videos is not None:
-            # TODO: ``videos=`` on image_processor is deprecated since
-            # transformers v4.57 (removed in v5); migrate to Qwen2VLVideoProcessor.
-            video_outputs = hf_processor.image_processor(
-                images=None,
-                videos=videos,
-                return_tensors="pt",
-            )
+            video_processor = getattr(hf_processor, "video_processor", None)
+            if video_processor is not None:
+                video_outputs = video_processor(
+                    videos=videos,
+                    return_tensors="pt",
+                )
+            else:
+                raise_missing_video_processor()
             # Rename keys to distinguish from images
             if "pixel_values" in video_outputs:
                 video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values")

@@ -714,6 +714,9 @@ def __init__(
     def get_input_embeddings(self):
         return self.word_embeddings
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.word_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -794,6 +797,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.sampler = Sampler()
         self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
 
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -818,7 +824,7 @@ def compute_logits(
         hidden_states: torch.Tensor,
         sampling_metadata,
     ) -> torch.Tensor | None:
-        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
+        logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def sample(

@@ -902,6 +902,7 @@ def duration_capped_steps(self, text_len: int, requested_max_steps: int) -> int:
         if self._audio_vae is None:
             return requested_max_steps
 
+        # Transformers >=5.x may expose these config values as 0-d tensors.
         sample_rate = float(self._audio_vae.config.sample_rate)
         vae_patch_size = float(getattr(self._audio_vae.config, "patch_size", 4))
         hop_size = float(getattr(self._audio_vae.decoder, "hop_length", 320))
@@ -1041,7 +1042,7 @@ def llm_step(
                 use_cache=True,
             )
         else:
-            past_seen_tokens = past_key_values.get_seq_length()
+            past_seen_tokens = int(past_key_values.get_seq_length())
             cache_position = torch.arange(
                 past_seen_tokens,
                 past_seen_tokens + inputs_embeds.shape[1],

@@ -27,6 +27,7 @@
 
 class BailingMoeV2Config(PretrainedConfig):
     model_type = "bailing_moe_v2"
+    ignore_keys_at_rope_validation = {"mrope_section"}
 
     def __init__(
         self,
@@ -237,6 +238,7 @@ def __init__(
 
 class BailingMM2Config(PretrainedConfig):
     model_type = "bailingmm_moe_v2_lite"
+    ignore_keys_at_rope_validation = {"mrope_section"}
     is_composition = True
     sub_configs: ClassVar = {"llm_config": AutoConfig}
 
@@ -352,9 +354,13 @@ def __init__(
         self.campplus_model = campplus_model
 
     def get_text_config(self, decoder: bool = False) -> PretrainedConfig:  # noqa: ARG002
-        if isinstance(self.llm_config, dict):
-            return PretrainedConfig.from_dict(self.llm_config)
-        return self.llm_config
+        # NOTE: transformers v5 runs validators (e.g. validate_token_ids -> get_text_config)
+        # during PretrainedConfig.__init__, before llm_config is assigned
+        llm_config = getattr(self, "llm_config", None)
+        if isinstance(llm_config, dict):
+            return PretrainedConfig.from_dict(llm_config)
+
+        return llm_config
 
 
 class MingFlashOmniConfig(PretrainedConfig):

@@ -22,6 +22,21 @@
 from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+
+try:
+    from transformers import AutoVideoProcessor
+except ImportError:
+    AutoVideoProcessor = None
+
+_HAS_VIDEO_PROCESSOR = AutoVideoProcessor is not None
+
+logger = logging.get_logger(__name__)
+
+
+def raise_missing_video_processor():
+    raise ValueError("Ming Flash Omni video inputs require a Transformers 5.x `video_processor`.")
+
 
 DEFAULT_IMAGE_PATCH_TOKEN = "<imagePatch>"
 DEFAULT_IM_START_TOKEN = "<image>"
@@ -156,6 +171,8 @@ class MingFlashOmniProcessor(ProcessorMixin):
 
     attributes = ["image_processor", "audio_processor", "tokenizer"]
     image_processor_class = "AutoImageProcessor"
+    if _HAS_VIDEO_PROCESSOR:
+        video_processor_class = "AutoVideoProcessor"
     audio_processor_class = "AutoFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
 
@@ -167,6 +184,7 @@ def __init__(
         merge_size: int = 2,
         **kwargs,
     ):
+        video_processor = kwargs.pop("video_processor", None)
         # Enforce that all sub-processors exist
         # Keep None defaults in the signature for HF ProcessorMixin compatibility
         if image_processor is None:
@@ -180,16 +198,47 @@ def __init__(
         self.image_token = PLACEHOLDER_IMAGE_TOKEN_IN_TEXT
         self.video_token = PLACEHOLDER_VIDEO_TOKEN_IN_TEXT
         self.audio_token = PLACEHOLDER_AUDIO_TOKEN_IN_TEXT
+        if video_processor is not None and not _HAS_VIDEO_PROCESSOR:
+            raise ValueError("`video_processor` requires transformers with `AutoVideoProcessor` support.")
+
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
             tokenizer=tokenizer,
         )
+        self.video_processor = video_processor
 
         # Fall back to the tokenizer's own chat_template.
         if self.chat_template is None:
             self.chat_template = getattr(tokenizer, "chat_template", None)
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        video_processor = kwargs.pop("video_processor", None)
+        processor = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        if video_processor is not None:
+            processor.video_processor = video_processor
+        elif _HAS_VIDEO_PROCESSOR:
+            try:
+                processor.video_processor = AutoVideoProcessor.from_pretrained(
+                    pretrained_model_name_or_path,
+                    *args,
+                    **kwargs,
+                )
+            except OSError:
+                processor.video_processor = None
+            except (ValueError, KeyError) as exc:
+                logger.warning("Failed to load optional Ming video processor: %s", exc)
+                processor.video_processor = None
+        return processor
+
+    def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
+        output = super().save_pretrained(save_directory, push_to_hub=push_to_hub, **kwargs)
+        video_processor = getattr(self, "video_processor", None)
+        if video_processor is not None:
+            video_processor.save_pretrained(save_directory)
+        return output
+
     def __call__(
         self,
         text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput],
@@ -211,7 +260,6 @@ def __call__(
         if images is not None:
             image_outputs = self.image_processor(
                 images=images,
-                videos=None,
                 return_tensors="pt",
                 **kwargs.get("images_kwargs", {}),
             )
@@ -220,12 +268,15 @@ def __call__(
                 text = self._expand_image_tokens(text, image_outputs["image_grid_thw"])
 
         if videos is not None:
-            video_outputs = self.image_processor(
-                images=None,
-                videos=videos,
-                return_tensors="pt",
-                **kwargs.get("videos_kwargs", {}),
-            )
+            video_processor = getattr(self, "video_processor", None)
+            if video_processor is not None:
+                video_outputs = video_processor(
+                    videos=videos,
+                    return_tensors="pt",
+                    **kwargs.get("videos_kwargs", {}),
+                )
+            else:
+                raise_missing_video_processor()
             if "pixel_values" in video_outputs:
                 video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values")
             if "image_grid_thw" in video_outputs:
@@ -423,6 +474,9 @@ def model_input_names(self):
             + self.image_processor.model_input_names
             + self.audio_processor.model_input_names
         )
+        video_processor = getattr(self, "video_processor", None)
+        if video_processor is not None:
+            names += video_processor.model_input_names
         return list(dict.fromkeys(names))