diff --git a/recipes/inclusionAI/Ming-flash-omni-2.0.md b/recipes/inclusionAI/Ming-flash-omni-2.0.md index 3dcda2cbe14..4a3a8666928 100644 --- a/recipes/inclusionAI/Ming-flash-omni-2.0.md +++ b/recipes/inclusionAI/Ming-flash-omni-2.0.md @@ -57,19 +57,22 @@ Adjust `devices` in the YAML to match your hardware. #### Command -Thinker only (text output): +Thinker + talker (text and/or audio output): ```bash -vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091 +vllm serve Jonathan1909/Ming-flash-omni-2.0 \ + --omni \ + --port 8091 \ + --log-stats ``` -Thinker + talker (text and/or audio output): +Thinker only (text-only output): ```bash vllm serve Jonathan1909/Ming-flash-omni-2.0 \ --omni \ - --port 8091 \ - --log-stats + --deploy-config vllm_omni/deploy/ming_flash_omni_thinker_only.yaml \ + --port 8091 ``` `--log-stats` is optional but recommended while validating the deployment. diff --git a/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py index 0a9ce0f5cc3..e0a49935df1 100644 --- a/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py +++ b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py @@ -72,6 +72,7 @@ PLACEHOLDER_VIDEO_TOKEN_IN_TEXT, MingFlashOmniProcessor, MingWhisperFeatureExtractor, + raise_missing_video_processor, ) from .audio_encoder import WhisperAudioEncoder @@ -540,20 +541,20 @@ def _call_hf_processor( if images is not None: image_outputs = hf_processor.image_processor( images=images, - videos=None, return_tensors="pt", ) data.update(image_outputs) videos = mm_data.get("videos", None) if videos is not None: - # TODO: ``videos=`` on image_processor is deprecated since - # transformers v4.57 (removed in v5); migrate to Qwen2VLVideoProcessor. - video_outputs = hf_processor.image_processor( - images=None, - videos=videos, - return_tensors="pt", - ) + video_processor = getattr(hf_processor, "video_processor", None) + if video_processor is not None: + video_outputs = video_processor( + videos=videos, + return_tensors="pt", + ) + else: + raise_missing_video_processor() # Rename keys to distinguish from images if "pixel_values" in video_outputs: video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values") diff --git a/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py b/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py index ca7d00f5032..5a1af2fc23c 100644 --- a/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py +++ b/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py @@ -714,6 +714,9 @@ def __init__( def get_input_embeddings(self): return self.word_embeddings + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.word_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -794,6 +797,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.sampler = Sampler() self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -818,7 +824,7 @@ def compute_logits( hidden_states: torch.Tensor, sampling_metadata, ) -> torch.Tensor | None: - logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def sample( diff --git a/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py b/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py index 6404b3fb891..6b2c6b2f200 100644 --- a/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py +++ b/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py @@ -902,6 +902,7 @@ def duration_capped_steps(self, text_len: int, requested_max_steps: int) -> int: if self._audio_vae is None: return requested_max_steps + # Transformers >=5.x may expose these config values as 0-d tensors. sample_rate = float(self._audio_vae.config.sample_rate) vae_patch_size = float(getattr(self._audio_vae.config, "patch_size", 4)) hop_size = float(getattr(self._audio_vae.decoder, "hop_length", 320)) @@ -1041,7 +1042,7 @@ def llm_step( use_cache=True, ) else: - past_seen_tokens = past_key_values.get_seq_length() + past_seen_tokens = int(past_key_values.get_seq_length()) cache_position = torch.arange( past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], diff --git a/vllm_omni/transformers_utils/configs/ming_flash_omni.py b/vllm_omni/transformers_utils/configs/ming_flash_omni.py index 376d902e381..6799a411ec9 100644 --- a/vllm_omni/transformers_utils/configs/ming_flash_omni.py +++ b/vllm_omni/transformers_utils/configs/ming_flash_omni.py @@ -27,6 +27,7 @@ class BailingMoeV2Config(PretrainedConfig): model_type = "bailing_moe_v2" + ignore_keys_at_rope_validation = {"mrope_section"} def __init__( self, @@ -237,6 +238,7 @@ def __init__( class BailingMM2Config(PretrainedConfig): model_type = "bailingmm_moe_v2_lite" + ignore_keys_at_rope_validation = {"mrope_section"} is_composition = True sub_configs: ClassVar = {"llm_config": AutoConfig} @@ -352,9 +354,13 @@ def __init__( self.campplus_model = campplus_model def get_text_config(self, decoder: bool = False) -> PretrainedConfig: # noqa: ARG002 - if isinstance(self.llm_config, dict): - return PretrainedConfig.from_dict(self.llm_config) - return self.llm_config + # NOTE: transformers v5 runs validators (e.g. validate_token_ids -> get_text_config) + # during PretrainedConfig.__init__, before llm_config is assigned + llm_config = getattr(self, "llm_config", None) + if isinstance(llm_config, dict): + return PretrainedConfig.from_dict(llm_config) + + return llm_config class MingFlashOmniConfig(PretrainedConfig): diff --git a/vllm_omni/transformers_utils/processors/ming.py b/vllm_omni/transformers_utils/processors/ming.py index 7f414b7268c..685909029ad 100644 --- a/vllm_omni/transformers_utils/processors/ming.py +++ b/vllm_omni/transformers_utils/processors/ming.py @@ -22,6 +22,21 @@ from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin from transformers.processing_utils import ProcessorMixin from transformers.tokenization_utils_base import PreTokenizedInput, TextInput +from transformers.utils import logging + +try: + from transformers import AutoVideoProcessor +except ImportError: + AutoVideoProcessor = None + +_HAS_VIDEO_PROCESSOR = AutoVideoProcessor is not None + +logger = logging.get_logger(__name__) + + +def raise_missing_video_processor(): + raise ValueError("Ming Flash Omni video inputs require a Transformers 5.x `video_processor`.") + DEFAULT_IMAGE_PATCH_TOKEN = "" DEFAULT_IM_START_TOKEN = "" @@ -156,6 +171,8 @@ class MingFlashOmniProcessor(ProcessorMixin): attributes = ["image_processor", "audio_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" + if _HAS_VIDEO_PROCESSOR: + video_processor_class = "AutoVideoProcessor" audio_processor_class = "AutoFeatureExtractor" tokenizer_class = "AutoTokenizer" @@ -167,6 +184,7 @@ def __init__( merge_size: int = 2, **kwargs, ): + video_processor = kwargs.pop("video_processor", None) # Enforce that all sub-processors exist # Keep None defaults in the signature for HF ProcessorMixin compatibility if image_processor is None: @@ -180,16 +198,47 @@ def __init__( self.image_token = PLACEHOLDER_IMAGE_TOKEN_IN_TEXT self.video_token = PLACEHOLDER_VIDEO_TOKEN_IN_TEXT self.audio_token = PLACEHOLDER_AUDIO_TOKEN_IN_TEXT + if video_processor is not None and not _HAS_VIDEO_PROCESSOR: + raise ValueError("`video_processor` requires transformers with `AutoVideoProcessor` support.") + super().__init__( image_processor=image_processor, audio_processor=audio_processor, tokenizer=tokenizer, ) + self.video_processor = video_processor # Fall back to the tokenizer's own chat_template. if self.chat_template is None: self.chat_template = getattr(tokenizer, "chat_template", None) + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): + video_processor = kwargs.pop("video_processor", None) + processor = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + if video_processor is not None: + processor.video_processor = video_processor + elif _HAS_VIDEO_PROCESSOR: + try: + processor.video_processor = AutoVideoProcessor.from_pretrained( + pretrained_model_name_or_path, + *args, + **kwargs, + ) + except OSError: + processor.video_processor = None + except (ValueError, KeyError) as exc: + logger.warning("Failed to load optional Ming video processor: %s", exc) + processor.video_processor = None + return processor + + def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): + output = super().save_pretrained(save_directory, push_to_hub=push_to_hub, **kwargs) + video_processor = getattr(self, "video_processor", None) + if video_processor is not None: + video_processor.save_pretrained(save_directory) + return output + def __call__( self, text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput], @@ -211,7 +260,6 @@ def __call__( if images is not None: image_outputs = self.image_processor( images=images, - videos=None, return_tensors="pt", **kwargs.get("images_kwargs", {}), ) @@ -220,12 +268,15 @@ def __call__( text = self._expand_image_tokens(text, image_outputs["image_grid_thw"]) if videos is not None: - video_outputs = self.image_processor( - images=None, - videos=videos, - return_tensors="pt", - **kwargs.get("videos_kwargs", {}), - ) + video_processor = getattr(self, "video_processor", None) + if video_processor is not None: + video_outputs = video_processor( + videos=videos, + return_tensors="pt", + **kwargs.get("videos_kwargs", {}), + ) + else: + raise_missing_video_processor() if "pixel_values" in video_outputs: video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values") if "image_grid_thw" in video_outputs: @@ -423,6 +474,9 @@ def model_input_names(self): + self.image_processor.model_input_names + self.audio_processor.model_input_names ) + video_processor = getattr(self, "video_processor", None) + if video_processor is not None: + names += video_processor.model_input_names return list(dict.fromkeys(names))