Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions recipes/inclusionAI/Ming-flash-omni-2.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,22 @@ Adjust `devices` in the YAML to match your hardware.

#### Command

Thinker only (text output):
Thinker + talker (text and/or audio output):

```bash
vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091
vllm serve Jonathan1909/Ming-flash-omni-2.0 \
--omni \
--port 8091 \
--log-stats
```

Thinker + talker (text and/or audio output):
Thinker only (text-only output):

```bash
vllm serve Jonathan1909/Ming-flash-omni-2.0 \
--omni \
--port 8091 \
--log-stats
--deploy-config vllm_omni/deploy/ming_flash_omni_thinker_only.yaml \
--port 8091
```

`--log-stats` is optional but recommended while validating the deployment.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
PLACEHOLDER_VIDEO_TOKEN_IN_TEXT,
MingFlashOmniProcessor,
MingWhisperFeatureExtractor,
raise_missing_video_processor,
)

from .audio_encoder import WhisperAudioEncoder
Expand Down Expand Up @@ -540,20 +541,20 @@ def _call_hf_processor(
if images is not None:
image_outputs = hf_processor.image_processor(
images=images,
videos=None,
return_tensors="pt",
)
data.update(image_outputs)

videos = mm_data.get("videos", None)
if videos is not None:
# TODO: ``videos=`` on image_processor is deprecated since
# transformers v4.57 (removed in v5); migrate to Qwen2VLVideoProcessor.
video_outputs = hf_processor.image_processor(
images=None,
videos=videos,
return_tensors="pt",
)
video_processor = getattr(hf_processor, "video_processor", None)
if video_processor is not None:
video_outputs = video_processor(
videos=videos,
return_tensors="pt",
)
else:
raise_missing_video_processor()
# Rename keys to distinguish from images
if "pixel_values" in video_outputs:
video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,9 @@ def __init__(
def get_input_embeddings(self):
return self.word_embeddings

def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.word_embeddings(input_ids)

def forward(
self,
input_ids: torch.Tensor,
Expand Down Expand Up @@ -794,6 +797,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.sampler = Sampler()
self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors

def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.embed_input_ids(input_ids)

def forward(
self,
input_ids: torch.Tensor,
Expand All @@ -818,7 +824,7 @@ def compute_logits(
hidden_states: torch.Tensor,
sampling_metadata,
) -> torch.Tensor | None:
logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
logits = self.logits_processor(self.lm_head, hidden_states)
return logits

def sample(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,7 @@ def duration_capped_steps(self, text_len: int, requested_max_steps: int) -> int:
if self._audio_vae is None:
return requested_max_steps

# Transformers >=5.x may expose these config values as 0-d tensors.
sample_rate = float(self._audio_vae.config.sample_rate)
vae_patch_size = float(getattr(self._audio_vae.config, "patch_size", 4))
hop_size = float(getattr(self._audio_vae.decoder, "hop_length", 320))
Expand Down Expand Up @@ -1041,7 +1042,7 @@ def llm_step(
use_cache=True,
)
else:
past_seen_tokens = past_key_values.get_seq_length()
past_seen_tokens = int(past_key_values.get_seq_length())
Comment thread
yuanheng-zhao marked this conversation as resolved.
cache_position = torch.arange(
past_seen_tokens,
past_seen_tokens + inputs_embeds.shape[1],
Expand Down
12 changes: 9 additions & 3 deletions vllm_omni/transformers_utils/configs/ming_flash_omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

class BailingMoeV2Config(PretrainedConfig):
model_type = "bailing_moe_v2"
ignore_keys_at_rope_validation = {"mrope_section"}

def __init__(
self,
Expand Down Expand Up @@ -237,6 +238,7 @@ def __init__(

class BailingMM2Config(PretrainedConfig):
model_type = "bailingmm_moe_v2_lite"
ignore_keys_at_rope_validation = {"mrope_section"}
is_composition = True
sub_configs: ClassVar = {"llm_config": AutoConfig}

Expand Down Expand Up @@ -352,9 +354,13 @@ def __init__(
self.campplus_model = campplus_model

def get_text_config(self, decoder: bool = False) -> PretrainedConfig: # noqa: ARG002
if isinstance(self.llm_config, dict):
return PretrainedConfig.from_dict(self.llm_config)
return self.llm_config
# NOTE: transformers v5 runs validators (e.g. validate_token_ids -> get_text_config)
# during PretrainedConfig.__init__, before llm_config is assigned
llm_config = getattr(self, "llm_config", None)
if isinstance(llm_config, dict):
return PretrainedConfig.from_dict(llm_config)

return llm_config


class MingFlashOmniConfig(PretrainedConfig):
Expand Down
68 changes: 61 additions & 7 deletions vllm_omni/transformers_utils/processors/ming.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,21 @@
from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from transformers.utils import logging

try:
from transformers import AutoVideoProcessor
except ImportError:
AutoVideoProcessor = None

_HAS_VIDEO_PROCESSOR = AutoVideoProcessor is not None

logger = logging.get_logger(__name__)


def raise_missing_video_processor():
raise ValueError("Ming Flash Omni video inputs require a Transformers 5.x `video_processor`.")


DEFAULT_IMAGE_PATCH_TOKEN = "<imagePatch>"
DEFAULT_IM_START_TOKEN = "<image>"
Expand Down Expand Up @@ -156,6 +171,8 @@ class MingFlashOmniProcessor(ProcessorMixin):

attributes = ["image_processor", "audio_processor", "tokenizer"]

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[major] video_processor is set after super().__init__() and is not in attributes, so ProcessorMixin.save_pretrained() will not persist it. A user who does processor.save_pretrained('/tmp/x') then loads from /tmp/x will silently lose the video processor (the new from_pretrained override only rescues loads from the original HF repo path, where the video config exists). Suggest gating "video_processor" into attributes on _HAS_VIDEO_PROCESSOR so the round-trip is symmetric. Otherwise this is a regression on save+reload workflows.

image_processor_class = "AutoImageProcessor"
if _HAS_VIDEO_PROCESSOR:
video_processor_class = "AutoVideoProcessor"
audio_processor_class = "AutoFeatureExtractor"
tokenizer_class = "AutoTokenizer"

Expand All @@ -167,6 +184,7 @@ def __init__(
merge_size: int = 2,
**kwargs,
):
video_processor = kwargs.pop("video_processor", None)
# Enforce that all sub-processors exist
# Keep None defaults in the signature for HF ProcessorMixin compatibility
if image_processor is None:
Expand All @@ -180,16 +198,47 @@ def __init__(
self.image_token = PLACEHOLDER_IMAGE_TOKEN_IN_TEXT
self.video_token = PLACEHOLDER_VIDEO_TOKEN_IN_TEXT
self.audio_token = PLACEHOLDER_AUDIO_TOKEN_IN_TEXT
if video_processor is not None and not _HAS_VIDEO_PROCESSOR:
raise ValueError("`video_processor` requires transformers with `AutoVideoProcessor` support.")

super().__init__(
image_processor=image_processor,
audio_processor=audio_processor,
tokenizer=tokenizer,
)
self.video_processor = video_processor

# Fall back to the tokenizer's own chat_template.
if self.chat_template is None:
self.chat_template = getattr(tokenizer, "chat_template", None)

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
video_processor = kwargs.pop("video_processor", None)
processor = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
if video_processor is not None:
processor.video_processor = video_processor
elif _HAS_VIDEO_PROCESSOR:
try:
processor.video_processor = AutoVideoProcessor.from_pretrained(
pretrained_model_name_or_path,
*args,
**kwargs,
)
except OSError:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[major] Narrow except OSError only catches missing-file cases. A malformed or partial video_preprocessor_config.json (e.g. unknown processor class, schema mismatch) raises ValueError/KeyError and will crash the whole processor load, defeating the fallback. Consider broadening to (OSError, ValueError, KeyError) or Exception with a logger.warning(...) and video_processor = None, matching the spirit of the try/except TypeError fallback in __call__ below.

processor.video_processor = None
except (ValueError, KeyError) as exc:
logger.warning("Failed to load optional Ming video processor: %s", exc)
processor.video_processor = None
return processor

def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
output = super().save_pretrained(save_directory, push_to_hub=push_to_hub, **kwargs)
video_processor = getattr(self, "video_processor", None)
if video_processor is not None:
video_processor.save_pretrained(save_directory)
return output

def __call__(
self,
text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput],
Expand All @@ -211,7 +260,6 @@ def __call__(
if images is not None:
image_outputs = self.image_processor(
images=images,
videos=None,
return_tensors="pt",
**kwargs.get("images_kwargs", {}),
)
Expand All @@ -220,12 +268,15 @@ def __call__(
text = self._expand_image_tokens(text, image_outputs["image_grid_thw"])

if videos is not None:
video_outputs = self.image_processor(
images=None,
videos=videos,
return_tensors="pt",
**kwargs.get("videos_kwargs", {}),
)
video_processor = getattr(self, "video_processor", None)
if video_processor is not None:
video_outputs = video_processor(
videos=videos,
return_tensors="pt",
**kwargs.get("videos_kwargs", {}),
)
else:
raise_missing_video_processor()
if "pixel_values" in video_outputs:
video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values")
if "image_grid_thw" in video_outputs:
Expand Down Expand Up @@ -423,6 +474,9 @@ def model_input_names(self):
+ self.image_processor.model_input_names
+ self.audio_processor.model_input_names
)
video_processor = getattr(self, "video_processor", None)
if video_processor is not None:
names += video_processor.model_input_names
return list(dict.fromkeys(names))


Expand Down
Loading