diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index a6d0c5d12dd4..4bc2112c0aa1 100755 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -213,37 +213,6 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: ) -def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData: - """ - Phi-4-multimodal-instruct supports both image and audio inputs. Here, we - show how to process audio inputs. - """ - model_path = snapshot_download( - "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70" - ) - # Since the vision-lora and speech-lora co-exist with the base model, - # we have to manually specify the path of the lora weights. - speech_lora_path = os.path.join(model_path, "speech-lora") - placeholders = "<|audio|>" * audio_count - - prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>" - - engine_args = EngineArgs( - model=model_path, - max_model_len=12800, - max_num_seqs=2, - enable_lora=True, - max_lora_rank=320, - limit_mm_per_prompt={"audio": audio_count}, - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompts, - lora_requests=[LoRARequest("speech", 1, speech_lora_path)], - ) - - # Qwen2-Audio def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: model_name = "Qwen/Qwen2-Audio-7B-Instruct" @@ -416,7 +385,6 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData: "midashenglm": run_midashenglm, "minicpmo": run_minicpmo, "phi4_mm": run_phi4mm, - "phi4_multimodal": run_phi4_multimodal, "qwen2_audio": run_qwen2_audio, "qwen2_5_omni": run_qwen2_5_omni, "ultravox": run_ultravox, diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index dd5b22ae9b0f..dfca7d5c9c9a 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1424,41 +1424,6 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: ) -# HF format Phi-4-multimodal-instruct -def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData: - """ - Phi-4-multimodal-instruct supports both image and audio inputs. Here, we - show how to process image inputs. - """ - assert modality == "image" - model_path = snapshot_download( - "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70" - ) - # Since the vision-lora and speech-lora co-exist with the base model, - # we have to manually specify the path of the lora weights. - vision_lora_path = os.path.join(model_path, "vision-lora") - prompts = [ - f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions - ] - engine_args = EngineArgs( - model=model_path, - max_model_len=5120, - max_num_seqs=2, - max_num_batched_tokens=12800, - enable_lora=True, - max_lora_rank=320, - # Note - mm_processor_kwargs can also be passed to generate/chat calls - mm_processor_kwargs={"dynamic_hd": 16}, - limit_mm_per_prompt={"image": 1}, - ) - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - lora_requests=[LoRARequest("vision", 1, vision_lora_path)], - ) - - # Pixtral HF-format def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1904,7 +1869,6 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: "paligemma2": run_paligemma2, "phi3_v": run_phi3v, "phi4_mm": run_phi4mm, - "phi4_multimodal": run_phi4_multimodal, "pixtral_hf": run_pixtral_hf, "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 3c01806baa20..2d7aece527ae 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -932,40 +932,6 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData: - """ - Phi-4-multimodal-instruct supports both image and audio inputs. Here, we - show how to process multi images inputs. - """ - - model_path = snapshot_download( - "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70" - ) - # Since the vision-lora and speech-lora co-exist with the base model, - # we have to manually specify the path of the lora weights. - vision_lora_path = os.path.join(model_path, "vision-lora") - engine_args = EngineArgs( - model=model_path, - max_model_len=4096, - max_num_seqs=2, - limit_mm_per_prompt={"image": len(image_urls)}, - enable_lora=True, - max_lora_rank=320, - # Note - mm_processor_kwargs can also be passed to generate/chat calls - mm_processor_kwargs={"dynamic_hd": 4}, - ) - - placeholders = "<|image|>" * len(image_urls) - prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>" - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - lora_requests=[LoRARequest("vision", 1, vision_lora_path)], - ) - - def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "Qwen/Qwen-VL-Chat" engine_args = EngineArgs( @@ -1363,7 +1329,6 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData: "paddleocr_vl": load_paddleocr_vl, "phi3_v": load_phi3v, "phi4_mm": load_phi4mm, - "phi4_multimodal": load_phi4_multimodal, "pixtral_hf": load_pixtral_hf, "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl,