[TRTLLM-7261][feat] Support phi-4 model in pytorch backend (#7371)

Wanli-Jiang · web-flow · commit 4223a9aadad0 · 2025-09-03T10:27:42.000+08:00
Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md
@@ -26,6 +26,7 @@ TensorRT-LLM optimizes the performance of a range of well-known models on NVIDIA
 | `MllamaForConditionalGeneration` | Llama 3.2 | `meta-llama/Llama-3.2-11B-Vision` | L |
 | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base` | L |
 | `NemotronNASForCausalLM` | NemotronNAS | `nvidia/Llama-3_3-Nemotron-Super-49B-v1` | L |
+| `Phi3ForCausalLM` | Phi-4  | `microsoft/Phi-4` | L |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | `microsoft/Phi-4-multimodal-instruct` | L + I + A |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/Qwen2-7B-Instruct` | L |
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B` | L |
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -189,6 +189,11 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
   - accuracy: 75.85
 microsoft/Phi-4-mini-instruct:
   - accuracy: 82.30
+microsoft/phi-4:
+  - accuracy: 90.30
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 90.64
 mistralai/Codestral-22B-v0.1:
   - accuracy: 67.10
 GPT-OSS/BF16:
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -293,6 +293,11 @@ microsoft/Phi-4-multimodal-instruct:
   - accuracy: 69.69
 microsoft/Phi-4-multimodal-instruct-long-rope:
   - accuracy: 65.98
+microsoft/phi-4:
+  - accuracy: 79.73
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 79.36
 LGAI-EXAONE/EXAONE-4.0-32B:
   - accuracy: 78.52
 GPT-OSS/BF16:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2791,6 +2791,25 @@ def test_fp8(self):
             task.evaluate(llm)
 
 
+class TestPhi4(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "microsoft/phi-4"
+
+    def test_auto_dtype(self):
+        with LLM(f"{llm_models_root()}/Phi-4") as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_hopper
+    def test_fp8(self):
+        with LLM(f"{llm_models_root()}/Phi-4-FP8") as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestPhi4MM(LlmapiAccuracyTestHarness):
     # phi4-mm can also support text input.
     MODEL_NAME = "microsoft/Phi-4-multimodal-instruct"
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -603,6 +603,8 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
 accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt
@@ -133,6 +133,8 @@ accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
 accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
+accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]