diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 545b696d6737..cbcb0665eb30 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1610,7 +1610,7 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
                 inputs_dict.pop("pixel_values_images", None)
             #   2.C - No easy fix, let's skip the check that compares the outputs from `input_ids` and `inputs_embeds`
             has_complex_embeds_computation = any(
-                model_name in model_class.__name__.lower() for model_name in ["moshi"]
+                model_name in model_class.__name__.lower() for model_name in ["moshi", "qwen2vl"]
             )
             # 3 - `inputs_dict` doesn't contain `attention_mask`. When `attention_mask` is not passed to generate,
             # we infer it from `input_ids`. The last test case will fail if there is a pad token in the original input.