diff --git a/test/srt/test_vision_chunked_prefill.py b/test/srt/test_vision_chunked_prefill.py
index 90fe213302bd..305cbe0af7a7 100644
--- a/test/srt/test_vision_chunked_prefill.py
+++ b/test/srt/test_vision_chunked_prefill.py
@@ -19,6 +19,7 @@
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
+    calculate_rouge_l,
     popen_launch_server,
 )
 
@@ -178,7 +179,18 @@ def _test_chunked_prefill(self, batches, num_frames):
             print(output_chunked)
             print("output without chunked prefill:")
             print(output_no_chunked)
-            self.assertEqual(output_chunked, output_no_chunked)
+            self.assertEqual(len(output_chunked), len(output_no_chunked))
+            rouge_scores = calculate_rouge_l(output_chunked, output_no_chunked)
+            avg_score = sum(rouge_scores) / len(rouge_scores)
+            print(f"ROUGE-L scores: {rouge_scores}")
+            print(f"Average ROUGE-L score: {avg_score:.4f}")
+            # Allow for occasional divergence in one item while maintaining overall output quality
+            self.assertGreater(
+                avg_score,
+                0.90,
+                f"Average ROUGE-L score too low: {avg_score:.4f}. "
+                f"Individual scores: {rouge_scores}",
+            )
 
     def test_chunked_prefill(self):
         self._test_chunked_prefill(batches=[False, True], num_frames=[1, [2, 6, 8, 10]])