Fix unit tests w/ speical tokens

chang-l · chang-l · commit ae40f840ec2e · 2025-09-14T07:16:35.000-07:00
Signed-off-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/tests/unittest/_torch/multimodal/test_multimodal_runtime.py b/tests/unittest/_torch/multimodal/test_multimodal_runtime.py
@@ -19,7 +19,8 @@ def test_fully_cached_multimodal_tokens(self):
             past_seen_token_num=20,
             mm_token_lengths=[5, 8, 7],  # Total: 20 tokens
             mm_token_positions=[0, 5, 13],  # Positions: 0-5, 5-13, 13-20
-            chunk_end_pos=20)
+            chunk_end_pos=20,
+            special_token_offsets=[])
 
         # All tokens should be cached since past_seen_token_num (20) >= all positions + lengths
         assert runtime.num_unseen_mm_tokens == 20
@@ -32,7 +33,8 @@ def test_no_cached_multimodal_tokens(self):
             mm_token_lengths=[5, 8, 7],  # Total: 20 tokens
             mm_token_positions=[10, 18,
                                 30],  # All positions > past_seen_token_num
-            chunk_end_pos=40)
+            chunk_end_pos=40,
+            special_token_offsets=[])
 
         # No multimodal tokens should be cached
         assert runtime.num_unseen_mm_tokens == 0
@@ -44,7 +46,8 @@ def test_partial_caching_with_chunk_boundaries(self):
             past_seen_token_num=15,
             mm_token_lengths=[5, 8, 7],  # Total: 20 tokens
             mm_token_positions=[10, 18, 25],  # Positions: 10-15, 18-26, 25-32
-            chunk_end_pos=30)
+            chunk_end_pos=30,
+            special_token_offsets=[])
 
         # Expected caching:
         # Chunk 0: [10-15] - 5 tokens fully cached, 0 tokens in current chunk
@@ -59,7 +62,8 @@ def test_chunk_boundary_case1(self):
             past_seen_token_num=12,
             mm_token_lengths=[6, 4, 8],  # Total: 18 tokens
             mm_token_positions=[8, 16, 22],  # Positions: 8-14, 16-20, 22-30
-            chunk_end_pos=20)
+            chunk_end_pos=20,
+            special_token_offsets=[])
 
         # Expected caching:
         # Chunk 0: [8-14] - 4 tokens cached (8-12), 2 tokens in current chunk (12-14)
@@ -76,7 +80,8 @@ def test_chunk_boundary_case2(self):
             mm_token_positions=[
                 0, 5, 10, 15, 25, 35
             ],  # Positions: 0-3, 5-9, 10-15, 15-21, 25-32, 35-43
-            chunk_end_pos=100)
+            chunk_end_pos=100,
+            special_token_offsets=[])
 
         expected_cached = 3 + 4 + 5 + 6 + 5  # 23 tokens
         expected_current_chunk = 2 + 8  # 10 tokens
@@ -94,44 +99,55 @@ def test_validation_errors(self):
             MultimodalRuntimeData(past_seen_token_num=10,
                                   mm_token_lengths=[5, 8, 7],
                                   mm_token_positions=[0, 5],
-                                  chunk_end_pos=20)
+                                  chunk_end_pos=20,
+                                  special_token_offsets=[])
 
         # Test negative past_seen_token_num
         with pytest.raises(ValueError,
                            match="past_seen_token_num must be non-negative"):
             MultimodalRuntimeData(past_seen_token_num=-1,
                                   mm_token_lengths=[5],
                                   mm_token_positions=[0],
-                                  chunk_end_pos=10)
+                                  chunk_end_pos=10,
+                                  special_token_offsets=[])
 
         # Test non-positive token lengths
         with pytest.raises(ValueError,
                            match="All mm_token_lengths must be positive"):
             MultimodalRuntimeData(past_seen_token_num=10,
                                   mm_token_lengths=[5, 0, 7],
                                   mm_token_positions=[0, 5, 10],
-                                  chunk_end_pos=20)
+                                  chunk_end_pos=20,
+                                  special_token_offsets=[])
 
         # Test negative positions
         with pytest.raises(ValueError,
                            match="All mm_token_positions must be non-negative"):
             MultimodalRuntimeData(past_seen_token_num=10,
                                   mm_token_lengths=[5, 8, 7],
                                   mm_token_positions=[0, -5, 10],
-                                  chunk_end_pos=20)
+                                  chunk_end_pos=20,
+                                  special_token_offsets=[])
 
 
 class TestFindInputMmEmbed:
     """Focused test cases for find_input_mm_embeds function - testing both KV cache reuse and chunked prefill."""
 
-    def create_mock_runtime(self, num_unseen_mm_tokens: int,
+    def create_mock_runtime(self,
+                            num_unseen_mm_tokens: int,
                             num_mm_tokens_in_chunk: int,
-                            mm_token_lengths: List[int]):
+                            mm_token_lengths: List[int],
+                            num_unseen_special_tokens: int = 0,
+                            num_special_tokens_in_chunk: int = 0,
+                            total_special_tokens_in_request: int = 0):
         """Helper to create a mock MultimodalRuntimeData."""
         runtime = Mock(spec=MultimodalRuntimeData)
         runtime.num_unseen_mm_tokens = num_unseen_mm_tokens
         runtime.num_mm_tokens_in_chunk = num_mm_tokens_in_chunk
         runtime.total_mm_tokens_in_request = sum(mm_token_lengths)
+        runtime.num_unseen_special_tokens = num_unseen_special_tokens
+        runtime.num_special_tokens_in_chunk = num_special_tokens_in_chunk
+        runtime.total_special_tokens_in_request = total_special_tokens_in_request
 
         return runtime
 
@@ -365,22 +381,68 @@ def test_different_devices(self):
         result = find_input_mm_embeds(mm_embeds, multimodal_params)
         assert result[0].device == mm_embeds[0].device
 
+    def test_special_tokens_in_batched_mode(self):
+        """Test special token handling in batched mode."""
+        mm_embeds = [torch.randn(12, 512)
+                     ]  # Pre-concatenated: (8-2) + (10-4) = 6 + 6 = 12 tokens
+        multimodal_params = [
+            self.create_mock_runtime(num_unseen_mm_tokens=2,
+                                     num_mm_tokens_in_chunk=6,
+                                     mm_token_lengths=[8],
+                                     num_unseen_special_tokens=1,
+                                     num_special_tokens_in_chunk=1,
+                                     total_special_tokens_in_request=2),
+            self.create_mock_runtime(num_unseen_mm_tokens=4,
+                                     num_mm_tokens_in_chunk=6,
+                                     mm_token_lengths=[10],
+                                     num_unseen_special_tokens=2,
+                                     num_special_tokens_in_chunk=2,
+                                     total_special_tokens_in_request=4)
+        ]
+        multimodal_params = [
+            MultimodalParams(multimodal_runtime=runtime)
+            for runtime in multimodal_params
+        ]
+
+        result = find_input_mm_embeds(mm_embeds, multimodal_params)
+
+        # Expected slices accounting for special tokens:
+        # Batch 1: local_start = 2-1=1, local_end = 1+(6-1)=6, slice [1:6] = 5 tokens
+        # Batch 2: local_start = 4-2=2, local_end = 2+(6-2)=6, slice [6+2:6+6] = [8:12] = 4 tokens
+        # Total: 5 + 4 = 9 tokens
+        assert len(result) == 1
+        assert result[0].shape == (9, 512)
+
+        # Verify the slices are correct
+        expected = torch.cat(
+            [
+                mm_embeds[0][1:6],  # Batch 1: 5 tokens
+                mm_embeds[0][8:12]  # Batch 2: 4 tokens
+            ],
+            dim=0)
+        torch.testing.assert_close(result[0], expected)
+
 
 class TestGetMultimodalEmbeddings:
     """Test cases for get_multimodal_embeddings function - testing caching and encoder forward optimization."""
 
-    def create_mock_runtime(self, total_mm_tokens: int):
-        """Helper to create a mock MultimodalRuntimeData with total_mm_tokens."""
+    def create_mock_runtime(self,
+                            total_mm_tokens: int,
+                            total_special_tokens: int = 0):
+        """Helper to create a mock MultimodalRuntimeData with total_mm_tokens and special_tokens."""
         runtime = Mock(spec=MultimodalRuntimeData)
         runtime.total_mm_tokens_in_request = total_mm_tokens
+        runtime.total_special_tokens_in_request = total_special_tokens
         return runtime
 
     def create_multimodal_params_with_data(self,
                                            has_cached_embedding: bool = False,
                                            total_mm_tokens: int = 10,
+                                           total_special_tokens: int = 0,
                                            cached_embedding=None):
         """Helper to create MultimodalParams with optional cached embeddings."""
-        runtime = self.create_mock_runtime(total_mm_tokens)
+        runtime = self.create_mock_runtime(total_mm_tokens,
+                                           total_special_tokens)
 
         multimodal_data = {
             # Add some dummy multimodal data to ensure has_content() returns True
@@ -663,6 +725,113 @@ def mock_encoder(params):
         assert multimodal_params[0].multimodal_data[
             "multimodal_embedding"].device.type == 'cuda'
 
+    def test_special_tokens_basic_caching(self):
+        """Test caching behavior with special tokens present."""
+
+        def mock_encoder(params):
+            # Return embeddings for non-special tokens only
+            # Total: (10-2) + (8-1) + (6-3) = 8 + 7 + 3 = 18 tokens
+            return [torch.randn(18, 512)]
+
+        multimodal_params = [
+            self.create_multimodal_params_with_data(
+                has_cached_embedding=False,
+                total_mm_tokens=10,
+                total_special_tokens=2),  # 8 actual embedding tokens
+            self.create_multimodal_params_with_data(
+                has_cached_embedding=False,
+                total_mm_tokens=8,
+                total_special_tokens=1),  # 7 actual embedding tokens
+            self.create_multimodal_params_with_data(
+                has_cached_embedding=False,
+                total_mm_tokens=6,
+                total_special_tokens=3)  # 3 actual embedding tokens
+        ]
+
+        result = get_multimodal_embeddings(mock_encoder, multimodal_params)
+
+        # Should return concatenated embeddings
+        assert len(result) == 1
+        assert result[0].shape == (18, 512)  # 8 + 7 + 3 = 18 tokens
+
+        # Check that embeddings were split correctly based on non-special token counts
+        assert multimodal_params[0].multimodal_data[
+            "multimodal_embedding"].shape == (8, 512)  # 10 - 2
+        assert multimodal_params[1].multimodal_data[
+            "multimodal_embedding"].shape == (7, 512)  # 8 - 1
+        assert multimodal_params[2].multimodal_data[
+            "multimodal_embedding"].shape == (3, 512)  # 6 - 3
+
+    def test_special_tokens_all_special(self):
+        """Test edge case where all tokens are special tokens."""
+
+        def mock_encoder(params):
+            # Should return empty tensor when no actual embedding tokens
+            return [torch.randn(0, 512)]
+
+        multimodal_params = [
+            self.create_multimodal_params_with_data(
+                has_cached_embedding=False,
+                total_mm_tokens=5,
+                total_special_tokens=5),  # All tokens are special
+            self.create_multimodal_params_with_data(
+                has_cached_embedding=False,
+                total_mm_tokens=3,
+                total_special_tokens=3)  # All tokens are special
+        ]
+
+        result = get_multimodal_embeddings(mock_encoder, multimodal_params)
+
+        # Should return empty embeddings
+        assert len(result) == 1
+        assert result[0].shape == (0, 512)
+
+        # Cached embeddings should also be empty
+        assert multimodal_params[0].multimodal_data[
+            "multimodal_embedding"].shape == (0, 512)
+        assert multimodal_params[1].multimodal_data[
+            "multimodal_embedding"].shape == (0, 512)
+
+    def test_special_tokens_mixed_with_cached(self):
+        """Test special tokens with mixed cached and uncached params."""
+        encoder_call_count = 0
+
+        def mock_encoder(params):
+            nonlocal encoder_call_count
+            encoder_call_count += 1
+            # Only process uncached param: 12 - 3 = 9 tokens
+            return [torch.randn(9, 512)]
+
+        # Mix: cached (with special tokens), uncached (with special tokens)
+        cached_emb = torch.randn(4, 512)  # 6 - 2 = 4 actual tokens
+        multimodal_params = [
+            self.create_multimodal_params_with_data(
+                has_cached_embedding=True,
+                total_mm_tokens=6,
+                total_special_tokens=2,
+                cached_embedding=cached_emb),
+            self.create_multimodal_params_with_data(
+                has_cached_embedding=False,
+                total_mm_tokens=12,
+                total_special_tokens=3)  # 9 actual embedding tokens
+        ]
+
+        result = get_multimodal_embeddings(mock_encoder, multimodal_params)
+
+        # Encoder should be called once for uncached param
+        assert encoder_call_count == 1
+
+        # Should return concatenated embeddings: 4 + 9 = 13 tokens
+        assert len(result) == 1
+        assert result[0].shape == (13, 512)
+
+        # Verify cached embedding is preserved and uncached is now cached
+        torch.testing.assert_close(
+            multimodal_params[0].multimodal_data["multimodal_embedding"],
+            cached_emb)
+        assert multimodal_params[1].multimodal_data[
+            "multimodal_embedding"].shape == (9, 512)
+
 
 if __name__ == "__main__":
     pytest.main([__file__])