vllm-project · can-sun · Sep 16, 2025 · Sep 23, 2025 · Sep 24, 2025 · Sep 26, 2025
diff --git a/src/tests/test_prefixaware_router.py b/src/tests/test_prefixaware_router.py
@@ -0,0 +1,109 @@
+import asyncio
+from typing import Dict
+
+from vllm_router.routers.routing_logic import PrefixAwareRouter
+
+
+class EndpointInfo:
+    def __init__(self, url: str):
+        self.url = url
+
+
+class RequestStats:
+    def __init__(self, qps: float):
+        self.qps = qps
+
+
+class Request:
+    def __init__(self, headers: Dict[str, str]):
+        self.headers = headers
+
+
+class EngineStats:
+    def __init__(self):
+        return
+
+
+def test_prefixaware_logic():
+    endpoints = [
+        EndpointInfo(url="http://engine1.com"),
+        EndpointInfo(url="http://engine2.com"),
+    ]
+    request_stats = {
+        "http://engine1.com": RequestStats(qps=10),
+        "http://engine2.com": RequestStats(qps=5),
+    }
+    request = Request(headers={})
+
+    router = PrefixAwareRouter()
+
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        request_json = {"prompt": "Hello, how are you today?"}
+        url = loop.run_until_complete(
+            router.route_request(endpoints, None, request_stats, request, request_json)
+        )
+        assert url in [endpoint.url for endpoint in endpoints]
+
+        # Same request should route to same endpoint
+        url2 = loop.run_until_complete(
+            router.route_request(endpoints, None, request_stats, request, request_json)
+        )
+        assert url == url2, "Same request should route to same endpoint"
+
+        # chat messages should work
+        chat_request = {
+            "messages": [{"role": "user", "content": "Hello, how are you?"}]
+        }
+        url3 = loop.run_until_complete(
+            router.route_request(endpoints, None, request_stats, request, chat_request)
+        )
+        assert url3 in [endpoint.url for endpoint in endpoints]
+
+    finally:
+        loop.close()
+
+
+def test_hashtrie_eviction():
+    from vllm_router.prefix.config import HashTrieConfig
+    from vllm_router.prefix.hashtrie import HashTrie
+
+    # Create a config with very small memory limit to trigger eviction
+    config = HashTrieConfig.from_defaults(
+        chunk_size=4,  # Small chunk size
+        max_memory_size=0.0,  # 0 MB - should trigger immediate eviction
+        eviction_threshold=0.5,
+        target_utilization=0.0,  # evict all of the nodes
+        memory_check_request_batch_size=5,
+    )
+
+    # Create a new HashTrie with the restrictive config
+    hashtrie = HashTrie(config)
+
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        # Insert a request - this should trigger eviction due to 0 MB limit
+        request = "Hello world, this is a test request"
+        endpoint = "http://engine1.com"
+
+        # Before insertion
+        initial_size = len(hashtrie.node_cache)
+        assert initial_size == 0, "HashTrie should start empty"
+
+        # Insert a couple of times to trigger the memory check and cache eviction
+        for i in range(config.memory_check_request_batch_size):
+            loop.run_until_complete(hashtrie.insert(request, endpoint))
+
+        # After insertion with 0 MB limit, eviction should have occurred
+        # The trie might be empty or have very few nodes due to aggressive eviction
+        final_size = len(hashtrie.node_cache)
+
+        # With 0 MB limit, the eviction should keep the trie very small
+        assert (
+            final_size == 0
+        ), f"HashTrie should be small after eviction, got {final_size} nodes"
+
+    finally:
+        loop.close()
diff --git a/src/vllm_router/prefix/__init__.py b/src/vllm_router/prefix/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024-2025 The vLLM Production Stack Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+vLLM Router Prefix Matching Module
+
+This module provides HashTrie-based prefix matching with LRU eviction
+for efficient request routing in vLLM production environments.
+"""
+
+from .config import HashTrieConfig
+from .hashtrie import HashTrie, NodeMetadata, TrieNode
+
+__all__ = [
+    "HashTrie",
+    "HashTrieConfig",
+    "NodeMetadata",
+    "TrieNode",
+]
diff --git a/src/vllm_router/prefix/config.py b/src/vllm_router/prefix/config.py
@@ -0,0 +1,58 @@
+# Copyright 2024-2025 The vLLM Production Stack Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class HashTrieConfig:
+    """Configuration for HashTrie with LRU eviction"""
+
+    chunk_size: int = 0
+    max_memory_size: int = 0  # GB
+    eviction_threshold: float = 0  # start evicting at
+    target_utilization: float = 0  # evict down to
+    memory_check_request_batch_size: int = 0
+
+    @staticmethod
+    def from_defaults(
+        chunk_size: int = 128,
+        max_memory_size: int = 2,
+        eviction_threshold: float = 0.9,
+        target_utilization: float = 0.5,
+        memory_check_request_batch_size: int = 10,
+    ) -> "HashTrieConfig":
+        """Create configuration with default values"""
+        return HashTrieConfig(
+            chunk_size,
+            max_memory_size,
+            eviction_threshold,
+            target_utilization,
+            memory_check_request_batch_size,
+        )
+
+    @staticmethod
+    def from_env() -> "HashTrieConfig":
+        """Load configuration from environment variables"""
+        return HashTrieConfig(
+            chunk_size=int(os.getenv("HASHTRIE_CHUNK_SIZE", "128")),
+            max_memory_size=int(os.getenv("PREFIXAWARE_MAX_MEMORY_SIZE_GB", "2")),
+            eviction_threshold=float(os.getenv("HASHTRIE_EVICTION_THRESHOLD", "0.9")),
+            target_utilization=float(os.getenv("HASHTRIE_TARGET_UTILIZATION", "0.5")),
+            memory_check_request_batch_size=int(
+                os.getenv("MEMORY_CHECK_REQUEST_BATCH_SIZE", "10")
+            ),
+        )