Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions src/tests/test_prefixaware_router.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import asyncio
from typing import Dict

from vllm_router.routers.routing_logic import PrefixAwareRouter


class EndpointInfo:
def __init__(self, url: str):
self.url = url


class RequestStats:
def __init__(self, qps: float):
self.qps = qps


class Request:
def __init__(self, headers: Dict[str, str]):
self.headers = headers


class EngineStats:
def __init__(self):
return


def test_prefixaware_logic():
endpoints = [
EndpointInfo(url="http://engine1.com"),
EndpointInfo(url="http://engine2.com"),
]
request_stats = {
"http://engine1.com": RequestStats(qps=10),
"http://engine2.com": RequestStats(qps=5),
}
request = Request(headers={})

router = PrefixAwareRouter()

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
request_json = {"prompt": "Hello, how are you today?"}
url = loop.run_until_complete(
router.route_request(endpoints, None, request_stats, request, request_json)
)
assert url in [endpoint.url for endpoint in endpoints]

# Same request should route to same endpoint
url2 = loop.run_until_complete(
router.route_request(endpoints, None, request_stats, request, request_json)
)
assert url == url2, "Same request should route to same endpoint"

# chat messages should work
chat_request = {
"messages": [{"role": "user", "content": "Hello, how are you?"}]
}
url3 = loop.run_until_complete(
router.route_request(endpoints, None, request_stats, request, chat_request)
)
assert url3 in [endpoint.url for endpoint in endpoints]

finally:
loop.close()


def test_hashtrie_eviction():
from vllm_router.prefix.config import HashTrieConfig
from vllm_router.prefix.hashtrie import HashTrie

# Create a config with very small memory limit to trigger eviction
config = HashTrieConfig.from_defaults(
chunk_size=4, # Small chunk size
max_memory_size=0.0, # 0 MB - should trigger immediate eviction
eviction_threshold=0.5,
target_utilization=0.0, # evict all of the nodes
memory_check_request_batch_size=5,
)

# Create a new HashTrie with the restrictive config
hashtrie = HashTrie(config)

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# Insert a request - this should trigger eviction due to 0 MB limit
request = "Hello world, this is a test request"
endpoint = "http://engine1.com"

# Before insertion
initial_size = len(hashtrie.node_cache)
assert initial_size == 0, "HashTrie should start empty"

# Insert a couple of times to trigger the memory check and cache eviction
for i in range(config.memory_check_request_batch_size):
loop.run_until_complete(hashtrie.insert(request, endpoint))

# After insertion with 0 MB limit, eviction should have occurred
# The trie might be empty or have very few nodes due to aggressive eviction
final_size = len(hashtrie.node_cache)

# With 0 MB limit, the eviction should keep the trie very small
assert (
final_size == 0
), f"HashTrie should be small after eviction, got {final_size} nodes"

finally:
loop.close()
30 changes: 30 additions & 0 deletions src/vllm_router/prefix/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2024-2025 The vLLM Production Stack Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
vLLM Router Prefix Matching Module

This module provides HashTrie-based prefix matching with LRU eviction
for efficient request routing in vLLM production environments.
"""

from .config import HashTrieConfig
from .hashtrie import HashTrie, NodeMetadata, TrieNode

__all__ = [
"HashTrie",
"HashTrieConfig",
"NodeMetadata",
"TrieNode",
]
58 changes: 58 additions & 0 deletions src/vllm_router/prefix/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright 2024-2025 The vLLM Production Stack Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from dataclasses import dataclass
from typing import Optional


@dataclass
class HashTrieConfig:
"""Configuration for HashTrie with LRU eviction"""

chunk_size: int = 0
max_memory_size: int = 0 # GB
eviction_threshold: float = 0 # start evicting at
target_utilization: float = 0 # evict down to
memory_check_request_batch_size: int = 0

@staticmethod
def from_defaults(
chunk_size: int = 128,
max_memory_size: int = 2,
eviction_threshold: float = 0.9,
target_utilization: float = 0.5,
memory_check_request_batch_size: int = 10,
) -> "HashTrieConfig":
"""Create configuration with default values"""
return HashTrieConfig(
chunk_size,
max_memory_size,
eviction_threshold,
target_utilization,
memory_check_request_batch_size,
)
Comment on lines +23 to +44
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current implementation of HashTrieConfig has default values of 0 in the dataclass definition, which can be misleading and lead to a non-functional configuration if instantiated with HashTrieConfig(). The actual default values are in a separate from_defaults static method.

To improve clarity and usability, I suggest moving the default values directly into the dataclass attributes and removing the from_defaults method. This makes the default configuration explicit and simplifies instantiation.

You will need to update the call sites:

  • HashTrieConfig.from_defaults() becomes HashTrieConfig()
  • HashTrieConfig.from_defaults(...) becomes HashTrieConfig(...)
@dataclass
class HashTrieConfig:
    """Configuration for HashTrie with LRU eviction"""

    chunk_size: int = 128
    max_memory_size: int = 2  # GB
    eviction_threshold: float = 0.9  # start evicting at
    target_utilization: float = 0.5  # evict down to
    memory_check_request_batch_size: int = 10

    @staticmethod
    def from_env() -> "HashTrieConfig":
        """Load configuration from environment variables"""
        return HashTrieConfig(
            chunk_size=int(os.getenv("HASHTRIE_CHUNK_SIZE", "128")),
            max_memory_size=int(os.getenv("PREFIXAWARE_MAX_MEMORY_SIZE_GB", "2")),
            eviction_threshold=float(os.getenv("HASHTRIE_EVICTION_THRESHOLD", "0.9")),
            target_utilization=float(os.getenv("HASHTRIE_TARGET_UTILIZATION", "0.5")),
            memory_check_request_batch_size=int(
                os.getenv("MEMORY_CHECK_REQUEST_BATCH_SIZE", "10")
            ),
        )


@staticmethod
def from_env() -> "HashTrieConfig":
"""Load configuration from environment variables"""
return HashTrieConfig(
chunk_size=int(os.getenv("HASHTRIE_CHUNK_SIZE", "128")),
max_memory_size=int(os.getenv("PREFIXAWARE_MAX_MEMORY_SIZE_GB", "2")),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Read default values from constant instead of hardcoding here

eviction_threshold=float(os.getenv("HASHTRIE_EVICTION_THRESHOLD", "0.9")),
target_utilization=float(os.getenv("HASHTRIE_TARGET_UTILIZATION", "0.5")),
memory_check_request_batch_size=int(
os.getenv("MEMORY_CHECK_REQUEST_BATCH_SIZE", "10")
),
)
Loading
Loading