Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
2361a17
add TtftRouter
chickeyton Sep 1, 2025
b567b1e
fix revieiw issues
chickeyton Sep 2, 2025
9119123
update the way of finding computation amount
chickeyton Sep 5, 2025
2fcf917
TtftRouter.route_request() returns RequestStatsCacheInfo as well
chickeyton Sep 5, 2025
c65a9fc
fix review comments
chickeyton Sep 5, 2025
bfad8c9
update prefill computation amount formula
chickeyton Sep 9, 2025
12320f0
update prefill computation amount formula
chickeyton Sep 9, 2025
847a50d
fix typing
chickeyton Sep 11, 2025
18764f6
fix queue time estimation
chickeyton Sep 11, 2025
80a1010
fix queue time estimation
chickeyton Sep 11, 2025
4e4489e
add event_id to FullLookupMsg()
chickeyton Sep 12, 2025
bed03d6
fix ttft routing when there is no matched cache
chickeyton Sep 12, 2025
a5ab012
fix ttft routing when there is no matched cache
chickeyton Sep 12, 2025
ad60d95
fix ttft esitimation when stats is incomplete
chickeyton Sep 12, 2025
5959c5e
fix ttft esitimation when stats is incomplete
chickeyton Sep 12, 2025
07b10d4
fix ttft esitimation when stats is incomplete
chickeyton Sep 12, 2025
7742066
fix ttft esitimation when stats is incomplete
chickeyton Sep 12, 2025
0a96497
fix ttft esitimation when stats is incomplete
chickeyton Sep 12, 2025
d49d380
fix ttft esitimation when stats is incomplete
chickeyton Sep 12, 2025
623c92d
estimated workload instead of ttft
chickeyton Sep 17, 2025
d7f525a
estimated workload instead of ttft
chickeyton Sep 17, 2025
12c4c99
estimated workload instead of ttft
chickeyton Sep 17, 2025
10b67d9
estimated workload instead of ttft
chickeyton Sep 17, 2025
fab66b0
estimated workload instead of ttft
chickeyton Sep 17, 2025
5c187bb
code enhancement
chickeyton Sep 19, 2025
6dd39a6
code enhancement
chickeyton Sep 19, 2025
73a2cca
code enhancement
chickeyton Sep 19, 2025
74d2902
resolve conflicts
chickeyton Sep 25, 2025
906ee47
fix review issues
chickeyton Sep 25, 2025
70a607a
fix review issues
chickeyton Sep 25, 2025
0cd8049
fix review issues
chickeyton Sep 25, 2025
4a32696
update lmcache usage
chickeyton Sep 25, 2025
bab4c0b
update lmcache usage
chickeyton Sep 25, 2025
13d8a19
update lmcache usage
chickeyton Sep 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions benchmarks/multi-round-qa/multi-round-qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ class WorkloadConfig:
# Whether to include user id in request header
enable_user_id: bool

# Max number of unfinished queries allowed (None means no limit)
max_unfinished_queries: Optional[int]


@dataclass
class UserConfig:
Expand Down Expand Up @@ -419,6 +422,13 @@ def step(self, timestamp: float, executor: RequestExecutor):
if self.start_time is None:
self.start_time = timestamp

pending_queries = len([s for s in self.sessions if s.has_unfinished_request])
# Only check limit if max_unfinished_queries is set
if (self.workload_config.max_unfinished_queries is not None and
pending_queries > self.workload_config.max_unfinished_queries):
logger.info(f"unfinished queries >{self.workload_config.max_unfinished_queries}, waiting")
return

if timestamp - self.last_user_join > self.gap_between_users:
self._create_user_session()
self.last_user_join = timestamp
Expand Down Expand Up @@ -625,6 +635,12 @@ def parse_arguments() -> WorkloadConfig:
parser.add_argument(
"--sharegpt", action="store_true", help="Whether to use ShareGPT dataset"
)
parser.add_argument(
"--max-unfinished-queries",
type=int,
default=None,
help="Maximum number of unfinished queries allowed (default: no limit)",
)
args = parser.parse_args()
return args

Expand Down Expand Up @@ -675,6 +691,7 @@ def main():
qps=args.qps,
model=args.model,
enable_user_id=args.request_with_user_id,
max_unfinished_queries=args.max_unfinished_queries,
)

manager = UserSessionManager(
Expand Down
16 changes: 16 additions & 0 deletions src/vllm_router/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,18 @@ async def lifespan(app: FastAPI):
dyn_cfg_watcher.close()


def create_instance_id_to_url(lmcache_instances, static_backends):
if lmcache_instances is None or static_backends is None:
return None
instance_ids = [s.strip() for s in lmcache_instances.split(',') if s.strip()]
urls = parse_static_urls(static_backends)
if not instance_ids or not urls:
return None
if len(instance_ids) != len(urls):
raise ValueError("length of lmcache-instances & static-backends mismatched")
return dict(zip(instance_ids, urls))


def initialize_all(app: FastAPI, args):
"""
Initialize all the components of the router with the given arguments.
Expand Down Expand Up @@ -206,6 +218,10 @@ def initialize_all(app: FastAPI, args):
prefill_model_labels=args.prefill_model_labels,
decode_model_labels=args.decode_model_labels,
kv_aware_threshold=args.kv_aware_threshold,
tokenizer=args.tokenizer,
enable_shared_cache=args.enable_shared_cache,
instance_id_to_url=create_instance_id_to_url(args.lmcache_instances,
args.static_backends),
)

# Initialize feature gates
Expand Down
27 changes: 20 additions & 7 deletions src/vllm_router/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from vllm_router.parsers.yaml_utils import (
read_and_process_yaml_config_file,
)
from vllm_router.routers.routing_logic import RoutingLogic
from vllm_router.version import __version__

try:
Expand Down Expand Up @@ -203,13 +204,7 @@ def parse_args():
parser.add_argument(
"--routing-logic",
type=str,
choices=[
"roundrobin",
"session",
"kvaware",
"prefixaware",
"disaggregated_prefill",
],
choices=[routing for routing in RoutingLogic],
help="The routing logic to use",
)
parser.add_argument(
Expand All @@ -218,12 +213,30 @@ def parse_args():
default=9000,
help="The port of the LMCache controller.",
)
parser.add_argument(
"--lmcache-instances",
type=str,
default=None,
help="The instance id in the lmcache config files, must be with the length of static-backends,"
" separated by commas. E.g., instance_0,instance_1",
)
parser.add_argument(
"--session-key",
type=str,
default=None,
help="The key (in the header) to identify a session.",
)
parser.add_argument(
"--tokenizer",
type=str,
default=None,
help="The tokenizer model.",
)
parser.add_argument(
"--enable-shared-cache",
action="store_true",
help="Enable shared KV Cache.",
)
parser.add_argument(
"--callbacks",
type=str,
Expand Down
Loading