diff --git a/components/src/dynamo/vllm/args.py b/components/src/dynamo/vllm/args.py index 70b15eea27cd..4c420d255b30 100644 --- a/components/src/dynamo/vllm/args.py +++ b/components/src/dynamo/vllm/args.py @@ -326,6 +326,24 @@ def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]: if not config.engine_args.enable_prefix_caching: return None + # There is a bug with KV events publishing when LORA is enabled. + # This is fixed in https://github.com/vllm-project/vllm/pull/27728 but not released yet. + # remove below check once new vLLM version is released with the fix. + if config.engine_args.enable_lora: + if config.engine_args.kv_events_config is None: + # No explicit kv events config provided by user, we'll disable kv cache because LoRA is enabled and its not supported yet. + return None + else: + # User provided their own kv events config and it'll not work when LoRA is enabled. + message = ( + "KV events doesn't work when LoRA is enabled due to upstream vLLM bug. " + "Please see https://github.com/vllm-project/vllm/pull/27728." + "For now, either disable lora or dont use explicit kv envents config." + "Dont set both --kv-events-config and --enable-lora in vllm command line args." + ) + logger.error(message) + raise ValueError(message) + # If user provided their own config, use that if c := getattr(config.engine_args, "kv_events_config"): logger.info(f"Using user-provided kv_events_config {c}") diff --git a/components/src/dynamo/vllm/main.py b/components/src/dynamo/vllm/main.py index 15513e7daaff..1c9ae1072ee8 100644 --- a/components/src/dynamo/vllm/main.py +++ b/components/src/dynamo/vllm/main.py @@ -153,6 +153,9 @@ def setup_kv_event_publisher( logger.info("Skipping KV event publisher setup for decode worker") return None + if config.engine_args.kv_events_config is None: + return None + # Get data_parallel_size to create publishers for all dp_ranks data_parallel_size = getattr(vllm_config.parallel_config, "data_parallel_size", 1) kv_publishers = []