From 9e714c09c9fce794edd7b6bafb385ac3b326c615 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Wed, 12 Nov 2025 11:00:17 -0800 Subject: [PATCH 01/27] [grpc] Add gRPC server Signed-off-by: Chang Su --- pyproject.toml | 3 + vllm/entrypoints/grpc_server.py | 519 ++++++++++++++++++++++++++++++ vllm/grpc/__init__.py | 17 + vllm/grpc/compile_protos.py | 85 +++++ vllm/grpc/grpc_request_manager.py | 319 ++++++++++++++++++ vllm/grpc/vllm_engine.proto | 218 +++++++++++++ vllm/grpc/vllm_engine_pb2.py | 81 +++++ vllm/grpc/vllm_engine_pb2_grpc.py | 354 ++++++++++++++++++++ 8 files changed, 1596 insertions(+) create mode 100755 vllm/entrypoints/grpc_server.py create mode 100644 vllm/grpc/__init__.py create mode 100755 vllm/grpc/compile_protos.py create mode 100644 vllm/grpc/grpc_request_manager.py create mode 100644 vllm/grpc/vllm_engine.proto create mode 100644 vllm/grpc/vllm_engine_pb2.py create mode 100644 vllm/grpc/vllm_engine_pb2_grpc.py diff --git a/pyproject.toml b/pyproject.toml index 773f832d650c..339b5adb3b53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,9 @@ include = ["vllm*"] "vllm/third_party/**" = ["ALL"] "vllm/version.py" = ["F401"] "vllm/_version.py" = ["ALL"] +# Exclude generated protobuf files +"vllm/grpc/*_pb2.py" = ["ALL"] +"vllm/grpc/*_pb2_grpc.py" = ["ALL"] [tool.ruff.lint] select = [ diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py new file mode 100755 index 000000000000..24c7ad031fff --- /dev/null +++ b/vllm/entrypoints/grpc_server.py @@ -0,0 +1,519 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# mypy: ignore-errors +""" +vLLM gRPC Server + +Starts a gRPC server for vLLM using the VllmEngine protocol. + +Usage: + python -m vllm.entrypoints.grpc_server --model + +Example: + python -m vllm.entrypoints.grpc_server \ + --model meta-llama/Llama-2-7b-hf \ + --host 0.0.0.0 \ + --port 50051 +""" + +import argparse +import asyncio +import signal +import sys +import time +from collections.abc import AsyncGenerator + +import grpc +from grpc_reflection.v1alpha import reflection + +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.grpc import vllm_engine_pb2, vllm_engine_pb2_grpc +from vllm.grpc.grpc_request_manager import ( + GrpcRequestManager, + create_sampling_params_from_proto, +) +from vllm.logger import init_logger +from vllm.usage.usage_lib import UsageContext +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.engine.async_llm import AsyncLLM + +logger = init_logger(__name__) + + +class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer): + """ + gRPC servicer implementing the VllmEngine service. + + Handles 6 RPCs: + - Generate: Streaming text generation + - Embed: Embeddings (TODO) + - HealthCheck: Health probe + - Abort: Cancel a request + - GetModelInfo: Model metadata + - GetServerInfo: Server state + """ + + def __init__(self, request_manager: GrpcRequestManager): + """ + Initialize the servicer. + + Args: + request_manager: The GrpcRequestManager instance + """ + self.request_manager = request_manager + logger.info("VllmEngineServicer initialized") + + async def Generate( + self, + request: vllm_engine_pb2.GenerateRequest, + context: grpc.aio.ServicerContext, + ) -> AsyncGenerator[vllm_engine_pb2.GenerateResponse, None]: + """ + Handle streaming generation requests. + + Args: + request: The GenerateRequest protobuf + context: gRPC context + + Yields: + GenerateResponse protobuf messages (streaming) + """ + request_id = request.request_id + logger.info("Generate request %s received.", request_id) + + try: + # Extract tokenized input + if not request.HasField("tokenized"): + yield self._error_response( + request_id, + "Missing tokenized input", + "400", + ) + return + + prompt_token_ids = list(request.tokenized.input_ids) + + # Build sampling params with detokenize=False + sampling_params = create_sampling_params_from_proto( + request.sampling_params, + stream=request.stream, + ) + + # Submit to request manager and stream outputs + arrival_time = time.time() + + async for output in self.request_manager.generate( + request_id=request_id, + prompt_token_ids=prompt_token_ids, + sampling_params=sampling_params, + arrival_time=arrival_time, + ): + # Check if client disconnected + if context.cancelled(): + logger.info("Client disconnected for %s.", request_id) + await self.request_manager.abort(request_id) + return + + # Convert vLLM output to protobuf + # For streaming, always send chunks + if request.stream: + yield self._chunk_response(request_id, output) + + # Send complete response when finished + if output.finished: + yield self._complete_response(request_id, output) + + except Exception as e: + logger.error("Error in Generate for %s: %s", request_id, e) + yield self._error_response( + request_id, + str(e), + "500", + ) + + async def Embed( + self, + request: vllm_engine_pb2.EmbedRequest, + context: grpc.aio.ServicerContext, + ) -> vllm_engine_pb2.EmbedResponse: + """ + Handle embedding requests. + + TODO: Implement in Phase 4 + + Args: + request: The EmbedRequest protobuf + context: gRPC context + + Returns: + EmbedResponse protobuf + """ + logger.warning("Embed RPC not yet implemented") + return vllm_engine_pb2.EmbedResponse( + request_id=request.request_id, + error=vllm_engine_pb2.EmbedError( + message="Embed RPC not yet implemented", + code="NOT_IMPLEMENTED", + ), + ) + + async def HealthCheck( + self, + request: vllm_engine_pb2.HealthCheckRequest, + context: grpc.aio.ServicerContext, + ) -> vllm_engine_pb2.HealthCheckResponse: + """ + Handle health check requests. + + Args: + request: The HealthCheckRequest protobuf + context: gRPC context + + Returns: + HealthCheckResponse protobuf + """ + is_healthy, message = await self.request_manager.health_check() + + logger.info("HealthCheck request: healthy=%s, message=%s", is_healthy, message) + + return vllm_engine_pb2.HealthCheckResponse( + healthy=is_healthy, + message=message, + ) + + async def Abort( + self, + request: vllm_engine_pb2.AbortRequest, + context: grpc.aio.ServicerContext, + ) -> vllm_engine_pb2.AbortResponse: + """ + Handle abort requests. + + Args: + request: The AbortRequest protobuf + context: gRPC context + + Returns: + AbortResponse protobuf + """ + request_id = request.request_id + logger.info("Abort request for %s.", request_id) + + success = await self.request_manager.abort(request_id) + + return vllm_engine_pb2.AbortResponse( + success=success, + message=f"Request {request_id} {'aborted' if success else 'not found'}", + ) + + async def GetModelInfo( + self, + request: vllm_engine_pb2.GetModelInfoRequest, + context: grpc.aio.ServicerContext, + ) -> vllm_engine_pb2.GetModelInfoResponse: + """ + Handle model info requests. + + Args: + request: The GetModelInfoRequest protobuf + context: gRPC context + + Returns: + GetModelInfoResponse protobuf + """ + model_config = self.request_manager.get_model_config() + + return vllm_engine_pb2.GetModelInfoResponse( + model_path=model_config.get("model_path", ""), + is_generation=model_config.get("is_generation", True), + max_context_length=model_config.get("max_context_length", 0), + vocab_size=model_config.get("vocab_size", 0), + supports_vision=model_config.get("supports_vision", False), + ) + + async def GetServerInfo( + self, + request: vllm_engine_pb2.GetServerInfoRequest, + context: grpc.aio.ServicerContext, + ) -> vllm_engine_pb2.GetServerInfoResponse: + """ + Handle server info requests. + + Args: + request: The GetServerInfoRequest protobuf + context: gRPC context + + Returns: + GetServerInfoResponse protobuf + """ + num_requests = self.request_manager.get_num_unfinished_requests() + + return vllm_engine_pb2.GetServerInfoResponse( + active_requests=num_requests, + is_paused=False, + last_receive_timestamp=time.time(), + uptime_seconds=0.0, # TODO: track server start time + server_type="vllm-grpc", + ) + + # ========== Helper methods ========== + + def _chunk_response( + self, + request_id: str, + output, + ) -> vllm_engine_pb2.GenerateResponse: + """ + Build a streaming chunk response from vLLM output. + When output_kind=DELTA, vLLM returns only new tokens automatically. + + Args: + request_id: The request ID + output: vLLM RequestOutput (with delta tokens when output_kind=DELTA) + + Returns: + GenerateResponse with chunk field set + """ + # Get the completion output (first one if n > 1) + completion = output.outputs[0] if output.outputs else None + + if completion is None: + # Empty chunk + return vllm_engine_pb2.GenerateResponse( + request_id=request_id, + chunk=vllm_engine_pb2.GenerateStreamChunk( + token_ids=[], + prompt_tokens=0, + completion_tokens=0, + cached_tokens=0, + ), + ) + + # When output_kind=DELTA, completion.token_ids contains only new tokens + # vLLM handles the delta logic internally + # completion_tokens = delta count (client will accumulate) + return vllm_engine_pb2.GenerateResponse( + request_id=request_id, + chunk=vllm_engine_pb2.GenerateStreamChunk( + token_ids=completion.token_ids, + prompt_tokens=len(output.prompt_token_ids) + if output.prompt_token_ids + else 0, + completion_tokens=len(completion.token_ids), # Delta count + cached_tokens=output.num_cached_tokens, + ), + ) + + def _complete_response( + self, + request_id: str, + output, + ) -> vllm_engine_pb2.GenerateResponse: + """ + Build a final completion response from vLLM output. + + Args: + request_id: The request ID + output: vLLM RequestOutput (finished=True) + + Returns: + GenerateResponse with complete field set + """ + # Get the completion output (first one if n > 1) + completion = output.outputs[0] if output.outputs else None + + if completion is None: + # Empty completion + return vllm_engine_pb2.GenerateResponse( + request_id=request_id, + complete=vllm_engine_pb2.GenerateComplete( + output_ids=[], + finish_reason="error", + prompt_tokens=0, + completion_tokens=0, + cached_tokens=0, + ), + ) + + # Build complete response + # When streaming (DELTA mode): completion.token_ids will be empty/last delta + # When non-streaming (CUMULATIVE mode): completion.token_ids has all tokens + # Client will accumulate token counts for streaming + return vllm_engine_pb2.GenerateResponse( + request_id=request_id, + complete=vllm_engine_pb2.GenerateComplete( + output_ids=completion.token_ids, + finish_reason=completion.finish_reason or "stop", + prompt_tokens=len(output.prompt_token_ids) + if output.prompt_token_ids + else 0, + completion_tokens=len(completion.token_ids), + cached_tokens=output.num_cached_tokens, + ), + ) + + def _error_response( + self, + request_id: str, + message: str, + status_code: str, + ) -> vllm_engine_pb2.GenerateResponse: + """ + Build an error response. + + Args: + request_id: The request ID + message: Error message + status_code: HTTP-style status code + + Returns: + GenerateResponse with error field set + """ + return vllm_engine_pb2.GenerateResponse( + request_id=request_id, + error=vllm_engine_pb2.GenerateError( + message=message, + http_status_code=status_code, + details="", + ), + ) + + +async def serve_grpc(args: argparse.Namespace): + """ + Main serving function. + + Args: + args: Parsed command line arguments + """ + logger.info("Initializing vLLM gRPC server...") + + # Create engine args + engine_args = AsyncEngineArgs.from_cli_args(args) + + # Build vLLM config + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.OPENAI_API_SERVER + ) + + # Create AsyncLLM + async_llm = AsyncLLM.from_vllm_config( + vllm_config=vllm_config, + usage_context=UsageContext.OPENAI_API_SERVER, + enable_log_requests=not args.disable_log_requests_server, + disable_log_stats=args.disable_log_stats_server, + ) + + logger.info("Model: %s", vllm_config.model_config.model) + logger.info("Max model len: %s", vllm_config.model_config.max_model_len) + logger.info("Vocab size: %s", vllm_config.model_config.get_vocab_size()) + + # Create request manager + request_manager = GrpcRequestManager(async_llm) + + # Create servicer + servicer = VllmEngineServicer(request_manager) + + # Create gRPC server + server = grpc.aio.server( + options=[ + ("grpc.max_send_message_length", -1), + ("grpc.max_receive_message_length", -1), + ], + ) + + # Add servicer to server + vllm_engine_pb2_grpc.add_VllmEngineServicer_to_server(servicer, server) + + # Enable reflection for grpcurl and other tools + service_names = ( + vllm_engine_pb2.DESCRIPTOR.services_by_name["VllmEngine"].full_name, + reflection.SERVICE_NAME, + ) + reflection.enable_server_reflection(service_names, server) + + # Bind to address + address = f"{args.host}:{args.port}" + server.add_insecure_port(address) + + # Start server + await server.start() + logger.info("vLLM gRPC server started on %s", address) + logger.info("Server is ready to accept requests") + + # Handle shutdown signals + loop = asyncio.get_running_loop() + stop_event = asyncio.Event() + + def signal_handler(): + logger.info("Received shutdown signal") + stop_event.set() + + for sig in (signal.SIGTERM, signal.SIGINT): + loop.add_signal_handler(sig, signal_handler) + + # Serve until shutdown signal + try: + await stop_event.wait() + except KeyboardInterrupt: + logger.info("Interrupted by user") + finally: + logger.info("Shutting down vLLM gRPC server...") + + # Stop gRPC server + await server.stop(grace=5.0) + logger.info("gRPC server stopped") + + # Shutdown AsyncLLM + await async_llm.shutdown() + logger.info("AsyncLLM engine stopped") + + logger.info("Shutdown complete") + + +def main(): + """Main entry point.""" + parser = FlexibleArgumentParser( + description="vLLM gRPC Server", + ) + + # Server args + parser.add_argument( + "--host", + type=str, + default="0.0.0.0", + help="Host to bind gRPC server to", + ) + parser.add_argument( + "--port", + type=int, + default=50051, + help="Port to bind gRPC server to", + ) + parser.add_argument( + "--disable-log-requests-server", + action="store_true", + help="Disable request logging on server side", + ) + parser.add_argument( + "--disable-log-stats-server", + action="store_true", + help="Disable stats logging on server side", + ) + + # Add vLLM engine args + parser = AsyncEngineArgs.add_cli_args(parser) + + args = parser.parse_args() + + # Run server + try: + asyncio.run(serve_grpc(args)) + except Exception as e: + logger.exception("Server failed: %s", e) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/vllm/grpc/__init__.py b/vllm/grpc/__init__.py new file mode 100644 index 000000000000..b59ee96fb986 --- /dev/null +++ b/vllm/grpc/__init__.py @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +vLLM gRPC protocol definitions. + +This module contains the protocol buffer definitions for vLLM's gRPC API. +The protobuf files are compiled into Python code using grpcio-tools. +""" + +# These imports will be available after protobuf compilation +# from vllm.grpc import vllm_engine_pb2 +# from vllm.grpc import vllm_engine_pb2_grpc + +__all__ = [ + "vllm_engine_pb2", + "vllm_engine_pb2_grpc", +] diff --git a/vllm/grpc/compile_protos.py b/vllm/grpc/compile_protos.py new file mode 100755 index 000000000000..62216983d88b --- /dev/null +++ b/vllm/grpc/compile_protos.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Compile vLLM protobuf definitions into Python code. + +This script uses grpcio-tools to generate *_pb2.py and *_pb2_grpc.py files +from the vllm_engine.proto definition. + +Usage: + python vllm/grpc/compile_protos.py + +Requirements: + pip install grpcio-tools +""" + +import sys +from pathlib import Path + + +def compile_protos(): + """Compile protobuf definitions.""" + # Get the vllm package root directory + script_dir = Path(__file__).parent + vllm_package_root = script_dir.parent.parent # vllm/vllm/grpc -> vllm/ + + proto_file = script_dir / "vllm_engine.proto" + + if not proto_file.exists(): + print(f"Error: Proto file not found at {proto_file}") + return 1 + + print(f"Compiling protobuf: {proto_file}") + print(f"Output directory: {script_dir}") + + # Compile the proto file + # We use vllm/vllm as the proto_path so that the package is vllm.grpc.engine + try: + from grpc_tools import protoc + + result = protoc.main( + [ + "grpc_tools.protoc", + f"--proto_path={vllm_package_root}", + f"--python_out={vllm_package_root}", + f"--grpc_python_out={vllm_package_root}", + str(script_dir / "vllm_engine.proto"), + ] + ) + + if result == 0: + # Add SPDX headers to generated files + spdx_header = ( + "# SPDX-License-Identifier: Apache-2.0\n" + "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n" + ) + + for generated_file in [ + script_dir / "vllm_engine_pb2.py", + script_dir / "vllm_engine_pb2_grpc.py", + ]: + if generated_file.exists(): + content = generated_file.read_text() + if not content.startswith("# SPDX-License-Identifier"): + generated_file.write_text(spdx_header + content) + + print("✓ Protobuf compilation successful!") + print(f" Generated: {script_dir / 'vllm_engine_pb2.py'}") + print(f" Generated: {script_dir / 'vllm_engine_pb2_grpc.py'}") + return 0 + else: + print(f"Error: protoc returned {result}") + return result + + except ImportError: + print("Error: grpcio-tools not installed") + print("Install with: pip install grpcio-tools") + return 1 + except Exception as e: + print(f"Error during compilation: {e}") + return 1 + + +if __name__ == "__main__": + sys.exit(compile_protos()) diff --git a/vllm/grpc/grpc_request_manager.py b/vllm/grpc/grpc_request_manager.py new file mode 100644 index 000000000000..8bfeefc4a43f --- /dev/null +++ b/vllm/grpc/grpc_request_manager.py @@ -0,0 +1,319 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# mypy: ignore-errors +""" +gRPC Request Manager for vLLM + +Manages request lifecycle for gRPC requests, converting between protobuf +and vLLM types. Much simpler than SGLang's implementation since we can +use AsyncLLM directly (no ZMQ needed). + +Key optimization: Sets detokenize=False in SamplingParams to skip +detokenization and return token IDs only. +""" + +import asyncio +from collections.abc import AsyncGenerator + +from vllm.inputs import TokensPrompt +from vllm.logger import init_logger +from vllm.outputs import RequestOutput +from vllm.sampling_params import ( + RequestOutputKind, + SamplingParams, + StructuredOutputsParams, +) +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.async_llm import AsyncLLM +from vllm.v1.engine.output_processor import RequestOutputCollector + +logger = init_logger(__name__) + + +class GrpcRequestManager: + """ + Manages gRPC request lifecycle for vLLM. + + Responsibilities: + - Convert protobuf requests to vLLM EngineCoreRequest + - Set detokenize=False in SamplingParams (key optimization!) + - Submit requests to AsyncLLM + - Stream token IDs (not text) back to gRPC clients + - Handle abort/cancel operations + """ + + def __init__(self, async_llm: AsyncLLM): + """ + Initialize the request manager. + + Args: + async_llm: The AsyncLLM engine instance to submit requests to + """ + self.async_llm = async_llm + self.rid_to_collector: dict[str, RequestOutputCollector] = {} + + logger.info("GrpcRequestManager initialized") + + async def generate( + self, + request_id: str, + prompt_token_ids: list[int], + sampling_params: SamplingParams, + arrival_time: float, + ) -> AsyncGenerator[RequestOutput, None]: + """ + Submit a generation request and stream outputs. + + Args: + request_id: Unique request identifier + prompt_token_ids: Pre-tokenized input from Rust router + sampling_params: Sampling parameters (with detokenize=False!) + arrival_time: Request arrival timestamp + + Yields: + RequestOutput objects containing token IDs (text will be empty) + """ + try: + # Use processor.process_inputs() with pre-tokenized input + prompt: TokensPrompt = {"prompt_token_ids": prompt_token_ids} + + engine_request = self.async_llm.processor.process_inputs( + request_id=request_id, + prompt=prompt, + params=sampling_params, + arrival_time=arrival_time, + ) + + collector = RequestOutputCollector(output_kind=sampling_params.output_kind) + self.rid_to_collector[request_id] = collector + + # Submit to AsyncLLM - it will call add_request internally + # and populate our collector + await self._submit_request(engine_request, collector) + + # Stream outputs from collector + while True: + try: + output = await collector.get() + yield output + + if output.finished: + break + + except asyncio.CancelledError: + logger.info("Request %s cancelled by client.", request_id) + # Clean up the request in output_processor and engine_core + await self.async_llm.abort([request_id]) + raise # Re-raise to let gRPC server handle cleanup + + except Exception as e: + logger.error("Error in generate for %s: %s", request_id, e) + raise + finally: + # Cleanup + self.rid_to_collector.pop(request_id, None) + + async def _submit_request( + self, + request: EngineCoreRequest, + collector: RequestOutputCollector, + ) -> None: + """ + Internal method to submit request to AsyncLLM. + + Args: + request: The EngineCoreRequest to submit + collector: The output collector for this request + """ + try: + # Add request to output processor + # Use None for prompt since we have pre-tokenized input + # TODO: Support sampling_params.n > 1 (parallel sampling) + # When n > 1, we need to: + # 1. Create a ParentRequest to track all child requests + # 2. Fan out multiple child EngineCoreRequests with different + # request_index values + # 3. Aggregate outputs from all children + # For now, we only support n=1, so parent_req=None and + # request_index=0 + self.async_llm.output_processor.add_request( + request=request, + prompt=None, + parent_req=None, + request_index=0, + queue=collector, + ) + + # Submit to engine core + await self.async_llm.engine_core.add_request_async(request) + + except Exception as e: + logger.error("Error submitting request %s: %s", request.request_id, e) + # Put error in collector + collector.put(e) + + async def abort(self, request_id: str) -> bool: + """ + Abort a running request. + + Args: + request_id: The request ID to abort + + Returns: + True if request was found and aborted, False otherwise + """ + try: + # Check if request exists + collector = self.rid_to_collector.get(request_id) + + if collector is None: + logger.debug( + "Abort: request %s not found (may have already completed).", + request_id, + ) + return False + + # Abort in AsyncLLM (this handles both engine_core and output_processor) + await self.async_llm.abort([request_id]) + + # Remove from our tracking + self.rid_to_collector.pop(request_id, None) + + logger.info("Request %s aborted.", request_id) + return True + + except Exception as e: + logger.error("Error aborting request %s: %s", request_id, e) + self.rid_to_collector.pop(request_id, None) + return False + + async def health_check(self) -> tuple[bool, str]: + """ + Check if the engine is healthy. + + Returns: + Tuple of (is_healthy, message) + """ + try: + # Check if engine is running and not errored + if self.async_llm.errored: + return False, "Engine is not alive" + + return True, "Healthy" + + except Exception as e: + logger.error("Health check error: %s", e) + return False, f"Error: {e}" + + def get_model_config(self) -> dict: + """ + Get model configuration information. + + Returns: + Dictionary with model config details + """ + model_config = self.async_llm.model_config + + return { + "model_path": model_config.model, + "is_generation": model_config.runner_type == "generate", + "max_context_length": model_config.max_model_len, + "vocab_size": model_config.get_vocab_size(), + "supports_vision": model_config.is_multimodal_model, + } + + def get_num_unfinished_requests(self) -> int: + """ + Get the number of currently running requests. + + Returns: + Number of unfinished requests + """ + return len(self.rid_to_collector) + + +def create_sampling_params_from_proto( + proto_params, + stream: bool = True, +) -> SamplingParams: + """ + Convert protobuf SamplingParams to vLLM SamplingParams. + + Args: + proto_params: Protobuf SamplingParams message + stream: Whether streaming is enabled + + Returns: + vLLM SamplingParams with detokenize=False and structured_outputs + """ + # Build stop sequences + stop = list(proto_params.stop) if proto_params.stop else None + stop_token_ids = ( + list(proto_params.stop_token_ids) if proto_params.stop_token_ids else None + ) + + # Handle structured outputs constraints + structured_outputs = None + constraint_field = proto_params.WhichOneof("constraint") + if constraint_field: + if constraint_field == "json_schema": + structured_outputs = StructuredOutputsParams(json=proto_params.json_schema) + elif constraint_field == "regex": + structured_outputs = StructuredOutputsParams(regex=proto_params.regex) + elif constraint_field == "grammar": + structured_outputs = StructuredOutputsParams(grammar=proto_params.grammar) + elif constraint_field == "structural_tag": + structured_outputs = StructuredOutputsParams( + structural_tag=proto_params.structural_tag + ) + elif constraint_field == "json_object": + structured_outputs = StructuredOutputsParams( + json_object=proto_params.json_object + ) + elif constraint_field == "choice": + structured_outputs = StructuredOutputsParams( + choice=list(proto_params.choice.choices) + ) + + # Handle logit_bias + logit_bias = None + if proto_params.logit_bias: + logit_bias = dict(proto_params.logit_bias) + + # Create SamplingParams with detokenize=False and output_kind=DELTA + # detokenize=False: KEY OPTIMIZATION that skips detokenization! + # output_kind=DELTA: Return only new tokens in each chunk (for streaming) + return SamplingParams( + temperature=proto_params.temperature if proto_params.temperature > 0 else 1.0, + top_p=proto_params.top_p if proto_params.top_p > 0 else 1.0, + top_k=proto_params.top_k if proto_params.top_k > 0 else -1, + min_p=proto_params.min_p if proto_params.min_p > 0 else 0.0, + frequency_penalty=proto_params.frequency_penalty, + presence_penalty=proto_params.presence_penalty, + repetition_penalty=proto_params.repetition_penalty + if proto_params.repetition_penalty > 0 + else 1.0, + max_tokens=proto_params.max_tokens + if proto_params.HasField("max_tokens") + else None, + min_tokens=proto_params.min_tokens if proto_params.min_tokens > 0 else 0, + stop=stop, + stop_token_ids=stop_token_ids, + skip_special_tokens=proto_params.skip_special_tokens, + spaces_between_special_tokens=proto_params.spaces_between_special_tokens, + ignore_eos=proto_params.ignore_eos, + n=proto_params.n if proto_params.n > 0 else 1, + logprobs=proto_params.logprobs if proto_params.HasField("logprobs") else None, + prompt_logprobs=proto_params.prompt_logprobs + if proto_params.HasField("prompt_logprobs") + else None, + seed=proto_params.seed if proto_params.HasField("seed") else None, + include_stop_str_in_output=proto_params.include_stop_str_in_output, + logit_bias=logit_bias, + truncate_prompt_tokens=proto_params.truncate_prompt_tokens + if proto_params.HasField("truncate_prompt_tokens") + else None, + structured_outputs=structured_outputs, + detokenize=False, + output_kind=RequestOutputKind.DELTA if stream else RequestOutputKind.CUMULATIVE, + ) diff --git a/vllm/grpc/vllm_engine.proto b/vllm/grpc/vllm_engine.proto new file mode 100644 index 000000000000..f4e7934baa9b --- /dev/null +++ b/vllm/grpc/vllm_engine.proto @@ -0,0 +1,218 @@ +syntax = "proto3"; + +package vllm.grpc.engine; + +// Service definition for vLLM engine communication +// This protocol is designed for efficient binary communication between +// the Rust router and vLLM Python engine (AsyncLLM). +service VllmEngine { + // Submit a generation request (supports streaming) + rpc Generate(GenerateRequest) returns (stream GenerateResponse); + + // Submit an embedding request + rpc Embed(EmbedRequest) returns (EmbedResponse); + + // Health check + rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse); + + // Abort a running request + rpc Abort(AbortRequest) returns (AbortResponse); + + // Get model information + rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse); + + // Get server information + rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse); +} + +// ===================== +// Common Types +// ===================== + +// Sampling parameters for text generation +message SamplingParams { + float temperature = 1; + float top_p = 2; + int32 top_k = 3; + float min_p = 4; + float frequency_penalty = 5; + float presence_penalty = 6; + float repetition_penalty = 7; + + optional int32 max_tokens = 8; + int32 min_tokens = 9; + + repeated string stop = 10; + repeated uint32 stop_token_ids = 11; + + bool skip_special_tokens = 12; + bool spaces_between_special_tokens = 13; + bool ignore_eos = 14; + + int32 n = 15; // Number of parallel samples + + // Logprobs configuration + optional int32 logprobs = 22; // Number of log probabilities per output token (-1 for all) + optional int32 prompt_logprobs = 23; // Number of log probabilities per prompt token (-1 for all) + + // Additional vLLM fields + optional int32 seed = 24; // Random seed for reproducibility + bool include_stop_str_in_output = 25; // Whether to include stop strings in output + map logit_bias = 26; // Token ID to bias mapping (-100 to 100) + optional int32 truncate_prompt_tokens = 27; // Prompt truncation (-1 for model max) + + // Structured outputs (one of) - matches vLLM's StructuredOutputsParams + oneof constraint { + string json_schema = 16; // JSON schema for structured output + string regex = 17; // Regex pattern + string grammar = 18; // Grammar/EBNF for structured output + string structural_tag = 19; // Structural tag (e.g., Harmony models) + bool json_object = 20; // Force JSON object output + ChoiceConstraint choice = 21; // List of allowed choices + } +} + +// Choice constraint for structured outputs +message ChoiceConstraint { + repeated string choices = 1; +} + +// Pre-tokenized input from Rust router +message TokenizedInput { + string original_text = 1; // For reference/debugging + repeated uint32 input_ids = 2; // Actual token IDs to process +} + +// ===================== +// Generate Request +// ===================== + +message GenerateRequest { + string request_id = 1; + + // Pre-tokenized input (required) + TokenizedInput tokenized = 2; + + // Generation parameters (includes logprobs config) + SamplingParams sampling_params = 3; + + // Streaming + bool stream = 4; +} + +// ===================== +// Generate Response +// ===================== + +message GenerateResponse { + string request_id = 1; + + oneof response { + GenerateStreamChunk chunk = 2; // For streaming + GenerateComplete complete = 3; // For final/non-streaming + GenerateError error = 4; // For errors + } +} + +message GenerateStreamChunk { + repeated uint32 token_ids = 1; // Incremental tokens + int32 prompt_tokens = 2; + int32 completion_tokens = 3; + int32 cached_tokens = 4; + + // Logprobs support (TODO: implement in Phase 4) + // OutputLogProbs output_logprobs = 5; + // InputLogProbs input_logprobs = 6; // Only in first chunk +} + +message GenerateComplete { + repeated uint32 output_ids = 1; // All output tokens + string finish_reason = 2; // "stop", "length", "abort" + int32 prompt_tokens = 3; + int32 completion_tokens = 4; + int32 cached_tokens = 5; + + // Logprobs support (TODO: implement in Phase 4) + // OutputLogProbs output_logprobs = 6; + // InputLogProbs input_logprobs = 7; +} + +message GenerateError { + string message = 1; + string http_status_code = 2; + string details = 3; +} + +// ===================== +// Embedding Request +// ===================== + +message EmbedRequest { + string request_id = 1; + TokenizedInput tokenized = 2; +} + +message EmbedResponse { + string request_id = 1; + + oneof response { + EmbedComplete complete = 2; + EmbedError error = 3; + } +} + +message EmbedComplete { + repeated float embedding = 1; + int32 prompt_tokens = 2; + int32 embedding_dim = 3; +} + +message EmbedError { + string message = 1; + string code = 2; +} + +// ===================== +// Management Operations +// ===================== + +message HealthCheckRequest {} + +message HealthCheckResponse { + bool healthy = 1; + string message = 2; +} + +message AbortRequest { + string request_id = 1; + string reason = 2; +} + +message AbortResponse { + bool success = 1; + string message = 2; +} + +// ===================== +// Model and Server Info +// ===================== + +message GetModelInfoRequest {} + +message GetModelInfoResponse { + string model_path = 1; + bool is_generation = 2; + int32 max_context_length = 3; + int32 vocab_size = 4; + bool supports_vision = 5; +} + +message GetServerInfoRequest {} + +message GetServerInfoResponse { + int32 active_requests = 1; + bool is_paused = 2; + double last_receive_timestamp = 3; + double uptime_seconds = 4; + string server_type = 5; // "vllm-grpc" +} diff --git a/vllm/grpc/vllm_engine_pb2.py b/vllm/grpc/vllm_engine_pb2.py new file mode 100644 index 000000000000..a12ff8b43300 --- /dev/null +++ b/vllm/grpc/vllm_engine_pb2.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: vllm/grpc/vllm_engine.proto +# Protobuf Python Version: 6.31.1 +"""Generated protocol buffer code.""" + +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, 6, 31, 1, "", "vllm/grpc/vllm_engine.proto" +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x1bvllm/grpc/vllm_engine.proto\x12\x10vllm.grpc.engine"\xe5\x06\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x17\n\nmax_tokens\x18\x08 \x01(\x05H\x01\x88\x01\x01\x12\x12\n\nmin_tokens\x18\t \x01(\x05\x12\x0c\n\x04stop\x18\n \x03(\t\x12\x16\n\x0estop_token_ids\x18\x0b \x03(\r\x12\x1b\n\x13skip_special_tokens\x18\x0c \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\r \x01(\x08\x12\x12\n\nignore_eos\x18\x0e \x01(\x08\x12\t\n\x01n\x18\x0f \x01(\x05\x12\x15\n\x08logprobs\x18\x16 \x01(\x05H\x02\x88\x01\x01\x12\x1c\n\x0fprompt_logprobs\x18\x17 \x01(\x05H\x03\x88\x01\x01\x12\x11\n\x04seed\x18\x18 \x01(\x05H\x04\x88\x01\x01\x12"\n\x1ainclude_stop_str_in_output\x18\x19 \x01(\x08\x12\x43\n\nlogit_bias\x18\x1a \x03(\x0b\x32/.vllm.grpc.engine.SamplingParams.LogitBiasEntry\x12#\n\x16truncate_prompt_tokens\x18\x1b \x01(\x05H\x05\x88\x01\x01\x12\x15\n\x0bjson_schema\x18\x10 \x01(\tH\x00\x12\x0f\n\x05regex\x18\x11 \x01(\tH\x00\x12\x11\n\x07grammar\x18\x12 \x01(\tH\x00\x12\x18\n\x0estructural_tag\x18\x13 \x01(\tH\x00\x12\x15\n\x0bjson_object\x18\x14 \x01(\x08H\x00\x12\x34\n\x06\x63hoice\x18\x15 \x01(\x0b\x32".vllm.grpc.engine.ChoiceConstraintH\x00\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\x05\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraintB\r\n\x0b_max_tokensB\x0b\n\t_logprobsB\x12\n\x10_prompt_logprobsB\x07\n\x05_seedB\x19\n\x17_truncate_prompt_tokens"#\n\x10\x43hoiceConstraint\x12\x0f\n\x07\x63hoices\x18\x01 \x03(\t":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\r"\xa5\x01\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput\x12\x39\n\x0fsampling_params\x18\x03 \x01(\x0b\x32 .vllm.grpc.engine.SamplingParams\x12\x0e\n\x06stream\x18\x04 \x01(\x08"\xd4\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x36\n\x05\x63hunk\x18\x02 \x01(\x0b\x32%.vllm.grpc.engine.GenerateStreamChunkH\x00\x12\x36\n\x08\x63omplete\x18\x03 \x01(\x0b\x32".vllm.grpc.engine.GenerateCompleteH\x00\x12\x30\n\x05\x65rror\x18\x04 \x01(\x0b\x32\x1f.vllm.grpc.engine.GenerateErrorH\x00\x42\n\n\x08response"q\n\x13GenerateStreamChunk\x12\x11\n\ttoken_ids\x18\x01 \x03(\r\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x03 \x01(\x05\x12\x15\n\rcached_tokens\x18\x04 \x01(\x05"\x86\x01\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\r\x12\x15\n\rfinish_reason\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t"W\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput"\x93\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\x08\x63omplete\x18\x02 \x01(\x0b\x32\x1f.vllm.grpc.engine.EmbedCompleteH\x00\x12-\n\x05\x65rror\x18\x03 \x01(\x0b\x32\x1c.vllm.grpc.engine.EmbedErrorH\x00\x42\n\n\x08response"P\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rembedding_dim\x18\x03 \x01(\x05"+\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t"\x14\n\x12HealthCheckRequest"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"\x15\n\x13GetModelInfoRequest"\x8a\x01\n\x14GetModelInfoResponse\x12\x12\n\nmodel_path\x18\x01 \x01(\t\x12\x15\n\ris_generation\x18\x02 \x01(\x08\x12\x1a\n\x12max_context_length\x18\x03 \x01(\x05\x12\x12\n\nvocab_size\x18\x04 \x01(\x05\x12\x17\n\x0fsupports_vision\x18\x05 \x01(\x08"\x16\n\x14GetServerInfoRequest"\x90\x01\n\x15GetServerInfoResponse\x12\x17\n\x0f\x61\x63tive_requests\x18\x01 \x01(\x05\x12\x11\n\tis_paused\x18\x02 \x01(\x08\x12\x1e\n\x16last_receive_timestamp\x18\x03 \x01(\x01\x12\x16\n\x0euptime_seconds\x18\x04 \x01(\x01\x12\x13\n\x0bserver_type\x18\x05 \x01(\t2\x92\x04\n\nVllmEngine\x12S\n\x08Generate\x12!.vllm.grpc.engine.GenerateRequest\x1a".vllm.grpc.engine.GenerateResponse0\x01\x12H\n\x05\x45mbed\x12\x1e.vllm.grpc.engine.EmbedRequest\x1a\x1f.vllm.grpc.engine.EmbedResponse\x12Z\n\x0bHealthCheck\x12$.vllm.grpc.engine.HealthCheckRequest\x1a%.vllm.grpc.engine.HealthCheckResponse\x12H\n\x05\x41\x62ort\x12\x1e.vllm.grpc.engine.AbortRequest\x1a\x1f.vllm.grpc.engine.AbortResponse\x12]\n\x0cGetModelInfo\x12%.vllm.grpc.engine.GetModelInfoRequest\x1a&.vllm.grpc.engine.GetModelInfoResponse\x12`\n\rGetServerInfo\x12&.vllm.grpc.engine.GetServerInfoRequest\x1a\'.vllm.grpc.engine.GetServerInfoResponseb\x06proto3' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages( + DESCRIPTOR, "vllm.grpc.vllm_engine_pb2", _globals +) +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None + _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._loaded_options = None + _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_options = b"8\001" + _globals["_SAMPLINGPARAMS"]._serialized_start = 50 + _globals["_SAMPLINGPARAMS"]._serialized_end = 919 + _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_start = 773 + _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_end = 821 + _globals["_CHOICECONSTRAINT"]._serialized_start = 921 + _globals["_CHOICECONSTRAINT"]._serialized_end = 956 + _globals["_TOKENIZEDINPUT"]._serialized_start = 958 + _globals["_TOKENIZEDINPUT"]._serialized_end = 1016 + _globals["_GENERATEREQUEST"]._serialized_start = 1019 + _globals["_GENERATEREQUEST"]._serialized_end = 1184 + _globals["_GENERATERESPONSE"]._serialized_start = 1187 + _globals["_GENERATERESPONSE"]._serialized_end = 1399 + _globals["_GENERATESTREAMCHUNK"]._serialized_start = 1401 + _globals["_GENERATESTREAMCHUNK"]._serialized_end = 1514 + _globals["_GENERATECOMPLETE"]._serialized_start = 1517 + _globals["_GENERATECOMPLETE"]._serialized_end = 1651 + _globals["_GENERATEERROR"]._serialized_start = 1653 + _globals["_GENERATEERROR"]._serialized_end = 1728 + _globals["_EMBEDREQUEST"]._serialized_start = 1730 + _globals["_EMBEDREQUEST"]._serialized_end = 1817 + _globals["_EMBEDRESPONSE"]._serialized_start = 1820 + _globals["_EMBEDRESPONSE"]._serialized_end = 1967 + _globals["_EMBEDCOMPLETE"]._serialized_start = 1969 + _globals["_EMBEDCOMPLETE"]._serialized_end = 2049 + _globals["_EMBEDERROR"]._serialized_start = 2051 + _globals["_EMBEDERROR"]._serialized_end = 2094 + _globals["_HEALTHCHECKREQUEST"]._serialized_start = 2096 + _globals["_HEALTHCHECKREQUEST"]._serialized_end = 2116 + _globals["_HEALTHCHECKRESPONSE"]._serialized_start = 2118 + _globals["_HEALTHCHECKRESPONSE"]._serialized_end = 2173 + _globals["_ABORTREQUEST"]._serialized_start = 2175 + _globals["_ABORTREQUEST"]._serialized_end = 2225 + _globals["_ABORTRESPONSE"]._serialized_start = 2227 + _globals["_ABORTRESPONSE"]._serialized_end = 2276 + _globals["_GETMODELINFOREQUEST"]._serialized_start = 2278 + _globals["_GETMODELINFOREQUEST"]._serialized_end = 2299 + _globals["_GETMODELINFORESPONSE"]._serialized_start = 2302 + _globals["_GETMODELINFORESPONSE"]._serialized_end = 2440 + _globals["_GETSERVERINFOREQUEST"]._serialized_start = 2442 + _globals["_GETSERVERINFOREQUEST"]._serialized_end = 2464 + _globals["_GETSERVERINFORESPONSE"]._serialized_start = 2467 + _globals["_GETSERVERINFORESPONSE"]._serialized_end = 2611 + _globals["_VLLMENGINE"]._serialized_start = 2614 + _globals["_VLLMENGINE"]._serialized_end = 3144 +# @@protoc_insertion_point(module_scope) diff --git a/vllm/grpc/vllm_engine_pb2_grpc.py b/vllm/grpc/vllm_engine_pb2_grpc.py new file mode 100644 index 000000000000..2e3b0fbfbc88 --- /dev/null +++ b/vllm/grpc/vllm_engine_pb2_grpc.py @@ -0,0 +1,354 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" + +import grpc +import warnings + +from vllm.grpc import vllm_engine_pb2 as vllm_dot_grpc_dot_vllm__engine__pb2 + +GRPC_GENERATED_VERSION = "1.75.1" +GRPC_VERSION = grpc.__version__ +_version_not_supported = False + +try: + from grpc._utilities import first_version_is_lower + + _version_not_supported = first_version_is_lower( + GRPC_VERSION, GRPC_GENERATED_VERSION + ) +except ImportError: + _version_not_supported = True + +if _version_not_supported: + raise RuntimeError( + f"The grpc package installed is at version {GRPC_VERSION}," + + f" but the generated code in vllm/grpc/vllm_engine_pb2_grpc.py depends on" + + f" grpcio>={GRPC_GENERATED_VERSION}." + + f" Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}" + + f" or downgrade your generated code using grpcio-tools<={GRPC_VERSION}." + ) + + +class VllmEngineStub(object): + """Service definition for vLLM engine communication + This protocol is designed for efficient binary communication between + the Rust router and vLLM Python engine (AsyncLLM). + """ + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.Generate = channel.unary_stream( + "/vllm.grpc.engine.VllmEngine/Generate", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.FromString, + _registered_method=True, + ) + self.Embed = channel.unary_unary( + "/vllm.grpc.engine.VllmEngine/Embed", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.FromString, + _registered_method=True, + ) + self.HealthCheck = channel.unary_unary( + "/vllm.grpc.engine.VllmEngine/HealthCheck", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.FromString, + _registered_method=True, + ) + self.Abort = channel.unary_unary( + "/vllm.grpc.engine.VllmEngine/Abort", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.FromString, + _registered_method=True, + ) + self.GetModelInfo = channel.unary_unary( + "/vllm.grpc.engine.VllmEngine/GetModelInfo", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.FromString, + _registered_method=True, + ) + self.GetServerInfo = channel.unary_unary( + "/vllm.grpc.engine.VllmEngine/GetServerInfo", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.FromString, + _registered_method=True, + ) + + +class VllmEngineServicer(object): + """Service definition for vLLM engine communication + This protocol is designed for efficient binary communication between + the Rust router and vLLM Python engine (AsyncLLM). + """ + + def Generate(self, request, context): + """Submit a generation request (supports streaming)""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def Embed(self, request, context): + """Submit an embedding request""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def HealthCheck(self, request, context): + """Health check""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def Abort(self, request, context): + """Abort a running request""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def GetModelInfo(self, request, context): + """Get model information""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def GetServerInfo(self, request, context): + """Get server information""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + +def add_VllmEngineServicer_to_server(servicer, server): + rpc_method_handlers = { + "Generate": grpc.unary_stream_rpc_method_handler( + servicer.Generate, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.SerializeToString, + ), + "Embed": grpc.unary_unary_rpc_method_handler( + servicer.Embed, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.SerializeToString, + ), + "HealthCheck": grpc.unary_unary_rpc_method_handler( + servicer.HealthCheck, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.SerializeToString, + ), + "Abort": grpc.unary_unary_rpc_method_handler( + servicer.Abort, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.SerializeToString, + ), + "GetModelInfo": grpc.unary_unary_rpc_method_handler( + servicer.GetModelInfo, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.SerializeToString, + ), + "GetServerInfo": grpc.unary_unary_rpc_method_handler( + servicer.GetServerInfo, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + "vllm.grpc.engine.VllmEngine", rpc_method_handlers + ) + server.add_generic_rpc_handlers((generic_handler,)) + server.add_registered_method_handlers( + "vllm.grpc.engine.VllmEngine", rpc_method_handlers + ) + + +# This class is part of an EXPERIMENTAL API. +class VllmEngine(object): + """Service definition for vLLM engine communication + This protocol is designed for efficient binary communication between + the Rust router and vLLM Python engine (AsyncLLM). + """ + + @staticmethod + def Generate( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_stream( + request, + target, + "/vllm.grpc.engine.VllmEngine/Generate", + vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.SerializeToString, + vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True, + ) + + @staticmethod + def Embed( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/vllm.grpc.engine.VllmEngine/Embed", + vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.SerializeToString, + vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True, + ) + + @staticmethod + def HealthCheck( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/vllm.grpc.engine.VllmEngine/HealthCheck", + vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.SerializeToString, + vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True, + ) + + @staticmethod + def Abort( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/vllm.grpc.engine.VllmEngine/Abort", + vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.SerializeToString, + vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True, + ) + + @staticmethod + def GetModelInfo( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/vllm.grpc.engine.VllmEngine/GetModelInfo", + vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.SerializeToString, + vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True, + ) + + @staticmethod + def GetServerInfo( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/vllm.grpc.engine.VllmEngine/GetServerInfo", + vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.SerializeToString, + vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True, + ) From 5b0ae5acd5f4e35b171914ecd8ff0bbf3df3ccaf Mon Sep 17 00:00:00 2001 From: Chang Su Date: Sat, 6 Dec 2025 11:28:18 -0800 Subject: [PATCH 02/27] Add grpc in CODEOWNERS Signed-off-by: Chang Su --- .github/CODEOWNERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 4d7a366f05e3..d74cc42eebc1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -159,3 +159,7 @@ mkdocs.yaml @hmellor /docs/usage/security.md @russellb /SECURITY.md @russellb /docs/contributing/vulnerability_management.md @russellb + +# gRPC server +/vllm/grpc @CatherineSue @slin1237 +/vllm/entrypoints/grpc_server.py @CatherineSue @slin1237 From a27403018ce8709e265fdde4cf0c8c8e2266f739 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Sat, 6 Dec 2025 13:24:01 -0800 Subject: [PATCH 03/27] Add type stubs for proto files Signed-off-by: Chang Su --- vllm/entrypoints/grpc_server.py | 5 +- vllm/grpc/compile_protos.py | 7 +- vllm/grpc/grpc_request_manager.py | 3 +- vllm/grpc/vllm_engine_pb2.py | 113 +++++----- vllm/grpc/vllm_engine_pb2.pyi | 248 +++++++++++++++++++++ vllm/grpc/vllm_engine_pb2_grpc.py | 345 ++++++++++++++---------------- 6 files changed, 475 insertions(+), 246 deletions(-) create mode 100644 vllm/grpc/vllm_engine_pb2.pyi diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index 24c7ad031fff..f9ae1c510c72 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -34,6 +34,7 @@ create_sampling_params_from_proto, ) from vllm.logger import init_logger +from vllm.outputs import RequestOutput from vllm.usage.usage_lib import UsageContext from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.engine.async_llm import AsyncLLM @@ -262,7 +263,7 @@ async def GetServerInfo( def _chunk_response( self, request_id: str, - output, + output: RequestOutput, ) -> vllm_engine_pb2.GenerateResponse: """ Build a streaming chunk response from vLLM output. @@ -308,7 +309,7 @@ def _chunk_response( def _complete_response( self, request_id: str, - output, + output: RequestOutput, ) -> vllm_engine_pb2.GenerateResponse: """ Build a final completion response from vLLM output. diff --git a/vllm/grpc/compile_protos.py b/vllm/grpc/compile_protos.py index 62216983d88b..b32da4abcf8d 100755 --- a/vllm/grpc/compile_protos.py +++ b/vllm/grpc/compile_protos.py @@ -4,8 +4,8 @@ """ Compile vLLM protobuf definitions into Python code. -This script uses grpcio-tools to generate *_pb2.py and *_pb2_grpc.py files -from the vllm_engine.proto definition. +This script uses grpcio-tools to generate *_pb2.py, *_pb2_grpc.py, and +*_pb2.pyi (type stubs) files from the vllm_engine.proto definition. Usage: python vllm/grpc/compile_protos.py @@ -44,6 +44,7 @@ def compile_protos(): f"--proto_path={vllm_package_root}", f"--python_out={vllm_package_root}", f"--grpc_python_out={vllm_package_root}", + f"--pyi_out={vllm_package_root}", # Generate type stubs str(script_dir / "vllm_engine.proto"), ] ) @@ -58,6 +59,7 @@ def compile_protos(): for generated_file in [ script_dir / "vllm_engine_pb2.py", script_dir / "vllm_engine_pb2_grpc.py", + script_dir / "vllm_engine_pb2.pyi", ]: if generated_file.exists(): content = generated_file.read_text() @@ -67,6 +69,7 @@ def compile_protos(): print("✓ Protobuf compilation successful!") print(f" Generated: {script_dir / 'vllm_engine_pb2.py'}") print(f" Generated: {script_dir / 'vllm_engine_pb2_grpc.py'}") + print(f" Generated: {script_dir / 'vllm_engine_pb2.pyi'} (type stubs)") return 0 else: print(f"Error: protoc returned {result}") diff --git a/vllm/grpc/grpc_request_manager.py b/vllm/grpc/grpc_request_manager.py index 8bfeefc4a43f..fe7024f65c77 100644 --- a/vllm/grpc/grpc_request_manager.py +++ b/vllm/grpc/grpc_request_manager.py @@ -15,6 +15,7 @@ import asyncio from collections.abc import AsyncGenerator +from vllm.grpc import vllm_engine_pb2 from vllm.inputs import TokensPrompt from vllm.logger import init_logger from vllm.outputs import RequestOutput @@ -233,7 +234,7 @@ def get_num_unfinished_requests(self) -> int: def create_sampling_params_from_proto( - proto_params, + proto_params: vllm_engine_pb2.SamplingParams, stream: bool = True, ) -> SamplingParams: """ diff --git a/vllm/grpc/vllm_engine_pb2.py b/vllm/grpc/vllm_engine_pb2.py index a12ff8b43300..1d7ec53061e3 100644 --- a/vllm/grpc/vllm_engine_pb2.py +++ b/vllm/grpc/vllm_engine_pb2.py @@ -6,76 +6,77 @@ # source: vllm/grpc/vllm_engine.proto # Protobuf Python Version: 6.31.1 """Generated protocol buffer code.""" - from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import runtime_version as _runtime_version from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder - _runtime_version.ValidateProtobufRuntimeVersion( - _runtime_version.Domain.PUBLIC, 6, 31, 1, "", "vllm/grpc/vllm_engine.proto" + _runtime_version.Domain.PUBLIC, + 6, + 31, + 1, + '', + 'vllm/grpc/vllm_engine.proto' ) # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1bvllm/grpc/vllm_engine.proto\x12\x10vllm.grpc.engine"\xe5\x06\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x17\n\nmax_tokens\x18\x08 \x01(\x05H\x01\x88\x01\x01\x12\x12\n\nmin_tokens\x18\t \x01(\x05\x12\x0c\n\x04stop\x18\n \x03(\t\x12\x16\n\x0estop_token_ids\x18\x0b \x03(\r\x12\x1b\n\x13skip_special_tokens\x18\x0c \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\r \x01(\x08\x12\x12\n\nignore_eos\x18\x0e \x01(\x08\x12\t\n\x01n\x18\x0f \x01(\x05\x12\x15\n\x08logprobs\x18\x16 \x01(\x05H\x02\x88\x01\x01\x12\x1c\n\x0fprompt_logprobs\x18\x17 \x01(\x05H\x03\x88\x01\x01\x12\x11\n\x04seed\x18\x18 \x01(\x05H\x04\x88\x01\x01\x12"\n\x1ainclude_stop_str_in_output\x18\x19 \x01(\x08\x12\x43\n\nlogit_bias\x18\x1a \x03(\x0b\x32/.vllm.grpc.engine.SamplingParams.LogitBiasEntry\x12#\n\x16truncate_prompt_tokens\x18\x1b \x01(\x05H\x05\x88\x01\x01\x12\x15\n\x0bjson_schema\x18\x10 \x01(\tH\x00\x12\x0f\n\x05regex\x18\x11 \x01(\tH\x00\x12\x11\n\x07grammar\x18\x12 \x01(\tH\x00\x12\x18\n\x0estructural_tag\x18\x13 \x01(\tH\x00\x12\x15\n\x0bjson_object\x18\x14 \x01(\x08H\x00\x12\x34\n\x06\x63hoice\x18\x15 \x01(\x0b\x32".vllm.grpc.engine.ChoiceConstraintH\x00\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\x05\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraintB\r\n\x0b_max_tokensB\x0b\n\t_logprobsB\x12\n\x10_prompt_logprobsB\x07\n\x05_seedB\x19\n\x17_truncate_prompt_tokens"#\n\x10\x43hoiceConstraint\x12\x0f\n\x07\x63hoices\x18\x01 \x03(\t":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\r"\xa5\x01\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput\x12\x39\n\x0fsampling_params\x18\x03 \x01(\x0b\x32 .vllm.grpc.engine.SamplingParams\x12\x0e\n\x06stream\x18\x04 \x01(\x08"\xd4\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x36\n\x05\x63hunk\x18\x02 \x01(\x0b\x32%.vllm.grpc.engine.GenerateStreamChunkH\x00\x12\x36\n\x08\x63omplete\x18\x03 \x01(\x0b\x32".vllm.grpc.engine.GenerateCompleteH\x00\x12\x30\n\x05\x65rror\x18\x04 \x01(\x0b\x32\x1f.vllm.grpc.engine.GenerateErrorH\x00\x42\n\n\x08response"q\n\x13GenerateStreamChunk\x12\x11\n\ttoken_ids\x18\x01 \x03(\r\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x03 \x01(\x05\x12\x15\n\rcached_tokens\x18\x04 \x01(\x05"\x86\x01\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\r\x12\x15\n\rfinish_reason\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t"W\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput"\x93\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\x08\x63omplete\x18\x02 \x01(\x0b\x32\x1f.vllm.grpc.engine.EmbedCompleteH\x00\x12-\n\x05\x65rror\x18\x03 \x01(\x0b\x32\x1c.vllm.grpc.engine.EmbedErrorH\x00\x42\n\n\x08response"P\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rembedding_dim\x18\x03 \x01(\x05"+\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t"\x14\n\x12HealthCheckRequest"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"\x15\n\x13GetModelInfoRequest"\x8a\x01\n\x14GetModelInfoResponse\x12\x12\n\nmodel_path\x18\x01 \x01(\t\x12\x15\n\ris_generation\x18\x02 \x01(\x08\x12\x1a\n\x12max_context_length\x18\x03 \x01(\x05\x12\x12\n\nvocab_size\x18\x04 \x01(\x05\x12\x17\n\x0fsupports_vision\x18\x05 \x01(\x08"\x16\n\x14GetServerInfoRequest"\x90\x01\n\x15GetServerInfoResponse\x12\x17\n\x0f\x61\x63tive_requests\x18\x01 \x01(\x05\x12\x11\n\tis_paused\x18\x02 \x01(\x08\x12\x1e\n\x16last_receive_timestamp\x18\x03 \x01(\x01\x12\x16\n\x0euptime_seconds\x18\x04 \x01(\x01\x12\x13\n\x0bserver_type\x18\x05 \x01(\t2\x92\x04\n\nVllmEngine\x12S\n\x08Generate\x12!.vllm.grpc.engine.GenerateRequest\x1a".vllm.grpc.engine.GenerateResponse0\x01\x12H\n\x05\x45mbed\x12\x1e.vllm.grpc.engine.EmbedRequest\x1a\x1f.vllm.grpc.engine.EmbedResponse\x12Z\n\x0bHealthCheck\x12$.vllm.grpc.engine.HealthCheckRequest\x1a%.vllm.grpc.engine.HealthCheckResponse\x12H\n\x05\x41\x62ort\x12\x1e.vllm.grpc.engine.AbortRequest\x1a\x1f.vllm.grpc.engine.AbortResponse\x12]\n\x0cGetModelInfo\x12%.vllm.grpc.engine.GetModelInfoRequest\x1a&.vllm.grpc.engine.GetModelInfoResponse\x12`\n\rGetServerInfo\x12&.vllm.grpc.engine.GetServerInfoRequest\x1a\'.vllm.grpc.engine.GetServerInfoResponseb\x06proto3' -) + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1bvllm/grpc/vllm_engine.proto\x12\x10vllm.grpc.engine\"\xe5\x06\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x17\n\nmax_tokens\x18\x08 \x01(\x05H\x01\x88\x01\x01\x12\x12\n\nmin_tokens\x18\t \x01(\x05\x12\x0c\n\x04stop\x18\n \x03(\t\x12\x16\n\x0estop_token_ids\x18\x0b \x03(\r\x12\x1b\n\x13skip_special_tokens\x18\x0c \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\r \x01(\x08\x12\x12\n\nignore_eos\x18\x0e \x01(\x08\x12\t\n\x01n\x18\x0f \x01(\x05\x12\x15\n\x08logprobs\x18\x16 \x01(\x05H\x02\x88\x01\x01\x12\x1c\n\x0fprompt_logprobs\x18\x17 \x01(\x05H\x03\x88\x01\x01\x12\x11\n\x04seed\x18\x18 \x01(\x05H\x04\x88\x01\x01\x12\"\n\x1ainclude_stop_str_in_output\x18\x19 \x01(\x08\x12\x43\n\nlogit_bias\x18\x1a \x03(\x0b\x32/.vllm.grpc.engine.SamplingParams.LogitBiasEntry\x12#\n\x16truncate_prompt_tokens\x18\x1b \x01(\x05H\x05\x88\x01\x01\x12\x15\n\x0bjson_schema\x18\x10 \x01(\tH\x00\x12\x0f\n\x05regex\x18\x11 \x01(\tH\x00\x12\x11\n\x07grammar\x18\x12 \x01(\tH\x00\x12\x18\n\x0estructural_tag\x18\x13 \x01(\tH\x00\x12\x15\n\x0bjson_object\x18\x14 \x01(\x08H\x00\x12\x34\n\x06\x63hoice\x18\x15 \x01(\x0b\x32\".vllm.grpc.engine.ChoiceConstraintH\x00\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\x05\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraintB\r\n\x0b_max_tokensB\x0b\n\t_logprobsB\x12\n\x10_prompt_logprobsB\x07\n\x05_seedB\x19\n\x17_truncate_prompt_tokens\"#\n\x10\x43hoiceConstraint\x12\x0f\n\x07\x63hoices\x18\x01 \x03(\t\":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\r\"\xa5\x01\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput\x12\x39\n\x0fsampling_params\x18\x03 \x01(\x0b\x32 .vllm.grpc.engine.SamplingParams\x12\x0e\n\x06stream\x18\x04 \x01(\x08\"\xd4\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x36\n\x05\x63hunk\x18\x02 \x01(\x0b\x32%.vllm.grpc.engine.GenerateStreamChunkH\x00\x12\x36\n\x08\x63omplete\x18\x03 \x01(\x0b\x32\".vllm.grpc.engine.GenerateCompleteH\x00\x12\x30\n\x05\x65rror\x18\x04 \x01(\x0b\x32\x1f.vllm.grpc.engine.GenerateErrorH\x00\x42\n\n\x08response\"q\n\x13GenerateStreamChunk\x12\x11\n\ttoken_ids\x18\x01 \x03(\r\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x03 \x01(\x05\x12\x15\n\rcached_tokens\x18\x04 \x01(\x05\"\x86\x01\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\r\x12\x15\n\rfinish_reason\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05\"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"W\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput\"\x93\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\x08\x63omplete\x18\x02 \x01(\x0b\x32\x1f.vllm.grpc.engine.EmbedCompleteH\x00\x12-\n\x05\x65rror\x18\x03 \x01(\x0b\x32\x1c.vllm.grpc.engine.EmbedErrorH\x00\x42\n\n\x08response\"P\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rembedding_dim\x18\x03 \x01(\x05\"+\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t\"\x14\n\x12HealthCheckRequest\"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t\"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"\x15\n\x13GetModelInfoRequest\"\x8a\x01\n\x14GetModelInfoResponse\x12\x12\n\nmodel_path\x18\x01 \x01(\t\x12\x15\n\ris_generation\x18\x02 \x01(\x08\x12\x1a\n\x12max_context_length\x18\x03 \x01(\x05\x12\x12\n\nvocab_size\x18\x04 \x01(\x05\x12\x17\n\x0fsupports_vision\x18\x05 \x01(\x08\"\x16\n\x14GetServerInfoRequest\"\x90\x01\n\x15GetServerInfoResponse\x12\x17\n\x0f\x61\x63tive_requests\x18\x01 \x01(\x05\x12\x11\n\tis_paused\x18\x02 \x01(\x08\x12\x1e\n\x16last_receive_timestamp\x18\x03 \x01(\x01\x12\x16\n\x0euptime_seconds\x18\x04 \x01(\x01\x12\x13\n\x0bserver_type\x18\x05 \x01(\t2\x92\x04\n\nVllmEngine\x12S\n\x08Generate\x12!.vllm.grpc.engine.GenerateRequest\x1a\".vllm.grpc.engine.GenerateResponse0\x01\x12H\n\x05\x45mbed\x12\x1e.vllm.grpc.engine.EmbedRequest\x1a\x1f.vllm.grpc.engine.EmbedResponse\x12Z\n\x0bHealthCheck\x12$.vllm.grpc.engine.HealthCheckRequest\x1a%.vllm.grpc.engine.HealthCheckResponse\x12H\n\x05\x41\x62ort\x12\x1e.vllm.grpc.engine.AbortRequest\x1a\x1f.vllm.grpc.engine.AbortResponse\x12]\n\x0cGetModelInfo\x12%.vllm.grpc.engine.GetModelInfoRequest\x1a&.vllm.grpc.engine.GetModelInfoResponse\x12`\n\rGetServerInfo\x12&.vllm.grpc.engine.GetServerInfoRequest\x1a\'.vllm.grpc.engine.GetServerInfoResponseb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages( - DESCRIPTOR, "vllm.grpc.vllm_engine_pb2", _globals -) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'vllm.grpc.vllm_engine_pb2', _globals) if not _descriptor._USE_C_DESCRIPTORS: - DESCRIPTOR._loaded_options = None - _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._loaded_options = None - _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_options = b"8\001" - _globals["_SAMPLINGPARAMS"]._serialized_start = 50 - _globals["_SAMPLINGPARAMS"]._serialized_end = 919 - _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_start = 773 - _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_end = 821 - _globals["_CHOICECONSTRAINT"]._serialized_start = 921 - _globals["_CHOICECONSTRAINT"]._serialized_end = 956 - _globals["_TOKENIZEDINPUT"]._serialized_start = 958 - _globals["_TOKENIZEDINPUT"]._serialized_end = 1016 - _globals["_GENERATEREQUEST"]._serialized_start = 1019 - _globals["_GENERATEREQUEST"]._serialized_end = 1184 - _globals["_GENERATERESPONSE"]._serialized_start = 1187 - _globals["_GENERATERESPONSE"]._serialized_end = 1399 - _globals["_GENERATESTREAMCHUNK"]._serialized_start = 1401 - _globals["_GENERATESTREAMCHUNK"]._serialized_end = 1514 - _globals["_GENERATECOMPLETE"]._serialized_start = 1517 - _globals["_GENERATECOMPLETE"]._serialized_end = 1651 - _globals["_GENERATEERROR"]._serialized_start = 1653 - _globals["_GENERATEERROR"]._serialized_end = 1728 - _globals["_EMBEDREQUEST"]._serialized_start = 1730 - _globals["_EMBEDREQUEST"]._serialized_end = 1817 - _globals["_EMBEDRESPONSE"]._serialized_start = 1820 - _globals["_EMBEDRESPONSE"]._serialized_end = 1967 - _globals["_EMBEDCOMPLETE"]._serialized_start = 1969 - _globals["_EMBEDCOMPLETE"]._serialized_end = 2049 - _globals["_EMBEDERROR"]._serialized_start = 2051 - _globals["_EMBEDERROR"]._serialized_end = 2094 - _globals["_HEALTHCHECKREQUEST"]._serialized_start = 2096 - _globals["_HEALTHCHECKREQUEST"]._serialized_end = 2116 - _globals["_HEALTHCHECKRESPONSE"]._serialized_start = 2118 - _globals["_HEALTHCHECKRESPONSE"]._serialized_end = 2173 - _globals["_ABORTREQUEST"]._serialized_start = 2175 - _globals["_ABORTREQUEST"]._serialized_end = 2225 - _globals["_ABORTRESPONSE"]._serialized_start = 2227 - _globals["_ABORTRESPONSE"]._serialized_end = 2276 - _globals["_GETMODELINFOREQUEST"]._serialized_start = 2278 - _globals["_GETMODELINFOREQUEST"]._serialized_end = 2299 - _globals["_GETMODELINFORESPONSE"]._serialized_start = 2302 - _globals["_GETMODELINFORESPONSE"]._serialized_end = 2440 - _globals["_GETSERVERINFOREQUEST"]._serialized_start = 2442 - _globals["_GETSERVERINFOREQUEST"]._serialized_end = 2464 - _globals["_GETSERVERINFORESPONSE"]._serialized_start = 2467 - _globals["_GETSERVERINFORESPONSE"]._serialized_end = 2611 - _globals["_VLLMENGINE"]._serialized_start = 2614 - _globals["_VLLMENGINE"]._serialized_end = 3144 + DESCRIPTOR._loaded_options = None + _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._loaded_options = None + _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_options = b'8\001' + _globals['_SAMPLINGPARAMS']._serialized_start=50 + _globals['_SAMPLINGPARAMS']._serialized_end=919 + _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_start=773 + _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_end=821 + _globals['_CHOICECONSTRAINT']._serialized_start=921 + _globals['_CHOICECONSTRAINT']._serialized_end=956 + _globals['_TOKENIZEDINPUT']._serialized_start=958 + _globals['_TOKENIZEDINPUT']._serialized_end=1016 + _globals['_GENERATEREQUEST']._serialized_start=1019 + _globals['_GENERATEREQUEST']._serialized_end=1184 + _globals['_GENERATERESPONSE']._serialized_start=1187 + _globals['_GENERATERESPONSE']._serialized_end=1399 + _globals['_GENERATESTREAMCHUNK']._serialized_start=1401 + _globals['_GENERATESTREAMCHUNK']._serialized_end=1514 + _globals['_GENERATECOMPLETE']._serialized_start=1517 + _globals['_GENERATECOMPLETE']._serialized_end=1651 + _globals['_GENERATEERROR']._serialized_start=1653 + _globals['_GENERATEERROR']._serialized_end=1728 + _globals['_EMBEDREQUEST']._serialized_start=1730 + _globals['_EMBEDREQUEST']._serialized_end=1817 + _globals['_EMBEDRESPONSE']._serialized_start=1820 + _globals['_EMBEDRESPONSE']._serialized_end=1967 + _globals['_EMBEDCOMPLETE']._serialized_start=1969 + _globals['_EMBEDCOMPLETE']._serialized_end=2049 + _globals['_EMBEDERROR']._serialized_start=2051 + _globals['_EMBEDERROR']._serialized_end=2094 + _globals['_HEALTHCHECKREQUEST']._serialized_start=2096 + _globals['_HEALTHCHECKREQUEST']._serialized_end=2116 + _globals['_HEALTHCHECKRESPONSE']._serialized_start=2118 + _globals['_HEALTHCHECKRESPONSE']._serialized_end=2173 + _globals['_ABORTREQUEST']._serialized_start=2175 + _globals['_ABORTREQUEST']._serialized_end=2225 + _globals['_ABORTRESPONSE']._serialized_start=2227 + _globals['_ABORTRESPONSE']._serialized_end=2276 + _globals['_GETMODELINFOREQUEST']._serialized_start=2278 + _globals['_GETMODELINFOREQUEST']._serialized_end=2299 + _globals['_GETMODELINFORESPONSE']._serialized_start=2302 + _globals['_GETMODELINFORESPONSE']._serialized_end=2440 + _globals['_GETSERVERINFOREQUEST']._serialized_start=2442 + _globals['_GETSERVERINFOREQUEST']._serialized_end=2464 + _globals['_GETSERVERINFORESPONSE']._serialized_start=2467 + _globals['_GETSERVERINFORESPONSE']._serialized_end=2611 + _globals['_VLLMENGINE']._serialized_start=2614 + _globals['_VLLMENGINE']._serialized_end=3144 # @@protoc_insertion_point(module_scope) diff --git a/vllm/grpc/vllm_engine_pb2.pyi b/vllm/grpc/vllm_engine_pb2.pyi new file mode 100644 index 000000000000..3b125d210d17 --- /dev/null +++ b/vllm/grpc/vllm_engine_pb2.pyi @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from google.protobuf.internal import containers as _containers +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from collections.abc import Iterable as _Iterable, Mapping as _Mapping +from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union + +DESCRIPTOR: _descriptor.FileDescriptor + +class SamplingParams(_message.Message): + __slots__ = ("temperature", "top_p", "top_k", "min_p", "frequency_penalty", "presence_penalty", "repetition_penalty", "max_tokens", "min_tokens", "stop", "stop_token_ids", "skip_special_tokens", "spaces_between_special_tokens", "ignore_eos", "n", "logprobs", "prompt_logprobs", "seed", "include_stop_str_in_output", "logit_bias", "truncate_prompt_tokens", "json_schema", "regex", "grammar", "structural_tag", "json_object", "choice") + class LogitBiasEntry(_message.Message): + __slots__ = ("key", "value") + KEY_FIELD_NUMBER: _ClassVar[int] + VALUE_FIELD_NUMBER: _ClassVar[int] + key: int + value: float + def __init__(self, key: _Optional[int] = ..., value: _Optional[float] = ...) -> None: ... + TEMPERATURE_FIELD_NUMBER: _ClassVar[int] + TOP_P_FIELD_NUMBER: _ClassVar[int] + TOP_K_FIELD_NUMBER: _ClassVar[int] + MIN_P_FIELD_NUMBER: _ClassVar[int] + FREQUENCY_PENALTY_FIELD_NUMBER: _ClassVar[int] + PRESENCE_PENALTY_FIELD_NUMBER: _ClassVar[int] + REPETITION_PENALTY_FIELD_NUMBER: _ClassVar[int] + MAX_TOKENS_FIELD_NUMBER: _ClassVar[int] + MIN_TOKENS_FIELD_NUMBER: _ClassVar[int] + STOP_FIELD_NUMBER: _ClassVar[int] + STOP_TOKEN_IDS_FIELD_NUMBER: _ClassVar[int] + SKIP_SPECIAL_TOKENS_FIELD_NUMBER: _ClassVar[int] + SPACES_BETWEEN_SPECIAL_TOKENS_FIELD_NUMBER: _ClassVar[int] + IGNORE_EOS_FIELD_NUMBER: _ClassVar[int] + N_FIELD_NUMBER: _ClassVar[int] + LOGPROBS_FIELD_NUMBER: _ClassVar[int] + PROMPT_LOGPROBS_FIELD_NUMBER: _ClassVar[int] + SEED_FIELD_NUMBER: _ClassVar[int] + INCLUDE_STOP_STR_IN_OUTPUT_FIELD_NUMBER: _ClassVar[int] + LOGIT_BIAS_FIELD_NUMBER: _ClassVar[int] + TRUNCATE_PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int] + JSON_SCHEMA_FIELD_NUMBER: _ClassVar[int] + REGEX_FIELD_NUMBER: _ClassVar[int] + GRAMMAR_FIELD_NUMBER: _ClassVar[int] + STRUCTURAL_TAG_FIELD_NUMBER: _ClassVar[int] + JSON_OBJECT_FIELD_NUMBER: _ClassVar[int] + CHOICE_FIELD_NUMBER: _ClassVar[int] + temperature: float + top_p: float + top_k: int + min_p: float + frequency_penalty: float + presence_penalty: float + repetition_penalty: float + max_tokens: int + min_tokens: int + stop: _containers.RepeatedScalarFieldContainer[str] + stop_token_ids: _containers.RepeatedScalarFieldContainer[int] + skip_special_tokens: bool + spaces_between_special_tokens: bool + ignore_eos: bool + n: int + logprobs: int + prompt_logprobs: int + seed: int + include_stop_str_in_output: bool + logit_bias: _containers.ScalarMap[int, float] + truncate_prompt_tokens: int + json_schema: str + regex: str + grammar: str + structural_tag: str + json_object: bool + choice: ChoiceConstraint + def __init__(self, temperature: _Optional[float] = ..., top_p: _Optional[float] = ..., top_k: _Optional[int] = ..., min_p: _Optional[float] = ..., frequency_penalty: _Optional[float] = ..., presence_penalty: _Optional[float] = ..., repetition_penalty: _Optional[float] = ..., max_tokens: _Optional[int] = ..., min_tokens: _Optional[int] = ..., stop: _Optional[_Iterable[str]] = ..., stop_token_ids: _Optional[_Iterable[int]] = ..., skip_special_tokens: bool = ..., spaces_between_special_tokens: bool = ..., ignore_eos: bool = ..., n: _Optional[int] = ..., logprobs: _Optional[int] = ..., prompt_logprobs: _Optional[int] = ..., seed: _Optional[int] = ..., include_stop_str_in_output: bool = ..., logit_bias: _Optional[_Mapping[int, float]] = ..., truncate_prompt_tokens: _Optional[int] = ..., json_schema: _Optional[str] = ..., regex: _Optional[str] = ..., grammar: _Optional[str] = ..., structural_tag: _Optional[str] = ..., json_object: bool = ..., choice: _Optional[_Union[ChoiceConstraint, _Mapping]] = ...) -> None: ... + +class ChoiceConstraint(_message.Message): + __slots__ = ("choices",) + CHOICES_FIELD_NUMBER: _ClassVar[int] + choices: _containers.RepeatedScalarFieldContainer[str] + def __init__(self, choices: _Optional[_Iterable[str]] = ...) -> None: ... + +class TokenizedInput(_message.Message): + __slots__ = ("original_text", "input_ids") + ORIGINAL_TEXT_FIELD_NUMBER: _ClassVar[int] + INPUT_IDS_FIELD_NUMBER: _ClassVar[int] + original_text: str + input_ids: _containers.RepeatedScalarFieldContainer[int] + def __init__(self, original_text: _Optional[str] = ..., input_ids: _Optional[_Iterable[int]] = ...) -> None: ... + +class GenerateRequest(_message.Message): + __slots__ = ("request_id", "tokenized", "sampling_params", "stream") + REQUEST_ID_FIELD_NUMBER: _ClassVar[int] + TOKENIZED_FIELD_NUMBER: _ClassVar[int] + SAMPLING_PARAMS_FIELD_NUMBER: _ClassVar[int] + STREAM_FIELD_NUMBER: _ClassVar[int] + request_id: str + tokenized: TokenizedInput + sampling_params: SamplingParams + stream: bool + def __init__(self, request_id: _Optional[str] = ..., tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ..., sampling_params: _Optional[_Union[SamplingParams, _Mapping]] = ..., stream: bool = ...) -> None: ... + +class GenerateResponse(_message.Message): + __slots__ = ("request_id", "chunk", "complete", "error") + REQUEST_ID_FIELD_NUMBER: _ClassVar[int] + CHUNK_FIELD_NUMBER: _ClassVar[int] + COMPLETE_FIELD_NUMBER: _ClassVar[int] + ERROR_FIELD_NUMBER: _ClassVar[int] + request_id: str + chunk: GenerateStreamChunk + complete: GenerateComplete + error: GenerateError + def __init__(self, request_id: _Optional[str] = ..., chunk: _Optional[_Union[GenerateStreamChunk, _Mapping]] = ..., complete: _Optional[_Union[GenerateComplete, _Mapping]] = ..., error: _Optional[_Union[GenerateError, _Mapping]] = ...) -> None: ... + +class GenerateStreamChunk(_message.Message): + __slots__ = ("token_ids", "prompt_tokens", "completion_tokens", "cached_tokens") + TOKEN_IDS_FIELD_NUMBER: _ClassVar[int] + PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int] + COMPLETION_TOKENS_FIELD_NUMBER: _ClassVar[int] + CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int] + token_ids: _containers.RepeatedScalarFieldContainer[int] + prompt_tokens: int + completion_tokens: int + cached_tokens: int + def __init__(self, token_ids: _Optional[_Iterable[int]] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ...) -> None: ... + +class GenerateComplete(_message.Message): + __slots__ = ("output_ids", "finish_reason", "prompt_tokens", "completion_tokens", "cached_tokens") + OUTPUT_IDS_FIELD_NUMBER: _ClassVar[int] + FINISH_REASON_FIELD_NUMBER: _ClassVar[int] + PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int] + COMPLETION_TOKENS_FIELD_NUMBER: _ClassVar[int] + CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int] + output_ids: _containers.RepeatedScalarFieldContainer[int] + finish_reason: str + prompt_tokens: int + completion_tokens: int + cached_tokens: int + def __init__(self, output_ids: _Optional[_Iterable[int]] = ..., finish_reason: _Optional[str] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ...) -> None: ... + +class GenerateError(_message.Message): + __slots__ = ("message", "http_status_code", "details") + MESSAGE_FIELD_NUMBER: _ClassVar[int] + HTTP_STATUS_CODE_FIELD_NUMBER: _ClassVar[int] + DETAILS_FIELD_NUMBER: _ClassVar[int] + message: str + http_status_code: str + details: str + def __init__(self, message: _Optional[str] = ..., http_status_code: _Optional[str] = ..., details: _Optional[str] = ...) -> None: ... + +class EmbedRequest(_message.Message): + __slots__ = ("request_id", "tokenized") + REQUEST_ID_FIELD_NUMBER: _ClassVar[int] + TOKENIZED_FIELD_NUMBER: _ClassVar[int] + request_id: str + tokenized: TokenizedInput + def __init__(self, request_id: _Optional[str] = ..., tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ...) -> None: ... + +class EmbedResponse(_message.Message): + __slots__ = ("request_id", "complete", "error") + REQUEST_ID_FIELD_NUMBER: _ClassVar[int] + COMPLETE_FIELD_NUMBER: _ClassVar[int] + ERROR_FIELD_NUMBER: _ClassVar[int] + request_id: str + complete: EmbedComplete + error: EmbedError + def __init__(self, request_id: _Optional[str] = ..., complete: _Optional[_Union[EmbedComplete, _Mapping]] = ..., error: _Optional[_Union[EmbedError, _Mapping]] = ...) -> None: ... + +class EmbedComplete(_message.Message): + __slots__ = ("embedding", "prompt_tokens", "embedding_dim") + EMBEDDING_FIELD_NUMBER: _ClassVar[int] + PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int] + EMBEDDING_DIM_FIELD_NUMBER: _ClassVar[int] + embedding: _containers.RepeatedScalarFieldContainer[float] + prompt_tokens: int + embedding_dim: int + def __init__(self, embedding: _Optional[_Iterable[float]] = ..., prompt_tokens: _Optional[int] = ..., embedding_dim: _Optional[int] = ...) -> None: ... + +class EmbedError(_message.Message): + __slots__ = ("message", "code") + MESSAGE_FIELD_NUMBER: _ClassVar[int] + CODE_FIELD_NUMBER: _ClassVar[int] + message: str + code: str + def __init__(self, message: _Optional[str] = ..., code: _Optional[str] = ...) -> None: ... + +class HealthCheckRequest(_message.Message): + __slots__ = () + def __init__(self) -> None: ... + +class HealthCheckResponse(_message.Message): + __slots__ = ("healthy", "message") + HEALTHY_FIELD_NUMBER: _ClassVar[int] + MESSAGE_FIELD_NUMBER: _ClassVar[int] + healthy: bool + message: str + def __init__(self, healthy: bool = ..., message: _Optional[str] = ...) -> None: ... + +class AbortRequest(_message.Message): + __slots__ = ("request_id", "reason") + REQUEST_ID_FIELD_NUMBER: _ClassVar[int] + REASON_FIELD_NUMBER: _ClassVar[int] + request_id: str + reason: str + def __init__(self, request_id: _Optional[str] = ..., reason: _Optional[str] = ...) -> None: ... + +class AbortResponse(_message.Message): + __slots__ = ("success", "message") + SUCCESS_FIELD_NUMBER: _ClassVar[int] + MESSAGE_FIELD_NUMBER: _ClassVar[int] + success: bool + message: str + def __init__(self, success: bool = ..., message: _Optional[str] = ...) -> None: ... + +class GetModelInfoRequest(_message.Message): + __slots__ = () + def __init__(self) -> None: ... + +class GetModelInfoResponse(_message.Message): + __slots__ = ("model_path", "is_generation", "max_context_length", "vocab_size", "supports_vision") + MODEL_PATH_FIELD_NUMBER: _ClassVar[int] + IS_GENERATION_FIELD_NUMBER: _ClassVar[int] + MAX_CONTEXT_LENGTH_FIELD_NUMBER: _ClassVar[int] + VOCAB_SIZE_FIELD_NUMBER: _ClassVar[int] + SUPPORTS_VISION_FIELD_NUMBER: _ClassVar[int] + model_path: str + is_generation: bool + max_context_length: int + vocab_size: int + supports_vision: bool + def __init__(self, model_path: _Optional[str] = ..., is_generation: bool = ..., max_context_length: _Optional[int] = ..., vocab_size: _Optional[int] = ..., supports_vision: bool = ...) -> None: ... + +class GetServerInfoRequest(_message.Message): + __slots__ = () + def __init__(self) -> None: ... + +class GetServerInfoResponse(_message.Message): + __slots__ = ("active_requests", "is_paused", "last_receive_timestamp", "uptime_seconds", "server_type") + ACTIVE_REQUESTS_FIELD_NUMBER: _ClassVar[int] + IS_PAUSED_FIELD_NUMBER: _ClassVar[int] + LAST_RECEIVE_TIMESTAMP_FIELD_NUMBER: _ClassVar[int] + UPTIME_SECONDS_FIELD_NUMBER: _ClassVar[int] + SERVER_TYPE_FIELD_NUMBER: _ClassVar[int] + active_requests: int + is_paused: bool + last_receive_timestamp: float + uptime_seconds: float + server_type: str + def __init__(self, active_requests: _Optional[int] = ..., is_paused: bool = ..., last_receive_timestamp: _Optional[float] = ..., uptime_seconds: _Optional[float] = ..., server_type: _Optional[str] = ...) -> None: ... diff --git a/vllm/grpc/vllm_engine_pb2_grpc.py b/vllm/grpc/vllm_engine_pb2_grpc.py index 2e3b0fbfbc88..219f78ef7a62 100644 --- a/vllm/grpc/vllm_engine_pb2_grpc.py +++ b/vllm/grpc/vllm_engine_pb2_grpc.py @@ -2,32 +2,28 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" - import grpc import warnings from vllm.grpc import vllm_engine_pb2 as vllm_dot_grpc_dot_vllm__engine__pb2 -GRPC_GENERATED_VERSION = "1.75.1" +GRPC_GENERATED_VERSION = '1.75.1' GRPC_VERSION = grpc.__version__ _version_not_supported = False try: from grpc._utilities import first_version_is_lower - - _version_not_supported = first_version_is_lower( - GRPC_VERSION, GRPC_GENERATED_VERSION - ) + _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION) except ImportError: _version_not_supported = True if _version_not_supported: raise RuntimeError( - f"The grpc package installed is at version {GRPC_VERSION}," - + f" but the generated code in vllm/grpc/vllm_engine_pb2_grpc.py depends on" - + f" grpcio>={GRPC_GENERATED_VERSION}." - + f" Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}" - + f" or downgrade your generated code using grpcio-tools<={GRPC_VERSION}." + f'The grpc package installed is at version {GRPC_VERSION},' + + f' but the generated code in vllm/grpc/vllm_engine_pb2_grpc.py depends on' + + f' grpcio>={GRPC_GENERATED_VERSION}.' + + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' + + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' ) @@ -44,41 +40,35 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.Generate = channel.unary_stream( - "/vllm.grpc.engine.VllmEngine/Generate", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.FromString, - _registered_method=True, - ) + '/vllm.grpc.engine.VllmEngine/Generate', + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.FromString, + _registered_method=True) self.Embed = channel.unary_unary( - "/vllm.grpc.engine.VllmEngine/Embed", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.FromString, - _registered_method=True, - ) + '/vllm.grpc.engine.VllmEngine/Embed', + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.FromString, + _registered_method=True) self.HealthCheck = channel.unary_unary( - "/vllm.grpc.engine.VllmEngine/HealthCheck", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.FromString, - _registered_method=True, - ) + '/vllm.grpc.engine.VllmEngine/HealthCheck', + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.FromString, + _registered_method=True) self.Abort = channel.unary_unary( - "/vllm.grpc.engine.VllmEngine/Abort", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.FromString, - _registered_method=True, - ) + '/vllm.grpc.engine.VllmEngine/Abort', + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.FromString, + _registered_method=True) self.GetModelInfo = channel.unary_unary( - "/vllm.grpc.engine.VllmEngine/GetModelInfo", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.FromString, - _registered_method=True, - ) + '/vllm.grpc.engine.VllmEngine/GetModelInfo', + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.FromString, + _registered_method=True) self.GetServerInfo = channel.unary_unary( - "/vllm.grpc.engine.VllmEngine/GetServerInfo", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.FromString, - _registered_method=True, - ) + '/vllm.grpc.engine.VllmEngine/GetServerInfo', + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.FromString, + _registered_method=True) class VllmEngineServicer(object): @@ -88,85 +78,88 @@ class VllmEngineServicer(object): """ def Generate(self, request, context): - """Submit a generation request (supports streaming)""" + """Submit a generation request (supports streaming) + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def Embed(self, request, context): - """Submit an embedding request""" + """Submit an embedding request + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def HealthCheck(self, request, context): - """Health check""" + """Health check + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def Abort(self, request, context): - """Abort a running request""" + """Abort a running request + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def GetModelInfo(self, request, context): - """Get model information""" + """Get model information + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def GetServerInfo(self, request, context): - """Get server information""" + """Get server information + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def add_VllmEngineServicer_to_server(servicer, server): rpc_method_handlers = { - "Generate": grpc.unary_stream_rpc_method_handler( - servicer.Generate, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.SerializeToString, - ), - "Embed": grpc.unary_unary_rpc_method_handler( - servicer.Embed, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.SerializeToString, - ), - "HealthCheck": grpc.unary_unary_rpc_method_handler( - servicer.HealthCheck, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.SerializeToString, - ), - "Abort": grpc.unary_unary_rpc_method_handler( - servicer.Abort, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.SerializeToString, - ), - "GetModelInfo": grpc.unary_unary_rpc_method_handler( - servicer.GetModelInfo, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.SerializeToString, - ), - "GetServerInfo": grpc.unary_unary_rpc_method_handler( - servicer.GetServerInfo, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.SerializeToString, - ), + 'Generate': grpc.unary_stream_rpc_method_handler( + servicer.Generate, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.SerializeToString, + ), + 'Embed': grpc.unary_unary_rpc_method_handler( + servicer.Embed, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.SerializeToString, + ), + 'HealthCheck': grpc.unary_unary_rpc_method_handler( + servicer.HealthCheck, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.SerializeToString, + ), + 'Abort': grpc.unary_unary_rpc_method_handler( + servicer.Abort, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.SerializeToString, + ), + 'GetModelInfo': grpc.unary_unary_rpc_method_handler( + servicer.GetModelInfo, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.SerializeToString, + ), + 'GetServerInfo': grpc.unary_unary_rpc_method_handler( + servicer.GetServerInfo, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - "vllm.grpc.engine.VllmEngine", rpc_method_handlers - ) + 'vllm.grpc.engine.VllmEngine', rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) - server.add_registered_method_handlers( - "vllm.grpc.engine.VllmEngine", rpc_method_handlers - ) + server.add_registered_method_handlers('vllm.grpc.engine.VllmEngine', rpc_method_handlers) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class VllmEngine(object): """Service definition for vLLM engine communication This protocol is designed for efficient binary communication between @@ -174,22 +167,20 @@ class VllmEngine(object): """ @staticmethod - def Generate( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): + def Generate(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): return grpc.experimental.unary_stream( request, target, - "/vllm.grpc.engine.VllmEngine/Generate", + '/vllm.grpc.engine.VllmEngine/Generate', vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.FromString, options, @@ -200,26 +191,23 @@ def Generate( wait_for_ready, timeout, metadata, - _registered_method=True, - ) + _registered_method=True) @staticmethod - def Embed( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): + def Embed(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): return grpc.experimental.unary_unary( request, target, - "/vllm.grpc.engine.VllmEngine/Embed", + '/vllm.grpc.engine.VllmEngine/Embed', vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.FromString, options, @@ -230,26 +218,23 @@ def Embed( wait_for_ready, timeout, metadata, - _registered_method=True, - ) + _registered_method=True) @staticmethod - def HealthCheck( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): + def HealthCheck(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): return grpc.experimental.unary_unary( request, target, - "/vllm.grpc.engine.VllmEngine/HealthCheck", + '/vllm.grpc.engine.VllmEngine/HealthCheck', vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.FromString, options, @@ -260,26 +245,23 @@ def HealthCheck( wait_for_ready, timeout, metadata, - _registered_method=True, - ) + _registered_method=True) @staticmethod - def Abort( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): + def Abort(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): return grpc.experimental.unary_unary( request, target, - "/vllm.grpc.engine.VllmEngine/Abort", + '/vllm.grpc.engine.VllmEngine/Abort', vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.FromString, options, @@ -290,26 +272,23 @@ def Abort( wait_for_ready, timeout, metadata, - _registered_method=True, - ) + _registered_method=True) @staticmethod - def GetModelInfo( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): + def GetModelInfo(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): return grpc.experimental.unary_unary( request, target, - "/vllm.grpc.engine.VllmEngine/GetModelInfo", + '/vllm.grpc.engine.VllmEngine/GetModelInfo', vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.FromString, options, @@ -320,26 +299,23 @@ def GetModelInfo( wait_for_ready, timeout, metadata, - _registered_method=True, - ) + _registered_method=True) @staticmethod - def GetServerInfo( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): + def GetServerInfo(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): return grpc.experimental.unary_unary( request, target, - "/vllm.grpc.engine.VllmEngine/GetServerInfo", + '/vllm.grpc.engine.VllmEngine/GetServerInfo', vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.FromString, options, @@ -350,5 +326,4 @@ def GetServerInfo( wait_for_ready, timeout, metadata, - _registered_method=True, - ) + _registered_method=True) From 64ff6d1dc55c8c44aaac6d3c63b582f3c946ed39 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Sat, 6 Dec 2025 13:32:29 -0800 Subject: [PATCH 04/27] Exclude auto-generated gRPC stubs in mkdocs Signed-off-by: Chang Su --- mkdocs.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mkdocs.yaml b/mkdocs.yaml index 8fb8f0568c6e..2532584e3444 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -87,7 +87,8 @@ plugins: options: show_symbol_type_heading: true show_symbol_type_toc: true - filters: [] + filters: + - "!.*_pb2_grpc" # Exclude auto-generated gRPC stubs summary: modules: true show_if_no_docstring: true From 41bc4f61474088d88fc1ef0819298a19e332a22e Mon Sep 17 00:00:00 2001 From: Chang Su Date: Sat, 6 Dec 2025 13:34:31 -0800 Subject: [PATCH 05/27] Run precommit Signed-off-by: Chang Su --- vllm/grpc/vllm_engine_pb2.py | 113 +++++----- vllm/grpc/vllm_engine_pb2.pyi | 192 ++++++++++++++--- vllm/grpc/vllm_engine_pb2_grpc.py | 345 ++++++++++++++++-------------- 3 files changed, 408 insertions(+), 242 deletions(-) diff --git a/vllm/grpc/vllm_engine_pb2.py b/vllm/grpc/vllm_engine_pb2.py index 1d7ec53061e3..a12ff8b43300 100644 --- a/vllm/grpc/vllm_engine_pb2.py +++ b/vllm/grpc/vllm_engine_pb2.py @@ -6,77 +6,76 @@ # source: vllm/grpc/vllm_engine.proto # Protobuf Python Version: 6.31.1 """Generated protocol buffer code.""" + from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import runtime_version as _runtime_version from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + _runtime_version.ValidateProtobufRuntimeVersion( - _runtime_version.Domain.PUBLIC, - 6, - 31, - 1, - '', - 'vllm/grpc/vllm_engine.proto' + _runtime_version.Domain.PUBLIC, 6, 31, 1, "", "vllm/grpc/vllm_engine.proto" ) # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1bvllm/grpc/vllm_engine.proto\x12\x10vllm.grpc.engine\"\xe5\x06\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x17\n\nmax_tokens\x18\x08 \x01(\x05H\x01\x88\x01\x01\x12\x12\n\nmin_tokens\x18\t \x01(\x05\x12\x0c\n\x04stop\x18\n \x03(\t\x12\x16\n\x0estop_token_ids\x18\x0b \x03(\r\x12\x1b\n\x13skip_special_tokens\x18\x0c \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\r \x01(\x08\x12\x12\n\nignore_eos\x18\x0e \x01(\x08\x12\t\n\x01n\x18\x0f \x01(\x05\x12\x15\n\x08logprobs\x18\x16 \x01(\x05H\x02\x88\x01\x01\x12\x1c\n\x0fprompt_logprobs\x18\x17 \x01(\x05H\x03\x88\x01\x01\x12\x11\n\x04seed\x18\x18 \x01(\x05H\x04\x88\x01\x01\x12\"\n\x1ainclude_stop_str_in_output\x18\x19 \x01(\x08\x12\x43\n\nlogit_bias\x18\x1a \x03(\x0b\x32/.vllm.grpc.engine.SamplingParams.LogitBiasEntry\x12#\n\x16truncate_prompt_tokens\x18\x1b \x01(\x05H\x05\x88\x01\x01\x12\x15\n\x0bjson_schema\x18\x10 \x01(\tH\x00\x12\x0f\n\x05regex\x18\x11 \x01(\tH\x00\x12\x11\n\x07grammar\x18\x12 \x01(\tH\x00\x12\x18\n\x0estructural_tag\x18\x13 \x01(\tH\x00\x12\x15\n\x0bjson_object\x18\x14 \x01(\x08H\x00\x12\x34\n\x06\x63hoice\x18\x15 \x01(\x0b\x32\".vllm.grpc.engine.ChoiceConstraintH\x00\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\x05\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraintB\r\n\x0b_max_tokensB\x0b\n\t_logprobsB\x12\n\x10_prompt_logprobsB\x07\n\x05_seedB\x19\n\x17_truncate_prompt_tokens\"#\n\x10\x43hoiceConstraint\x12\x0f\n\x07\x63hoices\x18\x01 \x03(\t\":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\r\"\xa5\x01\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput\x12\x39\n\x0fsampling_params\x18\x03 \x01(\x0b\x32 .vllm.grpc.engine.SamplingParams\x12\x0e\n\x06stream\x18\x04 \x01(\x08\"\xd4\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x36\n\x05\x63hunk\x18\x02 \x01(\x0b\x32%.vllm.grpc.engine.GenerateStreamChunkH\x00\x12\x36\n\x08\x63omplete\x18\x03 \x01(\x0b\x32\".vllm.grpc.engine.GenerateCompleteH\x00\x12\x30\n\x05\x65rror\x18\x04 \x01(\x0b\x32\x1f.vllm.grpc.engine.GenerateErrorH\x00\x42\n\n\x08response\"q\n\x13GenerateStreamChunk\x12\x11\n\ttoken_ids\x18\x01 \x03(\r\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x03 \x01(\x05\x12\x15\n\rcached_tokens\x18\x04 \x01(\x05\"\x86\x01\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\r\x12\x15\n\rfinish_reason\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05\"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"W\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput\"\x93\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\x08\x63omplete\x18\x02 \x01(\x0b\x32\x1f.vllm.grpc.engine.EmbedCompleteH\x00\x12-\n\x05\x65rror\x18\x03 \x01(\x0b\x32\x1c.vllm.grpc.engine.EmbedErrorH\x00\x42\n\n\x08response\"P\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rembedding_dim\x18\x03 \x01(\x05\"+\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t\"\x14\n\x12HealthCheckRequest\"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t\"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"\x15\n\x13GetModelInfoRequest\"\x8a\x01\n\x14GetModelInfoResponse\x12\x12\n\nmodel_path\x18\x01 \x01(\t\x12\x15\n\ris_generation\x18\x02 \x01(\x08\x12\x1a\n\x12max_context_length\x18\x03 \x01(\x05\x12\x12\n\nvocab_size\x18\x04 \x01(\x05\x12\x17\n\x0fsupports_vision\x18\x05 \x01(\x08\"\x16\n\x14GetServerInfoRequest\"\x90\x01\n\x15GetServerInfoResponse\x12\x17\n\x0f\x61\x63tive_requests\x18\x01 \x01(\x05\x12\x11\n\tis_paused\x18\x02 \x01(\x08\x12\x1e\n\x16last_receive_timestamp\x18\x03 \x01(\x01\x12\x16\n\x0euptime_seconds\x18\x04 \x01(\x01\x12\x13\n\x0bserver_type\x18\x05 \x01(\t2\x92\x04\n\nVllmEngine\x12S\n\x08Generate\x12!.vllm.grpc.engine.GenerateRequest\x1a\".vllm.grpc.engine.GenerateResponse0\x01\x12H\n\x05\x45mbed\x12\x1e.vllm.grpc.engine.EmbedRequest\x1a\x1f.vllm.grpc.engine.EmbedResponse\x12Z\n\x0bHealthCheck\x12$.vllm.grpc.engine.HealthCheckRequest\x1a%.vllm.grpc.engine.HealthCheckResponse\x12H\n\x05\x41\x62ort\x12\x1e.vllm.grpc.engine.AbortRequest\x1a\x1f.vllm.grpc.engine.AbortResponse\x12]\n\x0cGetModelInfo\x12%.vllm.grpc.engine.GetModelInfoRequest\x1a&.vllm.grpc.engine.GetModelInfoResponse\x12`\n\rGetServerInfo\x12&.vllm.grpc.engine.GetServerInfoRequest\x1a\'.vllm.grpc.engine.GetServerInfoResponseb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x1bvllm/grpc/vllm_engine.proto\x12\x10vllm.grpc.engine"\xe5\x06\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x17\n\nmax_tokens\x18\x08 \x01(\x05H\x01\x88\x01\x01\x12\x12\n\nmin_tokens\x18\t \x01(\x05\x12\x0c\n\x04stop\x18\n \x03(\t\x12\x16\n\x0estop_token_ids\x18\x0b \x03(\r\x12\x1b\n\x13skip_special_tokens\x18\x0c \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\r \x01(\x08\x12\x12\n\nignore_eos\x18\x0e \x01(\x08\x12\t\n\x01n\x18\x0f \x01(\x05\x12\x15\n\x08logprobs\x18\x16 \x01(\x05H\x02\x88\x01\x01\x12\x1c\n\x0fprompt_logprobs\x18\x17 \x01(\x05H\x03\x88\x01\x01\x12\x11\n\x04seed\x18\x18 \x01(\x05H\x04\x88\x01\x01\x12"\n\x1ainclude_stop_str_in_output\x18\x19 \x01(\x08\x12\x43\n\nlogit_bias\x18\x1a \x03(\x0b\x32/.vllm.grpc.engine.SamplingParams.LogitBiasEntry\x12#\n\x16truncate_prompt_tokens\x18\x1b \x01(\x05H\x05\x88\x01\x01\x12\x15\n\x0bjson_schema\x18\x10 \x01(\tH\x00\x12\x0f\n\x05regex\x18\x11 \x01(\tH\x00\x12\x11\n\x07grammar\x18\x12 \x01(\tH\x00\x12\x18\n\x0estructural_tag\x18\x13 \x01(\tH\x00\x12\x15\n\x0bjson_object\x18\x14 \x01(\x08H\x00\x12\x34\n\x06\x63hoice\x18\x15 \x01(\x0b\x32".vllm.grpc.engine.ChoiceConstraintH\x00\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\x05\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraintB\r\n\x0b_max_tokensB\x0b\n\t_logprobsB\x12\n\x10_prompt_logprobsB\x07\n\x05_seedB\x19\n\x17_truncate_prompt_tokens"#\n\x10\x43hoiceConstraint\x12\x0f\n\x07\x63hoices\x18\x01 \x03(\t":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\r"\xa5\x01\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput\x12\x39\n\x0fsampling_params\x18\x03 \x01(\x0b\x32 .vllm.grpc.engine.SamplingParams\x12\x0e\n\x06stream\x18\x04 \x01(\x08"\xd4\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x36\n\x05\x63hunk\x18\x02 \x01(\x0b\x32%.vllm.grpc.engine.GenerateStreamChunkH\x00\x12\x36\n\x08\x63omplete\x18\x03 \x01(\x0b\x32".vllm.grpc.engine.GenerateCompleteH\x00\x12\x30\n\x05\x65rror\x18\x04 \x01(\x0b\x32\x1f.vllm.grpc.engine.GenerateErrorH\x00\x42\n\n\x08response"q\n\x13GenerateStreamChunk\x12\x11\n\ttoken_ids\x18\x01 \x03(\r\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x03 \x01(\x05\x12\x15\n\rcached_tokens\x18\x04 \x01(\x05"\x86\x01\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\r\x12\x15\n\rfinish_reason\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t"W\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput"\x93\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\x08\x63omplete\x18\x02 \x01(\x0b\x32\x1f.vllm.grpc.engine.EmbedCompleteH\x00\x12-\n\x05\x65rror\x18\x03 \x01(\x0b\x32\x1c.vllm.grpc.engine.EmbedErrorH\x00\x42\n\n\x08response"P\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rembedding_dim\x18\x03 \x01(\x05"+\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t"\x14\n\x12HealthCheckRequest"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"\x15\n\x13GetModelInfoRequest"\x8a\x01\n\x14GetModelInfoResponse\x12\x12\n\nmodel_path\x18\x01 \x01(\t\x12\x15\n\ris_generation\x18\x02 \x01(\x08\x12\x1a\n\x12max_context_length\x18\x03 \x01(\x05\x12\x12\n\nvocab_size\x18\x04 \x01(\x05\x12\x17\n\x0fsupports_vision\x18\x05 \x01(\x08"\x16\n\x14GetServerInfoRequest"\x90\x01\n\x15GetServerInfoResponse\x12\x17\n\x0f\x61\x63tive_requests\x18\x01 \x01(\x05\x12\x11\n\tis_paused\x18\x02 \x01(\x08\x12\x1e\n\x16last_receive_timestamp\x18\x03 \x01(\x01\x12\x16\n\x0euptime_seconds\x18\x04 \x01(\x01\x12\x13\n\x0bserver_type\x18\x05 \x01(\t2\x92\x04\n\nVllmEngine\x12S\n\x08Generate\x12!.vllm.grpc.engine.GenerateRequest\x1a".vllm.grpc.engine.GenerateResponse0\x01\x12H\n\x05\x45mbed\x12\x1e.vllm.grpc.engine.EmbedRequest\x1a\x1f.vllm.grpc.engine.EmbedResponse\x12Z\n\x0bHealthCheck\x12$.vllm.grpc.engine.HealthCheckRequest\x1a%.vllm.grpc.engine.HealthCheckResponse\x12H\n\x05\x41\x62ort\x12\x1e.vllm.grpc.engine.AbortRequest\x1a\x1f.vllm.grpc.engine.AbortResponse\x12]\n\x0cGetModelInfo\x12%.vllm.grpc.engine.GetModelInfoRequest\x1a&.vllm.grpc.engine.GetModelInfoResponse\x12`\n\rGetServerInfo\x12&.vllm.grpc.engine.GetServerInfoRequest\x1a\'.vllm.grpc.engine.GetServerInfoResponseb\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'vllm.grpc.vllm_engine_pb2', _globals) +_builder.BuildTopDescriptorsAndMessages( + DESCRIPTOR, "vllm.grpc.vllm_engine_pb2", _globals +) if not _descriptor._USE_C_DESCRIPTORS: - DESCRIPTOR._loaded_options = None - _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._loaded_options = None - _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_options = b'8\001' - _globals['_SAMPLINGPARAMS']._serialized_start=50 - _globals['_SAMPLINGPARAMS']._serialized_end=919 - _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_start=773 - _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_end=821 - _globals['_CHOICECONSTRAINT']._serialized_start=921 - _globals['_CHOICECONSTRAINT']._serialized_end=956 - _globals['_TOKENIZEDINPUT']._serialized_start=958 - _globals['_TOKENIZEDINPUT']._serialized_end=1016 - _globals['_GENERATEREQUEST']._serialized_start=1019 - _globals['_GENERATEREQUEST']._serialized_end=1184 - _globals['_GENERATERESPONSE']._serialized_start=1187 - _globals['_GENERATERESPONSE']._serialized_end=1399 - _globals['_GENERATESTREAMCHUNK']._serialized_start=1401 - _globals['_GENERATESTREAMCHUNK']._serialized_end=1514 - _globals['_GENERATECOMPLETE']._serialized_start=1517 - _globals['_GENERATECOMPLETE']._serialized_end=1651 - _globals['_GENERATEERROR']._serialized_start=1653 - _globals['_GENERATEERROR']._serialized_end=1728 - _globals['_EMBEDREQUEST']._serialized_start=1730 - _globals['_EMBEDREQUEST']._serialized_end=1817 - _globals['_EMBEDRESPONSE']._serialized_start=1820 - _globals['_EMBEDRESPONSE']._serialized_end=1967 - _globals['_EMBEDCOMPLETE']._serialized_start=1969 - _globals['_EMBEDCOMPLETE']._serialized_end=2049 - _globals['_EMBEDERROR']._serialized_start=2051 - _globals['_EMBEDERROR']._serialized_end=2094 - _globals['_HEALTHCHECKREQUEST']._serialized_start=2096 - _globals['_HEALTHCHECKREQUEST']._serialized_end=2116 - _globals['_HEALTHCHECKRESPONSE']._serialized_start=2118 - _globals['_HEALTHCHECKRESPONSE']._serialized_end=2173 - _globals['_ABORTREQUEST']._serialized_start=2175 - _globals['_ABORTREQUEST']._serialized_end=2225 - _globals['_ABORTRESPONSE']._serialized_start=2227 - _globals['_ABORTRESPONSE']._serialized_end=2276 - _globals['_GETMODELINFOREQUEST']._serialized_start=2278 - _globals['_GETMODELINFOREQUEST']._serialized_end=2299 - _globals['_GETMODELINFORESPONSE']._serialized_start=2302 - _globals['_GETMODELINFORESPONSE']._serialized_end=2440 - _globals['_GETSERVERINFOREQUEST']._serialized_start=2442 - _globals['_GETSERVERINFOREQUEST']._serialized_end=2464 - _globals['_GETSERVERINFORESPONSE']._serialized_start=2467 - _globals['_GETSERVERINFORESPONSE']._serialized_end=2611 - _globals['_VLLMENGINE']._serialized_start=2614 - _globals['_VLLMENGINE']._serialized_end=3144 + DESCRIPTOR._loaded_options = None + _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._loaded_options = None + _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_options = b"8\001" + _globals["_SAMPLINGPARAMS"]._serialized_start = 50 + _globals["_SAMPLINGPARAMS"]._serialized_end = 919 + _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_start = 773 + _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_end = 821 + _globals["_CHOICECONSTRAINT"]._serialized_start = 921 + _globals["_CHOICECONSTRAINT"]._serialized_end = 956 + _globals["_TOKENIZEDINPUT"]._serialized_start = 958 + _globals["_TOKENIZEDINPUT"]._serialized_end = 1016 + _globals["_GENERATEREQUEST"]._serialized_start = 1019 + _globals["_GENERATEREQUEST"]._serialized_end = 1184 + _globals["_GENERATERESPONSE"]._serialized_start = 1187 + _globals["_GENERATERESPONSE"]._serialized_end = 1399 + _globals["_GENERATESTREAMCHUNK"]._serialized_start = 1401 + _globals["_GENERATESTREAMCHUNK"]._serialized_end = 1514 + _globals["_GENERATECOMPLETE"]._serialized_start = 1517 + _globals["_GENERATECOMPLETE"]._serialized_end = 1651 + _globals["_GENERATEERROR"]._serialized_start = 1653 + _globals["_GENERATEERROR"]._serialized_end = 1728 + _globals["_EMBEDREQUEST"]._serialized_start = 1730 + _globals["_EMBEDREQUEST"]._serialized_end = 1817 + _globals["_EMBEDRESPONSE"]._serialized_start = 1820 + _globals["_EMBEDRESPONSE"]._serialized_end = 1967 + _globals["_EMBEDCOMPLETE"]._serialized_start = 1969 + _globals["_EMBEDCOMPLETE"]._serialized_end = 2049 + _globals["_EMBEDERROR"]._serialized_start = 2051 + _globals["_EMBEDERROR"]._serialized_end = 2094 + _globals["_HEALTHCHECKREQUEST"]._serialized_start = 2096 + _globals["_HEALTHCHECKREQUEST"]._serialized_end = 2116 + _globals["_HEALTHCHECKRESPONSE"]._serialized_start = 2118 + _globals["_HEALTHCHECKRESPONSE"]._serialized_end = 2173 + _globals["_ABORTREQUEST"]._serialized_start = 2175 + _globals["_ABORTREQUEST"]._serialized_end = 2225 + _globals["_ABORTRESPONSE"]._serialized_start = 2227 + _globals["_ABORTRESPONSE"]._serialized_end = 2276 + _globals["_GETMODELINFOREQUEST"]._serialized_start = 2278 + _globals["_GETMODELINFOREQUEST"]._serialized_end = 2299 + _globals["_GETMODELINFORESPONSE"]._serialized_start = 2302 + _globals["_GETMODELINFORESPONSE"]._serialized_end = 2440 + _globals["_GETSERVERINFOREQUEST"]._serialized_start = 2442 + _globals["_GETSERVERINFOREQUEST"]._serialized_end = 2464 + _globals["_GETSERVERINFORESPONSE"]._serialized_start = 2467 + _globals["_GETSERVERINFORESPONSE"]._serialized_end = 2611 + _globals["_VLLMENGINE"]._serialized_start = 2614 + _globals["_VLLMENGINE"]._serialized_end = 3144 # @@protoc_insertion_point(module_scope) diff --git a/vllm/grpc/vllm_engine_pb2.pyi b/vllm/grpc/vllm_engine_pb2.pyi index 3b125d210d17..e7a0045ac785 100644 --- a/vllm/grpc/vllm_engine_pb2.pyi +++ b/vllm/grpc/vllm_engine_pb2.pyi @@ -1,22 +1,55 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from google.protobuf.internal import containers as _containers +from collections.abc import Iterable as _Iterable +from collections.abc import Mapping as _Mapping +from typing import ClassVar as _ClassVar + from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message -from collections.abc import Iterable as _Iterable, Mapping as _Mapping -from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union +from google.protobuf.internal import containers as _containers DESCRIPTOR: _descriptor.FileDescriptor class SamplingParams(_message.Message): - __slots__ = ("temperature", "top_p", "top_k", "min_p", "frequency_penalty", "presence_penalty", "repetition_penalty", "max_tokens", "min_tokens", "stop", "stop_token_ids", "skip_special_tokens", "spaces_between_special_tokens", "ignore_eos", "n", "logprobs", "prompt_logprobs", "seed", "include_stop_str_in_output", "logit_bias", "truncate_prompt_tokens", "json_schema", "regex", "grammar", "structural_tag", "json_object", "choice") + __slots__ = ( + "temperature", + "top_p", + "top_k", + "min_p", + "frequency_penalty", + "presence_penalty", + "repetition_penalty", + "max_tokens", + "min_tokens", + "stop", + "stop_token_ids", + "skip_special_tokens", + "spaces_between_special_tokens", + "ignore_eos", + "n", + "logprobs", + "prompt_logprobs", + "seed", + "include_stop_str_in_output", + "logit_bias", + "truncate_prompt_tokens", + "json_schema", + "regex", + "grammar", + "structural_tag", + "json_object", + "choice", + ) class LogitBiasEntry(_message.Message): __slots__ = ("key", "value") KEY_FIELD_NUMBER: _ClassVar[int] VALUE_FIELD_NUMBER: _ClassVar[int] key: int value: float - def __init__(self, key: _Optional[int] = ..., value: _Optional[float] = ...) -> None: ... + def __init__( + self, key: int | None = ..., value: float | None = ... + ) -> None: ... + TEMPERATURE_FIELD_NUMBER: _ClassVar[int] TOP_P_FIELD_NUMBER: _ClassVar[int] TOP_K_FIELD_NUMBER: _ClassVar[int] @@ -71,13 +104,42 @@ class SamplingParams(_message.Message): structural_tag: str json_object: bool choice: ChoiceConstraint - def __init__(self, temperature: _Optional[float] = ..., top_p: _Optional[float] = ..., top_k: _Optional[int] = ..., min_p: _Optional[float] = ..., frequency_penalty: _Optional[float] = ..., presence_penalty: _Optional[float] = ..., repetition_penalty: _Optional[float] = ..., max_tokens: _Optional[int] = ..., min_tokens: _Optional[int] = ..., stop: _Optional[_Iterable[str]] = ..., stop_token_ids: _Optional[_Iterable[int]] = ..., skip_special_tokens: bool = ..., spaces_between_special_tokens: bool = ..., ignore_eos: bool = ..., n: _Optional[int] = ..., logprobs: _Optional[int] = ..., prompt_logprobs: _Optional[int] = ..., seed: _Optional[int] = ..., include_stop_str_in_output: bool = ..., logit_bias: _Optional[_Mapping[int, float]] = ..., truncate_prompt_tokens: _Optional[int] = ..., json_schema: _Optional[str] = ..., regex: _Optional[str] = ..., grammar: _Optional[str] = ..., structural_tag: _Optional[str] = ..., json_object: bool = ..., choice: _Optional[_Union[ChoiceConstraint, _Mapping]] = ...) -> None: ... + def __init__( + self, + temperature: float | None = ..., + top_p: float | None = ..., + top_k: int | None = ..., + min_p: float | None = ..., + frequency_penalty: float | None = ..., + presence_penalty: float | None = ..., + repetition_penalty: float | None = ..., + max_tokens: int | None = ..., + min_tokens: int | None = ..., + stop: _Iterable[str] | None = ..., + stop_token_ids: _Iterable[int] | None = ..., + skip_special_tokens: bool = ..., + spaces_between_special_tokens: bool = ..., + ignore_eos: bool = ..., + n: int | None = ..., + logprobs: int | None = ..., + prompt_logprobs: int | None = ..., + seed: int | None = ..., + include_stop_str_in_output: bool = ..., + logit_bias: _Mapping[int, float] | None = ..., + truncate_prompt_tokens: int | None = ..., + json_schema: str | None = ..., + regex: str | None = ..., + grammar: str | None = ..., + structural_tag: str | None = ..., + json_object: bool = ..., + choice: ChoiceConstraint | _Mapping | None = ..., + ) -> None: ... class ChoiceConstraint(_message.Message): __slots__ = ("choices",) CHOICES_FIELD_NUMBER: _ClassVar[int] choices: _containers.RepeatedScalarFieldContainer[str] - def __init__(self, choices: _Optional[_Iterable[str]] = ...) -> None: ... + def __init__(self, choices: _Iterable[str] | None = ...) -> None: ... class TokenizedInput(_message.Message): __slots__ = ("original_text", "input_ids") @@ -85,7 +147,9 @@ class TokenizedInput(_message.Message): INPUT_IDS_FIELD_NUMBER: _ClassVar[int] original_text: str input_ids: _containers.RepeatedScalarFieldContainer[int] - def __init__(self, original_text: _Optional[str] = ..., input_ids: _Optional[_Iterable[int]] = ...) -> None: ... + def __init__( + self, original_text: str | None = ..., input_ids: _Iterable[int] | None = ... + ) -> None: ... class GenerateRequest(_message.Message): __slots__ = ("request_id", "tokenized", "sampling_params", "stream") @@ -97,7 +161,13 @@ class GenerateRequest(_message.Message): tokenized: TokenizedInput sampling_params: SamplingParams stream: bool - def __init__(self, request_id: _Optional[str] = ..., tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ..., sampling_params: _Optional[_Union[SamplingParams, _Mapping]] = ..., stream: bool = ...) -> None: ... + def __init__( + self, + request_id: str | None = ..., + tokenized: TokenizedInput | _Mapping | None = ..., + sampling_params: SamplingParams | _Mapping | None = ..., + stream: bool = ..., + ) -> None: ... class GenerateResponse(_message.Message): __slots__ = ("request_id", "chunk", "complete", "error") @@ -109,7 +179,13 @@ class GenerateResponse(_message.Message): chunk: GenerateStreamChunk complete: GenerateComplete error: GenerateError - def __init__(self, request_id: _Optional[str] = ..., chunk: _Optional[_Union[GenerateStreamChunk, _Mapping]] = ..., complete: _Optional[_Union[GenerateComplete, _Mapping]] = ..., error: _Optional[_Union[GenerateError, _Mapping]] = ...) -> None: ... + def __init__( + self, + request_id: str | None = ..., + chunk: GenerateStreamChunk | _Mapping | None = ..., + complete: GenerateComplete | _Mapping | None = ..., + error: GenerateError | _Mapping | None = ..., + ) -> None: ... class GenerateStreamChunk(_message.Message): __slots__ = ("token_ids", "prompt_tokens", "completion_tokens", "cached_tokens") @@ -121,10 +197,22 @@ class GenerateStreamChunk(_message.Message): prompt_tokens: int completion_tokens: int cached_tokens: int - def __init__(self, token_ids: _Optional[_Iterable[int]] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ...) -> None: ... + def __init__( + self, + token_ids: _Iterable[int] | None = ..., + prompt_tokens: int | None = ..., + completion_tokens: int | None = ..., + cached_tokens: int | None = ..., + ) -> None: ... class GenerateComplete(_message.Message): - __slots__ = ("output_ids", "finish_reason", "prompt_tokens", "completion_tokens", "cached_tokens") + __slots__ = ( + "output_ids", + "finish_reason", + "prompt_tokens", + "completion_tokens", + "cached_tokens", + ) OUTPUT_IDS_FIELD_NUMBER: _ClassVar[int] FINISH_REASON_FIELD_NUMBER: _ClassVar[int] PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int] @@ -135,7 +223,14 @@ class GenerateComplete(_message.Message): prompt_tokens: int completion_tokens: int cached_tokens: int - def __init__(self, output_ids: _Optional[_Iterable[int]] = ..., finish_reason: _Optional[str] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ...) -> None: ... + def __init__( + self, + output_ids: _Iterable[int] | None = ..., + finish_reason: str | None = ..., + prompt_tokens: int | None = ..., + completion_tokens: int | None = ..., + cached_tokens: int | None = ..., + ) -> None: ... class GenerateError(_message.Message): __slots__ = ("message", "http_status_code", "details") @@ -145,7 +240,12 @@ class GenerateError(_message.Message): message: str http_status_code: str details: str - def __init__(self, message: _Optional[str] = ..., http_status_code: _Optional[str] = ..., details: _Optional[str] = ...) -> None: ... + def __init__( + self, + message: str | None = ..., + http_status_code: str | None = ..., + details: str | None = ..., + ) -> None: ... class EmbedRequest(_message.Message): __slots__ = ("request_id", "tokenized") @@ -153,7 +253,11 @@ class EmbedRequest(_message.Message): TOKENIZED_FIELD_NUMBER: _ClassVar[int] request_id: str tokenized: TokenizedInput - def __init__(self, request_id: _Optional[str] = ..., tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ...) -> None: ... + def __init__( + self, + request_id: str | None = ..., + tokenized: TokenizedInput | _Mapping | None = ..., + ) -> None: ... class EmbedResponse(_message.Message): __slots__ = ("request_id", "complete", "error") @@ -163,7 +267,12 @@ class EmbedResponse(_message.Message): request_id: str complete: EmbedComplete error: EmbedError - def __init__(self, request_id: _Optional[str] = ..., complete: _Optional[_Union[EmbedComplete, _Mapping]] = ..., error: _Optional[_Union[EmbedError, _Mapping]] = ...) -> None: ... + def __init__( + self, + request_id: str | None = ..., + complete: EmbedComplete | _Mapping | None = ..., + error: EmbedError | _Mapping | None = ..., + ) -> None: ... class EmbedComplete(_message.Message): __slots__ = ("embedding", "prompt_tokens", "embedding_dim") @@ -173,7 +282,12 @@ class EmbedComplete(_message.Message): embedding: _containers.RepeatedScalarFieldContainer[float] prompt_tokens: int embedding_dim: int - def __init__(self, embedding: _Optional[_Iterable[float]] = ..., prompt_tokens: _Optional[int] = ..., embedding_dim: _Optional[int] = ...) -> None: ... + def __init__( + self, + embedding: _Iterable[float] | None = ..., + prompt_tokens: int | None = ..., + embedding_dim: int | None = ..., + ) -> None: ... class EmbedError(_message.Message): __slots__ = ("message", "code") @@ -181,7 +295,7 @@ class EmbedError(_message.Message): CODE_FIELD_NUMBER: _ClassVar[int] message: str code: str - def __init__(self, message: _Optional[str] = ..., code: _Optional[str] = ...) -> None: ... + def __init__(self, message: str | None = ..., code: str | None = ...) -> None: ... class HealthCheckRequest(_message.Message): __slots__ = () @@ -193,7 +307,7 @@ class HealthCheckResponse(_message.Message): MESSAGE_FIELD_NUMBER: _ClassVar[int] healthy: bool message: str - def __init__(self, healthy: bool = ..., message: _Optional[str] = ...) -> None: ... + def __init__(self, healthy: bool = ..., message: str | None = ...) -> None: ... class AbortRequest(_message.Message): __slots__ = ("request_id", "reason") @@ -201,7 +315,9 @@ class AbortRequest(_message.Message): REASON_FIELD_NUMBER: _ClassVar[int] request_id: str reason: str - def __init__(self, request_id: _Optional[str] = ..., reason: _Optional[str] = ...) -> None: ... + def __init__( + self, request_id: str | None = ..., reason: str | None = ... + ) -> None: ... class AbortResponse(_message.Message): __slots__ = ("success", "message") @@ -209,14 +325,20 @@ class AbortResponse(_message.Message): MESSAGE_FIELD_NUMBER: _ClassVar[int] success: bool message: str - def __init__(self, success: bool = ..., message: _Optional[str] = ...) -> None: ... + def __init__(self, success: bool = ..., message: str | None = ...) -> None: ... class GetModelInfoRequest(_message.Message): __slots__ = () def __init__(self) -> None: ... class GetModelInfoResponse(_message.Message): - __slots__ = ("model_path", "is_generation", "max_context_length", "vocab_size", "supports_vision") + __slots__ = ( + "model_path", + "is_generation", + "max_context_length", + "vocab_size", + "supports_vision", + ) MODEL_PATH_FIELD_NUMBER: _ClassVar[int] IS_GENERATION_FIELD_NUMBER: _ClassVar[int] MAX_CONTEXT_LENGTH_FIELD_NUMBER: _ClassVar[int] @@ -227,14 +349,27 @@ class GetModelInfoResponse(_message.Message): max_context_length: int vocab_size: int supports_vision: bool - def __init__(self, model_path: _Optional[str] = ..., is_generation: bool = ..., max_context_length: _Optional[int] = ..., vocab_size: _Optional[int] = ..., supports_vision: bool = ...) -> None: ... + def __init__( + self, + model_path: str | None = ..., + is_generation: bool = ..., + max_context_length: int | None = ..., + vocab_size: int | None = ..., + supports_vision: bool = ..., + ) -> None: ... class GetServerInfoRequest(_message.Message): __slots__ = () def __init__(self) -> None: ... class GetServerInfoResponse(_message.Message): - __slots__ = ("active_requests", "is_paused", "last_receive_timestamp", "uptime_seconds", "server_type") + __slots__ = ( + "active_requests", + "is_paused", + "last_receive_timestamp", + "uptime_seconds", + "server_type", + ) ACTIVE_REQUESTS_FIELD_NUMBER: _ClassVar[int] IS_PAUSED_FIELD_NUMBER: _ClassVar[int] LAST_RECEIVE_TIMESTAMP_FIELD_NUMBER: _ClassVar[int] @@ -245,4 +380,11 @@ class GetServerInfoResponse(_message.Message): last_receive_timestamp: float uptime_seconds: float server_type: str - def __init__(self, active_requests: _Optional[int] = ..., is_paused: bool = ..., last_receive_timestamp: _Optional[float] = ..., uptime_seconds: _Optional[float] = ..., server_type: _Optional[str] = ...) -> None: ... + def __init__( + self, + active_requests: int | None = ..., + is_paused: bool = ..., + last_receive_timestamp: float | None = ..., + uptime_seconds: float | None = ..., + server_type: str | None = ..., + ) -> None: ... diff --git a/vllm/grpc/vllm_engine_pb2_grpc.py b/vllm/grpc/vllm_engine_pb2_grpc.py index 219f78ef7a62..2e3b0fbfbc88 100644 --- a/vllm/grpc/vllm_engine_pb2_grpc.py +++ b/vllm/grpc/vllm_engine_pb2_grpc.py @@ -2,28 +2,32 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" + import grpc import warnings from vllm.grpc import vllm_engine_pb2 as vllm_dot_grpc_dot_vllm__engine__pb2 -GRPC_GENERATED_VERSION = '1.75.1' +GRPC_GENERATED_VERSION = "1.75.1" GRPC_VERSION = grpc.__version__ _version_not_supported = False try: from grpc._utilities import first_version_is_lower - _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION) + + _version_not_supported = first_version_is_lower( + GRPC_VERSION, GRPC_GENERATED_VERSION + ) except ImportError: _version_not_supported = True if _version_not_supported: raise RuntimeError( - f'The grpc package installed is at version {GRPC_VERSION},' - + f' but the generated code in vllm/grpc/vllm_engine_pb2_grpc.py depends on' - + f' grpcio>={GRPC_GENERATED_VERSION}.' - + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' - + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' + f"The grpc package installed is at version {GRPC_VERSION}," + + f" but the generated code in vllm/grpc/vllm_engine_pb2_grpc.py depends on" + + f" grpcio>={GRPC_GENERATED_VERSION}." + + f" Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}" + + f" or downgrade your generated code using grpcio-tools<={GRPC_VERSION}." ) @@ -40,35 +44,41 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.Generate = channel.unary_stream( - '/vllm.grpc.engine.VllmEngine/Generate', - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.FromString, - _registered_method=True) + "/vllm.grpc.engine.VllmEngine/Generate", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.FromString, + _registered_method=True, + ) self.Embed = channel.unary_unary( - '/vllm.grpc.engine.VllmEngine/Embed', - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.FromString, - _registered_method=True) + "/vllm.grpc.engine.VllmEngine/Embed", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.FromString, + _registered_method=True, + ) self.HealthCheck = channel.unary_unary( - '/vllm.grpc.engine.VllmEngine/HealthCheck', - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.FromString, - _registered_method=True) + "/vllm.grpc.engine.VllmEngine/HealthCheck", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.FromString, + _registered_method=True, + ) self.Abort = channel.unary_unary( - '/vllm.grpc.engine.VllmEngine/Abort', - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.FromString, - _registered_method=True) + "/vllm.grpc.engine.VllmEngine/Abort", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.FromString, + _registered_method=True, + ) self.GetModelInfo = channel.unary_unary( - '/vllm.grpc.engine.VllmEngine/GetModelInfo', - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.FromString, - _registered_method=True) + "/vllm.grpc.engine.VllmEngine/GetModelInfo", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.FromString, + _registered_method=True, + ) self.GetServerInfo = channel.unary_unary( - '/vllm.grpc.engine.VllmEngine/GetServerInfo', - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.FromString, - _registered_method=True) + "/vllm.grpc.engine.VllmEngine/GetServerInfo", + request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.SerializeToString, + response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.FromString, + _registered_method=True, + ) class VllmEngineServicer(object): @@ -78,88 +88,85 @@ class VllmEngineServicer(object): """ def Generate(self, request, context): - """Submit a generation request (supports streaming) - """ + """Submit a generation request (supports streaming)""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def Embed(self, request, context): - """Submit an embedding request - """ + """Submit an embedding request""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def HealthCheck(self, request, context): - """Health check - """ + """Health check""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def Abort(self, request, context): - """Abort a running request - """ + """Abort a running request""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def GetModelInfo(self, request, context): - """Get model information - """ + """Get model information""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def GetServerInfo(self, request, context): - """Get server information - """ + """Get server information""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def add_VllmEngineServicer_to_server(servicer, server): rpc_method_handlers = { - 'Generate': grpc.unary_stream_rpc_method_handler( - servicer.Generate, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.SerializeToString, - ), - 'Embed': grpc.unary_unary_rpc_method_handler( - servicer.Embed, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.SerializeToString, - ), - 'HealthCheck': grpc.unary_unary_rpc_method_handler( - servicer.HealthCheck, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.SerializeToString, - ), - 'Abort': grpc.unary_unary_rpc_method_handler( - servicer.Abort, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.SerializeToString, - ), - 'GetModelInfo': grpc.unary_unary_rpc_method_handler( - servicer.GetModelInfo, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.SerializeToString, - ), - 'GetServerInfo': grpc.unary_unary_rpc_method_handler( - servicer.GetServerInfo, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.SerializeToString, - ), + "Generate": grpc.unary_stream_rpc_method_handler( + servicer.Generate, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.SerializeToString, + ), + "Embed": grpc.unary_unary_rpc_method_handler( + servicer.Embed, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.SerializeToString, + ), + "HealthCheck": grpc.unary_unary_rpc_method_handler( + servicer.HealthCheck, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.SerializeToString, + ), + "Abort": grpc.unary_unary_rpc_method_handler( + servicer.Abort, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.SerializeToString, + ), + "GetModelInfo": grpc.unary_unary_rpc_method_handler( + servicer.GetModelInfo, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.SerializeToString, + ), + "GetServerInfo": grpc.unary_unary_rpc_method_handler( + servicer.GetServerInfo, + request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.FromString, + response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - 'vllm.grpc.engine.VllmEngine', rpc_method_handlers) + "vllm.grpc.engine.VllmEngine", rpc_method_handlers + ) server.add_generic_rpc_handlers((generic_handler,)) - server.add_registered_method_handlers('vllm.grpc.engine.VllmEngine', rpc_method_handlers) + server.add_registered_method_handlers( + "vllm.grpc.engine.VllmEngine", rpc_method_handlers + ) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class VllmEngine(object): """Service definition for vLLM engine communication This protocol is designed for efficient binary communication between @@ -167,20 +174,22 @@ class VllmEngine(object): """ @staticmethod - def Generate(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): + def Generate( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): return grpc.experimental.unary_stream( request, target, - '/vllm.grpc.engine.VllmEngine/Generate', + "/vllm.grpc.engine.VllmEngine/Generate", vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.FromString, options, @@ -191,23 +200,26 @@ def Generate(request, wait_for_ready, timeout, metadata, - _registered_method=True) + _registered_method=True, + ) @staticmethod - def Embed(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): + def Embed( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): return grpc.experimental.unary_unary( request, target, - '/vllm.grpc.engine.VllmEngine/Embed', + "/vllm.grpc.engine.VllmEngine/Embed", vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.FromString, options, @@ -218,23 +230,26 @@ def Embed(request, wait_for_ready, timeout, metadata, - _registered_method=True) + _registered_method=True, + ) @staticmethod - def HealthCheck(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): + def HealthCheck( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): return grpc.experimental.unary_unary( request, target, - '/vllm.grpc.engine.VllmEngine/HealthCheck', + "/vllm.grpc.engine.VllmEngine/HealthCheck", vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.FromString, options, @@ -245,23 +260,26 @@ def HealthCheck(request, wait_for_ready, timeout, metadata, - _registered_method=True) + _registered_method=True, + ) @staticmethod - def Abort(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): + def Abort( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): return grpc.experimental.unary_unary( request, target, - '/vllm.grpc.engine.VllmEngine/Abort', + "/vllm.grpc.engine.VllmEngine/Abort", vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.FromString, options, @@ -272,23 +290,26 @@ def Abort(request, wait_for_ready, timeout, metadata, - _registered_method=True) + _registered_method=True, + ) @staticmethod - def GetModelInfo(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): + def GetModelInfo( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): return grpc.experimental.unary_unary( request, target, - '/vllm.grpc.engine.VllmEngine/GetModelInfo', + "/vllm.grpc.engine.VllmEngine/GetModelInfo", vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.FromString, options, @@ -299,23 +320,26 @@ def GetModelInfo(request, wait_for_ready, timeout, metadata, - _registered_method=True) + _registered_method=True, + ) @staticmethod - def GetServerInfo(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): + def GetServerInfo( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): return grpc.experimental.unary_unary( request, target, - '/vllm.grpc.engine.VllmEngine/GetServerInfo', + "/vllm.grpc.engine.VllmEngine/GetServerInfo", vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.SerializeToString, vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.FromString, options, @@ -326,4 +350,5 @@ def GetServerInfo(request, wait_for_ready, timeout, metadata, - _registered_method=True) + _registered_method=True, + ) From d5b274120aa66eb78ee93121bfa3c5f41394b8e7 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Sat, 6 Dec 2025 13:35:01 -0800 Subject: [PATCH 06/27] Add pyi in pyproject.toml Signed-off-by: Chang Su --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 339b5adb3b53..20061604ea2c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ include = ["vllm*"] # Exclude generated protobuf files "vllm/grpc/*_pb2.py" = ["ALL"] "vllm/grpc/*_pb2_grpc.py" = ["ALL"] +"vllm/grpc/*_pb2.pyi" = ["ALL"] [tool.ruff.lint] select = [ From 8421d59988323843fcc331cba7705b4c34131bb5 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Sat, 6 Dec 2025 13:40:44 -0800 Subject: [PATCH 07/27] Add mypy ignores to all generated grpc stubs Signed-off-by: Chang Su --- vllm/grpc/compile_protos.py | 4 +++- vllm/grpc/vllm_engine_pb2.py | 1 + vllm/grpc/vllm_engine_pb2.pyi | 1 + vllm/grpc/vllm_engine_pb2_grpc.py | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/grpc/compile_protos.py b/vllm/grpc/compile_protos.py index b32da4abcf8d..0cf6e7e76e10 100755 --- a/vllm/grpc/compile_protos.py +++ b/vllm/grpc/compile_protos.py @@ -64,7 +64,9 @@ def compile_protos(): if generated_file.exists(): content = generated_file.read_text() if not content.startswith("# SPDX-License-Identifier"): - generated_file.write_text(spdx_header + content) + # Add mypy ignore-errors comment for all generated files + header = spdx_header + "# mypy: ignore-errors\n" + generated_file.write_text(header + content) print("✓ Protobuf compilation successful!") print(f" Generated: {script_dir / 'vllm_engine_pb2.py'}") diff --git a/vllm/grpc/vllm_engine_pb2.py b/vllm/grpc/vllm_engine_pb2.py index a12ff8b43300..487f39876733 100644 --- a/vllm/grpc/vllm_engine_pb2.py +++ b/vllm/grpc/vllm_engine_pb2.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# mypy: ignore-errors # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # NO CHECKED-IN PROTOBUF GENCODE diff --git a/vllm/grpc/vllm_engine_pb2.pyi b/vllm/grpc/vllm_engine_pb2.pyi index e7a0045ac785..51cdd7a799cd 100644 --- a/vllm/grpc/vllm_engine_pb2.pyi +++ b/vllm/grpc/vllm_engine_pb2.pyi @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# mypy: ignore-errors from collections.abc import Iterable as _Iterable from collections.abc import Mapping as _Mapping from typing import ClassVar as _ClassVar diff --git a/vllm/grpc/vllm_engine_pb2_grpc.py b/vllm/grpc/vllm_engine_pb2_grpc.py index 2e3b0fbfbc88..9f6a39ecea15 100644 --- a/vllm/grpc/vllm_engine_pb2_grpc.py +++ b/vllm/grpc/vllm_engine_pb2_grpc.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# mypy: ignore-errors # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" From e11e45c78ac99b689eb639acb427d0c410dc87ab Mon Sep 17 00:00:00 2001 From: Chang Su Date: Sat, 6 Dec 2025 13:56:07 -0800 Subject: [PATCH 08/27] Exclude grpc in api-autonav Signed-off-by: Chang Su --- mkdocs.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yaml b/mkdocs.yaml index 2532584e3444..c5501e7db0f0 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -80,6 +80,7 @@ plugins: - "re:vllm\\._.*" # Internal modules - "vllm.third_party" - "vllm.vllm_flash_attn" + - "re:vllm\\.grpc\\..*_pb2.*" # Auto-generated protobuf files - !ENV [API_AUTONAV_EXCLUDE, "re:^$"] # Match nothing by default - mkdocstrings: handlers: From 81db261a53b0ae613d5773687a394148115daf61 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Fri, 2 Jan 2026 22:30:07 -0800 Subject: [PATCH 09/27] remove code owner, replace logger.error with logger.exception Signed-off-by: Chang Su --- .github/CODEOWNERS | 4 ---- requirements/common.txt | 2 ++ vllm/entrypoints/grpc_server.py | 2 +- vllm/grpc/grpc_request_manager.py | 12 ++++++------ 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d74cc42eebc1..4d7a366f05e3 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -159,7 +159,3 @@ mkdocs.yaml @hmellor /docs/usage/security.md @russellb /SECURITY.md @russellb /docs/contributing/vulnerability_management.md @russellb - -# gRPC server -/vllm/grpc @CatherineSue @slin1237 -/vllm/entrypoints/grpc_server.py @CatherineSue @slin1237 diff --git a/requirements/common.txt b/requirements/common.txt index 43f4a8676d79..977d9f5165b5 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -52,3 +52,5 @@ openai-harmony >= 0.0.3 # Required for gpt-oss anthropic == 0.71.0 model-hosting-container-standards >= 0.1.10, < 1.0.0 mcp +grpcio>=1.75.1 +grpcio-reflection>=1.75.1 diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index f9ae1c510c72..808358e19450 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -126,7 +126,7 @@ async def Generate( yield self._complete_response(request_id, output) except Exception as e: - logger.error("Error in Generate for %s: %s", request_id, e) + logger.exception("Error in Generate for %s", request_id) yield self._error_response( request_id, str(e), diff --git a/vllm/grpc/grpc_request_manager.py b/vllm/grpc/grpc_request_manager.py index fe7024f65c77..35997f2ee657 100644 --- a/vllm/grpc/grpc_request_manager.py +++ b/vllm/grpc/grpc_request_manager.py @@ -107,8 +107,8 @@ async def generate( await self.async_llm.abort([request_id]) raise # Re-raise to let gRPC server handle cleanup - except Exception as e: - logger.error("Error in generate for %s: %s", request_id, e) + except Exception: + logger.exception("Error in generate for %s", request_id) raise finally: # Cleanup @@ -149,7 +149,7 @@ async def _submit_request( await self.async_llm.engine_core.add_request_async(request) except Exception as e: - logger.error("Error submitting request %s: %s", request.request_id, e) + logger.exception("Error submitting request %s", request.request_id) # Put error in collector collector.put(e) @@ -183,8 +183,8 @@ async def abort(self, request_id: str) -> bool: logger.info("Request %s aborted.", request_id) return True - except Exception as e: - logger.error("Error aborting request %s: %s", request_id, e) + except Exception: + logger.exception("Error aborting request %s", request_id) self.rid_to_collector.pop(request_id, None) return False @@ -203,7 +203,7 @@ async def health_check(self) -> tuple[bool, str]: return True, "Healthy" except Exception as e: - logger.error("Health check error: %s", e) + logger.exception("Health check error") return False, f"Error: {e}" def get_model_config(self) -> dict: From df1898cd66d38724f399b2d0262473042dce27d0 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Fri, 2 Jan 2026 22:53:54 -0800 Subject: [PATCH 10/27] move grpc protobuf compilation to setup Signed-off-by: Chang Su --- .gitignore | 5 + pyproject.toml | 1 + requirements/build.txt | 1 + setup.py | 73 +++++- vllm/grpc/compile_protos.py | 4 + vllm/grpc/vllm_engine_pb2.py | 82 ------- vllm/grpc/vllm_engine_pb2.pyi | 391 ------------------------------ vllm/grpc/vllm_engine_pb2_grpc.py | 355 --------------------------- 8 files changed, 82 insertions(+), 830 deletions(-) delete mode 100644 vllm/grpc/vllm_engine_pb2.py delete mode 100644 vllm/grpc/vllm_engine_pb2.pyi delete mode 100644 vllm/grpc/vllm_engine_pb2_grpc.py diff --git a/.gitignore b/.gitignore index 7cda86478664..864542128c05 100644 --- a/.gitignore +++ b/.gitignore @@ -227,3 +227,8 @@ ep_kernels_workspace/ # Allow tracked library source folders under submodules (e.g., benchmarks/lib) !vllm/benchmarks/lib/ + +# Generated gRPC protobuf files (compiled at build time from vllm_engine.proto) +vllm/grpc/vllm_engine_pb2.py +vllm/grpc/vllm_engine_pb2_grpc.py +vllm/grpc/vllm_engine_pb2.pyi diff --git a/pyproject.toml b/pyproject.toml index 20061604ea2c..ad6fdd1fd14c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ requires = [ "torch == 2.9.1", "wheel", "jinja2", + "grpcio-tools>=1.75.1", ] build-backend = "setuptools.build_meta" diff --git a/requirements/build.txt b/requirements/build.txt index 3756371638ba..8c1dad4a20f4 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -9,3 +9,4 @@ wheel jinja2>=3.1.6 regex build +grpcio-tools>=1.75.1 diff --git a/setup.py b/setup.py index 595397264283..34bce769359a 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ from packaging.version import Version, parse from setuptools import Extension, setup from setuptools.command.build_ext import build_ext +from setuptools.command.build_py import build_py from setuptools_scm import get_version from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME @@ -79,6 +80,73 @@ def is_freethreaded(): return bool(sysconfig.get_config_var("Py_GIL_DISABLED")) +def compile_grpc_protos(): + """Compile gRPC protobuf definitions during build. + + This generates *_pb2.py, *_pb2_grpc.py, and *_pb2.pyi files from + the vllm_engine.proto definition. + """ + try: + from grpc_tools import protoc + except ImportError: + logger.warning( + "grpcio-tools not installed, skipping gRPC proto compilation. " + "gRPC server functionality will not be available." + ) + return False + + proto_file = ROOT_DIR / "vllm" / "grpc" / "vllm_engine.proto" + if not proto_file.exists(): + logger.warning("Proto file not found at %s, skipping compilation", proto_file) + return False + + logger.info("Compiling gRPC protobuf: %s", proto_file) + + result = protoc.main( + [ + "grpc_tools.protoc", + f"--proto_path={ROOT_DIR}", + f"--python_out={ROOT_DIR}", + f"--grpc_python_out={ROOT_DIR}", + f"--pyi_out={ROOT_DIR}", + str(proto_file), + ] + ) + + if result != 0: + logger.error("protoc failed with exit code %s", result) + return False + + # Add SPDX headers and mypy ignore to generated files + spdx_header = ( + "# SPDX-License-Identifier: Apache-2.0\n" + "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n" + "# mypy: ignore-errors\n" + ) + + grpc_dir = ROOT_DIR / "vllm" / "grpc" + for generated_file in [ + grpc_dir / "vllm_engine_pb2.py", + grpc_dir / "vllm_engine_pb2_grpc.py", + grpc_dir / "vllm_engine_pb2.pyi", + ]: + if generated_file.exists(): + content = generated_file.read_text() + if not content.startswith("# SPDX-License-Identifier"): + generated_file.write_text(spdx_header + content) + + logger.info("gRPC protobuf compilation successful") + return True + + +class BuildPyAndGenerateGrpc(build_py): + """Build Python modules and generate gRPC stubs from proto files.""" + + def run(self): + compile_grpc_protos() + super().run() + + class CMakeExtension(Extension): def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None: super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa) @@ -882,12 +950,13 @@ def _read_requirements(filename: str) -> list[str]: ext_modules = [] if not ext_modules: - cmdclass = {} + cmdclass = {"build_py": BuildPyAndGenerateGrpc} else: cmdclass = { "build_ext": precompiled_build_ext if envs.VLLM_USE_PRECOMPILED - else cmake_build_ext + else cmake_build_ext, + "build_py": BuildPyAndGenerateGrpc, } setup( diff --git a/vllm/grpc/compile_protos.py b/vllm/grpc/compile_protos.py index 0cf6e7e76e10..92ad46e160a5 100755 --- a/vllm/grpc/compile_protos.py +++ b/vllm/grpc/compile_protos.py @@ -7,6 +7,10 @@ This script uses grpcio-tools to generate *_pb2.py, *_pb2_grpc.py, and *_pb2.pyi (type stubs) files from the vllm_engine.proto definition. +NOTE: Proto compilation happens automatically during package build (via setup.py). +This script is provided for developers who want to regenerate protos manually, +e.g., after modifying vllm_engine.proto. + Usage: python vllm/grpc/compile_protos.py diff --git a/vllm/grpc/vllm_engine_pb2.py b/vllm/grpc/vllm_engine_pb2.py deleted file mode 100644 index 487f39876733..000000000000 --- a/vllm/grpc/vllm_engine_pb2.py +++ /dev/null @@ -1,82 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# mypy: ignore-errors -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# NO CHECKED-IN PROTOBUF GENCODE -# source: vllm/grpc/vllm_engine.proto -# Protobuf Python Version: 6.31.1 -"""Generated protocol buffer code.""" - -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import runtime_version as _runtime_version -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -_runtime_version.ValidateProtobufRuntimeVersion( - _runtime_version.Domain.PUBLIC, 6, 31, 1, "", "vllm/grpc/vllm_engine.proto" -) -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1bvllm/grpc/vllm_engine.proto\x12\x10vllm.grpc.engine"\xe5\x06\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x17\n\nmax_tokens\x18\x08 \x01(\x05H\x01\x88\x01\x01\x12\x12\n\nmin_tokens\x18\t \x01(\x05\x12\x0c\n\x04stop\x18\n \x03(\t\x12\x16\n\x0estop_token_ids\x18\x0b \x03(\r\x12\x1b\n\x13skip_special_tokens\x18\x0c \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\r \x01(\x08\x12\x12\n\nignore_eos\x18\x0e \x01(\x08\x12\t\n\x01n\x18\x0f \x01(\x05\x12\x15\n\x08logprobs\x18\x16 \x01(\x05H\x02\x88\x01\x01\x12\x1c\n\x0fprompt_logprobs\x18\x17 \x01(\x05H\x03\x88\x01\x01\x12\x11\n\x04seed\x18\x18 \x01(\x05H\x04\x88\x01\x01\x12"\n\x1ainclude_stop_str_in_output\x18\x19 \x01(\x08\x12\x43\n\nlogit_bias\x18\x1a \x03(\x0b\x32/.vllm.grpc.engine.SamplingParams.LogitBiasEntry\x12#\n\x16truncate_prompt_tokens\x18\x1b \x01(\x05H\x05\x88\x01\x01\x12\x15\n\x0bjson_schema\x18\x10 \x01(\tH\x00\x12\x0f\n\x05regex\x18\x11 \x01(\tH\x00\x12\x11\n\x07grammar\x18\x12 \x01(\tH\x00\x12\x18\n\x0estructural_tag\x18\x13 \x01(\tH\x00\x12\x15\n\x0bjson_object\x18\x14 \x01(\x08H\x00\x12\x34\n\x06\x63hoice\x18\x15 \x01(\x0b\x32".vllm.grpc.engine.ChoiceConstraintH\x00\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\x05\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraintB\r\n\x0b_max_tokensB\x0b\n\t_logprobsB\x12\n\x10_prompt_logprobsB\x07\n\x05_seedB\x19\n\x17_truncate_prompt_tokens"#\n\x10\x43hoiceConstraint\x12\x0f\n\x07\x63hoices\x18\x01 \x03(\t":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\r"\xa5\x01\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput\x12\x39\n\x0fsampling_params\x18\x03 \x01(\x0b\x32 .vllm.grpc.engine.SamplingParams\x12\x0e\n\x06stream\x18\x04 \x01(\x08"\xd4\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x36\n\x05\x63hunk\x18\x02 \x01(\x0b\x32%.vllm.grpc.engine.GenerateStreamChunkH\x00\x12\x36\n\x08\x63omplete\x18\x03 \x01(\x0b\x32".vllm.grpc.engine.GenerateCompleteH\x00\x12\x30\n\x05\x65rror\x18\x04 \x01(\x0b\x32\x1f.vllm.grpc.engine.GenerateErrorH\x00\x42\n\n\x08response"q\n\x13GenerateStreamChunk\x12\x11\n\ttoken_ids\x18\x01 \x03(\r\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x03 \x01(\x05\x12\x15\n\rcached_tokens\x18\x04 \x01(\x05"\x86\x01\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\r\x12\x15\n\rfinish_reason\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t"W\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\ttokenized\x18\x02 \x01(\x0b\x32 .vllm.grpc.engine.TokenizedInput"\x93\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x33\n\x08\x63omplete\x18\x02 \x01(\x0b\x32\x1f.vllm.grpc.engine.EmbedCompleteH\x00\x12-\n\x05\x65rror\x18\x03 \x01(\x0b\x32\x1c.vllm.grpc.engine.EmbedErrorH\x00\x42\n\n\x08response"P\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rembedding_dim\x18\x03 \x01(\x05"+\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t"\x14\n\x12HealthCheckRequest"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"\x15\n\x13GetModelInfoRequest"\x8a\x01\n\x14GetModelInfoResponse\x12\x12\n\nmodel_path\x18\x01 \x01(\t\x12\x15\n\ris_generation\x18\x02 \x01(\x08\x12\x1a\n\x12max_context_length\x18\x03 \x01(\x05\x12\x12\n\nvocab_size\x18\x04 \x01(\x05\x12\x17\n\x0fsupports_vision\x18\x05 \x01(\x08"\x16\n\x14GetServerInfoRequest"\x90\x01\n\x15GetServerInfoResponse\x12\x17\n\x0f\x61\x63tive_requests\x18\x01 \x01(\x05\x12\x11\n\tis_paused\x18\x02 \x01(\x08\x12\x1e\n\x16last_receive_timestamp\x18\x03 \x01(\x01\x12\x16\n\x0euptime_seconds\x18\x04 \x01(\x01\x12\x13\n\x0bserver_type\x18\x05 \x01(\t2\x92\x04\n\nVllmEngine\x12S\n\x08Generate\x12!.vllm.grpc.engine.GenerateRequest\x1a".vllm.grpc.engine.GenerateResponse0\x01\x12H\n\x05\x45mbed\x12\x1e.vllm.grpc.engine.EmbedRequest\x1a\x1f.vllm.grpc.engine.EmbedResponse\x12Z\n\x0bHealthCheck\x12$.vllm.grpc.engine.HealthCheckRequest\x1a%.vllm.grpc.engine.HealthCheckResponse\x12H\n\x05\x41\x62ort\x12\x1e.vllm.grpc.engine.AbortRequest\x1a\x1f.vllm.grpc.engine.AbortResponse\x12]\n\x0cGetModelInfo\x12%.vllm.grpc.engine.GetModelInfoRequest\x1a&.vllm.grpc.engine.GetModelInfoResponse\x12`\n\rGetServerInfo\x12&.vllm.grpc.engine.GetServerInfoRequest\x1a\'.vllm.grpc.engine.GetServerInfoResponseb\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages( - DESCRIPTOR, "vllm.grpc.vllm_engine_pb2", _globals -) -if not _descriptor._USE_C_DESCRIPTORS: - DESCRIPTOR._loaded_options = None - _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._loaded_options = None - _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_options = b"8\001" - _globals["_SAMPLINGPARAMS"]._serialized_start = 50 - _globals["_SAMPLINGPARAMS"]._serialized_end = 919 - _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_start = 773 - _globals["_SAMPLINGPARAMS_LOGITBIASENTRY"]._serialized_end = 821 - _globals["_CHOICECONSTRAINT"]._serialized_start = 921 - _globals["_CHOICECONSTRAINT"]._serialized_end = 956 - _globals["_TOKENIZEDINPUT"]._serialized_start = 958 - _globals["_TOKENIZEDINPUT"]._serialized_end = 1016 - _globals["_GENERATEREQUEST"]._serialized_start = 1019 - _globals["_GENERATEREQUEST"]._serialized_end = 1184 - _globals["_GENERATERESPONSE"]._serialized_start = 1187 - _globals["_GENERATERESPONSE"]._serialized_end = 1399 - _globals["_GENERATESTREAMCHUNK"]._serialized_start = 1401 - _globals["_GENERATESTREAMCHUNK"]._serialized_end = 1514 - _globals["_GENERATECOMPLETE"]._serialized_start = 1517 - _globals["_GENERATECOMPLETE"]._serialized_end = 1651 - _globals["_GENERATEERROR"]._serialized_start = 1653 - _globals["_GENERATEERROR"]._serialized_end = 1728 - _globals["_EMBEDREQUEST"]._serialized_start = 1730 - _globals["_EMBEDREQUEST"]._serialized_end = 1817 - _globals["_EMBEDRESPONSE"]._serialized_start = 1820 - _globals["_EMBEDRESPONSE"]._serialized_end = 1967 - _globals["_EMBEDCOMPLETE"]._serialized_start = 1969 - _globals["_EMBEDCOMPLETE"]._serialized_end = 2049 - _globals["_EMBEDERROR"]._serialized_start = 2051 - _globals["_EMBEDERROR"]._serialized_end = 2094 - _globals["_HEALTHCHECKREQUEST"]._serialized_start = 2096 - _globals["_HEALTHCHECKREQUEST"]._serialized_end = 2116 - _globals["_HEALTHCHECKRESPONSE"]._serialized_start = 2118 - _globals["_HEALTHCHECKRESPONSE"]._serialized_end = 2173 - _globals["_ABORTREQUEST"]._serialized_start = 2175 - _globals["_ABORTREQUEST"]._serialized_end = 2225 - _globals["_ABORTRESPONSE"]._serialized_start = 2227 - _globals["_ABORTRESPONSE"]._serialized_end = 2276 - _globals["_GETMODELINFOREQUEST"]._serialized_start = 2278 - _globals["_GETMODELINFOREQUEST"]._serialized_end = 2299 - _globals["_GETMODELINFORESPONSE"]._serialized_start = 2302 - _globals["_GETMODELINFORESPONSE"]._serialized_end = 2440 - _globals["_GETSERVERINFOREQUEST"]._serialized_start = 2442 - _globals["_GETSERVERINFOREQUEST"]._serialized_end = 2464 - _globals["_GETSERVERINFORESPONSE"]._serialized_start = 2467 - _globals["_GETSERVERINFORESPONSE"]._serialized_end = 2611 - _globals["_VLLMENGINE"]._serialized_start = 2614 - _globals["_VLLMENGINE"]._serialized_end = 3144 -# @@protoc_insertion_point(module_scope) diff --git a/vllm/grpc/vllm_engine_pb2.pyi b/vllm/grpc/vllm_engine_pb2.pyi deleted file mode 100644 index 51cdd7a799cd..000000000000 --- a/vllm/grpc/vllm_engine_pb2.pyi +++ /dev/null @@ -1,391 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# mypy: ignore-errors -from collections.abc import Iterable as _Iterable -from collections.abc import Mapping as _Mapping -from typing import ClassVar as _ClassVar - -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf.internal import containers as _containers - -DESCRIPTOR: _descriptor.FileDescriptor - -class SamplingParams(_message.Message): - __slots__ = ( - "temperature", - "top_p", - "top_k", - "min_p", - "frequency_penalty", - "presence_penalty", - "repetition_penalty", - "max_tokens", - "min_tokens", - "stop", - "stop_token_ids", - "skip_special_tokens", - "spaces_between_special_tokens", - "ignore_eos", - "n", - "logprobs", - "prompt_logprobs", - "seed", - "include_stop_str_in_output", - "logit_bias", - "truncate_prompt_tokens", - "json_schema", - "regex", - "grammar", - "structural_tag", - "json_object", - "choice", - ) - class LogitBiasEntry(_message.Message): - __slots__ = ("key", "value") - KEY_FIELD_NUMBER: _ClassVar[int] - VALUE_FIELD_NUMBER: _ClassVar[int] - key: int - value: float - def __init__( - self, key: int | None = ..., value: float | None = ... - ) -> None: ... - - TEMPERATURE_FIELD_NUMBER: _ClassVar[int] - TOP_P_FIELD_NUMBER: _ClassVar[int] - TOP_K_FIELD_NUMBER: _ClassVar[int] - MIN_P_FIELD_NUMBER: _ClassVar[int] - FREQUENCY_PENALTY_FIELD_NUMBER: _ClassVar[int] - PRESENCE_PENALTY_FIELD_NUMBER: _ClassVar[int] - REPETITION_PENALTY_FIELD_NUMBER: _ClassVar[int] - MAX_TOKENS_FIELD_NUMBER: _ClassVar[int] - MIN_TOKENS_FIELD_NUMBER: _ClassVar[int] - STOP_FIELD_NUMBER: _ClassVar[int] - STOP_TOKEN_IDS_FIELD_NUMBER: _ClassVar[int] - SKIP_SPECIAL_TOKENS_FIELD_NUMBER: _ClassVar[int] - SPACES_BETWEEN_SPECIAL_TOKENS_FIELD_NUMBER: _ClassVar[int] - IGNORE_EOS_FIELD_NUMBER: _ClassVar[int] - N_FIELD_NUMBER: _ClassVar[int] - LOGPROBS_FIELD_NUMBER: _ClassVar[int] - PROMPT_LOGPROBS_FIELD_NUMBER: _ClassVar[int] - SEED_FIELD_NUMBER: _ClassVar[int] - INCLUDE_STOP_STR_IN_OUTPUT_FIELD_NUMBER: _ClassVar[int] - LOGIT_BIAS_FIELD_NUMBER: _ClassVar[int] - TRUNCATE_PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int] - JSON_SCHEMA_FIELD_NUMBER: _ClassVar[int] - REGEX_FIELD_NUMBER: _ClassVar[int] - GRAMMAR_FIELD_NUMBER: _ClassVar[int] - STRUCTURAL_TAG_FIELD_NUMBER: _ClassVar[int] - JSON_OBJECT_FIELD_NUMBER: _ClassVar[int] - CHOICE_FIELD_NUMBER: _ClassVar[int] - temperature: float - top_p: float - top_k: int - min_p: float - frequency_penalty: float - presence_penalty: float - repetition_penalty: float - max_tokens: int - min_tokens: int - stop: _containers.RepeatedScalarFieldContainer[str] - stop_token_ids: _containers.RepeatedScalarFieldContainer[int] - skip_special_tokens: bool - spaces_between_special_tokens: bool - ignore_eos: bool - n: int - logprobs: int - prompt_logprobs: int - seed: int - include_stop_str_in_output: bool - logit_bias: _containers.ScalarMap[int, float] - truncate_prompt_tokens: int - json_schema: str - regex: str - grammar: str - structural_tag: str - json_object: bool - choice: ChoiceConstraint - def __init__( - self, - temperature: float | None = ..., - top_p: float | None = ..., - top_k: int | None = ..., - min_p: float | None = ..., - frequency_penalty: float | None = ..., - presence_penalty: float | None = ..., - repetition_penalty: float | None = ..., - max_tokens: int | None = ..., - min_tokens: int | None = ..., - stop: _Iterable[str] | None = ..., - stop_token_ids: _Iterable[int] | None = ..., - skip_special_tokens: bool = ..., - spaces_between_special_tokens: bool = ..., - ignore_eos: bool = ..., - n: int | None = ..., - logprobs: int | None = ..., - prompt_logprobs: int | None = ..., - seed: int | None = ..., - include_stop_str_in_output: bool = ..., - logit_bias: _Mapping[int, float] | None = ..., - truncate_prompt_tokens: int | None = ..., - json_schema: str | None = ..., - regex: str | None = ..., - grammar: str | None = ..., - structural_tag: str | None = ..., - json_object: bool = ..., - choice: ChoiceConstraint | _Mapping | None = ..., - ) -> None: ... - -class ChoiceConstraint(_message.Message): - __slots__ = ("choices",) - CHOICES_FIELD_NUMBER: _ClassVar[int] - choices: _containers.RepeatedScalarFieldContainer[str] - def __init__(self, choices: _Iterable[str] | None = ...) -> None: ... - -class TokenizedInput(_message.Message): - __slots__ = ("original_text", "input_ids") - ORIGINAL_TEXT_FIELD_NUMBER: _ClassVar[int] - INPUT_IDS_FIELD_NUMBER: _ClassVar[int] - original_text: str - input_ids: _containers.RepeatedScalarFieldContainer[int] - def __init__( - self, original_text: str | None = ..., input_ids: _Iterable[int] | None = ... - ) -> None: ... - -class GenerateRequest(_message.Message): - __slots__ = ("request_id", "tokenized", "sampling_params", "stream") - REQUEST_ID_FIELD_NUMBER: _ClassVar[int] - TOKENIZED_FIELD_NUMBER: _ClassVar[int] - SAMPLING_PARAMS_FIELD_NUMBER: _ClassVar[int] - STREAM_FIELD_NUMBER: _ClassVar[int] - request_id: str - tokenized: TokenizedInput - sampling_params: SamplingParams - stream: bool - def __init__( - self, - request_id: str | None = ..., - tokenized: TokenizedInput | _Mapping | None = ..., - sampling_params: SamplingParams | _Mapping | None = ..., - stream: bool = ..., - ) -> None: ... - -class GenerateResponse(_message.Message): - __slots__ = ("request_id", "chunk", "complete", "error") - REQUEST_ID_FIELD_NUMBER: _ClassVar[int] - CHUNK_FIELD_NUMBER: _ClassVar[int] - COMPLETE_FIELD_NUMBER: _ClassVar[int] - ERROR_FIELD_NUMBER: _ClassVar[int] - request_id: str - chunk: GenerateStreamChunk - complete: GenerateComplete - error: GenerateError - def __init__( - self, - request_id: str | None = ..., - chunk: GenerateStreamChunk | _Mapping | None = ..., - complete: GenerateComplete | _Mapping | None = ..., - error: GenerateError | _Mapping | None = ..., - ) -> None: ... - -class GenerateStreamChunk(_message.Message): - __slots__ = ("token_ids", "prompt_tokens", "completion_tokens", "cached_tokens") - TOKEN_IDS_FIELD_NUMBER: _ClassVar[int] - PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int] - COMPLETION_TOKENS_FIELD_NUMBER: _ClassVar[int] - CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int] - token_ids: _containers.RepeatedScalarFieldContainer[int] - prompt_tokens: int - completion_tokens: int - cached_tokens: int - def __init__( - self, - token_ids: _Iterable[int] | None = ..., - prompt_tokens: int | None = ..., - completion_tokens: int | None = ..., - cached_tokens: int | None = ..., - ) -> None: ... - -class GenerateComplete(_message.Message): - __slots__ = ( - "output_ids", - "finish_reason", - "prompt_tokens", - "completion_tokens", - "cached_tokens", - ) - OUTPUT_IDS_FIELD_NUMBER: _ClassVar[int] - FINISH_REASON_FIELD_NUMBER: _ClassVar[int] - PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int] - COMPLETION_TOKENS_FIELD_NUMBER: _ClassVar[int] - CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int] - output_ids: _containers.RepeatedScalarFieldContainer[int] - finish_reason: str - prompt_tokens: int - completion_tokens: int - cached_tokens: int - def __init__( - self, - output_ids: _Iterable[int] | None = ..., - finish_reason: str | None = ..., - prompt_tokens: int | None = ..., - completion_tokens: int | None = ..., - cached_tokens: int | None = ..., - ) -> None: ... - -class GenerateError(_message.Message): - __slots__ = ("message", "http_status_code", "details") - MESSAGE_FIELD_NUMBER: _ClassVar[int] - HTTP_STATUS_CODE_FIELD_NUMBER: _ClassVar[int] - DETAILS_FIELD_NUMBER: _ClassVar[int] - message: str - http_status_code: str - details: str - def __init__( - self, - message: str | None = ..., - http_status_code: str | None = ..., - details: str | None = ..., - ) -> None: ... - -class EmbedRequest(_message.Message): - __slots__ = ("request_id", "tokenized") - REQUEST_ID_FIELD_NUMBER: _ClassVar[int] - TOKENIZED_FIELD_NUMBER: _ClassVar[int] - request_id: str - tokenized: TokenizedInput - def __init__( - self, - request_id: str | None = ..., - tokenized: TokenizedInput | _Mapping | None = ..., - ) -> None: ... - -class EmbedResponse(_message.Message): - __slots__ = ("request_id", "complete", "error") - REQUEST_ID_FIELD_NUMBER: _ClassVar[int] - COMPLETE_FIELD_NUMBER: _ClassVar[int] - ERROR_FIELD_NUMBER: _ClassVar[int] - request_id: str - complete: EmbedComplete - error: EmbedError - def __init__( - self, - request_id: str | None = ..., - complete: EmbedComplete | _Mapping | None = ..., - error: EmbedError | _Mapping | None = ..., - ) -> None: ... - -class EmbedComplete(_message.Message): - __slots__ = ("embedding", "prompt_tokens", "embedding_dim") - EMBEDDING_FIELD_NUMBER: _ClassVar[int] - PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int] - EMBEDDING_DIM_FIELD_NUMBER: _ClassVar[int] - embedding: _containers.RepeatedScalarFieldContainer[float] - prompt_tokens: int - embedding_dim: int - def __init__( - self, - embedding: _Iterable[float] | None = ..., - prompt_tokens: int | None = ..., - embedding_dim: int | None = ..., - ) -> None: ... - -class EmbedError(_message.Message): - __slots__ = ("message", "code") - MESSAGE_FIELD_NUMBER: _ClassVar[int] - CODE_FIELD_NUMBER: _ClassVar[int] - message: str - code: str - def __init__(self, message: str | None = ..., code: str | None = ...) -> None: ... - -class HealthCheckRequest(_message.Message): - __slots__ = () - def __init__(self) -> None: ... - -class HealthCheckResponse(_message.Message): - __slots__ = ("healthy", "message") - HEALTHY_FIELD_NUMBER: _ClassVar[int] - MESSAGE_FIELD_NUMBER: _ClassVar[int] - healthy: bool - message: str - def __init__(self, healthy: bool = ..., message: str | None = ...) -> None: ... - -class AbortRequest(_message.Message): - __slots__ = ("request_id", "reason") - REQUEST_ID_FIELD_NUMBER: _ClassVar[int] - REASON_FIELD_NUMBER: _ClassVar[int] - request_id: str - reason: str - def __init__( - self, request_id: str | None = ..., reason: str | None = ... - ) -> None: ... - -class AbortResponse(_message.Message): - __slots__ = ("success", "message") - SUCCESS_FIELD_NUMBER: _ClassVar[int] - MESSAGE_FIELD_NUMBER: _ClassVar[int] - success: bool - message: str - def __init__(self, success: bool = ..., message: str | None = ...) -> None: ... - -class GetModelInfoRequest(_message.Message): - __slots__ = () - def __init__(self) -> None: ... - -class GetModelInfoResponse(_message.Message): - __slots__ = ( - "model_path", - "is_generation", - "max_context_length", - "vocab_size", - "supports_vision", - ) - MODEL_PATH_FIELD_NUMBER: _ClassVar[int] - IS_GENERATION_FIELD_NUMBER: _ClassVar[int] - MAX_CONTEXT_LENGTH_FIELD_NUMBER: _ClassVar[int] - VOCAB_SIZE_FIELD_NUMBER: _ClassVar[int] - SUPPORTS_VISION_FIELD_NUMBER: _ClassVar[int] - model_path: str - is_generation: bool - max_context_length: int - vocab_size: int - supports_vision: bool - def __init__( - self, - model_path: str | None = ..., - is_generation: bool = ..., - max_context_length: int | None = ..., - vocab_size: int | None = ..., - supports_vision: bool = ..., - ) -> None: ... - -class GetServerInfoRequest(_message.Message): - __slots__ = () - def __init__(self) -> None: ... - -class GetServerInfoResponse(_message.Message): - __slots__ = ( - "active_requests", - "is_paused", - "last_receive_timestamp", - "uptime_seconds", - "server_type", - ) - ACTIVE_REQUESTS_FIELD_NUMBER: _ClassVar[int] - IS_PAUSED_FIELD_NUMBER: _ClassVar[int] - LAST_RECEIVE_TIMESTAMP_FIELD_NUMBER: _ClassVar[int] - UPTIME_SECONDS_FIELD_NUMBER: _ClassVar[int] - SERVER_TYPE_FIELD_NUMBER: _ClassVar[int] - active_requests: int - is_paused: bool - last_receive_timestamp: float - uptime_seconds: float - server_type: str - def __init__( - self, - active_requests: int | None = ..., - is_paused: bool = ..., - last_receive_timestamp: float | None = ..., - uptime_seconds: float | None = ..., - server_type: str | None = ..., - ) -> None: ... diff --git a/vllm/grpc/vllm_engine_pb2_grpc.py b/vllm/grpc/vllm_engine_pb2_grpc.py deleted file mode 100644 index 9f6a39ecea15..000000000000 --- a/vllm/grpc/vllm_engine_pb2_grpc.py +++ /dev/null @@ -1,355 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# mypy: ignore-errors -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" - -import grpc -import warnings - -from vllm.grpc import vllm_engine_pb2 as vllm_dot_grpc_dot_vllm__engine__pb2 - -GRPC_GENERATED_VERSION = "1.75.1" -GRPC_VERSION = grpc.__version__ -_version_not_supported = False - -try: - from grpc._utilities import first_version_is_lower - - _version_not_supported = first_version_is_lower( - GRPC_VERSION, GRPC_GENERATED_VERSION - ) -except ImportError: - _version_not_supported = True - -if _version_not_supported: - raise RuntimeError( - f"The grpc package installed is at version {GRPC_VERSION}," - + f" but the generated code in vllm/grpc/vllm_engine_pb2_grpc.py depends on" - + f" grpcio>={GRPC_GENERATED_VERSION}." - + f" Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}" - + f" or downgrade your generated code using grpcio-tools<={GRPC_VERSION}." - ) - - -class VllmEngineStub(object): - """Service definition for vLLM engine communication - This protocol is designed for efficient binary communication between - the Rust router and vLLM Python engine (AsyncLLM). - """ - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.Generate = channel.unary_stream( - "/vllm.grpc.engine.VllmEngine/Generate", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.FromString, - _registered_method=True, - ) - self.Embed = channel.unary_unary( - "/vllm.grpc.engine.VllmEngine/Embed", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.FromString, - _registered_method=True, - ) - self.HealthCheck = channel.unary_unary( - "/vllm.grpc.engine.VllmEngine/HealthCheck", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.FromString, - _registered_method=True, - ) - self.Abort = channel.unary_unary( - "/vllm.grpc.engine.VllmEngine/Abort", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.FromString, - _registered_method=True, - ) - self.GetModelInfo = channel.unary_unary( - "/vllm.grpc.engine.VllmEngine/GetModelInfo", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.FromString, - _registered_method=True, - ) - self.GetServerInfo = channel.unary_unary( - "/vllm.grpc.engine.VllmEngine/GetServerInfo", - request_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.SerializeToString, - response_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.FromString, - _registered_method=True, - ) - - -class VllmEngineServicer(object): - """Service definition for vLLM engine communication - This protocol is designed for efficient binary communication between - the Rust router and vLLM Python engine (AsyncLLM). - """ - - def Generate(self, request, context): - """Submit a generation request (supports streaming)""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def Embed(self, request, context): - """Submit an embedding request""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def HealthCheck(self, request, context): - """Health check""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def Abort(self, request, context): - """Abort a running request""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def GetModelInfo(self, request, context): - """Get model information""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def GetServerInfo(self, request, context): - """Get server information""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_VllmEngineServicer_to_server(servicer, server): - rpc_method_handlers = { - "Generate": grpc.unary_stream_rpc_method_handler( - servicer.Generate, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.SerializeToString, - ), - "Embed": grpc.unary_unary_rpc_method_handler( - servicer.Embed, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.SerializeToString, - ), - "HealthCheck": grpc.unary_unary_rpc_method_handler( - servicer.HealthCheck, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.SerializeToString, - ), - "Abort": grpc.unary_unary_rpc_method_handler( - servicer.Abort, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.SerializeToString, - ), - "GetModelInfo": grpc.unary_unary_rpc_method_handler( - servicer.GetModelInfo, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.SerializeToString, - ), - "GetServerInfo": grpc.unary_unary_rpc_method_handler( - servicer.GetServerInfo, - request_deserializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.FromString, - response_serializer=vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler( - "vllm.grpc.engine.VllmEngine", rpc_method_handlers - ) - server.add_generic_rpc_handlers((generic_handler,)) - server.add_registered_method_handlers( - "vllm.grpc.engine.VllmEngine", rpc_method_handlers - ) - - -# This class is part of an EXPERIMENTAL API. -class VllmEngine(object): - """Service definition for vLLM engine communication - This protocol is designed for efficient binary communication between - the Rust router and vLLM Python engine (AsyncLLM). - """ - - @staticmethod - def Generate( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_stream( - request, - target, - "/vllm.grpc.engine.VllmEngine/Generate", - vllm_dot_grpc_dot_vllm__engine__pb2.GenerateRequest.SerializeToString, - vllm_dot_grpc_dot_vllm__engine__pb2.GenerateResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def Embed( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/vllm.grpc.engine.VllmEngine/Embed", - vllm_dot_grpc_dot_vllm__engine__pb2.EmbedRequest.SerializeToString, - vllm_dot_grpc_dot_vllm__engine__pb2.EmbedResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def HealthCheck( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/vllm.grpc.engine.VllmEngine/HealthCheck", - vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckRequest.SerializeToString, - vllm_dot_grpc_dot_vllm__engine__pb2.HealthCheckResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def Abort( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/vllm.grpc.engine.VllmEngine/Abort", - vllm_dot_grpc_dot_vllm__engine__pb2.AbortRequest.SerializeToString, - vllm_dot_grpc_dot_vllm__engine__pb2.AbortResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def GetModelInfo( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/vllm.grpc.engine.VllmEngine/GetModelInfo", - vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoRequest.SerializeToString, - vllm_dot_grpc_dot_vllm__engine__pb2.GetModelInfoResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def GetServerInfo( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/vllm.grpc.engine.VllmEngine/GetServerInfo", - vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoRequest.SerializeToString, - vllm_dot_grpc_dot_vllm__engine__pb2.GetServerInfoResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) From 0b306ce6437032463d01276cea2c9966b418963c Mon Sep 17 00:00:00 2001 From: Chang Su Date: Fri, 2 Jan 2026 23:05:27 -0800 Subject: [PATCH 11/27] remove disable log request server arg Signed-off-by: Chang Su --- vllm/entrypoints/grpc_server.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index 808358e19450..5d7c9c3e6a75 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -402,7 +402,7 @@ async def serve_grpc(args: argparse.Namespace): async_llm = AsyncLLM.from_vllm_config( vllm_config=vllm_config, usage_context=UsageContext.OPENAI_API_SERVER, - enable_log_requests=not args.disable_log_requests_server, + enable_log_requests=args.enable_log_requests, disable_log_stats=args.disable_log_stats_server, ) @@ -492,11 +492,6 @@ def main(): default=50051, help="Port to bind gRPC server to", ) - parser.add_argument( - "--disable-log-requests-server", - action="store_true", - help="Disable request logging on server side", - ) parser.add_argument( "--disable-log-stats-server", action="store_true", From 22f08a79584998c8dfef26181efe2d1e7e476c2f Mon Sep 17 00:00:00 2001 From: Chang Su Date: Fri, 2 Jan 2026 23:15:29 -0800 Subject: [PATCH 12/27] =?UTF-8?q?Stop=20strings=20fix:=20Changed=20detoken?= =?UTF-8?q?ize=3DFalse=20=E2=86=92=20detokenize=3Dbool(stop)=20and=20Reque?= =?UTF-8?q?stOutputKind=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Chang Su --- vllm/grpc/grpc_request_manager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/grpc/grpc_request_manager.py b/vllm/grpc/grpc_request_manager.py index 35997f2ee657..31914a4576c6 100644 --- a/vllm/grpc/grpc_request_manager.py +++ b/vllm/grpc/grpc_request_manager.py @@ -315,6 +315,7 @@ def create_sampling_params_from_proto( if proto_params.HasField("truncate_prompt_tokens") else None, structured_outputs=structured_outputs, - detokenize=False, - output_kind=RequestOutputKind.DELTA if stream else RequestOutputKind.CUMULATIVE, + # detokenize must be True if stop strings are used (SamplingParams validation) + detokenize=bool(stop), + output_kind=RequestOutputKind.DELTA if stream else RequestOutputKind.FINAL_ONLY, ) From c25a5d4dc4b00a185ca9f987bc2f0ce6708e23ef Mon Sep 17 00:00:00 2001 From: Chang Su Date: Fri, 2 Jan 2026 23:18:53 -0800 Subject: [PATCH 13/27] unify logging content with http server startup Signed-off-by: Chang Su --- vllm/entrypoints/grpc_server.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index 5d7c9c3e6a75..cf895b6dd7f9 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -38,6 +38,7 @@ from vllm.usage.usage_lib import UsageContext from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.engine.async_llm import AsyncLLM +from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -388,7 +389,8 @@ async def serve_grpc(args: argparse.Namespace): Args: args: Parsed command line arguments """ - logger.info("Initializing vLLM gRPC server...") + logger.info("vLLM gRPC server version %s", VLLM_VERSION) + logger.info("args: %s", args) # Create engine args engine_args = AsyncEngineArgs.from_cli_args(args) @@ -406,10 +408,6 @@ async def serve_grpc(args: argparse.Namespace): disable_log_stats=args.disable_log_stats_server, ) - logger.info("Model: %s", vllm_config.model_config.model) - logger.info("Max model len: %s", vllm_config.model_config.max_model_len) - logger.info("Vocab size: %s", vllm_config.model_config.get_vocab_size()) - # Create request manager request_manager = GrpcRequestManager(async_llm) From 7bf5e78cc65e15008baf5a290ce6d85e90343845 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Fri, 2 Jan 2026 23:41:54 -0800 Subject: [PATCH 14/27] update input processor api call Signed-off-by: Chang Su --- vllm/grpc/grpc_request_manager.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/grpc/grpc_request_manager.py b/vllm/grpc/grpc_request_manager.py index 31914a4576c6..76d0d5df3cf2 100644 --- a/vllm/grpc/grpc_request_manager.py +++ b/vllm/grpc/grpc_request_manager.py @@ -78,14 +78,16 @@ async def generate( # Use processor.process_inputs() with pre-tokenized input prompt: TokensPrompt = {"prompt_token_ids": prompt_token_ids} - engine_request = self.async_llm.processor.process_inputs( + engine_request = self.async_llm.input_processor.process_inputs( request_id=request_id, prompt=prompt, params=sampling_params, arrival_time=arrival_time, ) - collector = RequestOutputCollector(output_kind=sampling_params.output_kind) + collector = RequestOutputCollector( + output_kind=sampling_params.output_kind, request_id=request_id + ) self.rid_to_collector[request_id] = collector # Submit to AsyncLLM - it will call add_request internally From af3ba361f1faedd6a1ba24226a1ff410bca84149 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Fri, 2 Jan 2026 23:51:54 -0800 Subject: [PATCH 15/27] assign request internal id Signed-off-by: Chang Su --- vllm/grpc/grpc_request_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/grpc/grpc_request_manager.py b/vllm/grpc/grpc_request_manager.py index 76d0d5df3cf2..a67edb87d816 100644 --- a/vllm/grpc/grpc_request_manager.py +++ b/vllm/grpc/grpc_request_manager.py @@ -85,6 +85,8 @@ async def generate( arrival_time=arrival_time, ) + self.async_llm.input_processor.assign_request_id(engine_request) + collector = RequestOutputCollector( output_kind=sampling_params.output_kind, request_id=request_id ) From d23b707b3a5cf90c4895b5924427509a18b13fb3 Mon Sep 17 00:00:00 2001 From: njhill Date: Sun, 4 Jan 2026 11:33:41 -0800 Subject: [PATCH 16/27] streamline Signed-off-by: njhill --- vllm/entrypoints/grpc_server.py | 98 +++++--------- vllm/grpc/grpc_request_manager.py | 214 ------------------------------ 2 files changed, 36 insertions(+), 276 deletions(-) diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index cf895b6dd7f9..dcd00f924977 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -27,10 +27,10 @@ import grpc from grpc_reflection.v1alpha import reflection +from vllm import TokensPrompt from vllm.engine.arg_utils import AsyncEngineArgs from vllm.grpc import vllm_engine_pb2, vllm_engine_pb2_grpc from vllm.grpc.grpc_request_manager import ( - GrpcRequestManager, create_sampling_params_from_proto, ) from vllm.logger import init_logger @@ -56,14 +56,15 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer): - GetServerInfo: Server state """ - def __init__(self, request_manager: GrpcRequestManager): + def __init__(self, async_llm: AsyncLLM, start_time: float): """ Initialize the servicer. Args: request_manager: The GrpcRequestManager instance """ - self.request_manager = request_manager + self.async_llm = async_llm + self.start_time = start_time logger.info("VllmEngineServicer initialized") async def Generate( @@ -87,14 +88,10 @@ async def Generate( try: # Extract tokenized input if not request.HasField("tokenized"): - yield self._error_response( - request_id, - "Missing tokenized input", - "400", - ) - return + raise ValueError("Missing tokenized input") prompt_token_ids = list(request.tokenized.input_ids) + prompt: TokensPrompt = {"prompt_token_ids": prompt_token_ids} # Build sampling params with detokenize=False sampling_params = create_sampling_params_from_proto( @@ -102,21 +99,11 @@ async def Generate( stream=request.stream, ) - # Submit to request manager and stream outputs - arrival_time = time.time() - - async for output in self.request_manager.generate( - request_id=request_id, - prompt_token_ids=prompt_token_ids, + async for output in self.async_llm.generate( + prompt=prompt, sampling_params=sampling_params, - arrival_time=arrival_time, + request_id=request_id, ): - # Check if client disconnected - if context.cancelled(): - logger.info("Client disconnected for %s.", request_id) - await self.request_manager.abort(request_id) - return - # Convert vLLM output to protobuf # For streaming, always send chunks if request.stream: @@ -128,11 +115,7 @@ async def Generate( except Exception as e: logger.exception("Error in Generate for %s", request_id) - yield self._error_response( - request_id, - str(e), - "500", - ) + yield self._error_response(request_id, e) async def Embed( self, @@ -175,14 +158,12 @@ async def HealthCheck( Returns: HealthCheckResponse protobuf """ - is_healthy, message = await self.request_manager.health_check() + is_healthy = not self.async_llm.errored + message = "Health" if is_healthy else "Engine is not alive" logger.info("HealthCheck request: healthy=%s, message=%s", is_healthy, message) - return vllm_engine_pb2.HealthCheckResponse( - healthy=is_healthy, - message=message, - ) + return vllm_engine_pb2.HealthCheckResponse(healthy=is_healthy, message=message) async def Abort( self, @@ -202,11 +183,10 @@ async def Abort( request_id = request.request_id logger.info("Abort request for %s.", request_id) - success = await self.request_manager.abort(request_id) + await self.async_llm.abort(request_id) return vllm_engine_pb2.AbortResponse( - success=success, - message=f"Request {request_id} {'aborted' if success else 'not found'}", + success=True, message=f"Request {request_id} aborted" ) async def GetModelInfo( @@ -224,14 +204,14 @@ async def GetModelInfo( Returns: GetModelInfoResponse protobuf """ - model_config = self.request_manager.get_model_config() + model_config = self.async_llm.model_config return vllm_engine_pb2.GetModelInfoResponse( - model_path=model_config.get("model_path", ""), - is_generation=model_config.get("is_generation", True), - max_context_length=model_config.get("max_context_length", 0), - vocab_size=model_config.get("vocab_size", 0), - supports_vision=model_config.get("supports_vision", False), + model_path=model_config.model, + is_generation=model_config.runner_type == "generate", + max_context_length=model_config.max_model_len, + vocab_size=model_config.get_vocab_size(), + supports_vision=model_config.is_multimodal_model, ) async def GetServerInfo( @@ -249,22 +229,20 @@ async def GetServerInfo( Returns: GetServerInfoResponse protobuf """ - num_requests = self.request_manager.get_num_unfinished_requests() + num_requests = self.async_llm.output_processor.get_num_unfinished_requests() return vllm_engine_pb2.GetServerInfoResponse( active_requests=num_requests, - is_paused=False, - last_receive_timestamp=time.time(), - uptime_seconds=0.0, # TODO: track server start time + is_paused=False, # TODO + last_receive_timestamp=time.time(), # TODO looks wrong? + uptime_seconds=time.time() - self.start_time, server_type="vllm-grpc", ) # ========== Helper methods ========== def _chunk_response( - self, - request_id: str, - output: RequestOutput, + self, request_id: str, output: RequestOutput ) -> vllm_engine_pb2.GenerateResponse: """ Build a streaming chunk response from vLLM output. @@ -308,9 +286,7 @@ def _chunk_response( ) def _complete_response( - self, - request_id: str, - output: RequestOutput, + self, request_id: str, output: RequestOutput ) -> vllm_engine_pb2.GenerateResponse: """ Build a final completion response from vLLM output. @@ -340,7 +316,7 @@ def _complete_response( # Build complete response # When streaming (DELTA mode): completion.token_ids will be empty/last delta - # When non-streaming (CUMULATIVE mode): completion.token_ids has all tokens + # When non-streaming (FINAL_ONLY mode): completion.token_ids has all tokens # Client will accumulate token counts for streaming return vllm_engine_pb2.GenerateResponse( request_id=request_id, @@ -356,22 +332,21 @@ def _complete_response( ) def _error_response( - self, - request_id: str, - message: str, - status_code: str, + self, request_id: str, e: Exception ) -> vllm_engine_pb2.GenerateResponse: """ Build an error response. Args: request_id: The request ID - message: Error message - status_code: HTTP-style status code + e: The exception from vLLM Returns: GenerateResponse with error field set """ + status_code = "400" if isinstance(e, ValueError) else "500" + message = str(e) + return vllm_engine_pb2.GenerateResponse( request_id=request_id, error=vllm_engine_pb2.GenerateError( @@ -392,6 +367,8 @@ async def serve_grpc(args: argparse.Namespace): logger.info("vLLM gRPC server version %s", VLLM_VERSION) logger.info("args: %s", args) + start_time = time.time() + # Create engine args engine_args = AsyncEngineArgs.from_cli_args(args) @@ -408,11 +385,8 @@ async def serve_grpc(args: argparse.Namespace): disable_log_stats=args.disable_log_stats_server, ) - # Create request manager - request_manager = GrpcRequestManager(async_llm) - # Create servicer - servicer = VllmEngineServicer(request_manager) + servicer = VllmEngineServicer(async_llm, start_time) # Create gRPC server server = grpc.aio.server( diff --git a/vllm/grpc/grpc_request_manager.py b/vllm/grpc/grpc_request_manager.py index a67edb87d816..1945cb4e7093 100644 --- a/vllm/grpc/grpc_request_manager.py +++ b/vllm/grpc/grpc_request_manager.py @@ -12,231 +12,17 @@ detokenization and return token IDs only. """ -import asyncio -from collections.abc import AsyncGenerator - from vllm.grpc import vllm_engine_pb2 -from vllm.inputs import TokensPrompt from vllm.logger import init_logger -from vllm.outputs import RequestOutput from vllm.sampling_params import ( RequestOutputKind, SamplingParams, StructuredOutputsParams, ) -from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.async_llm import AsyncLLM -from vllm.v1.engine.output_processor import RequestOutputCollector logger = init_logger(__name__) -class GrpcRequestManager: - """ - Manages gRPC request lifecycle for vLLM. - - Responsibilities: - - Convert protobuf requests to vLLM EngineCoreRequest - - Set detokenize=False in SamplingParams (key optimization!) - - Submit requests to AsyncLLM - - Stream token IDs (not text) back to gRPC clients - - Handle abort/cancel operations - """ - - def __init__(self, async_llm: AsyncLLM): - """ - Initialize the request manager. - - Args: - async_llm: The AsyncLLM engine instance to submit requests to - """ - self.async_llm = async_llm - self.rid_to_collector: dict[str, RequestOutputCollector] = {} - - logger.info("GrpcRequestManager initialized") - - async def generate( - self, - request_id: str, - prompt_token_ids: list[int], - sampling_params: SamplingParams, - arrival_time: float, - ) -> AsyncGenerator[RequestOutput, None]: - """ - Submit a generation request and stream outputs. - - Args: - request_id: Unique request identifier - prompt_token_ids: Pre-tokenized input from Rust router - sampling_params: Sampling parameters (with detokenize=False!) - arrival_time: Request arrival timestamp - - Yields: - RequestOutput objects containing token IDs (text will be empty) - """ - try: - # Use processor.process_inputs() with pre-tokenized input - prompt: TokensPrompt = {"prompt_token_ids": prompt_token_ids} - - engine_request = self.async_llm.input_processor.process_inputs( - request_id=request_id, - prompt=prompt, - params=sampling_params, - arrival_time=arrival_time, - ) - - self.async_llm.input_processor.assign_request_id(engine_request) - - collector = RequestOutputCollector( - output_kind=sampling_params.output_kind, request_id=request_id - ) - self.rid_to_collector[request_id] = collector - - # Submit to AsyncLLM - it will call add_request internally - # and populate our collector - await self._submit_request(engine_request, collector) - - # Stream outputs from collector - while True: - try: - output = await collector.get() - yield output - - if output.finished: - break - - except asyncio.CancelledError: - logger.info("Request %s cancelled by client.", request_id) - # Clean up the request in output_processor and engine_core - await self.async_llm.abort([request_id]) - raise # Re-raise to let gRPC server handle cleanup - - except Exception: - logger.exception("Error in generate for %s", request_id) - raise - finally: - # Cleanup - self.rid_to_collector.pop(request_id, None) - - async def _submit_request( - self, - request: EngineCoreRequest, - collector: RequestOutputCollector, - ) -> None: - """ - Internal method to submit request to AsyncLLM. - - Args: - request: The EngineCoreRequest to submit - collector: The output collector for this request - """ - try: - # Add request to output processor - # Use None for prompt since we have pre-tokenized input - # TODO: Support sampling_params.n > 1 (parallel sampling) - # When n > 1, we need to: - # 1. Create a ParentRequest to track all child requests - # 2. Fan out multiple child EngineCoreRequests with different - # request_index values - # 3. Aggregate outputs from all children - # For now, we only support n=1, so parent_req=None and - # request_index=0 - self.async_llm.output_processor.add_request( - request=request, - prompt=None, - parent_req=None, - request_index=0, - queue=collector, - ) - - # Submit to engine core - await self.async_llm.engine_core.add_request_async(request) - - except Exception as e: - logger.exception("Error submitting request %s", request.request_id) - # Put error in collector - collector.put(e) - - async def abort(self, request_id: str) -> bool: - """ - Abort a running request. - - Args: - request_id: The request ID to abort - - Returns: - True if request was found and aborted, False otherwise - """ - try: - # Check if request exists - collector = self.rid_to_collector.get(request_id) - - if collector is None: - logger.debug( - "Abort: request %s not found (may have already completed).", - request_id, - ) - return False - - # Abort in AsyncLLM (this handles both engine_core and output_processor) - await self.async_llm.abort([request_id]) - - # Remove from our tracking - self.rid_to_collector.pop(request_id, None) - - logger.info("Request %s aborted.", request_id) - return True - - except Exception: - logger.exception("Error aborting request %s", request_id) - self.rid_to_collector.pop(request_id, None) - return False - - async def health_check(self) -> tuple[bool, str]: - """ - Check if the engine is healthy. - - Returns: - Tuple of (is_healthy, message) - """ - try: - # Check if engine is running and not errored - if self.async_llm.errored: - return False, "Engine is not alive" - - return True, "Healthy" - - except Exception as e: - logger.exception("Health check error") - return False, f"Error: {e}" - - def get_model_config(self) -> dict: - """ - Get model configuration information. - - Returns: - Dictionary with model config details - """ - model_config = self.async_llm.model_config - - return { - "model_path": model_config.model, - "is_generation": model_config.runner_type == "generate", - "max_context_length": model_config.max_model_len, - "vocab_size": model_config.get_vocab_size(), - "supports_vision": model_config.is_multimodal_model, - } - - def get_num_unfinished_requests(self) -> int: - """ - Get the number of currently running requests. - - Returns: - Number of unfinished requests - """ - return len(self.rid_to_collector) - - def create_sampling_params_from_proto( proto_params: vllm_engine_pb2.SamplingParams, stream: bool = True, From 231bc0341f53ef56ae8ed1581509096180eca24e Mon Sep 17 00:00:00 2001 From: njhill Date: Mon, 5 Jan 2026 12:56:25 -0800 Subject: [PATCH 17/27] remove grpc_request_manager.py for now Signed-off-by: njhill --- vllm/entrypoints/grpc_server.py | 102 +++++++++++++++++++++++---- vllm/grpc/grpc_request_manager.py | 111 ------------------------------ 2 files changed, 90 insertions(+), 123 deletions(-) delete mode 100644 vllm/grpc/grpc_request_manager.py diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index dcd00f924977..a12ae844db65 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -27,14 +27,12 @@ import grpc from grpc_reflection.v1alpha import reflection -from vllm import TokensPrompt +from vllm import SamplingParams, TokensPrompt from vllm.engine.arg_utils import AsyncEngineArgs from vllm.grpc import vllm_engine_pb2, vllm_engine_pb2_grpc -from vllm.grpc.grpc_request_manager import ( - create_sampling_params_from_proto, -) from vllm.logger import init_logger from vllm.outputs import RequestOutput +from vllm.sampling_params import RequestOutputKind, StructuredOutputsParams from vllm.usage.usage_lib import UsageContext from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.engine.async_llm import AsyncLLM @@ -94,9 +92,8 @@ async def Generate( prompt: TokensPrompt = {"prompt_token_ids": prompt_token_ids} # Build sampling params with detokenize=False - sampling_params = create_sampling_params_from_proto( - request.sampling_params, - stream=request.stream, + sampling_params = self._sampling_params_from_proto( + request.sampling_params, stream=request.stream ) async for output in self.async_llm.generate( @@ -241,8 +238,88 @@ async def GetServerInfo( # ========== Helper methods ========== + @staticmethod + def _sampling_params_from_proto( + params: vllm_engine_pb2.SamplingParams, stream: bool = True + ) -> SamplingParams: + """ + Convert protobuf SamplingParams to vLLM SamplingParams. + + Args: + params: Protobuf SamplingParams message + stream: Whether streaming is enabled + + Returns: + vLLM SamplingParams with detokenize=False and structured_outputs + """ + # Build stop sequences + stop = list(params.stop) if params.stop else None + stop_token_ids = list(params.stop_token_ids) if params.stop_token_ids else None + + # Handle structured outputs constraints + structured_outputs = None + constraint_field = params.WhichOneof("constraint") + if constraint_field: + if constraint_field == "json_schema": + structured_outputs = StructuredOutputsParams(json=params.json_schema) + elif constraint_field == "regex": + structured_outputs = StructuredOutputsParams(regex=params.regex) + elif constraint_field == "grammar": + structured_outputs = StructuredOutputsParams(grammar=params.grammar) + elif constraint_field == "structural_tag": + structured_outputs = StructuredOutputsParams( + structural_tag=params.structural_tag + ) + elif constraint_field == "json_object": + structured_outputs = StructuredOutputsParams( + json_object=params.json_object + ) + elif constraint_field == "choice": + structured_outputs = StructuredOutputsParams( + choice=list(params.choice.choices) + ) + + # Create SamplingParams + # output_kind=DELTA: Return only new tokens in each chunk (for streaming) + return SamplingParams( + temperature=params.temperature if params.temperature > 0 else 1.0, + top_p=params.top_p if params.top_p > 0 else 1.0, + top_k=params.top_k if params.top_k > 0 else -1, + min_p=params.min_p if params.min_p > 0 else 0.0, + frequency_penalty=params.frequency_penalty, + presence_penalty=params.presence_penalty, + repetition_penalty=params.repetition_penalty + if params.repetition_penalty > 0 + else 1.0, + max_tokens=params.max_tokens if params.HasField("max_tokens") else None, + min_tokens=params.min_tokens if params.min_tokens > 0 else 0, + stop=stop, + stop_token_ids=stop_token_ids, + skip_special_tokens=params.skip_special_tokens, + spaces_between_special_tokens=params.spaces_between_special_tokens, + ignore_eos=params.ignore_eos, + n=params.n if params.n > 0 else 1, + logprobs=params.logprobs if params.HasField("logprobs") else None, + prompt_logprobs=params.prompt_logprobs + if params.HasField("prompt_logprobs") + else None, + seed=params.seed if params.HasField("seed") else None, + include_stop_str_in_output=params.include_stop_str_in_output, + logit_bias=dict(params.logit_bias) if params.logit_bias else None, + truncate_prompt_tokens=params.truncate_prompt_tokens + if params.HasField("truncate_prompt_tokens") + else None, + structured_outputs=structured_outputs, + # detokenize must be True if stop strings are used + detokenize=bool(stop), + output_kind=RequestOutputKind.DELTA + if stream + else RequestOutputKind.FINAL_ONLY, + ) + + @staticmethod def _chunk_response( - self, request_id: str, output: RequestOutput + request_id: str, output: RequestOutput ) -> vllm_engine_pb2.GenerateResponse: """ Build a streaming chunk response from vLLM output. @@ -285,8 +362,9 @@ def _chunk_response( ), ) + @staticmethod def _complete_response( - self, request_id: str, output: RequestOutput + request_id: str, output: RequestOutput ) -> vllm_engine_pb2.GenerateResponse: """ Build a final completion response from vLLM output. @@ -331,8 +409,9 @@ def _complete_response( ), ) + @staticmethod def _error_response( - self, request_id: str, e: Exception + request_id: str, e: Exception ) -> vllm_engine_pb2.GenerateResponse: """ Build an error response. @@ -345,12 +424,11 @@ def _error_response( GenerateResponse with error field set """ status_code = "400" if isinstance(e, ValueError) else "500" - message = str(e) return vllm_engine_pb2.GenerateResponse( request_id=request_id, error=vllm_engine_pb2.GenerateError( - message=message, + message=str(e), http_status_code=status_code, details="", ), diff --git a/vllm/grpc/grpc_request_manager.py b/vllm/grpc/grpc_request_manager.py deleted file mode 100644 index 1945cb4e7093..000000000000 --- a/vllm/grpc/grpc_request_manager.py +++ /dev/null @@ -1,111 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# mypy: ignore-errors -""" -gRPC Request Manager for vLLM - -Manages request lifecycle for gRPC requests, converting between protobuf -and vLLM types. Much simpler than SGLang's implementation since we can -use AsyncLLM directly (no ZMQ needed). - -Key optimization: Sets detokenize=False in SamplingParams to skip -detokenization and return token IDs only. -""" - -from vllm.grpc import vllm_engine_pb2 -from vllm.logger import init_logger -from vllm.sampling_params import ( - RequestOutputKind, - SamplingParams, - StructuredOutputsParams, -) - -logger = init_logger(__name__) - - -def create_sampling_params_from_proto( - proto_params: vllm_engine_pb2.SamplingParams, - stream: bool = True, -) -> SamplingParams: - """ - Convert protobuf SamplingParams to vLLM SamplingParams. - - Args: - proto_params: Protobuf SamplingParams message - stream: Whether streaming is enabled - - Returns: - vLLM SamplingParams with detokenize=False and structured_outputs - """ - # Build stop sequences - stop = list(proto_params.stop) if proto_params.stop else None - stop_token_ids = ( - list(proto_params.stop_token_ids) if proto_params.stop_token_ids else None - ) - - # Handle structured outputs constraints - structured_outputs = None - constraint_field = proto_params.WhichOneof("constraint") - if constraint_field: - if constraint_field == "json_schema": - structured_outputs = StructuredOutputsParams(json=proto_params.json_schema) - elif constraint_field == "regex": - structured_outputs = StructuredOutputsParams(regex=proto_params.regex) - elif constraint_field == "grammar": - structured_outputs = StructuredOutputsParams(grammar=proto_params.grammar) - elif constraint_field == "structural_tag": - structured_outputs = StructuredOutputsParams( - structural_tag=proto_params.structural_tag - ) - elif constraint_field == "json_object": - structured_outputs = StructuredOutputsParams( - json_object=proto_params.json_object - ) - elif constraint_field == "choice": - structured_outputs = StructuredOutputsParams( - choice=list(proto_params.choice.choices) - ) - - # Handle logit_bias - logit_bias = None - if proto_params.logit_bias: - logit_bias = dict(proto_params.logit_bias) - - # Create SamplingParams with detokenize=False and output_kind=DELTA - # detokenize=False: KEY OPTIMIZATION that skips detokenization! - # output_kind=DELTA: Return only new tokens in each chunk (for streaming) - return SamplingParams( - temperature=proto_params.temperature if proto_params.temperature > 0 else 1.0, - top_p=proto_params.top_p if proto_params.top_p > 0 else 1.0, - top_k=proto_params.top_k if proto_params.top_k > 0 else -1, - min_p=proto_params.min_p if proto_params.min_p > 0 else 0.0, - frequency_penalty=proto_params.frequency_penalty, - presence_penalty=proto_params.presence_penalty, - repetition_penalty=proto_params.repetition_penalty - if proto_params.repetition_penalty > 0 - else 1.0, - max_tokens=proto_params.max_tokens - if proto_params.HasField("max_tokens") - else None, - min_tokens=proto_params.min_tokens if proto_params.min_tokens > 0 else 0, - stop=stop, - stop_token_ids=stop_token_ids, - skip_special_tokens=proto_params.skip_special_tokens, - spaces_between_special_tokens=proto_params.spaces_between_special_tokens, - ignore_eos=proto_params.ignore_eos, - n=proto_params.n if proto_params.n > 0 else 1, - logprobs=proto_params.logprobs if proto_params.HasField("logprobs") else None, - prompt_logprobs=proto_params.prompt_logprobs - if proto_params.HasField("prompt_logprobs") - else None, - seed=proto_params.seed if proto_params.HasField("seed") else None, - include_stop_str_in_output=proto_params.include_stop_str_in_output, - logit_bias=logit_bias, - truncate_prompt_tokens=proto_params.truncate_prompt_tokens - if proto_params.HasField("truncate_prompt_tokens") - else None, - structured_outputs=structured_outputs, - # detokenize must be True if stop strings are used (SamplingParams validation) - detokenize=bool(stop), - output_kind=RequestOutputKind.DELTA if stream else RequestOutputKind.FINAL_ONLY, - ) From 1164f88cb55b18e09f274a603b39e8994f992d9a Mon Sep 17 00:00:00 2001 From: njhill Date: Mon, 5 Jan 2026 13:07:00 -0800 Subject: [PATCH 18/27] remove out-of-band Abort rbc Signed-off-by: njhill --- vllm/entrypoints/grpc_server.py | 25 ------------------------- vllm/grpc/vllm_engine.proto | 13 ------------- 2 files changed, 38 deletions(-) diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index a12ae844db65..58598bb45eb0 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -49,7 +49,6 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer): - Generate: Streaming text generation - Embed: Embeddings (TODO) - HealthCheck: Health probe - - Abort: Cancel a request - GetModelInfo: Model metadata - GetServerInfo: Server state """ @@ -162,30 +161,6 @@ async def HealthCheck( return vllm_engine_pb2.HealthCheckResponse(healthy=is_healthy, message=message) - async def Abort( - self, - request: vllm_engine_pb2.AbortRequest, - context: grpc.aio.ServicerContext, - ) -> vllm_engine_pb2.AbortResponse: - """ - Handle abort requests. - - Args: - request: The AbortRequest protobuf - context: gRPC context - - Returns: - AbortResponse protobuf - """ - request_id = request.request_id - logger.info("Abort request for %s.", request_id) - - await self.async_llm.abort(request_id) - - return vllm_engine_pb2.AbortResponse( - success=True, message=f"Request {request_id} aborted" - ) - async def GetModelInfo( self, request: vllm_engine_pb2.GetModelInfoRequest, diff --git a/vllm/grpc/vllm_engine.proto b/vllm/grpc/vllm_engine.proto index f4e7934baa9b..853c09789d92 100644 --- a/vllm/grpc/vllm_engine.proto +++ b/vllm/grpc/vllm_engine.proto @@ -15,9 +15,6 @@ service VllmEngine { // Health check rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse); - // Abort a running request - rpc Abort(AbortRequest) returns (AbortResponse); - // Get model information rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse); @@ -183,16 +180,6 @@ message HealthCheckResponse { string message = 2; } -message AbortRequest { - string request_id = 1; - string reason = 2; -} - -message AbortResponse { - bool success = 1; - string message = 2; -} - // ===================== // Model and Server Info // ===================== From da8e8777687227c4038b810057c965c040019eaf Mon Sep 17 00:00:00 2001 From: njhill Date: Mon, 5 Jan 2026 13:47:51 -0800 Subject: [PATCH 19/27] also support text input Signed-off-by: njhill --- vllm/entrypoints/grpc_server.py | 15 +++++++++------ vllm/grpc/vllm_engine.proto | 11 +++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index 58598bb45eb0..65a4551fbb94 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -27,7 +27,7 @@ import grpc from grpc_reflection.v1alpha import reflection -from vllm import SamplingParams, TokensPrompt +from vllm import SamplingParams, TextPrompt, TokensPrompt from vllm.engine.arg_utils import AsyncEngineArgs from vllm.grpc import vllm_engine_pb2, vllm_engine_pb2_grpc from vllm.logger import init_logger @@ -84,11 +84,14 @@ async def Generate( try: # Extract tokenized input - if not request.HasField("tokenized"): - raise ValueError("Missing tokenized input") - - prompt_token_ids = list(request.tokenized.input_ids) - prompt: TokensPrompt = {"prompt_token_ids": prompt_token_ids} + if request.WhichOneof("input") == "tokenized": + prompt: TokensPrompt = { + "prompt_token_ids": list(request.tokenized.input_ids) + } + if request.tokenized.original_text: + prompt["prompt"] = request.tokenized.original_text + else: + prompt: TextPrompt = {"prompt": request.text} # Build sampling params with detokenize=False sampling_params = self._sampling_params_from_proto( diff --git a/vllm/grpc/vllm_engine.proto b/vllm/grpc/vllm_engine.proto index 853c09789d92..a6ac012f65b7 100644 --- a/vllm/grpc/vllm_engine.proto +++ b/vllm/grpc/vllm_engine.proto @@ -87,14 +87,17 @@ message TokenizedInput { message GenerateRequest { string request_id = 1; - // Pre-tokenized input (required) - TokenizedInput tokenized = 2; + // Prompt input + oneof input { + TokenizedInput tokenized = 2; + string text = 3; + } // Generation parameters (includes logprobs config) - SamplingParams sampling_params = 3; + SamplingParams sampling_params = 4; // Streaming - bool stream = 4; + bool stream = 5; } // ===================== From aba398467c31d1ff2855a7e0d0dfe2214b2ab535 Mon Sep 17 00:00:00 2001 From: njhill Date: Mon, 5 Jan 2026 13:56:41 -0800 Subject: [PATCH 20/27] fix doc warnings Signed-off-by: njhill --- vllm/entrypoints/grpc_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index 65a4551fbb94..233b3eaec08c 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -58,7 +58,8 @@ def __init__(self, async_llm: AsyncLLM, start_time: float): Initialize the servicer. Args: - request_manager: The GrpcRequestManager instance + async_llm: The AsyncLLM instance + start_time: The server start time, in seconds since epoch """ self.async_llm = async_llm self.start_time = start_time From 0cdf3ea62f75d4fcdd9dbba8c08052c9c0b36fa7 Mon Sep 17 00:00:00 2001 From: njhill Date: Mon, 5 Jan 2026 15:18:18 -0800 Subject: [PATCH 21/27] param type and validation fixes Signed-off-by: njhill --- vllm/entrypoints/grpc_server.py | 22 ++++++++++++---------- vllm/grpc/vllm_engine.proto | 32 ++++++++++++++++---------------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index 233b3eaec08c..700f4a0e871c 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -81,7 +81,7 @@ async def Generate( GenerateResponse protobuf messages (streaming) """ request_id = request.request_id - logger.info("Generate request %s received.", request_id) + logger.debug("Generate request %s received.", request_id) try: # Extract tokenized input @@ -113,8 +113,10 @@ async def Generate( if output.finished: yield self._complete_response(request_id, output) + except ValueError as e: + yield self._error_response(request_id, e) except Exception as e: - logger.exception("Error in Generate for %s", request_id) + logger.exception("Error in Generate for request %s", request_id) yield self._error_response(request_id, e) async def Embed( @@ -161,7 +163,7 @@ async def HealthCheck( is_healthy = not self.async_llm.errored message = "Health" if is_healthy else "Engine is not alive" - logger.info("HealthCheck request: healthy=%s, message=%s", is_healthy, message) + logger.debug("HealthCheck request: healthy=%s, message=%s", is_healthy, message) return vllm_engine_pb2.HealthCheckResponse(healthy=is_healthy, message=message) @@ -261,17 +263,17 @@ def _sampling_params_from_proto( # Create SamplingParams # output_kind=DELTA: Return only new tokens in each chunk (for streaming) return SamplingParams( - temperature=params.temperature if params.temperature > 0 else 1.0, - top_p=params.top_p if params.top_p > 0 else 1.0, - top_k=params.top_k if params.top_k > 0 else -1, - min_p=params.min_p if params.min_p > 0 else 0.0, + temperature=params.temperature if params.HasField("temperature") else 1.0, + top_p=params.top_p if params.top_p != 0.0 else 1.0, + top_k=params.top_k, + min_p=params.min_p, frequency_penalty=params.frequency_penalty, presence_penalty=params.presence_penalty, repetition_penalty=params.repetition_penalty - if params.repetition_penalty > 0 + if params.repetition_penalty != 0.0 else 1.0, max_tokens=params.max_tokens if params.HasField("max_tokens") else None, - min_tokens=params.min_tokens if params.min_tokens > 0 else 0, + min_tokens=params.min_tokens, stop=stop, stop_token_ids=stop_token_ids, skip_special_tokens=params.skip_special_tokens, @@ -496,7 +498,7 @@ def signal_handler(): logger.info("gRPC server stopped") # Shutdown AsyncLLM - await async_llm.shutdown() + async_llm.shutdown() logger.info("AsyncLLM engine stopped") logger.info("Shutdown complete") diff --git a/vllm/grpc/vllm_engine.proto b/vllm/grpc/vllm_engine.proto index a6ac012f65b7..7e17714c52b3 100644 --- a/vllm/grpc/vllm_engine.proto +++ b/vllm/grpc/vllm_engine.proto @@ -28,16 +28,16 @@ service VllmEngine { // Sampling parameters for text generation message SamplingParams { - float temperature = 1; + optional float temperature = 1; float top_p = 2; - int32 top_k = 3; + uint32 top_k = 3; float min_p = 4; float frequency_penalty = 5; float presence_penalty = 6; float repetition_penalty = 7; - optional int32 max_tokens = 8; - int32 min_tokens = 9; + optional uint32 max_tokens = 8; + uint32 min_tokens = 9; repeated string stop = 10; repeated uint32 stop_token_ids = 11; @@ -46,7 +46,7 @@ message SamplingParams { bool spaces_between_special_tokens = 13; bool ignore_eos = 14; - int32 n = 15; // Number of parallel samples + uint32 n = 15; // Number of parallel samples // Logprobs configuration optional int32 logprobs = 22; // Number of log probabilities per output token (-1 for all) @@ -116,9 +116,9 @@ message GenerateResponse { message GenerateStreamChunk { repeated uint32 token_ids = 1; // Incremental tokens - int32 prompt_tokens = 2; - int32 completion_tokens = 3; - int32 cached_tokens = 4; + uint32 prompt_tokens = 2; + uint32 completion_tokens = 3; + uint32 cached_tokens = 4; // Logprobs support (TODO: implement in Phase 4) // OutputLogProbs output_logprobs = 5; @@ -128,9 +128,9 @@ message GenerateStreamChunk { message GenerateComplete { repeated uint32 output_ids = 1; // All output tokens string finish_reason = 2; // "stop", "length", "abort" - int32 prompt_tokens = 3; - int32 completion_tokens = 4; - int32 cached_tokens = 5; + uint32 prompt_tokens = 3; + uint32 completion_tokens = 4; + uint32 cached_tokens = 5; // Logprobs support (TODO: implement in Phase 4) // OutputLogProbs output_logprobs = 6; @@ -163,8 +163,8 @@ message EmbedResponse { message EmbedComplete { repeated float embedding = 1; - int32 prompt_tokens = 2; - int32 embedding_dim = 3; + uint32 prompt_tokens = 2; + uint32 embedding_dim = 3; } message EmbedError { @@ -192,15 +192,15 @@ message GetModelInfoRequest {} message GetModelInfoResponse { string model_path = 1; bool is_generation = 2; - int32 max_context_length = 3; - int32 vocab_size = 4; + uint32 max_context_length = 3; + uint32 vocab_size = 4; bool supports_vision = 5; } message GetServerInfoRequest {} message GetServerInfoResponse { - int32 active_requests = 1; + uint32 active_requests = 1; bool is_paused = 2; double last_receive_timestamp = 3; double uptime_seconds = 4; From 079ffb0c6496576b22e02d52119e47aa975b570f Mon Sep 17 00:00:00 2001 From: njhill Date: Mon, 5 Jan 2026 15:18:53 -0800 Subject: [PATCH 22/27] add ci test Signed-off-by: njhill --- tests/entrypoints/test_grpc_server.py | 393 ++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) create mode 100644 tests/entrypoints/test_grpc_server.py diff --git a/tests/entrypoints/test_grpc_server.py b/tests/entrypoints/test_grpc_server.py new file mode 100644 index 000000000000..76c6eea43e53 --- /dev/null +++ b/tests/entrypoints/test_grpc_server.py @@ -0,0 +1,393 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +End-to-end tests for the vLLM gRPC server. +""" + +import asyncio +import socket +import subprocess +import sys +import time + +import grpc +import pytest +import pytest_asyncio + +from vllm.grpc import vllm_engine_pb2, vllm_engine_pb2_grpc + +# Use a small model for fast testing +MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" + + +def find_free_port() -> int: + """Find a free port on localhost.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + s.listen(1) + port = s.getsockname()[1] + return port + + +async def wait_for_server(port: int, timeout: float = 30.0) -> bool: + """Wait for the gRPC server to be ready by trying health checks.""" + start_time = time.time() + print("waiting for server to start...") + while time.time() - start_time < timeout: + try: + channel = grpc.aio.insecure_channel(f"localhost:{port}") + stub = vllm_engine_pb2_grpc.VllmEngineStub(channel) + request = vllm_engine_pb2.HealthCheckRequest() + response = await stub.HealthCheck(request, timeout=5.0) + await channel.close() + if response.healthy: + print("server returned healthy=True") + return True + except Exception: + await asyncio.sleep(0.5) + return False + + +class GrpcServerProcess: + """Manages a gRPC server running in a subprocess.""" + + def __init__(self): + self.process: subprocess.Popen | None = None + self.port: int | None = None + + async def start(self): + """Start the gRPC server process.""" + self.port = find_free_port() + + # Start the server as a subprocess + self.process = subprocess.Popen( + [ + sys.executable, + "-m", + "vllm.entrypoints.grpc_server", + "--model", + MODEL_NAME, + "--host", + "localhost", + "--port", + str(self.port), + "--max-num-batched-tokens", + "512", + "--disable-log-stats-server", + ], + ) + + # Wait for server to be ready + if not await wait_for_server(self.port): + self.stop() + raise RuntimeError("gRPC server failed to start within timeout") + + def stop(self): + """Stop the gRPC server process.""" + if self.process: + self.process.terminate() + try: + self.process.wait(timeout=10) + except subprocess.TimeoutExpired: + self.process.kill() + self.process.wait() + + +@pytest_asyncio.fixture(scope="module") +async def grpc_server(): + """Fixture providing a running gRPC server in a subprocess.""" + server = GrpcServerProcess() + await server.start() + + yield server + + server.stop() + + +@pytest_asyncio.fixture +async def grpc_client(grpc_server): + """Fixture providing a gRPC client connected to the server.""" + channel = grpc.aio.insecure_channel(f"localhost:{grpc_server.port}") + stub = vllm_engine_pb2_grpc.VllmEngineStub(channel) + + yield stub + + await channel.close() + + +@pytest.mark.asyncio +async def test_health_check(grpc_client): + """Test the HealthCheck RPC.""" + request = vllm_engine_pb2.HealthCheckRequest() + response = await grpc_client.HealthCheck(request) + + assert response.healthy is True + assert response.message == "Health" + + +@pytest.mark.asyncio +async def test_get_model_info(grpc_client): + """Test the GetModelInfo RPC.""" + request = vllm_engine_pb2.GetModelInfoRequest() + response = await grpc_client.GetModelInfo(request) + + assert response.model_path == MODEL_NAME + assert response.is_generation is True + assert response.max_context_length > 0 + assert response.vocab_size > 0 + assert response.supports_vision is False + + +@pytest.mark.asyncio +async def test_get_server_info(grpc_client): + """Test the GetServerInfo RPC.""" + request = vllm_engine_pb2.GetServerInfoRequest() + response = await grpc_client.GetServerInfo(request) + + assert response.active_requests >= 0 + assert response.is_paused is False + assert response.uptime_seconds >= 0 + assert response.server_type == "vllm-grpc" + assert response.last_receive_timestamp > 0 + + +@pytest.mark.asyncio +async def test_generate_non_streaming(grpc_client): + """Test the Generate RPC in non-streaming mode.""" + # Create a simple request + request = vllm_engine_pb2.GenerateRequest( + request_id="test-non-streaming-1", + tokenized=vllm_engine_pb2.TokenizedInput( + original_text="Hello, my name is", + input_ids=[15496, 11, 616, 1438, 318], # GPT-2 tokens for the prompt + ), + sampling_params=vllm_engine_pb2.SamplingParams( + temperature=0.0, + max_tokens=10, + n=1, + ), + stream=False, + ) + + # Collect all responses + responses = [] + async for response in grpc_client.Generate(request): + responses.append(response) + + # Should have exactly one response (complete) + assert len(responses) == 1 + + # Check the response + final_response = responses[0] + assert final_response.request_id == "test-non-streaming-1" + assert final_response.HasField("complete") + + complete = final_response.complete + assert len(complete.output_ids) > 0 + assert complete.finish_reason in ["stop", "length"] + assert complete.prompt_tokens > 0 + assert complete.completion_tokens > 0 + + +@pytest.mark.asyncio +async def test_generate_streaming(grpc_client): + """Test the Generate RPC in streaming mode.""" + request = vllm_engine_pb2.GenerateRequest( + request_id="test-streaming-1", + tokenized=vllm_engine_pb2.TokenizedInput( + original_text="The capital of France is", + input_ids=[464, 3139, 286, 4881, 318], # GPT-2 tokens + ), + sampling_params=vllm_engine_pb2.SamplingParams( + temperature=0.0, + max_tokens=10, + n=1, + ), + stream=True, + ) + + # Collect all responses + chunks = [] + complete_response = None + + async for response in grpc_client.Generate(request): + assert response.request_id == "test-streaming-1" + + if response.HasField("chunk"): + chunks.append(response.chunk) + elif response.HasField("complete"): + complete_response = response.complete + elif response.HasField("error"): + pytest.fail(f"Unexpected error: {response.error.message}") + + # Should have received some chunks + assert len(chunks) >= 0 # May have 0 chunks if generation is very fast + + # Should have a final complete response + assert complete_response is not None + assert complete_response.finish_reason in ["stop", "length"] + assert complete_response.prompt_tokens > 0 + + # Verify chunk structure + for chunk in chunks: + assert chunk.prompt_tokens > 0 + assert chunk.completion_tokens >= 0 + + +@pytest.mark.asyncio +async def test_generate_with_different_sampling_params(grpc_client): + """Test Generate with various sampling parameters.""" + # Test with temperature + request = vllm_engine_pb2.GenerateRequest( + request_id="test-sampling-temp", + tokenized=vllm_engine_pb2.TokenizedInput( + original_text="Hello", + input_ids=[15496], + ), + sampling_params=vllm_engine_pb2.SamplingParams( + temperature=0.8, + top_p=0.95, + max_tokens=5, + ), + stream=False, + ) + + responses = [r async for r in grpc_client.Generate(request)] + assert len(responses) == 1 + assert responses[0].HasField("complete") + + # Test with top_k + request = vllm_engine_pb2.GenerateRequest( + request_id="test-sampling-topk", + tokenized=vllm_engine_pb2.TokenizedInput( + original_text="Hello", + input_ids=[15496], + ), + sampling_params=vllm_engine_pb2.SamplingParams( + temperature=1.0, + top_k=50, + max_tokens=5, + ), + stream=False, + ) + + responses = [r async for r in grpc_client.Generate(request)] + assert len(responses) == 1 + assert responses[0].HasField("complete") + + +@pytest.mark.asyncio +async def test_generate_with_stop_strings(grpc_client): + """Test Generate with stop strings.""" + request = vllm_engine_pb2.GenerateRequest( + request_id="test-stop-strings", + tokenized=vllm_engine_pb2.TokenizedInput( + original_text="Hello", + input_ids=[15496], + ), + sampling_params=vllm_engine_pb2.SamplingParams( + temperature=0.0, + max_tokens=20, + stop=["\n", "END"], + ), + stream=False, + ) + + responses = [r async for r in grpc_client.Generate(request)] + assert len(responses) == 1 + assert responses[0].HasField("complete") + + complete = responses[0].complete + assert complete.finish_reason in ["stop", "length"] + + +@pytest.mark.asyncio +async def test_generate_multiple_requests(grpc_client): + """Test handling multiple concurrent Generate requests.""" + + async def make_request(request_id: str): + request = vllm_engine_pb2.GenerateRequest( + request_id=request_id, + tokenized=vllm_engine_pb2.TokenizedInput( + original_text="Hello", + input_ids=[15496], + ), + sampling_params=vllm_engine_pb2.SamplingParams( + temperature=0.0, + max_tokens=5, + ), + stream=False, + ) + + responses = [r async for r in grpc_client.Generate(request)] + return responses[0] + + # Send multiple requests concurrently + tasks = [make_request(f"test-concurrent-{i}") for i in range(3)] + responses = await asyncio.gather(*tasks) + + # Verify all requests completed successfully + assert len(responses) == 3 + for i, response in enumerate(responses): + assert response.request_id == f"test-concurrent-{i}" + assert response.HasField("complete") + + +@pytest.mark.asyncio +async def test_generate_with_seed(grpc_client): + """Test Generate with a fixed seed for reproducibility.""" + + def make_request(request_id: str, seed: int): + return vllm_engine_pb2.GenerateRequest( + request_id=request_id, + tokenized=vllm_engine_pb2.TokenizedInput( + original_text="The future of AI is", + input_ids=[464, 2003, 286, 9552, 318], + ), + sampling_params=vllm_engine_pb2.SamplingParams( + temperature=1.0, + max_tokens=10, + seed=seed, + ), + stream=False, + ) + + # Make two requests with the same seed + request1 = make_request("test-seed-1", 42) + request2 = make_request("test-seed-2", 42) + + response_list1 = [r async for r in grpc_client.Generate(request1)] + response_list2 = [r async for r in grpc_client.Generate(request2)] + + # Both should complete successfully + assert len(response_list1) == 1 + assert len(response_list2) == 1 + assert response_list1[0].HasField("complete") + assert response_list2[0].HasField("complete") + + # With the same seed, outputs should be identical + output_ids1 = list(response_list1[0].complete.output_ids) + output_ids2 = list(response_list2[0].complete.output_ids) + assert output_ids1 == output_ids2 + + +@pytest.mark.asyncio +async def test_generate_error_handling(grpc_client): + """Test error handling in Generate RPC.""" + # Request with missing tokenized input + request = vllm_engine_pb2.GenerateRequest( + request_id="test-error-missing-input", + sampling_params=vllm_engine_pb2.SamplingParams( + temperature=0.0, + max_tokens=10, + top_p=-33, + ), + stream=False, + ) + + responses = [r async for r in grpc_client.Generate(request)] + + # Should receive an error response + assert len(responses) == 1 + assert responses[0].HasField("error") + assert "top_p must be in (0, 1], got -33.0" in responses[0].error.message From 6efd639d1717f26623a57133cbc914d85b3cb18e Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 7 Jan 2026 09:19:49 -0800 Subject: [PATCH 23/27] add back oob abort rpc Signed-off-by: Nick Hill --- tests/entrypoints/test_grpc_server.py | 53 +++++++++++++++++++++++++++ vllm/entrypoints/grpc_server.py | 22 +++++++++++ vllm/grpc/vllm_engine.proto | 10 +++++ 3 files changed, 85 insertions(+) diff --git a/tests/entrypoints/test_grpc_server.py b/tests/entrypoints/test_grpc_server.py index 76c6eea43e53..7228dcc01fd3 100644 --- a/tests/entrypoints/test_grpc_server.py +++ b/tests/entrypoints/test_grpc_server.py @@ -391,3 +391,56 @@ async def test_generate_error_handling(grpc_client): assert len(responses) == 1 assert responses[0].HasField("error") assert "top_p must be in (0, 1], got -33.0" in responses[0].error.message + + +@pytest.mark.asyncio +async def test_abort_request(grpc_client): + """Test the out-of-band Abort RPC.""" + request_id = "test-abort-1" + + # Start a long-running streaming generate request + generate_request = vllm_engine_pb2.GenerateRequest( + request_id=request_id, + tokenized=vllm_engine_pb2.TokenizedInput( + original_text="Hello", + input_ids=[15496], + ), + sampling_params=vllm_engine_pb2.SamplingParams( + temperature=0.0, + min_tokens=500, + max_tokens=500, # Request many tokens to ensure it runs long enough + ), + stream=True, + ) + + # Track whether we were aborted + was_aborted = False + received_chunks = 0 + + async def run_generate(): + nonlocal was_aborted, received_chunks + async for response in grpc_client.Generate(generate_request): + if response.HasField("chunk"): + received_chunks += 1 + + if response.HasField("complete"): + complete = response.complete + was_aborted = complete.finish_reason == "abort" + else: + was_aborted = False + print(response) + + async def abort_after_delay(): + # Small delay to ensure generate has started + await asyncio.sleep(0.1) + abort_request = vllm_engine_pb2.AbortRequest(request_ids=[request_id]) + await grpc_client.Abort(abort_request) + + # Run generate and abort concurrently + await asyncio.gather(run_generate(), abort_after_delay()) + + # The request should have been aborted (received error with "aborted" message) + # and finished early due to the abort + assert was_aborted and received_chunks < 500, ( + "Request should have been aborted before generating all 500 tokens" + ) diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index 700f4a0e871c..fa14b85ff084 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -49,6 +49,7 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer): - Generate: Streaming text generation - Embed: Embeddings (TODO) - HealthCheck: Health probe + - Abort: Cancel requests out-of-band - GetModelInfo: Model metadata - GetServerInfo: Server state """ @@ -167,6 +168,27 @@ async def HealthCheck( return vllm_engine_pb2.HealthCheckResponse(healthy=is_healthy, message=message) + async def Abort( + self, + request: vllm_engine_pb2.AbortRequest, + context: grpc.aio.ServicerContext, + ) -> vllm_engine_pb2.AbortResponse: + """ + Out-of-band abort requests. + + Args: + request: The AbortRequest protobuf + context: gRPC context + + Returns: + AbortResponse protobuf + """ + request_ids = request.request_ids + logger.debug("Abort requests: %s", request_ids) + + await self.async_llm.abort(request_ids) + return vllm_engine_pb2.AbortResponse() + async def GetModelInfo( self, request: vllm_engine_pb2.GetModelInfoRequest, diff --git a/vllm/grpc/vllm_engine.proto b/vllm/grpc/vllm_engine.proto index 7e17714c52b3..ab0fd2544966 100644 --- a/vllm/grpc/vllm_engine.proto +++ b/vllm/grpc/vllm_engine.proto @@ -15,6 +15,9 @@ service VllmEngine { // Health check rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse); + // Abort a running request + rpc Abort(AbortRequest) returns (AbortResponse); + // Get model information rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse); @@ -183,6 +186,13 @@ message HealthCheckResponse { string message = 2; } +message AbortRequest { + repeated string request_ids = 1; +} + +message AbortResponse { +} + // ===================== // Model and Server Info // ===================== From 4165ba8b29873b738f2a083f3ce1c781aa0bb8ae Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 7 Jan 2026 12:23:14 -0800 Subject: [PATCH 24/27] streamline proto: remove custom err messages and req_id in responses Signed-off-by: Nick Hill --- tests/entrypoints/test_grpc_server.py | 47 +++++++--------------- vllm/entrypoints/grpc_server.py | 56 +++++---------------------- vllm/grpc/vllm_engine.proto | 27 +------------ 3 files changed, 26 insertions(+), 104 deletions(-) diff --git a/tests/entrypoints/test_grpc_server.py b/tests/entrypoints/test_grpc_server.py index 7228dcc01fd3..ada31707d482 100644 --- a/tests/entrypoints/test_grpc_server.py +++ b/tests/entrypoints/test_grpc_server.py @@ -179,7 +179,6 @@ async def test_generate_non_streaming(grpc_client): # Check the response final_response = responses[0] - assert final_response.request_id == "test-non-streaming-1" assert final_response.HasField("complete") complete = final_response.complete @@ -199,9 +198,7 @@ async def test_generate_streaming(grpc_client): input_ids=[464, 3139, 286, 4881, 318], # GPT-2 tokens ), sampling_params=vllm_engine_pb2.SamplingParams( - temperature=0.0, - max_tokens=10, - n=1, + temperature=0.0, max_tokens=10, n=1 ), stream=True, ) @@ -211,14 +208,10 @@ async def test_generate_streaming(grpc_client): complete_response = None async for response in grpc_client.Generate(request): - assert response.request_id == "test-streaming-1" - if response.HasField("chunk"): chunks.append(response.chunk) elif response.HasField("complete"): complete_response = response.complete - elif response.HasField("error"): - pytest.fail(f"Unexpected error: {response.error.message}") # Should have received some chunks assert len(chunks) >= 0 # May have 0 chunks if generation is very fast @@ -245,9 +238,7 @@ async def test_generate_with_different_sampling_params(grpc_client): input_ids=[15496], ), sampling_params=vllm_engine_pb2.SamplingParams( - temperature=0.8, - top_p=0.95, - max_tokens=5, + temperature=0.8, top_p=0.95, max_tokens=5 ), stream=False, ) @@ -264,9 +255,7 @@ async def test_generate_with_different_sampling_params(grpc_client): input_ids=[15496], ), sampling_params=vllm_engine_pb2.SamplingParams( - temperature=1.0, - top_k=50, - max_tokens=5, + temperature=1.0, top_k=50, max_tokens=5 ), stream=False, ) @@ -313,8 +302,7 @@ async def make_request(request_id: str): input_ids=[15496], ), sampling_params=vllm_engine_pb2.SamplingParams( - temperature=0.0, - max_tokens=5, + temperature=0.0, max_tokens=5 ), stream=False, ) @@ -329,7 +317,6 @@ async def make_request(request_id: str): # Verify all requests completed successfully assert len(responses) == 3 for i, response in enumerate(responses): - assert response.request_id == f"test-concurrent-{i}" assert response.HasField("complete") @@ -345,9 +332,7 @@ def make_request(request_id: str, seed: int): input_ids=[464, 2003, 286, 9552, 318], ), sampling_params=vllm_engine_pb2.SamplingParams( - temperature=1.0, - max_tokens=10, - seed=seed, + temperature=1.0, max_tokens=10, seed=seed ), stream=False, ) @@ -374,23 +359,21 @@ def make_request(request_id: str, seed: int): @pytest.mark.asyncio async def test_generate_error_handling(grpc_client): """Test error handling in Generate RPC.""" - # Request with missing tokenized input + # Request with invalid top_p value (-33) request = vllm_engine_pb2.GenerateRequest( - request_id="test-error-missing-input", + request_id="test-error-invalid-topp", sampling_params=vllm_engine_pb2.SamplingParams( - temperature=0.0, - max_tokens=10, - top_p=-33, + temperature=0.0, max_tokens=10, top_p=-33 ), stream=False, ) - responses = [r async for r in grpc_client.Generate(request)] + # Should raise an error response + with pytest.raises(grpc.RpcError) as exc_info: + _ = [r async for r in grpc_client.Generate(request)] - # Should receive an error response - assert len(responses) == 1 - assert responses[0].HasField("error") - assert "top_p must be in (0, 1], got -33.0" in responses[0].error.message + assert exc_info.value.code() == grpc.StatusCode.INVALID_ARGUMENT + assert "top_p must be in (0, 1], got -33.0" in exc_info.value.details() @pytest.mark.asyncio @@ -439,8 +422,8 @@ async def abort_after_delay(): # Run generate and abort concurrently await asyncio.gather(run_generate(), abort_after_delay()) - # The request should have been aborted (received error with "aborted" message) - # and finished early due to the abort + # The request should have been aborted (received final chunk with + # "abort" finish reason) and finished early due to the abort. assert was_aborted and received_chunks < 500, ( "Request should have been aborted before generating all 500 tokens" ) diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index fa14b85ff084..5e055e31e910 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -108,17 +108,18 @@ async def Generate( # Convert vLLM output to protobuf # For streaming, always send chunks if request.stream: - yield self._chunk_response(request_id, output) + yield self._chunk_response(output) # Send complete response when finished if output.finished: - yield self._complete_response(request_id, output) + yield self._complete_response(output) except ValueError as e: - yield self._error_response(request_id, e) + # Invalid request error (equiv to 400). + await context.abort(grpc.StatusCode.INVALID_ARGUMENT, str(e)) except Exception as e: logger.exception("Error in Generate for request %s", request_id) - yield self._error_response(request_id, e) + await context.abort(grpc.StatusCode.INTERNAL, str(e)) async def Embed( self, @@ -138,12 +139,8 @@ async def Embed( EmbedResponse protobuf """ logger.warning("Embed RPC not yet implemented") - return vllm_engine_pb2.EmbedResponse( - request_id=request.request_id, - error=vllm_engine_pb2.EmbedError( - message="Embed RPC not yet implemented", - code="NOT_IMPLEMENTED", - ), + await context.abort( + grpc.StatusCode.UNIMPLEMENTED, "Embed RPC not yet implemented" ) async def HealthCheck( @@ -321,15 +318,12 @@ def _sampling_params_from_proto( ) @staticmethod - def _chunk_response( - request_id: str, output: RequestOutput - ) -> vllm_engine_pb2.GenerateResponse: + def _chunk_response(output: RequestOutput) -> vllm_engine_pb2.GenerateResponse: """ Build a streaming chunk response from vLLM output. When output_kind=DELTA, vLLM returns only new tokens automatically. Args: - request_id: The request ID output: vLLM RequestOutput (with delta tokens when output_kind=DELTA) Returns: @@ -341,7 +335,6 @@ def _chunk_response( if completion is None: # Empty chunk return vllm_engine_pb2.GenerateResponse( - request_id=request_id, chunk=vllm_engine_pb2.GenerateStreamChunk( token_ids=[], prompt_tokens=0, @@ -354,7 +347,6 @@ def _chunk_response( # vLLM handles the delta logic internally # completion_tokens = delta count (client will accumulate) return vllm_engine_pb2.GenerateResponse( - request_id=request_id, chunk=vllm_engine_pb2.GenerateStreamChunk( token_ids=completion.token_ids, prompt_tokens=len(output.prompt_token_ids) @@ -366,14 +358,11 @@ def _chunk_response( ) @staticmethod - def _complete_response( - request_id: str, output: RequestOutput - ) -> vllm_engine_pb2.GenerateResponse: + def _complete_response(output: RequestOutput) -> vllm_engine_pb2.GenerateResponse: """ Build a final completion response from vLLM output. Args: - request_id: The request ID output: vLLM RequestOutput (finished=True) Returns: @@ -385,7 +374,6 @@ def _complete_response( if completion is None: # Empty completion return vllm_engine_pb2.GenerateResponse( - request_id=request_id, complete=vllm_engine_pb2.GenerateComplete( output_ids=[], finish_reason="error", @@ -400,7 +388,6 @@ def _complete_response( # When non-streaming (FINAL_ONLY mode): completion.token_ids has all tokens # Client will accumulate token counts for streaming return vllm_engine_pb2.GenerateResponse( - request_id=request_id, complete=vllm_engine_pb2.GenerateComplete( output_ids=completion.token_ids, finish_reason=completion.finish_reason or "stop", @@ -412,31 +399,6 @@ def _complete_response( ), ) - @staticmethod - def _error_response( - request_id: str, e: Exception - ) -> vllm_engine_pb2.GenerateResponse: - """ - Build an error response. - - Args: - request_id: The request ID - e: The exception from vLLM - - Returns: - GenerateResponse with error field set - """ - status_code = "400" if isinstance(e, ValueError) else "500" - - return vllm_engine_pb2.GenerateResponse( - request_id=request_id, - error=vllm_engine_pb2.GenerateError( - message=str(e), - http_status_code=status_code, - details="", - ), - ) - async def serve_grpc(args: argparse.Namespace): """ diff --git a/vllm/grpc/vllm_engine.proto b/vllm/grpc/vllm_engine.proto index ab0fd2544966..bbb1b9b00370 100644 --- a/vllm/grpc/vllm_engine.proto +++ b/vllm/grpc/vllm_engine.proto @@ -108,12 +108,9 @@ message GenerateRequest { // ===================== message GenerateResponse { - string request_id = 1; - oneof response { - GenerateStreamChunk chunk = 2; // For streaming - GenerateComplete complete = 3; // For final/non-streaming - GenerateError error = 4; // For errors + GenerateStreamChunk chunk = 1; // For streaming + GenerateComplete complete = 2; // For final/non-streaming } } @@ -140,12 +137,6 @@ message GenerateComplete { // InputLogProbs input_logprobs = 7; } -message GenerateError { - string message = 1; - string http_status_code = 2; - string details = 3; -} - // ===================== // Embedding Request // ===================== @@ -156,25 +147,11 @@ message EmbedRequest { } message EmbedResponse { - string request_id = 1; - - oneof response { - EmbedComplete complete = 2; - EmbedError error = 3; - } -} - -message EmbedComplete { repeated float embedding = 1; uint32 prompt_tokens = 2; uint32 embedding_dim = 3; } -message EmbedError { - string message = 1; - string code = 2; -} - // ===================== // Management Operations // ===================== From be3845ea9e89a7489ad5b67f31e4715effcb17d4 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 7 Jan 2026 12:42:12 -0800 Subject: [PATCH 25/27] update grpc version; use uvloop Signed-off-by: Nick Hill --- pyproject.toml | 2 +- requirements/build.txt | 2 +- requirements/common.txt | 4 ++-- tests/entrypoints/test_grpc_server.py | 1 - vllm/entrypoints/grpc_server.py | 3 ++- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ad6fdd1fd14c..97651afeec82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ requires = [ "torch == 2.9.1", "wheel", "jinja2", - "grpcio-tools>=1.75.1", + "grpcio-tools>=1.76.0", ] build-backend = "setuptools.build_meta" diff --git a/requirements/build.txt b/requirements/build.txt index 8c1dad4a20f4..893cf69c013f 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -9,4 +9,4 @@ wheel jinja2>=3.1.6 regex build -grpcio-tools>=1.75.1 +grpcio-tools>=1.76.0 diff --git a/requirements/common.txt b/requirements/common.txt index 977d9f5165b5..73e0edf03a24 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -52,5 +52,5 @@ openai-harmony >= 0.0.3 # Required for gpt-oss anthropic == 0.71.0 model-hosting-container-standards >= 0.1.10, < 1.0.0 mcp -grpcio>=1.75.1 -grpcio-reflection>=1.75.1 +grpcio>=1.76.0 +grpcio-reflection>=1.76.0 diff --git a/tests/entrypoints/test_grpc_server.py b/tests/entrypoints/test_grpc_server.py index ada31707d482..5fb55843f750 100644 --- a/tests/entrypoints/test_grpc_server.py +++ b/tests/entrypoints/test_grpc_server.py @@ -411,7 +411,6 @@ async def run_generate(): was_aborted = complete.finish_reason == "abort" else: was_aborted = False - print(response) async def abort_after_delay(): # Small delay to ensure generate has started diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index 5e055e31e910..2778385c9998 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -25,6 +25,7 @@ from collections.abc import AsyncGenerator import grpc +import uvloop from grpc_reflection.v1alpha import reflection from vllm import SamplingParams, TextPrompt, TokensPrompt @@ -520,7 +521,7 @@ def main(): # Run server try: - asyncio.run(serve_grpc(args)) + uvloop.run(serve_grpc(args)) except Exception as e: logger.exception("Server failed: %s", e) sys.exit(1) From 40cea773366e79b2fd2bd8d1327019691149e124 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 7 Jan 2026 17:42:58 -0800 Subject: [PATCH 26/27] update protobuf version requirements Signed-off-by: Nick Hill --- requirements/build.txt | 1 + requirements/common.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/build.txt b/requirements/build.txt index 893cf69c013f..b3ef0a71038f 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -9,4 +9,5 @@ wheel jinja2>=3.1.6 regex build +protobuf>=6.33.2 grpcio-tools>=1.76.0 diff --git a/requirements/common.txt b/requirements/common.txt index 73e0edf03a24..29d59c8db8df 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -9,7 +9,7 @@ blake3 py-cpuinfo transformers >= 4.56.0, < 5 tokenizers >= 0.21.1 # Required for fast incremental detokenization. -protobuf # Required by LlamaTokenizer. +protobuf >= 6.30.0 # Required by LlamaTokenizer, gRPC. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. aiohttp openai >= 1.99.1 # For Responses API with reasoning content From 370c046f31db3c04e961640c8c767b2880874ee2 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 7 Jan 2026 19:10:08 -0800 Subject: [PATCH 27/27] also update text.txt version of protobuf Signed-off-by: Nick Hill --- requirements/test.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements/test.txt b/requirements/test.txt index 41882da9d31f..e78431ab39a4 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -297,7 +297,7 @@ graphql-relay==3.2.0 # via graphene greenlet==3.2.3 # via sqlalchemy -grpcio==1.71.0 +grpcio==1.76.0 # via ray gunicorn==23.0.0 # via mlflow @@ -758,7 +758,7 @@ propcache==0.2.0 # yarl proto-plus==1.26.1 # via google-api-core -protobuf==5.28.3 +protobuf==6.33.2 # via # google-api-core # googleapis-common-protos @@ -1249,6 +1249,7 @@ typing-extensions==4.15.0 # chz # fastapi # graphene + # grpcio # huggingface-hub # librosa # lightning