diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md index baaf21f01f06..3a8f2e39b83e 100644 --- a/docs/configuration/serve_args.md +++ b/docs/configuration/serve_args.md @@ -21,6 +21,11 @@ model: meta-llama/Llama-3.1-8B-Instruct host: "127.0.0.1" port: 6379 uvicorn-log-level: "info" +# Optional: log request metadata at INFO (see docs/usage/troubleshooting.md) +# enable-log-requests: true +# Optional: truncated prompt snippets at INFO; requires enable-log-requests. +# Off by default for security (see docs/usage/security.md). +# enable-log-request-prompts: true ``` To use the above config file: diff --git a/docs/usage/security.md b/docs/usage/security.md index 4879ddbf64ef..d3b991edf9b8 100644 --- a/docs/usage/security.md +++ b/docs/usage/security.md @@ -85,6 +85,20 @@ significantly reduce the attack surface for these types of abuse. Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP redirects from being followed to bypass domain restrictions. +## Logging and client payloads + +For `vllm serve`, `--enable-log-requests` records request metadata at INFO (for +example sampling parameters). By design it does **not** include raw prompt +content at INFO: prompts can contain secrets or PII, and logs are often copied +to less-controlled systems. + +To debug prompts without raising the global log level, you can: + +- Set `VLLM_LOGGING_LEVEL=DEBUG` for full prompt details in logs, **or** +- Explicitly pass `--enable-log-request-prompts` **in addition to** + `--enable-log-requests` for **truncated** prompt summaries at INFO (still + treat log sinks as sensitive if you enable this). + ## Security and Firewalls: Protecting Exposed vLLM Systems While vLLM is designed to allow unsafe network services to be isolated to diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index dc1cd89f8209..e42c25d66c3b 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -35,6 +35,8 @@ You can check if this is happening by trying the old defaults with `--generation If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: - `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging. +- For the OpenAI-compatible server, `--enable-log-requests` logs request IDs and sampling parameters at INFO. **Full** prompt inputs (text, token IDs, shapes) appear only at DEBUG unless you also opt in to bounded INFO previews (see below). +- `--enable-log-request-prompts` (requires `--enable-log-requests`) adds **truncated** prompt summaries to INFO lines. It is **off by default** because prompts can contain sensitive data that may end up in log aggregation systems. See [Security](security.md#logging-and-client-payloads). - `export VLLM_LOG_STATS_INTERVAL=1.` to get log statistics more frequently for tracking running queue, waiting queue and cache hit states. - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem. - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL. diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 58dd328b325a..ac1260c8c5f7 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -206,6 +206,19 @@ def test_chat_template_validation_for_sad_paths(serve_parser): validate_parsed_serve_args(args) +def test_enable_log_request_prompts_requires_enable_log_requests(serve_parser): + args = serve_parser.parse_args(args=["--enable-log-request-prompts"]) + with pytest.raises(TypeError, match="--enable-log-request-prompts"): + validate_parsed_serve_args(args) + + +def test_enable_log_request_prompts_passes_with_log_requests(serve_parser): + args = serve_parser.parse_args( + args=["--enable-log-requests", "--enable-log-request-prompts"] + ) + validate_parsed_serve_args(args) + + @pytest.mark.parametrize( "cli_args, expected_middleware", [ diff --git a/tests/test_logger.py b/tests/test_logger.py index b4f44f52d4df..e444336fe9ba 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -472,11 +472,35 @@ def test_request_logger_log_outputs_integration(): assert "Received request %s" in input_call[0] assert input_call[1] == "test-integration" + # Prompts at INFO require explicit --enable-log-request-prompts (security). + assert input_call[4] == "" assert "Generated response %s%s" in output_call[0] assert output_call[1] == "test-integration" +def test_request_logger_log_inputs_prompt_at_info_when_opt_in(): + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger( + max_log_len=None, log_prompts_at_info=True + ) + request_logger.log_inputs( + request_id="test-prompt-info", + prompt="Hello", + prompt_token_ids=None, + prompt_embeds=None, + params=None, + lora_request=None, + ) + + mock_logger.info.assert_called_once() + input_call = mock_logger.info.call_args[0] + assert input_call[1] == "test-prompt-info" + assert "Hello" in input_call[4] + + def test_streaming_complete_logs_full_text_content(): """Test that streaming complete logging includes full accumulated text, not just token count.""" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0c9cf2ae9b14..6013d0cf37ee 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -2280,8 +2280,9 @@ def add_cli_args( action=argparse.BooleanOptionalAction, default=AsyncEngineArgs.enable_log_requests, help="Enable logging request information, dependent on log level:\n" - "- INFO: Request ID, parameters and LoRA request.\n" - "- DEBUG: Prompt inputs (e.g: text, token IDs).\n" + "- INFO: Request ID, parameters and LoRA request (and truncated " + "prompt inputs if `--enable-log-request-prompts` is set).\n" + "- DEBUG: Full prompt inputs (e.g. text, token IDs).\n" "You can set the minimum log level via `VLLM_LOGGING_LEVEL`.", ) current_platform.pre_register_and_update(parser) diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index c2a77fbb4e56..233270df2a9f 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -13,10 +13,22 @@ logger = init_logger(__name__) +# With `--enable-log-request-prompts`, INFO logs may include a bounded prompt +# preview when `--max-log-len` is unset. Full inputs remain at DEBUG. +# See github.com/vllm-project/vllm/issues/38537. +_DEFAULT_INFO_PROMPT_STR_LEN = 4096 +_DEFAULT_INFO_PROMPT_TOKEN_IDS = 512 + class RequestLogger: - def __init__(self, *, max_log_len: int | None) -> None: + def __init__( + self, + *, + max_log_len: int | None, + log_prompts_at_info: bool = False, + ) -> None: self.max_log_len = max_log_len + self.log_prompts_at_info = log_prompts_at_info if not logger.isEnabledFor(logging.INFO): logger.warning_once( @@ -24,6 +36,13 @@ def __init__(self, *, max_log_len: int | None) -> None: "the minimum log level is higher than INFO. " "No request information will be logged." ) + elif self.log_prompts_at_info and not logger.isEnabledFor(logging.DEBUG): + logger.info_once( + "`--enable-log-request-prompts` is set but " + "the minimum log level is higher than DEBUG. " + "Prompt details at INFO are truncated when long; " + "set `VLLM_LOGGING_LEVEL=DEBUG` for full details." + ) elif not logger.isEnabledFor(logging.DEBUG): logger.info_once( "`--enable-log-requests` is set but " @@ -32,6 +51,35 @@ def __init__(self, *, max_log_len: int | None) -> None: "To view more details, set `VLLM_LOGGING_LEVEL=DEBUG`." ) + def _prompt_summary_for_info( + self, + prompt: str | None, + prompt_token_ids: list[int] | None, + prompt_embeds: torch.Tensor | None, + ) -> str: + if not self.log_prompts_at_info or not logger.isEnabledFor(logging.INFO): + return "" + + max_chars = ( + self.max_log_len + if self.max_log_len is not None + else _DEFAULT_INFO_PROMPT_STR_LEN + ) + max_ids = ( + self.max_log_len + if self.max_log_len is not None + else _DEFAULT_INFO_PROMPT_TOKEN_IDS + ) + if prompt is not None: + preview = prompt[:max_chars] + return f", prompt: {preview!r}" + if prompt_token_ids is not None: + preview_ids = prompt_token_ids[:max_ids] + return f", prompt_token_ids: {preview_ids}" + if prompt_embeds is not None: + return f", prompt_embeds: shape={prompt_embeds.shape}" + return "" + def log_inputs( self, request_id: str, @@ -60,11 +108,15 @@ def log_inputs( prompt_embeds.shape if prompt_embeds is not None else None, ) + prompt_summary = self._prompt_summary_for_info( + prompt, prompt_token_ids, prompt_embeds + ) logger.info( - "Received request %s: params: %s, lora_request: %s.", + "Received request %s: params: %s, lora_request: %s%s.", request_id, params, lora_request, + prompt_summary, ) def log_outputs( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 95e831b51ec0..2f56565314df 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -331,7 +331,10 @@ async def init_app_state( served_model_names = [args.model] if args.enable_log_requests: - request_logger = RequestLogger(max_log_len=args.max_log_len) + request_logger = RequestLogger( + max_log_len=args.max_log_len, + log_prompts_at_info=args.enable_log_request_prompts, + ) else: request_logger = None @@ -445,7 +448,10 @@ async def init_render_app_state( ) if args.enable_log_requests: - request_logger = RequestLogger(max_log_len=args.max_log_len) + request_logger = RequestLogger( + max_log_len=args.max_log_len, + log_prompts_at_info=args.enable_log_request_prompts, + ) else: request_logger = None diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 2bd991b0010e..57f717fe632e 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -145,6 +145,13 @@ class BaseFrontendArgs: """If set to True, log model outputs (generations). Requires `--enable-log-requests`. As with `--enable-log-requests`, information is only logged at INFO level at maximum.""" + enable_log_request_prompts: bool = False + """If set to True, include truncated prompt inputs (text, token ids, or + embedding tensor shape) in INFO-level request logs when + `--enable-log-requests` is set. **Off by default:** logging client + payloads can expose sensitive data in log sinks. Requires + `--enable-log-requests`. Use `VLLM_LOGGING_LEVEL=DEBUG` for full prompt + details without this flag.""" enable_log_deltas: bool = True """If set to False, output deltas will not be logged. Relevant only if --enable-log-outputs is set. @@ -364,6 +371,13 @@ def validate_parsed_serve_args(args: argparse.Namespace): raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser") if args.enable_log_outputs and not args.enable_log_requests: raise TypeError("Error: --enable-log-outputs requires --enable-log-requests") + if ( + getattr(args, "enable_log_request_prompts", False) + and not args.enable_log_requests + ): + raise TypeError( + "Error: --enable-log-request-prompts requires --enable-log-requests" + ) def create_parser_for_docs() -> FlexibleArgumentParser: