From d8b059924f6d420f043d1927612a7702d3c51d8e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 15 Jul 2025 11:53:55 +0100
Subject: [PATCH 1/4] Add full serve CLI reference back to docs

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/cli/README.md                     |  8 +++++++
 docs/mkdocs/hooks/generate_argparse.py | 23 ++++++++++++++++---
 requirements/docs.txt                  |  1 +
 vllm/entrypoints/cli/serve.py          | 31 --------------------------
 vllm/entrypoints/openai/cli_args.py    | 28 +++++++++++++++++++++++
 5 files changed, 57 insertions(+), 34 deletions(-)

diff --git a/docs/cli/README.md b/docs/cli/README.md
index 1d951747a7ac..fe43de50d534 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -1,3 +1,7 @@
+---
+toc_depth: 3
+---
+
 # vLLM CLI Guide
 
 The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
@@ -42,6 +46,10 @@ Start the vLLM OpenAI Compatible API server.
     vllm serve --help=page
     ```
 
+### help
+
+--8<-- "docs/argparse/serve.md"
+
 ## chat
 
 Generate chat completions via the running API server.
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index 64120f2d1513..22cf41e6041d 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -16,6 +16,7 @@
 sys.modules["vllm._C"] = MagicMock()
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
+from vllm.entrypoints.openai.cli_args import make_arg_parser  # noqa: E402
 from vllm.utils import FlexibleArgumentParser  # noqa: E402
 
 logger = logging.getLogger("mkdocs")
@@ -24,15 +25,18 @@
 class MarkdownFormatter(HelpFormatter):
     """Custom formatter that generates markdown for argument groups."""
 
-    def __init__(self, prog):
+    def __init__(self, prog, starting_heading_level=3):
         super().__init__(prog,
                          max_help_position=float('inf'),
                          width=float('inf'))
+        self._section_heading_prefix = "#" * starting_heading_level
+        self._argument_heading_prefix = "#" * (starting_heading_level + 1)
         self._markdown_output = []
 
     def start_section(self, heading):
         if heading not in {"positional arguments", "options"}:
-            self._markdown_output.append(f"\n### {heading}\n\n")
+            heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
+            self._markdown_output.append(heading_md)
 
     def end_section(self):
         pass
@@ -46,9 +50,13 @@ def add_usage(self, usage, actions, groups, prefix=None):
 
     def add_arguments(self, actions):
         for action in actions:
+            if (len(action.option_strings) == 0
+                    or "--help" in action.option_strings):
+                continue
 
             option_strings = f'`{"`, `".join(action.option_strings)}`'
-            self._markdown_output.append(f"#### {option_strings}\n\n")
+            heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
+            self._markdown_output.append(heading_md)
 
             if choices := action.choices:
                 choices = f'`{"`, `".join(str(c) for c in choices)}`'
@@ -81,6 +89,14 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
         return cls.add_cli_args(parser, **kwargs)
 
 
+def create_serve_parser() -> FlexibleArgumentParser:
+    """Create a parser for the serve command with markdown formatting."""
+    parser = FlexibleArgumentParser()
+    parser.formatter_class = lambda prog: MarkdownFormatter(
+        prog, starting_heading_level=4)
+    return make_arg_parser(parser)
+
+
 def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
     logger.info("Generating argparse documentation")
     logger.debug("Root directory: %s", ROOT_DIR.resolve())
@@ -95,6 +111,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         "engine_args": create_parser(EngineArgs),
         "async_engine_args": create_parser(AsyncEngineArgs,
                                            async_args_only=True),
+        "serve": create_serve_parser(),
     }
 
     # Generate documentation for each parser
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 7ea768b99093..1ddc825a9cdd 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -17,6 +17,7 @@ cloudpickle
 fastapi
 msgspec
 openai
+partial-json-parser
 pillow
 psutil
 pybase64
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index d25105cbb789..1204ccc1c679 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -67,37 +67,6 @@ def subparser_init(
             help="Start the vLLM OpenAI Compatible API server.",
             description="Start the vLLM OpenAI Compatible API server.",
             usage="vllm serve [model_tag] [options]")
-        serve_parser.add_argument("model_tag",
-                                  type=str,
-                                  nargs='?',
-                                  help="The model tag to serve "
-                                  "(optional if specified in config)")
-        serve_parser.add_argument(
-            "--headless",
-            action='store_true',
-            default=False,
-            help="Run in headless mode. See multi-node data parallel "
-            "documentation for more details.")
-        serve_parser.add_argument(
-            '--data-parallel-start-rank',
-            '-dpr',
-            type=int,
-            default=0,
-            help="Starting data parallel rank for secondary nodes. "
-            "Requires --headless.")
-        serve_parser.add_argument('--api-server-count',
-                                  '-asc',
-                                  type=int,
-                                  default=1,
-                                  help='How many API server processes to run.')
-        serve_parser.add_argument(
-            "--config",
-            type=str,
-            default='',
-            required=False,
-            help="Read CLI options from a config file. "
-            "Must be a YAML with the following options: "
-            "https://docs.vllm.ai/en/latest/configuration/serve_args.html")
 
         serve_parser = make_arg_parser(serve_parser)
         show_filtered_argument_or_group_from_help(serve_parser, ["serve"])
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 9a7f04cd9b26..6688bf2656f6 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -248,6 +248,34 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     register all arguments instead of manually enumerating them here. This
     avoids code duplication and keeps the argument definitions in one place.
     """
+    parser.add_argument("model_tag",
+                        type=str,
+                        nargs="?",
+                        help="The model tag to serve "
+                        "(optional if specified in config)")
+    parser.add_argument(
+        "--headless",
+        action="store_true",
+        default=False,
+        help="Run in headless mode. See multi-node data parallel "
+        "documentation for more details.")
+    parser.add_argument(
+        "--data-parallel-start-rank",
+        "-dpr",
+        type=int,
+        default=0,
+        help="Starting data parallel rank for secondary nodes. "
+        "Requires --headless.")
+    parser.add_argument("--api-server-count",
+                        "-asc",
+                        type=int,
+                        default=1,
+                        help="How many API server processes to run.")
+    parser.add_argument(
+        "--config",
+        help="Read CLI options from a config file. "
+        "Must be a YAML with the following options: "
+        "https://docs.vllm.ai/en/latest/configuration/serve_args.html")
     parser = FrontendArgs.add_cli_args(parser)
     parser = AsyncEngineArgs.add_cli_args(parser)
 

From 62c43d4df262a8b32c229255d75fa08a70c1d066 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:25:40 +0200
Subject: [PATCH 2/4] Show argument groups in TOC

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/cli/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cli/README.md b/docs/cli/README.md
index fe43de50d534..1f86a26bddc7 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -1,5 +1,5 @@
 ---
-toc_depth: 3
+toc_depth: 4
 ---
 
 # vLLM CLI Guide

From ce82cb5d0b2d5ac82794d9a507c7a7e3f3f0141c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:25:55 +0200
Subject: [PATCH 3/4] Rename `help` -> `Options`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/cli/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cli/README.md b/docs/cli/README.md
index 1f86a26bddc7..dfb6051a8c8a 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -46,7 +46,7 @@ Start the vLLM OpenAI Compatible API server.
     vllm serve --help=page
     ```
 
-### help
+### Options
 
 --8<-- "docs/argparse/serve.md"
 

From 0fa773b58fc0df121239c6e904aca977f2406cb6 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:26:07 +0200
Subject: [PATCH 4/4] Link to CLI reference from `serve_args.md`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/configuration/serve_args.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md
index 142d4b8af898..c1cc5577bc7a 100644
--- a/docs/configuration/serve_args.md
+++ b/docs/configuration/serve_args.md
@@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
 ## CLI Arguments
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
-To see the available CLI arguments, run `vllm serve --help`!
+To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
 
 ## Configuration file