waybarrios · Thump604 · Apr 18, 2026 · Apr 14, 2026
diff --git a/docs/guides/server.md b/docs/guides/server.md
@@ -2,6 +2,8 @@
 
 vllm-mlx provides a FastAPI server with full OpenAI API compatibility.
 
+By default the server binds only to `127.0.0.1`. Use `--host 0.0.0.0` only when you intentionally want to expose it beyond the local machine.
+
 ## Starting the Server
 
 ### Simple Mode (Default)
@@ -33,7 +35,7 @@ vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --continuous
 | Option | Description | Default |
 |--------|-------------|---------|
 | `--port` | Server port | 8000 |
-| `--host` | Server host | 0.0.0.0 |
+| `--host` | Server host | 127.0.0.1 |
 | `--api-key` | API key for authentication | None |
 | `--rate-limit` | Requests per minute per client (0 = disabled) | 0 |
 | `--timeout` | Request timeout in seconds | 300 |

diff --git a/docs/reference/cli.md b/docs/reference/cli.md
@@ -24,7 +24,7 @@ vllm-mlx serve <model> [options]
 |--------|-------------|---------|
 | `--served-model-name` | Custom model name exposed through the OpenAI API. If not set, the model path is used as the name. | None |
 | `--port` | Server port | 8000 |
-| `--host` | Server host | 0.0.0.0 |
+| `--host` | Server host | 127.0.0.1 |
 | `--api-key` | API key for authentication | None |
 | `--rate-limit` | Requests per minute per client (0 = disabled) | 0 |
 | `--timeout` | Request timeout in seconds | 300 |

diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md
@@ -6,7 +6,7 @@
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--host` | Server host address | `0.0.0.0` |
+| `--host` | Server host address | `127.0.0.1` |
 | `--port` | Server port | `8000` |
 | `--max-tokens` | Default max tokens | `32768` |
 | `--default-temperature` | Default temperature when not specified in request | None |

diff --git a/tests/test_server.py b/tests/test_server.py
@@ -190,6 +190,23 @@ def test_trust_remote_code_flag_defaults_false(self):
         )
         assert args.trust_remote_code is True
 
+    def test_host_defaults_to_localhost(self):
+        """Serve parsers should bind only to localhost unless overridden."""
+        from vllm_mlx.cli import create_parser as create_cli_parser
+        from vllm_mlx.server import create_parser as create_server_parser
+
+        cli_parser = create_cli_parser()
+        cli_args = cli_parser.parse_args(
+            ["serve", "mlx-community/Llama-3.2-3B-Instruct-4bit"]
+        )
+        assert cli_args.host == "127.0.0.1"
+
+        server_parser = create_server_parser()
+        server_args = server_parser.parse_args(
+            ["--model", "mlx-community/Llama-3.2-3B-Instruct-4bit"]
+        )
+        assert server_args.host == "127.0.0.1"
+
     def test_tool_call_parser_accepts_harmony_aliases(self):
         """GPT-OSS/Harmony parsers should be selectable from the serve CLI."""
         from vllm_mlx.cli import create_parser

diff --git a/vllm_mlx/cli.py b/vllm_mlx/cli.py
@@ -671,7 +671,10 @@ def create_parser() -> argparse.ArgumentParser:
         help="The model name used in the API. If not specified, the model argument is used.",
     )
     serve_parser.add_argument(
-        "--host", type=str, default="0.0.0.0", help="Host to bind"
+        "--host",
+        type=str,
+        default="127.0.0.1",
+        help="Host to bind (default: localhost; use 0.0.0.0 to expose externally)",
     )
     serve_parser.add_argument("--port", type=int, default=8000, help="Port to bind")
     serve_parser.add_argument(

diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
@@ -4335,8 +4335,8 @@ def create_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--host",
         type=str,
-        default="0.0.0.0",
-        help="Host to bind to",
+        default="127.0.0.1",
+        help="Host to bind to (default: localhost; use 0.0.0.0 to expose externally)",
     )
     parser.add_argument(
         "--port",