From d59a5419a59a48b9c7ca6e722aadbe388275c80d Mon Sep 17 00:00:00 2001
From: Frost Ming <me@frostming.com>
Date: Fri, 12 Sep 2025 08:47:00 +0800
Subject: [PATCH 1/4] feat: implement reverse proxy for custom command services

Signed-off-by: Frost Ming <me@frostming.com>
---
 docs/source/build-with-bentoml/services.rst   |  23 ++-
 src/_bentoml_impl/server/app.py               |  29 +++-
 src/_bentoml_impl/server/proxy.py             | 129 ++++++++++++++
 src/_bentoml_impl/server/serving.py           | 161 +++++++-----------
 src/_bentoml_sdk/service/config.py            |   1 +
 .../_internal/configuration/v2/__init__.py    |   1 +
 .../v2/default_configuration.yaml             |   1 +
 src/bentoml/_internal/server/base_app.py      |  15 ++
 .../_internal/server/http/instruments.py      |  22 +--
 9 files changed, 257 insertions(+), 125 deletions(-)
 create mode 100644 src/_bentoml_impl/server/proxy.py

diff --git a/docs/source/build-with-bentoml/services.rst b/docs/source/build-with-bentoml/services.rst
index 822e136e79f..3defd206716 100644
--- a/docs/source/build-with-bentoml/services.rst
+++ b/docs/source/build-with-bentoml/services.rst
@@ -306,7 +306,28 @@ Alternatively, compute the command at runtime:
 
 Use this method when there are parameters whose values can only be determined at runtime.
 
-When a custom command is provided, BentoML will launch a single process for that Service using your command. It will set the ``PORT`` environment variable (and ``BENTOML_HOST``/``BENTOML_PORT`` for the entry Service). Your process must listen on the provided ``PORT`` and serve HTTP endpoints. Server-level options like CORS/SSL/timeouts defined in BentoML won't apply automatically—configure them in your own server if needed.
+BentoML operates by establishing a proxy service that directs all requests to the HTTP server initiated by the custom command. The default proxy port is ``8000``, specify a different one if the custom command is listening on another port:
+
+.. code-block:: python
+
+    @bentoml.service(cmd=["myserver", "--port", "$PORT"], http={"proxy_port": 9000})
+    class ExternalServer:
+        pass
+
+Metrics Rewriting
+-----------------
+
+When starting a server with a custom command, it can be helpful to include metrics from that server. Alternatively, you can modify the metrics provided by the Prometheus exporter.
+To achieve this, you can implement the ``__metrics__`` method in your Service class. This method takes the original metrics text as input and returns the modified metrics text:
+
+.. code-block:: python
+
+    @bentoml.service(cmd=["myserver", "--port", "$PORT"])
+    class ExternalServer:
+        def __metrics__(self, original_metrics: str) -> str:
+            # Modify the original metrics as needed
+            modified_metrics = original_metrics.replace('sglang', 'vllm')
+            return modified_metrics
 
 .. _bentoml-tasks:
 
diff --git a/src/_bentoml_impl/server/app.py b/src/_bentoml_impl/server/app.py
index 100267ffdbe..43dfb2a0b37 100644
--- a/src/_bentoml_impl/server/app.py
+++ b/src/_bentoml_impl/server/app.py
@@ -19,6 +19,7 @@
 from starlette.responses import Response
 from starlette.staticfiles import StaticFiles
 
+from _bentoml_impl.server.proxy import create_proxy_app
 from _bentoml_sdk import Service
 from _bentoml_sdk.service import set_current_service
 from bentoml._internal.container import BentoMLContainer
@@ -26,6 +27,7 @@
 from bentoml._internal.resource import system_resources
 from bentoml._internal.server.base_app import BaseAppFactory
 from bentoml._internal.server.http_app import log_exception
+from bentoml._internal.utils import is_async_callable
 from bentoml._internal.utils.metrics import exponential_buckets
 from bentoml.exceptions import BentoMLException
 from bentoml.exceptions import ServiceUnavailable
@@ -178,6 +180,10 @@ def __call__(self) -> Starlette:
         app = super().__call__()
         app.add_route("/schema.json", self.schema_view, name="schema")
 
+        if self.service.has_custom_command():
+            # This may obscure all the routes behind, but this is expected.
+            self.service.mount_asgi_app(create_proxy_app(self.service), name="proxy")
+
         for mount_app, path, name in self.service.mount_apps:
             app.router.routes.append(PassiveMount(path, mount_app, name=name))
 
@@ -418,6 +424,22 @@ async def readyz(self, _: Request) -> Response:
 
         return PlainTextResponse("\n", status_code=200)
 
+    async def metrics(self, _: Request) -> Response:  # type: ignore[override]
+        metrics_client = BentoMLContainer.metrics_client.get()
+        metrics_content = await anyio.to_thread.run_sync(metrics_client.generate_latest)
+        if hasattr(self.service.inner, "__metrics__"):
+            func = getattr(self._service_instance, "__metrics__")
+            if not is_async_callable(func):
+                func = functools.partial(anyio.to_thread.run_sync, func)
+            metrics_content = (await func(metrics_content.decode("utf-8"))).encode(
+                "utf-8"
+            )
+        return Response(
+            metrics_content,
+            status_code=200,
+            media_type=metrics_client.CONTENT_TYPE_LATEST,
+        )
+
     @contextlib.asynccontextmanager
     async def lifespan(self, app: Starlette) -> t.AsyncGenerator[None, None]:
         from starlette.applications import Starlette
@@ -430,12 +452,11 @@ async def lifespan(self, app: Starlette) -> t.AsyncGenerator[None, None]:
 
             for mount_app, *_ in self.service.mount_apps:
                 if isinstance(mount_app, Starlette):
-                    maybe_state = await stack.enter_async_context(
+                    _ = await stack.enter_async_context(
                         mount_app.router.lifespan_context(mount_app)
                     )
-                    if maybe_state is not None:
-                        mount_app.state.update(maybe_state)
-                # TODO: support other ASGI apps
+                else:
+                    pass  # TODO: support other ASGI apps
             yield
 
     async def schema_view(self, request: Request) -> Response:
diff --git a/src/_bentoml_impl/server/proxy.py b/src/_bentoml_impl/server/proxy.py
new file mode 100644
index 00000000000..bf4ad8c9342
--- /dev/null
+++ b/src/_bentoml_impl/server/proxy.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+import contextlib
+import logging
+import typing as t
+
+import anyio
+import httpx
+from pyparsing import cast
+from starlette.requests import Request
+
+from _bentoml_sdk import Service
+from bentoml import get_current_service
+from bentoml._internal.utils import expand_envs
+from bentoml.exceptions import BentoMLConfigException
+
+if t.TYPE_CHECKING:
+    from starlette.applications import Starlette
+
+logger = logging.getLogger("bentoml.server")
+
+
+async def _check_health(client: httpx.AsyncClient, health_endpoint: str) -> bool:
+    try:
+        response = await client.get(health_endpoint, timeout=5.0)
+        if response.status_code == 404:
+            raise BentoMLConfigException(
+                f"Health endpoint {health_endpoint} not found (404). Please make sure the health "
+                "endpoint is correctly configured in the service config."
+            )
+        return response.is_success
+    except (httpx.HTTPError, httpx.RequestError):
+        return False
+
+
+def create_proxy_app(service: Service[t.Any]) -> Starlette:
+    """A reverse-proxy that forwards all requests to the HTTP server started
+    by the custom command.
+    """
+    import fastapi
+    from fastapi.responses import StreamingResponse
+
+    health_endpoint = service.config.get("endpoints", {}).get("livez", "/health")
+
+    @contextlib.asynccontextmanager
+    async def lifespan(
+        app: fastapi.FastAPI,
+    ) -> t.AsyncGenerator[dict[str, t.Any], None]:
+        server_instance = get_current_service()
+        assert server_instance is not None, "Current service is not initialized"
+        async with contextlib.AsyncExitStack() as stack:
+            if cmd_getter := getattr(server_instance, "__command__", None):
+                if not callable(cmd_getter):
+                    raise TypeError(
+                        f"__command__ must be a callable that returns a list of strings, got {type(cmd_getter)}"
+                    )
+                cmd = cast("list[str]", cmd_getter())
+            else:
+                cmd = service.cmd
+            assert cmd is not None, "must have a command"
+            cmd = [expand_envs(c) for c in cmd]
+            logger.info("Running service with command: %s", " ".join(cmd))
+            if (
+                instance_client := getattr(server_instance, "client", None)
+            ) is not None and isinstance(instance_client, httpx.AsyncClient):
+                # TODO: support aiohttp client
+                client = instance_client
+            else:
+                proxy_port = service.config.get("http", {}).get("proxy_port")
+                if proxy_port is None:
+                    raise BentoMLConfigException(
+                        "proxy_port must be set in service config to use custom command"
+                    )
+                proxy_url = f"http://localhost:{proxy_port}"
+                client = await stack.enter_async_context(
+                    httpx.AsyncClient(base_url=proxy_url, timeout=None)
+                )
+            proc = await anyio.open_process(cmd, stdout=None, stderr=None)
+            while proc.returncode is None:
+                if await _check_health(client, health_endpoint):
+                    break
+                await anyio.sleep(0.5)
+            else:
+                raise RuntimeError(
+                    "Service process exited before becoming healthy, see the error above"
+                )
+
+            app.state.client = client
+            try:
+                state = {"proc": proc, "client": client}
+                service.context.state.update(state)
+                yield state
+            finally:
+                proc.terminate()
+                await proc.wait()
+
+    assert service.has_custom_command(), "Service does not have custom command"
+    app = fastapi.FastAPI(lifespan=lifespan)
+
+    # TODO: support websocket endpoints
+    @app.api_route(
+        "/{path:path}",
+        methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"],
+    )
+    async def reverse_proxy(request: Request, path: str):
+        url = httpx.URL(
+            path=f"/{path}", query=request.url.query.encode("utf-8") or None
+        )
+        client = t.cast(httpx.AsyncClient, app.state.client)
+        headers = dict(request.headers)
+        headers.pop("host", None)
+        req = client.build_request(
+            method=request.method, url=url, headers=headers, content=request.stream()
+        )
+        try:
+            resp = await client.send(req, stream=True)
+        except httpx.ConnectError:
+            return fastapi.Response(503)
+        except httpx.RequestError:
+            return fastapi.Response(500)
+
+        return StreamingResponse(
+            resp.aiter_raw(),
+            status_code=resp.status_code,
+            headers=resp.headers,
+            background=resp.aclose,
+        )
+
+    return app
diff --git a/src/_bentoml_impl/server/serving.py b/src/_bentoml_impl/server/serving.py
index a13b1ee0883..1db129cca94 100644
--- a/src/_bentoml_impl/server/serving.py
+++ b/src/_bentoml_impl/server/serving.py
@@ -68,7 +68,6 @@ def _get_server_socket(
 
 
 _SERVICE_WORKER_SCRIPT = "_bentoml_impl.worker.service"
-_RUNNER_WORKER_SCRIPT = "_bentoml_impl.worker.runner"
 
 
 @inject
@@ -87,39 +86,21 @@ def create_dependency_watcher(
 
     num_workers, worker_envs = scheduler.get_worker_env(svc)
     env = env or {}
-    if svc.has_custom_command():
-        svc_port = port_stack.enter_context(reserve_free_port())
-        env = {**os.environ, **env, "PORT": str(svc_port)}
-        uri = f"http://127.0.0.1:{svc_port}"
-        socket = None
-        cmd = sys.executable
-        args = [
-            "-m",
-            _RUNNER_WORKER_SCRIPT,
-            bento_identifier,
-            "--service-name",
-            svc.name,
-            "--worker-id",
-            "$(CIRCUS.WID)",
-            "--args",
-            json.dumps(bento_args),
-        ]
-    else:
-        uri, socket = _get_server_socket(svc, uds_path, port_stack, backlog)
-        args = [
-            "-m",
-            _SERVICE_WORKER_SCRIPT,
-            bento_identifier,
-            "--service-name",
-            svc.name,
-            "--fd",
-            f"$(circus.sockets.{svc.name})",
-            "--worker-id",
-            "$(CIRCUS.WID)",
-            "--args",
-            json.dumps(bento_args),
-        ]
-        cmd = sys.executable
+    uri, socket = _get_server_socket(svc, uds_path, port_stack, backlog)
+    args = [
+        "-m",
+        _SERVICE_WORKER_SCRIPT,
+        bento_identifier,
+        "--service-name",
+        svc.name,
+        "--fd",
+        f"$(circus.sockets.{svc.name})",
+        "--worker-id",
+        "$(CIRCUS.WID)",
+        "--args",
+        json.dumps(bento_args),
+    ]
+    cmd = sys.executable
 
     env.update(worker_envs)
     watcher = create_watcher(
@@ -288,74 +269,54 @@ def serve_http(
             raise BentoMLConfigException(f"Invalid host IP address: {host}") from e
         bento_args = BentoMLContainer.bento_arguments.get()
         env.update(worker_env)
-        if svc.has_custom_command():
-            env.update(os.environ)
-            env.update(
-                {"PORT": str(port), "BENTOML_HOST": host, "BENTOML_PORT": str(port)}
-            )
-            server_cmd = sys.executable
-            server_args = [
-                "-m",
-                _RUNNER_WORKER_SCRIPT,
-                bento_identifier,
-                "--service-name",
-                svc.name,
-                "--worker-id",
-                "$(CIRCUS.WID)",
-                "--args",
-                json.dumps(bento_args),
-            ]
-        else:
-            sockets.append(
-                CircusSocket(
-                    name=API_SERVER_NAME,
-                    host=host,
-                    port=port,
-                    family=family,
-                    backlog=backlog,
-                )
-            )
-            if BentoMLContainer.ssl.enabled.get() and not ssl_certfile:
-                raise BentoMLConfigException(
-                    "ssl_certfile is required when ssl is enabled"
-                )
-
-            ssl_args = construct_ssl_args(
-                ssl_certfile=ssl_certfile,
-                ssl_keyfile=ssl_keyfile,
-                ssl_keyfile_password=ssl_keyfile_password,
-                ssl_version=ssl_version,
-                ssl_cert_reqs=ssl_cert_reqs,
-                ssl_ca_certs=ssl_ca_certs,
-                ssl_ciphers=ssl_ciphers,
+        sockets.append(
+            CircusSocket(
+                name=API_SERVER_NAME,
+                host=host,
+                port=port,
+                family=family,
+                backlog=backlog,
             )
-            timeouts_args = construct_timeouts_args(
-                timeout_keep_alive=timeout_keep_alive,
-                timeout_graceful_shutdown=timeout_graceful_shutdown,
-            )
-            timeout_args = ["--timeout", str(timeout)] if timeout else []
+        )
+        if BentoMLContainer.ssl.enabled.get() and not ssl_certfile:
+            raise BentoMLConfigException("ssl_certfile is required when ssl is enabled")
+
+        ssl_args = construct_ssl_args(
+            ssl_certfile=ssl_certfile,
+            ssl_keyfile=ssl_keyfile,
+            ssl_keyfile_password=ssl_keyfile_password,
+            ssl_version=ssl_version,
+            ssl_cert_reqs=ssl_cert_reqs,
+            ssl_ca_certs=ssl_ca_certs,
+            ssl_ciphers=ssl_ciphers,
+        )
+        timeouts_args = construct_timeouts_args(
+            timeout_keep_alive=timeout_keep_alive,
+            timeout_graceful_shutdown=timeout_graceful_shutdown,
+        )
+        timeout_args = ["--timeout", str(timeout)] if timeout else []
 
-            server_cmd = sys.executable
-            server_args = [
-                "-m",
-                _SERVICE_WORKER_SCRIPT,
-                bento_identifier,
-                "--fd",
-                f"$(circus.sockets.{API_SERVER_NAME})",
-                "--service-name",
-                svc.name,
-                "--backlog",
-                str(backlog),
-                "--worker-id",
-                "$(CIRCUS.WID)",
-                "--args",
-                json.dumps(bento_args),
-                *ssl_args,
-                *timeouts_args,
-                *timeout_args,
-            ]
-            if development_mode:
-                server_args.append("--development-mode")
+        server_cmd = sys.executable
+        server_args = [
+            "-m",
+            _SERVICE_WORKER_SCRIPT,
+            bento_identifier,
+            "--fd",
+            f"$(circus.sockets.{API_SERVER_NAME})",
+            "--service-name",
+            svc.name,
+            "--backlog",
+            str(backlog),
+            "--worker-id",
+            "$(CIRCUS.WID)",
+            "--args",
+            json.dumps(bento_args),
+            *ssl_args,
+            *timeouts_args,
+            *timeout_args,
+        ]
+        if development_mode:
+            server_args.append("--development-mode")
 
         scheme = "https" if BentoMLContainer.ssl.enabled.get() else "http"
         watchers.append(
diff --git a/src/_bentoml_sdk/service/config.py b/src/_bentoml_sdk/service/config.py
index 4d75df823b6..641abe94610 100644
--- a/src/_bentoml_sdk/service/config.py
+++ b/src/_bentoml_sdk/service/config.py
@@ -153,6 +153,7 @@ class HTTPCorsSchema(TypedDict, total=False):
 class HTTPSchema(TypedDict, total=False):
     host: IPvAnyAddress
     port: int
+    proxy_port: int
     cors: HTTPCorsSchema
     response: TypedDict("HTTPResponseSchema", {"trace_id": bool}, total=False)  # type: ignore
 
diff --git a/src/bentoml/_internal/configuration/v2/__init__.py b/src/bentoml/_internal/configuration/v2/__init__.py
index db2d10a67f2..c9f7b29de08 100644
--- a/src/bentoml/_internal/configuration/v2/__init__.py
+++ b/src/bentoml/_internal/configuration/v2/__init__.py
@@ -121,6 +121,7 @@
     s.Optional("http"): {
         "host": s.And(str, is_valid_ip_address),
         "port": s.And(int, ensure_larger_than_zero),
+        "proxy_port": s.And(int, ensure_larger_than_zero),
         "cors": {
             "enabled": bool,
             "access_control_allow_origins": s.Or([str], str, None),
diff --git a/src/bentoml/_internal/configuration/v2/default_configuration.yaml b/src/bentoml/_internal/configuration/v2/default_configuration.yaml
index af0d62ee962..f9744c7a701 100644
--- a/src/bentoml/_internal/configuration/v2/default_configuration.yaml
+++ b/src/bentoml/_internal/configuration/v2/default_configuration.yaml
@@ -46,6 +46,7 @@ services:
   http:
     host: 0.0.0.0
     port: 3000
+    proxy_port: 8000
     cors:
       enabled: false
       access_control_allow_origins: ~
diff --git a/src/bentoml/_internal/server/base_app.py b/src/bentoml/_internal/server/base_app.py
index d3eb07df416..f3a77d30f08 100644
--- a/src/bentoml/_internal/server/base_app.py
+++ b/src/bentoml/_internal/server/base_app.py
@@ -75,6 +75,17 @@ async def readyz(self, _: Request) -> Response:
             return PlainTextResponse("\n", status_code=200)
         raise HTTPException(500)
 
+    def metrics(self, _: Request) -> Response:
+        """The Prometheus metrics endpoint."""
+        from ..configuration.containers import BentoMLContainer
+
+        metrics_client = BentoMLContainer.metrics_client.get()
+        return Response(
+            metrics_client.generate_latest(),
+            status_code=200,
+            media_type=metrics_client.CONTENT_TYPE_LATEST,
+        )
+
     def __call__(self) -> Starlette:
         from starlette.applications import Starlette
 
@@ -91,10 +102,14 @@ def __call__(self) -> Starlette:
     def routes(self) -> list[BaseRoute]:
         from starlette.routing import Route
 
+        from ..configuration.containers import BentoMLContainer
+
         routes: list[BaseRoute] = []
         routes.append(Route(path="/livez", name="livez", endpoint=self.livez))
         routes.append(Route(path="/healthz", name="healthz", endpoint=self.livez))
         routes.append(Route(path="/readyz", name="readyz", endpoint=self.readyz))
+        if BentoMLContainer.api_server_config.metrics.enabled.get():
+            routes.append(Route(path="/metrics", name="metrics", endpoint=self.metrics))
         return routes
 
     @property
diff --git a/src/bentoml/_internal/server/http/instruments.py b/src/bentoml/_internal/server/http/instruments.py
index 9a5daa00288..9fb2cd5556d 100644
--- a/src/bentoml/_internal/server/http/instruments.py
+++ b/src/bentoml/_internal/server/http/instruments.py
@@ -89,21 +89,10 @@ async def __call__(
     ) -> None:
         if not self._is_setup:
             self._setup()
-        if not scope["type"].startswith("http"):
+        if not scope["type"].startswith("http") or scope["path"] == "/metrics":
             await self.app(scope, receive, send)
             return
 
-        if scope["path"] == "/metrics":
-            from starlette.responses import Response
-
-            response = Response(
-                self.metrics_client.generate_latest(),
-                status_code=200,
-                media_type=self.metrics_client.CONTENT_TYPE_LATEST,
-            )
-            await response(scope, receive, send)
-            return
-
         endpoint = scope["path"]
         if not endpoint.startswith(tuple(self.skip_paths)):
             self.metrics_last_request_timestamp.labels(
@@ -261,14 +250,7 @@ async def __call__(
             return
 
         if scope["type"].startswith("http") and scope["path"] == "/metrics":
-            from starlette.responses import Response
-
-            response = Response(
-                self.metrics_client.generate_latest(),
-                status_code=200,
-                media_type=self.metrics_client.CONTENT_TYPE_LATEST,
-            )
-            await response(scope, receive, send)
+            await self.app(scope, receive, send)
             return
 
         endpoint = scope["path"]

From 0a26bfad9bcadb8c9684aa9e10380e0cc79f77a4 Mon Sep 17 00:00:00 2001
From: Frost Ming <me@frostming.com>
Date: Fri, 12 Sep 2025 09:39:04 +0800
Subject: [PATCH 2/4] fix: add tests

Signed-off-by: Frost Ming <me@frostming.com>
---
 src/_bentoml_impl/server/proxy.py             |  6 +---
 src/bentoml/_internal/server/base_app.py      |  1 +
 .../_internal/utils/circus/__init__.py        | 24 +++++++------
 .../e2e/bento_new_sdk/test_custom_command.py  | 27 ++++++++++++++
 tests/e2e/fixtures/static_http/service.py     | 36 +++++++++++++++++++
 tests/e2e/fixtures/static_http/test.txt       |  1 +
 6 files changed, 79 insertions(+), 16 deletions(-)
 create mode 100644 tests/e2e/bento_new_sdk/test_custom_command.py
 create mode 100644 tests/e2e/fixtures/static_http/service.py
 create mode 100644 tests/e2e/fixtures/static_http/test.txt

diff --git a/src/_bentoml_impl/server/proxy.py b/src/_bentoml_impl/server/proxy.py
index bf4ad8c9342..21f79347514 100644
--- a/src/_bentoml_impl/server/proxy.py
+++ b/src/_bentoml_impl/server/proxy.py
@@ -66,11 +66,7 @@ async def lifespan(
                 # TODO: support aiohttp client
                 client = instance_client
             else:
-                proxy_port = service.config.get("http", {}).get("proxy_port")
-                if proxy_port is None:
-                    raise BentoMLConfigException(
-                        "proxy_port must be set in service config to use custom command"
-                    )
+                proxy_port = service.config.get("http", {}).get("proxy_port", 8000)
                 proxy_url = f"http://localhost:{proxy_port}"
                 client = await stack.enter_async_context(
                     httpx.AsyncClient(base_url=proxy_url, timeout=None)
diff --git a/src/bentoml/_internal/server/base_app.py b/src/bentoml/_internal/server/base_app.py
index f3a77d30f08..453fb81e594 100644
--- a/src/bentoml/_internal/server/base_app.py
+++ b/src/bentoml/_internal/server/base_app.py
@@ -10,6 +10,7 @@
 from starlette.exceptions import HTTPException
 from starlette.middleware import Middleware
 from starlette.responses import PlainTextResponse
+from starlette.responses import Response
 
 from ..utils import with_app_arg
 
diff --git a/src/bentoml/_internal/utils/circus/__init__.py b/src/bentoml/_internal/utils/circus/__init__.py
index 71ede772eb0..ef9be2537a9 100644
--- a/src/bentoml/_internal/utils/circus/__init__.py
+++ b/src/bentoml/_internal/utils/circus/__init__.py
@@ -35,25 +35,27 @@ def stop(self) -> None:
         return super().stop()
 
 
-class ThreadedArbiter(Arbiter, Thread):
+class ThreadedArbiter(Arbiter):
     def __init__(self, *args: t.Any, **kwargs: t.Any) -> None:
-        Arbiter.__init__(self, *args, **kwargs)
-        Thread.__init__(self, daemon=True)
-        self.__cb: t.Optional[t.Callable[[t.Any], t.Any]] = None
+        super().__init__(*args, **kwargs)
+        self._thread: Thread | None = None
 
     def start(self, cb: t.Callable[[t.Any], t.Any] | None = None) -> None:
-        self.__cb = cb
-        Thread.start(self)
+        self._thread = Thread(target=self._worker, args=(cb,), daemon=True)
+        self._thread.start()
 
-    def run(self) -> None:
+    def _worker(self, cb: t.Callable[[t.Any], t.Any] | None = None) -> None:
         # reset the loop in thread
         self.loop = None
-        self.ctrl.loop = self._ensure_ioloop()  # type: ignore[union-attr]
-        Arbiter.start(self, self.__cb)
+        self._ensure_ioloop()
+        self.ctrl.loop = self.loop  # type: ignore[union-attr]
+        super().start(cb)
 
     def stop(self) -> None:
-        self.loop.add_callback(Arbiter.stop, self)  # type: ignore[union-attr]
-        Thread.join(self)
+        if self.loop is not None:
+            self.loop.add_callback(super().stop)  # type: ignore[union-attr]
+        if self._thread is not None:
+            self._thread.join()
 
 
 def create_circus_socket_from_uri(
diff --git a/tests/e2e/bento_new_sdk/test_custom_command.py b/tests/e2e/bento_new_sdk/test_custom_command.py
new file mode 100644
index 00000000000..f372d4f71da
--- /dev/null
+++ b/tests/e2e/bento_new_sdk/test_custom_command.py
@@ -0,0 +1,27 @@
+from pathlib import Path
+
+import bentoml
+
+
+def test_command_in_service_argument(examples: Path) -> None:
+    with bentoml.serve(
+        "service:StaticHTTP1", working_dir=str(examples / "static_http"), port=35679
+    ) as server:
+        with bentoml.SyncHTTPClient(server.url, server_ready_timeout=100) as client:
+            resp = client.request("GET", "/test.txt")
+            assert resp.status_code == 200
+            assert resp.text.strip() == "Hello world!"
+
+
+def test_command_in_method(examples: Path) -> None:
+    with bentoml.serve(
+        "service:StaticHTTP2", working_dir=str(examples / "static_http"), port=35680
+    ) as server:
+        with bentoml.SyncHTTPClient(server.url, server_ready_timeout=100) as client:
+            resp = client.request("GET", "/test.txt")
+            assert resp.status_code == 200
+            assert resp.text.strip() == "Hello world!"
+
+            resp = client.request("GET", "/metrics")
+            assert resp.status_code == 200
+            assert "# HELLO from custom metrics" in resp.text
diff --git a/tests/e2e/fixtures/static_http/service.py b/tests/e2e/fixtures/static_http/service.py
new file mode 100644
index 00000000000..01b64d21e64
--- /dev/null
+++ b/tests/e2e/fixtures/static_http/service.py
@@ -0,0 +1,36 @@
+import sys
+from pathlib import Path
+
+import bentoml
+
+THIS_DIR = Path(__file__).parent
+
+
+StaticHTTP1 = bentoml.Service(
+    "StaticHTTP1",
+    cmd=[
+        sys.executable,
+        "-m",
+        "http.server",
+        "8000",
+        "--directory",
+        str(THIS_DIR),
+    ],
+    config={"endpoints": {"livez": "/"}},
+)
+
+
+@bentoml.service(endpoints={"livez": "/"})
+class StaticHTTP2:
+    def __command__(self) -> list[str]:
+        return [
+            sys.executable,
+            "-m",
+            "http.server",
+            "8000",
+            "--directory",
+            str(THIS_DIR),
+        ]
+
+    def __metrics__(self, content: str) -> str:
+        return f"{content}\n# HELLO from custom metrics\n"
diff --git a/tests/e2e/fixtures/static_http/test.txt b/tests/e2e/fixtures/static_http/test.txt
new file mode 100644
index 00000000000..cd0875583aa
--- /dev/null
+++ b/tests/e2e/fixtures/static_http/test.txt
@@ -0,0 +1 @@
+Hello world!

From 4ba214985ce85544f8b09edc3fd6c2c01f106260 Mon Sep 17 00:00:00 2001
From: Frost Ming <me@frostming.com>
Date: Fri, 12 Sep 2025 10:32:36 +0800
Subject: [PATCH 3/4] fix: clear module from sys.modules before importing
 service

Signed-off-by: Frost Ming <me@frostming.com>
---
 tests/e2e/bento_new_sdk/conftest.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/e2e/bento_new_sdk/conftest.py b/tests/e2e/bento_new_sdk/conftest.py
index 9992a2ec3df..afeea55abc2 100644
--- a/tests/e2e/bento_new_sdk/conftest.py
+++ b/tests/e2e/bento_new_sdk/conftest.py
@@ -23,3 +23,8 @@ def prepare_models() -> None:
 @pytest.fixture
 def examples() -> Path:
     return EXAMPLE_DIR
+
+
+@pytest.fixture(autouse=True)
+def clear_import_cache() -> None:
+    sys.modules.pop("service", None)

From 8182ff9d75a7ca6d19d586a096b8382f9ae176a7 Mon Sep 17 00:00:00 2001
From: Frost Ming <me@frostming.com>
Date: Tue, 16 Sep 2025 10:54:16 +0800
Subject: [PATCH 4/4] feat: add support for Quart lifespan hooks

Signed-off-by: Frost Ming <me@frostming.com>
---
 src/_bentoml_impl/server/app.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/_bentoml_impl/server/app.py b/src/_bentoml_impl/server/app.py
index 43dfb2a0b37..6e4ed84715b 100644
--- a/src/_bentoml_impl/server/app.py
+++ b/src/_bentoml_impl/server/app.py
@@ -27,6 +27,7 @@
 from bentoml._internal.resource import system_resources
 from bentoml._internal.server.base_app import BaseAppFactory
 from bentoml._internal.server.http_app import log_exception
+from bentoml._internal.types import LazyType
 from bentoml._internal.utils import is_async_callable
 from bentoml._internal.utils.metrics import exponential_buckets
 from bentoml.exceptions import BentoMLException
@@ -452,9 +453,12 @@ async def lifespan(self, app: Starlette) -> t.AsyncGenerator[None, None]:
 
             for mount_app, *_ in self.service.mount_apps:
                 if isinstance(mount_app, Starlette):
-                    _ = await stack.enter_async_context(
+                    await stack.enter_async_context(
                         mount_app.router.lifespan_context(mount_app)
                     )
+                elif LazyType("quart.Quart").isinstance(mount_app):
+                    await mount_app.startup()  # type: ignore[attr-defined]
+                    stack.push_async_callback(mount_app.shutdown)  # type: ignore[attr-defined]
                 else:
                     pass  # TODO: support other ASGI apps
             yield