vllm-project · YuhanLiu11 · Aug 5, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -95,7 +95,7 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
     "prometheus_client",
     "uhashring",
     "lmcache",
-    "httpx",
+    "aiohttp",
     "transformers",
     "os",
 ]

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,12 +15,12 @@ dependencies = [
     "aiofiles==24.1.0",
     "black>=25.1.0",
     "fastapi==0.115.8",
-    "httpx==0.28.1",
+    "aiohttp==3.9.1",
     "kubernetes==32.0.0",
     "numpy==1.26.4",
     "prometheus-client==0.21.1",
     "python-multipart==0.0.20",
-    "sentry-sdk[fastapi,httpx]==2.27.0",
+    "sentry-sdk[fastapi]==2.27.0",
     "uhashring==2.3",
     "uvicorn==0.34.0",
     "xxhash==3.5.0",

diff --git a/src/examples/example_file_upload.py b/src/examples/example_file_upload.py
@@ -1,22 +1,23 @@
 import argparse
 
-import httpx
+import aiohttp
 
 
-def upload_file(server_url: str, file_path: str):
+async def upload_file(server_url: str, file_path: str):
     """Uploads a file to the production stack."""
     try:
         with open(file_path, "rb") as file:
             files = {"file": (file_path, file, "application/octet-stream")}
             data = {"purpose": "unknown"}
 
-            with httpx.Client() as client:
-                response = client.post(server_url, files=files, data=data)
-
-                if response.status_code == 200:
-                    print("File uploaded successfully:", response.json())
-                else:
-                    print("Failed to upload file:", response.text)
+            async with aiohttp.ClientSession() as client:
+                async with client.post(server_url, files=files, data=data) as response:
+                    if response.status == 200:
+                        result = await response.json()
+                        print("File uploaded successfully:", result)
+                    else:
+                        text = await response.text()
+                        print("Failed to upload file:", text)
     except Exception as e:
         print(f"Error: {e}")
 
@@ -31,7 +32,9 @@ def parse_args():
 
 
 if __name__ == "__main__":
+    import asyncio
+
     args = parse_args()
     endpoint = args.url
     file_to_upload = args.path
-    upload_file(endpoint, file_to_upload)
+    asyncio.run(upload_file(endpoint, file_to_upload))
diff --git a/src/tests/requirements.txt b/src/tests/requirements.txt
@@ -1,5 +1,5 @@
+aiohttp
 fastapi
-httpx
 openai
 uvicorn
 vllm
diff --git a/src/vllm_router/httpx_client.py → src/vllm_router/aiohttp_client.py b/src/vllm_router/httpx_client.py → src/vllm_router/aiohttp_client.py
@@ -11,39 +11,38 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import httpx
+import aiohttp
 
 from vllm_router.log import init_logger
 
 logger = init_logger(__name__)
 
 
-class HTTPXClientWrapper:
+class AiohttpClientWrapper:
 
     async_client = None
 
     def start(self):
         """Instantiate the client. Call from the FastAPI startup hook."""
         # To fully leverage the router's concurrency capabilities,
         # we set the maximum number of connections to be unlimited.
-        limits = httpx.Limits(max_connections=None)
-        self.async_client = httpx.AsyncClient(limits=limits)
-        logger.info(f"httpx AsyncClient instantiated. Id {id(self.async_client)}")
+        self.async_client = aiohttp.ClientSession()
+        logger.info(f"aiohttp ClientSession instantiated. Id {id(self.async_client)}")
 
     async def stop(self):
         """Gracefully shutdown. Call from FastAPI shutdown hook."""
         logger.info(
-            f"httpx async_client.is_closed(): {self.async_client.is_closed} - Now close it. Id (will be unchanged): {id(self.async_client)}"
+            f"aiohttp async_client.closed: {self.async_client.closed} - Now close it. Id (will be unchanged): {id(self.async_client)}"
         )
-        await self.async_client.aclose()
+        await self.async_client.close()
         logger.info(
-            f"httpx async_client.is_closed(): {self.async_client.is_closed}. Id (will be unchanged): {id(self.async_client)}"
+            f"aiohttp async_client.closed: {self.async_client.closed}. Id (will be unchanged): {id(self.async_client)}"
         )
         self.async_client = None
-        logger.info("httpx AsyncClient closed")
+        logger.info("aiohttp ClientSession closed")
 
     def __call__(self):
-        """Calling the instantiated HTTPXClientWrapper returns the wrapped singleton."""
+        """Calling the instantiated AiohttpClientWrapper returns the wrapped singleton."""
         # Ensure we don't use it if not started / running
         assert self.async_client is not None
         return self.async_client
diff --git a/src/vllm_router/app.py b/src/vllm_router/app.py
@@ -19,13 +19,13 @@
 import uvicorn
 from fastapi import FastAPI
 
+from vllm_router.aiohttp_client import AiohttpClientWrapper
 from vllm_router.dynamic_config import (
     DynamicRouterConfig,
     get_dynamic_config_watcher,
     initialize_dynamic_config_watcher,
 )
 from vllm_router.experimental import get_feature_gates, initialize_feature_gates
-from vllm_router.httpx_client import HTTPXClientWrapper
 from vllm_router.parsers.parser import parse_args
 from vllm_router.routers.batches_router import batches_router
 from vllm_router.routers.files_router import files_router
@@ -82,11 +82,16 @@
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    app.state.httpx_client_wrapper.start()
+    app.state.aiohttp_client_wrapper.start()
     if hasattr(app.state, "batch_processor"):
         await app.state.batch_processor.initialize()
+
+    service_discovery = get_service_discovery()
+    if hasattr(service_discovery, "initialize_client_sessions"):
+        await service_discovery.initialize_client_sessions()
+
     yield
-    await app.state.httpx_client_wrapper.stop()
+    await app.state.aiohttp_client_wrapper.stop()
 
     # Close the threaded-components
     logger.info("Closing engine stats scraper")
@@ -265,7 +270,7 @@ def initialize_all(app: FastAPI, args):
 app.include_router(files_router)
 app.include_router(batches_router)
 app.include_router(metrics_router)
-app.state.httpx_client_wrapper = HTTPXClientWrapper()
+app.state.aiohttp_client_wrapper = AiohttpClientWrapper()
 app.state.semantic_cache_available = semantic_cache_available
 
 

diff --git a/src/vllm_router/requirements.txt b/src/vllm_router/requirements.txt
@@ -1,12 +1,12 @@
 aiofiles==24.1.0
+aiohttp==3.9.5
 fastapi==0.115.8
-httpx==0.28.1
 kubernetes==32.0.0
 numpy==1.26.4
 prometheus_client==0.21.1
 psutil==7.0.0
 python-multipart==0.0.20
-sentry-sdk[fastapi,httpx]==2.27.0
+sentry-sdk[fastapi]==2.27.0
 uhashring==2.3
 uvicorn==0.34.0
 xxhash==3.5.0
diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
@@ -22,7 +22,7 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 
-import httpx
+import aiohttp
 import requests
 from kubernetes import client, config, watch
 
@@ -308,22 +308,29 @@ def get_endpoint_info(self) -> List[EndpointInfo]:
                 model_info=self._get_model_info(model),
             )
             endpoint_infos.append(endpoint_info)
+        return endpoint_infos
+
+    async def initialize_client_sessions(self) -> None:
+        """
+        Initialize aiohttp ClientSession objects for prefill and decode endpoints.
+        This must be called from an async context during app startup.
+        """
         if (
             self.prefill_model_labels is not None
             and self.decode_model_labels is not None
         ):
+            endpoint_infos = self.get_endpoint_info()
             for endpoint_info in endpoint_infos:
                 if endpoint_info.model_label in self.prefill_model_labels:
-                    self.app.state.prefill_client = httpx.AsyncClient(
+                    self.app.state.prefill_client = aiohttp.ClientSession(
                         base_url=endpoint_info.url,
-                        timeout=None,
+                        timeout=aiohttp.ClientTimeout(total=None),
                     )
                 elif endpoint_info.model_label in self.decode_model_labels:
-                    self.app.state.decode_client = httpx.AsyncClient(
+                    self.app.state.decode_client = aiohttp.ClientSession(
                         base_url=endpoint_info.url,
-                        timeout=None,
+                        timeout=aiohttp.ClientTimeout(total=None),
                     )
-        return endpoint_infos
 
 
 class K8sPodIPServiceDiscovery(ServiceDiscovery):
@@ -629,20 +636,7 @@ def _add_engine(
                 namespace=self.namespace,
                 model_info=model_info,
             )
-            if (
-                self.prefill_model_labels is not None
-                and self.decode_model_labels is not None
-            ):
-                if model_label in self.prefill_model_labels:
-                    self.app.state.prefill_client = httpx.AsyncClient(
-                        base_url=f"http://{engine_ip}:{self.port}",
-                        timeout=None,
-                    )
-                elif model_label in self.decode_model_labels:
-                    self.app.state.decode_client = httpx.AsyncClient(
-                        base_url=f"http://{engine_ip}:{self.port}",
-                        timeout=None,
-                    )
+
             # Store model information in the endpoint info
             self.available_engines[engine_name].model_info = model_info
 
@@ -720,6 +714,28 @@ def close(self):
         self.k8s_watcher.stop()
         self.watcher_thread.join()
 
+    async def initialize_client_sessions(self) -> None:
+        """
+        Initialize aiohttp ClientSession objects for prefill and decode endpoints.
+        This must be called from an async context during app startup.
+        """
+        if (
+            self.prefill_model_labels is not None
+            and self.decode_model_labels is not None
+        ):
+            endpoint_infos = self.get_endpoint_info()
+            for endpoint_info in endpoint_infos:
+                if endpoint_info.model_label in self.prefill_model_labels:
+                    self.app.state.prefill_client = aiohttp.ClientSession(
+                        base_url=endpoint_info.url,
+                        timeout=aiohttp.ClientTimeout(total=None),
+                    )
+                elif endpoint_info.model_label in self.decode_model_labels:
+                    self.app.state.decode_client = aiohttp.ClientSession(
+                        base_url=endpoint_info.url,
+                        timeout=aiohttp.ClientTimeout(total=None),
+                    )
+
 
 class K8sServiceNameServiceDiscovery(ServiceDiscovery):
     def __init__(
@@ -1024,20 +1040,7 @@ def _add_engine(self, engine_name: str, model_names: List[str], model_label: str
                 namespace=self.namespace,
                 model_info=model_info,
             )
-            if (
-                self.prefill_model_labels is not None
-                and self.decode_model_labels is not None
-            ):
-                if model_label in self.prefill_model_labels:
-                    self.app.state.prefill_client = httpx.AsyncClient(
-                        base_url=f"http://{engine_name}:{self.port}",
-                        timeout=None,
-                    )
-                elif model_label in self.decode_model_labels:
-                    self.app.state.decode_client = httpx.AsyncClient(
-                        base_url=f"http://{engine_name}:{self.port}",
-                        timeout=None,
-                    )
+
             # Store model information in the endpoint info
             self.available_engines[engine_name].model_info = model_info
 
@@ -1114,6 +1117,28 @@ def close(self):
         self.k8s_watcher.stop()
         self.watcher_thread.join()
 
+    async def initialize_client_sessions(self) -> None:
+        """
+        Initialize aiohttp ClientSession objects for prefill and decode endpoints.
+        This must be called from an async context during app startup.
+        """
+        if (
+            self.prefill_model_labels is not None
+            and self.decode_model_labels is not None
+        ):
+            endpoint_infos = self.get_endpoint_info()
+            for endpoint_info in endpoint_infos:
+                if endpoint_info.model_label in self.prefill_model_labels:
+                    self.app.state.prefill_client = aiohttp.ClientSession(
+                        base_url=endpoint_info.url,
+                        timeout=aiohttp.ClientTimeout(total=None),
+                    )
+                elif endpoint_info.model_label in self.decode_model_labels:
+                    self.app.state.decode_client = aiohttp.ClientSession(
+                        base_url=endpoint_info.url,
+                        timeout=aiohttp.ClientTimeout(total=None),
+                    )
+
 
 def _create_service_discovery(
     service_discovery_type: ServiceDiscoveryType, *args, **kwargs
-Original file line number
+Diff line change
@@ -1,5 +1,5 @@
+    aiohttp
     fastapi
-    httpx
     openai
     uvicorn
     vllm