huggingface · BjarniHaukur · Mar 28, 2025 · Mar 28, 2025 · Mar 29, 2025 · Mar 29, 2025
diff --git a/tests/test_vllm_client_server.py b/tests/test_vllm_client_server.py
@@ -252,3 +252,209 @@ def tearDownClass(cls):
             child.send_signal(signal.SIGTERM)
         cls.server_process.terminate()
         cls.server_process.wait()
+
+
+@pytest.mark.slow
+@require_torch_multi_gpu
+class TestVLLMClientServerAsync(unittest.TestCase):
+    model_id = "Qwen/Qwen2.5-1.5B"
+
+    @classmethod
+    def setUpClass(cls):
+        # We want the server to run on GPU 1, so we set CUDA_VISIBLE_DEVICES to "1"
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = "1"  # Restrict to GPU 1
+
+        # Start the server process
+        cls.server_process = subprocess.Popen(
+            ["trl", "vllm-serve-async", "--model", cls.model_id], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env
+        )
+
+        #Initialize the client
+        cls.client = VLLMClient(connection_timeout=240)
+        cls.client.init_communicator()
+
+    def test_generate(self):
+        prompt = "Hello, AI! Tell me a joke."
+        response = self.client.session.post(
+            url="http://localhost:8000/v1/completions",
+            json={
+                "model": self.model_id,
+                "prompt": prompt,
+                "max_tokens": 50
+            }
+        )
+        response.raise_for_status()
+        response_json = response.json()
+
+        # Check basic response structure
+        self.assertIn("choices", response_json)
+        self.assertGreater(len(response_json["choices"]), 0)
+
+        # Check that we got a non-empty text response
+        first_choice = response_json["choices"][0]
+        self.assertIn("text", first_choice)
+        self.assertGreater(len(first_choice["text"]), 0)
+
+    def test_update_model_params(self):
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map="cuda")
+        self.client.update_model_params(model)
+
+    def test_reset_prefix_cache(self):
+        # Test resetting the prefix cache
+        self.client.reset_prefix_cache()
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+
+        # Close the client
+        cls.client.close_communicator()
+
+        # vLLM x pytest (or Popen) seems not to handle process termination well. To avoid zombie processes, we need to
+        # kill the server process and its children explicitly.
+        parent = psutil.Process(cls.server_process.pid)
+        children = parent.children(recursive=True)
+        for child in children:
+            child.send_signal(signal.SIGTERM)
+        cls.server_process.terminate()
+        cls.server_process.wait()
+
+
+@pytest.mark.slow
+@require_3_gpus
+class TestVLLMClientAsyncServerTP(unittest.TestCase):
+    model_id = "Qwen/Qwen2.5-1.5B"
+
+    @classmethod
+    def setUpClass(cls):
+        # We want the server to run on GPU 1 and 2, so we set CUDA_VISIBLE_DEVICES to "1,2"
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = "1,2"  # Restrict to GPU 1 and 2
+
+        # Start the server process
+        cls.server_process = subprocess.Popen(
+            ["trl", "vllm-serve-async", "--model", cls.model_id, "--tensor_parallel_size", "2"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env,
+        )
+
+        # Initialize the client
+        cls.client = VLLMClient(connection_timeout=240)
+        cls.client.init_communicator()
+
+    def test_generate(self):
+        prompt = "Hello, AI! Tell me a joke."
+        response = self.client.session.post(
+            url="http://localhost:8000/v1/completions",
+            json={
+                "model": self.model_id,
+                "prompt": prompt,
+                "max_tokens": 50
+            }
+        )
+        response.raise_for_status()
+        response_json = response.json()
+
+        # Check basic response structure
+        self.assertIn("choices", response_json)
+        self.assertGreater(len(response_json["choices"]), 0)
+
+        # Check that we got a non-empty text response
+        first_choice = response_json["choices"][0]
+        self.assertIn("text", first_choice)
+        self.assertGreater(len(first_choice["text"]), 0)
+
+    def test_update_model_params(self):
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map="cuda")
+        self.client.update_model_params(model)
+
+    def test_reset_prefix_cache(self):
+        # Test resetting the prefix cache
+        self.client.reset_prefix_cache()
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+
+        # Close the client
+        cls.client.close_communicator()
+
+        # vLLM x pytest (or Popen) seems not to handle process termination well. To avoid zombie processes, we need to
+        # kill the server process and its children explicitly.
+        parent = psutil.Process(cls.server_process.pid)
+        children = parent.children(recursive=True)
+        for child in children:
+            child.send_signal(signal.SIGTERM)
+        cls.server_process.terminate()
+        cls.server_process.wait()
+
+
+@pytest.mark.slow
+@require_3_gpus
+class TestVLLMClientAsyncServerDP(unittest.TestCase):
+    model_id = "Qwen/Qwen2.5-1.5B"
+
+    @classmethod
+    def setUpClass(cls):
+        # We want the server to run on GPU 1 and 2, so we set CUDA_VISIBLE_DEVICES to "1,2"
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = "1,2"  # Restrict to GPU 1 and 2
+
+        # Start the server process
+        cls.server_process = subprocess.Popen(
+            ["trl", "vllm-serve-async", "--model", cls.model_id, "--data_parallel_size", "2"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env,
+        )
+
+        # Initialize the client
+        cls.client = VLLMClient(connection_timeout=240)
+
+    def test_generate(self):
+        prompt = "Hello, AI! Tell me a joke."
+        response = self.client.session.post(
+            url="http://localhost:8000/v1/completions",
+            json={
+                "model": self.model_id,
+                "prompt": prompt,
+                "max_tokens": 50
+            }
+        )
+        response.raise_for_status()
+        response_json = response.json()
+
+        # Check basic response structure
+        self.assertIn("choices", response_json)
+        self.assertGreater(len(response_json["choices"]), 0)
+
+        # Check that we got a non-empty text response
+        first_choice = response_json["choices"][0]
+        self.assertIn("text", first_choice)
+        self.assertGreater(len(first_choice["text"]), 0)
+
+    def test_update_model_params(self):
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map="cuda")
+        self.client.update_model_params(model)
+
+    def test_reset_prefix_cache(self):
+        # Test resetting the prefix cache
+        self.client.reset_prefix_cache()
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+
+        # Close the client
+        cls.client.close_communicator()
+
+        # vLLM x pytest (or Popen) seems not to handle process termination well. To avoid zombie processes, we need to
+        # kill the server process and its children explicitly.
+        parent = psutil.Process(cls.server_process.pid)
+        children = parent.children(recursive=True)
+        for child in children:
+            child.send_signal(signal.SIGTERM)
+        cls.server_process.terminate()
+        cls.server_process.wait()
diff --git a/trl/cli.py b/trl/cli.py
@@ -29,6 +29,8 @@
 from .scripts.utils import TrlParser
 from .scripts.vllm_serve import main as vllm_serve_main
 from .scripts.vllm_serve import make_parser as make_vllm_serve_parser
+from .scripts.vllm_serve_async import main as vllm_serve_async_main
+from .scripts.vllm_serve_async import make_parser as make_vllm_serve_async_parser
 
 
 def main():
@@ -45,6 +47,7 @@ def main():
     make_kto_parser(subparsers)
     make_sft_parser(subparsers)
     make_vllm_serve_parser(subparsers)
+    make_vllm_serve_async_parser(subparsers)
 
     # Parse the arguments; the remaining ones (`launch_args`) are passed to the 'accelerate launch' subparser.
     # Duplicates may occur if the same argument is provided in both the config file and CLI.
@@ -138,7 +141,13 @@ def main():
             )
 
         vllm_serve_main(script_args)
+
+    elif args.command == "vllm-serve-async":
+        # Here we defer to vllm's argument parser, so that we don't have to reimplement all of its logic
+        sys.argv = ["trl/scripts/vllm_serve_async.py"] + launch_args
+        vllm_serve_async_main()
 
 
 if __name__ == "__main__":
     main()
+
diff --git a/trl/import_utils.py b/trl/import_utils.py
@@ -35,6 +35,7 @@
 _requests_available = _is_package_available("requests")
 _unsloth_available = _is_package_available("unsloth")
 _uvicorn_available = _is_package_available("uvicorn")
+_uvloop_available = _is_package_available("uvloop")
 _vllm_available = _is_package_available("vllm")
 _vllm_ascend_available = _is_package_available("vllm_ascend")
 _joblib_available = _is_package_available("joblib")
@@ -80,6 +81,10 @@ def is_uvicorn_available() -> bool:
     return _uvicorn_available
 
 
+def is_uvloop_available() -> bool:
+    return _uvloop_available
+
+
 def is_vllm_available() -> bool:
     return _vllm_available