waybarrios · janhilgard · Apr 12, 2026 · Apr 11, 2026
diff --git a/tests/test_batching.py b/tests/test_batching.py
@@ -790,7 +790,7 @@ def test_multiple_concurrent_requests(self, model_and_tokenizer):
         assert len(finished) == len(prompts), f"Only {len(finished)} requests finished"
 
 
-@pytest.mark.asyncio
+@pytest.mark.anyio
 class TestEngineAsync:
     """Async tests for the engine."""
 

diff --git a/tests/test_batching_deterministic.py b/tests/test_batching_deterministic.py
@@ -37,7 +37,7 @@ def sampling_params():
 class TestDeterministicSingleRequest:
     """Test single request determinism."""
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_same_prompt_same_output(self, model_and_tokenizer, sampling_params):
         """Same prompt should produce same output with temp=0."""
         from vllm_mlx import AsyncEngineCore, EngineConfig, SchedulerConfig
@@ -68,7 +68,7 @@ async def test_same_prompt_same_output(self, model_and_tokenizer, sampling_param
         assert len(outputs) == 3
         assert outputs[0] == outputs[1] == outputs[2], f"Outputs differ: {outputs}"
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_token_streaming_order(self, model_and_tokenizer, sampling_params):
         """Tokens should stream in order."""
         from vllm_mlx import AsyncEngineCore
@@ -94,7 +94,7 @@ async def test_token_streaming_order(self, model_and_tokenizer, sampling_params)
 class TestDeterministicConcurrentRequests:
     """Test concurrent request handling with determinism."""
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_concurrent_same_prompt(self, model_and_tokenizer):
         """Multiple concurrent requests with same prompt should get same output."""
         from vllm_mlx import (
@@ -137,7 +137,7 @@ async def get_output(rid):
             # All should be the same
             assert all(r == results[0] for r in results), f"Outputs differ: {results}"
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_concurrent_different_prompts(self, model_and_tokenizer):
         """Different prompts should get different (but deterministic) outputs."""
         from vllm_mlx import (
@@ -191,7 +191,7 @@ async def get_output(rid):
 class TestBatchingPerformance:
     """Test that batching improves throughput."""
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_batched_faster_than_sequential(self, model_and_tokenizer):
         """Batched requests should be faster than sequential."""
         from vllm_mlx import (
@@ -274,7 +274,7 @@ async def get_output(rid):
 class TestRequestManagement:
     """Test request lifecycle management."""
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_abort_request(self, model_and_tokenizer):
         """Test aborting a request mid-generation."""
         from vllm_mlx import AsyncEngineCore, SamplingParams
@@ -304,7 +304,7 @@ async def test_abort_request(self, model_and_tokenizer):
             stats = engine.get_stats()
             assert stats["active_requests"] == 0
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_engine_stats(self, model_and_tokenizer):
         """Test engine statistics tracking."""
         from vllm_mlx import (
@@ -343,7 +343,7 @@ async def test_engine_stats(self, model_and_tokenizer):
 class TestSchedulerPolicy:
     """Test scheduler policies."""
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_fcfs_ordering(self, model_and_tokenizer):
         """Test that FCFS policy processes requests in order."""
         from vllm_mlx import (
@@ -396,7 +396,7 @@ async def track_completion(rid, name):
 class TestEdgeCases:
     """Test edge cases and error handling."""
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_empty_prompt(self, model_and_tokenizer):
         """Test handling of empty prompt."""
         from vllm_mlx import AsyncEngineCore, SamplingParams
@@ -414,7 +414,7 @@ async def test_empty_prompt(self, model_and_tokenizer):
                     assert out.finished
                     break
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_very_short_max_tokens(self, model_and_tokenizer):
         """Test with max_tokens=1."""
         from vllm_mlx import AsyncEngineCore, SamplingParams
@@ -436,7 +436,7 @@ async def test_very_short_max_tokens(self, model_and_tokenizer):
             # Should generate exactly 1 token
             assert token_count == 1
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_multiple_start_stop(self, model_and_tokenizer):
         """Test starting and stopping engine multiple times."""
         from vllm_mlx import AsyncEngineCore, SamplingParams

diff --git a/tests/test_continuous_batching.py b/tests/test_continuous_batching.py
@@ -53,7 +53,7 @@ def test_scheduler_config_batching_params(self):
         assert config.completion_batch_size == 32
 
 
-@pytest.mark.asyncio
+@pytest.mark.anyio
 class TestContinuousBatchingIntegration:
     """Integration tests requiring actual model loading."""
 

diff --git a/tests/test_server.py b/tests/test_server.py
@@ -629,9 +629,7 @@ def test_verify_api_key_rejects_invalid(self):
 
             # Should raise HTTPException with 401
             with pytest.raises(HTTPException) as exc_info:
-                asyncio.get_event_loop().run_until_complete(
-                    server.verify_api_key(credentials)
-                )
+                asyncio.run(server.verify_api_key(credentials))
 
             assert exc_info.value.status_code == 401
             assert "Invalid API key" in str(exc_info.value.detail)
@@ -657,9 +655,7 @@ def test_verify_api_key_accepts_valid(self):
             )
 
             # Should not raise any exception
-            result = asyncio.get_event_loop().run_until_complete(
-                server.verify_api_key(credentials)
-            )
+            result = asyncio.run(server.verify_api_key(credentials))
             # verify_api_key returns True on success (no exception raised)
             assert result is True or result is None
         finally:
@@ -716,7 +712,7 @@ def test_rate_limiter_window_cleanup(self):
 class TestStreamChatCompletion:
     """Tests for streaming chat completion behavior."""
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_reasoning_stream_emits_structured_tool_calls(self, monkeypatch):
         """Tool markup after </think> should emit tool_calls chunks."""
         from vllm_mlx.engine.base import GenerationOutput
@@ -837,7 +833,7 @@ def extract_tool_calls_streaming(
             "total_tokens": 10,
         }
 
-    @pytest.mark.asyncio
+    @pytest.mark.anyio
     async def test_reasoning_stream_skips_tool_parser_until_markup_appears(
         self, monkeypatch
     ):

diff --git a/tests/test_streaming_latency.py b/tests/test_streaming_latency.py
@@ -206,7 +206,7 @@ async def run_benchmark(
             print(f"Throughput: {throughput:.1f} tokens/sec")
 
 
-@pytest.mark.asyncio
+@pytest.mark.anyio
 async def test_output_collector():
     """Unit test for RequestOutputCollector."""
     import sys