diff --git a/tests/test_batching.py b/tests/test_batching.py index fc2eefcde..6cb536aa5 100644 --- a/tests/test_batching.py +++ b/tests/test_batching.py @@ -790,7 +790,7 @@ def test_multiple_concurrent_requests(self, model_and_tokenizer): assert len(finished) == len(prompts), f"Only {len(finished)} requests finished" -@pytest.mark.asyncio +@pytest.mark.anyio class TestEngineAsync: """Async tests for the engine.""" diff --git a/tests/test_batching_deterministic.py b/tests/test_batching_deterministic.py index 52b0fd49b..0e6072ce9 100644 --- a/tests/test_batching_deterministic.py +++ b/tests/test_batching_deterministic.py @@ -37,7 +37,7 @@ def sampling_params(): class TestDeterministicSingleRequest: """Test single request determinism.""" - @pytest.mark.asyncio + @pytest.mark.anyio async def test_same_prompt_same_output(self, model_and_tokenizer, sampling_params): """Same prompt should produce same output with temp=0.""" from vllm_mlx import AsyncEngineCore, EngineConfig, SchedulerConfig @@ -68,7 +68,7 @@ async def test_same_prompt_same_output(self, model_and_tokenizer, sampling_param assert len(outputs) == 3 assert outputs[0] == outputs[1] == outputs[2], f"Outputs differ: {outputs}" - @pytest.mark.asyncio + @pytest.mark.anyio async def test_token_streaming_order(self, model_and_tokenizer, sampling_params): """Tokens should stream in order.""" from vllm_mlx import AsyncEngineCore @@ -94,7 +94,7 @@ async def test_token_streaming_order(self, model_and_tokenizer, sampling_params) class TestDeterministicConcurrentRequests: """Test concurrent request handling with determinism.""" - @pytest.mark.asyncio + @pytest.mark.anyio async def test_concurrent_same_prompt(self, model_and_tokenizer): """Multiple concurrent requests with same prompt should get same output.""" from vllm_mlx import ( @@ -137,7 +137,7 @@ async def get_output(rid): # All should be the same assert all(r == results[0] for r in results), f"Outputs differ: {results}" - @pytest.mark.asyncio + @pytest.mark.anyio async def test_concurrent_different_prompts(self, model_and_tokenizer): """Different prompts should get different (but deterministic) outputs.""" from vllm_mlx import ( @@ -191,7 +191,7 @@ async def get_output(rid): class TestBatchingPerformance: """Test that batching improves throughput.""" - @pytest.mark.asyncio + @pytest.mark.anyio async def test_batched_faster_than_sequential(self, model_and_tokenizer): """Batched requests should be faster than sequential.""" from vllm_mlx import ( @@ -274,7 +274,7 @@ async def get_output(rid): class TestRequestManagement: """Test request lifecycle management.""" - @pytest.mark.asyncio + @pytest.mark.anyio async def test_abort_request(self, model_and_tokenizer): """Test aborting a request mid-generation.""" from vllm_mlx import AsyncEngineCore, SamplingParams @@ -304,7 +304,7 @@ async def test_abort_request(self, model_and_tokenizer): stats = engine.get_stats() assert stats["active_requests"] == 0 - @pytest.mark.asyncio + @pytest.mark.anyio async def test_engine_stats(self, model_and_tokenizer): """Test engine statistics tracking.""" from vllm_mlx import ( @@ -343,7 +343,7 @@ async def test_engine_stats(self, model_and_tokenizer): class TestSchedulerPolicy: """Test scheduler policies.""" - @pytest.mark.asyncio + @pytest.mark.anyio async def test_fcfs_ordering(self, model_and_tokenizer): """Test that FCFS policy processes requests in order.""" from vllm_mlx import ( @@ -396,7 +396,7 @@ async def track_completion(rid, name): class TestEdgeCases: """Test edge cases and error handling.""" - @pytest.mark.asyncio + @pytest.mark.anyio async def test_empty_prompt(self, model_and_tokenizer): """Test handling of empty prompt.""" from vllm_mlx import AsyncEngineCore, SamplingParams @@ -414,7 +414,7 @@ async def test_empty_prompt(self, model_and_tokenizer): assert out.finished break - @pytest.mark.asyncio + @pytest.mark.anyio async def test_very_short_max_tokens(self, model_and_tokenizer): """Test with max_tokens=1.""" from vllm_mlx import AsyncEngineCore, SamplingParams @@ -436,7 +436,7 @@ async def test_very_short_max_tokens(self, model_and_tokenizer): # Should generate exactly 1 token assert token_count == 1 - @pytest.mark.asyncio + @pytest.mark.anyio async def test_multiple_start_stop(self, model_and_tokenizer): """Test starting and stopping engine multiple times.""" from vllm_mlx import AsyncEngineCore, SamplingParams diff --git a/tests/test_continuous_batching.py b/tests/test_continuous_batching.py index fd10fe808..0e196a226 100644 --- a/tests/test_continuous_batching.py +++ b/tests/test_continuous_batching.py @@ -53,7 +53,7 @@ def test_scheduler_config_batching_params(self): assert config.completion_batch_size == 32 -@pytest.mark.asyncio +@pytest.mark.anyio class TestContinuousBatchingIntegration: """Integration tests requiring actual model loading.""" diff --git a/tests/test_server.py b/tests/test_server.py index c0450548d..c20957211 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -629,9 +629,7 @@ def test_verify_api_key_rejects_invalid(self): # Should raise HTTPException with 401 with pytest.raises(HTTPException) as exc_info: - asyncio.get_event_loop().run_until_complete( - server.verify_api_key(credentials) - ) + asyncio.run(server.verify_api_key(credentials)) assert exc_info.value.status_code == 401 assert "Invalid API key" in str(exc_info.value.detail) @@ -657,9 +655,7 @@ def test_verify_api_key_accepts_valid(self): ) # Should not raise any exception - result = asyncio.get_event_loop().run_until_complete( - server.verify_api_key(credentials) - ) + result = asyncio.run(server.verify_api_key(credentials)) # verify_api_key returns True on success (no exception raised) assert result is True or result is None finally: @@ -716,7 +712,7 @@ def test_rate_limiter_window_cleanup(self): class TestStreamChatCompletion: """Tests for streaming chat completion behavior.""" - @pytest.mark.asyncio + @pytest.mark.anyio async def test_reasoning_stream_emits_structured_tool_calls(self, monkeypatch): """Tool markup after should emit tool_calls chunks.""" from vllm_mlx.engine.base import GenerationOutput @@ -837,7 +833,7 @@ def extract_tool_calls_streaming( "total_tokens": 10, } - @pytest.mark.asyncio + @pytest.mark.anyio async def test_reasoning_stream_skips_tool_parser_until_markup_appears( self, monkeypatch ): diff --git a/tests/test_streaming_latency.py b/tests/test_streaming_latency.py index cae95f5fb..116ee9dfa 100644 --- a/tests/test_streaming_latency.py +++ b/tests/test_streaming_latency.py @@ -206,7 +206,7 @@ async def run_benchmark( print(f"Throughput: {throughput:.1f} tokens/sec") -@pytest.mark.asyncio +@pytest.mark.anyio async def test_output_collector(): """Unit test for RequestOutputCollector.""" import sys