vllm-project · DarkLight1337 · May 2, 2026 · May 1, 2026 · May 1, 2026 · May 1, 2026
@@ -26,6 +26,9 @@
 from ....models.registry import HF_EXAMPLE_MODELS
 from ....utils import RemoteOpenAIServer
 
+# Tuned to prevent OOM on 18GB GPUs in transcription correctness tests.
+MAX_SEQS_FOR_TRANSCRIPTION_TEST = 32
+
 
 def to_bytes(y, sr):
     buffer = io.BytesIO()
@@ -184,6 +187,7 @@ def test_wer_correctness(
     server_args = [
         "--enforce-eager",
         f"--tokenizer_mode={model_info.tokenizer_mode}",
+        f"--max_num_seqs={MAX_SEQS_FOR_TRANSCRIPTION_TEST}",
     ]
     if model_info.trust_remote_code:
         server_args.append("--trust-remote-code")