move to weight_loading test to deal with CI cuda memory issues

vllm-project · Aug 21, 2024 · 20b5940 · 20b5940
1 parent 70ce46e
commit 20b5940
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 7 deletions.
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
@@ -161,10 +161,3 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
         output = llm.generate_greedy("Hello world!", max_tokens=20)
         assert output
-
-
-def test_compressed_tensors_fused_moe(vllm_runner):
-    model_path = "nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized"
-    with vllm_runner(model_path) as llm:
-        output = llm.generate_greedy("Hello world!", max_tokens=20)
-        assert output
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
@@ -14,5 +14,6 @@ compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
 compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main