Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions tests/e2e/singlecard/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,43 @@ def test_qwen3_w8a8_quant():
name_1="vllm_quant_w8a8_outputs",
)

# fmt: off
def test_qwen3_w8a8_quant_auto_detect():
"""Test that ModelSlim quantization is auto-detected without --quantization.

Uses the same W8A8 model as test_qwen3_w8a8_quant but omits the
quantization parameter, verifying that the auto-detection in
maybe_auto_detect_quantization() picks up quant_model_description.json
and produces identical results.
"""
max_tokens = 5
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
]
vllm_target_outputs = [([
85, 4086, 44, 374, 264, 1550, 42747, 628, 323, 4938, 72816, 44378, 323,
13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
)]
# fmt: on

with VllmRunner(
"vllm-ascend/Qwen3-0.6B-W8A8",
max_model_len=8192,
gpu_memory_utilization=0.7,
cudagraph_capture_sizes=[1, 2, 4, 8],
) as vllm_model:
vllm_quant_auto_detect_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens)

check_outputs_equal(
outputs_0_lst=vllm_target_outputs,
outputs_1_lst=vllm_quant_auto_detect_outputs,
name_0="vllm_target_outputs",
name_1="vllm_quant_auto_detect_outputs",
)


# fmt: off
def test_qwen3_dense_w8a16():
max_tokens = 5
Expand Down
95 changes: 93 additions & 2 deletions tests/ut/quantization/test_modelslim_config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import json
import os
import tempfile
from unittest.mock import MagicMock, patch

from vllm.model_executor.layers.fused_moe import FusedMoE
Expand All @@ -6,7 +9,10 @@

from tests.ut.base import TestBase
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
from vllm_ascend.quantization.modelslim_config import AscendModelSlimConfig
from vllm_ascend.quantization.modelslim_config import (
MODELSLIM_CONFIG_FILENAME,
AscendModelSlimConfig,
)
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is

if vllm_version_is("v0.15.0"):
Expand Down Expand Up @@ -54,7 +60,7 @@ def test_get_min_capability(self):

def test_get_config_filenames(self):
filenames = AscendModelSlimConfig.get_config_filenames()
self.assertEqual(filenames, ["quant_model_description.json"])
self.assertEqual(filenames, [])

def test_from_config(self):
config = AscendModelSlimConfig.from_config(self.sample_config)
Expand Down Expand Up @@ -162,5 +168,90 @@ def test_is_layer_skipped_ascend(self):
with self.assertRaises(ValueError):
config.is_layer_skipped_ascend("fused_layer", fused_mapping)

def test_init_with_none_config(self):
config = AscendModelSlimConfig(None)
self.assertEqual(config.quant_description, {})

def test_init_with_default_config(self):
config = AscendModelSlimConfig()
self.assertEqual(config.quant_description, {})

def test_maybe_update_config_already_populated(self):
# When quant_description is already populated, should be a no-op
self.assertTrue(len(self.ascend_config.quant_description) > 0)
self.ascend_config.maybe_update_config("/some/model/path")
# quant_description should remain unchanged
self.assertEqual(self.ascend_config.quant_description,
self.sample_config)

def test_maybe_update_config_loads_from_file(self):
config = AscendModelSlimConfig()
self.assertEqual(config.quant_description, {})

quant_data = {"layer1.weight": "INT8", "layer2.weight": "FLOAT"}
with tempfile.TemporaryDirectory() as tmpdir:
config_path = os.path.join(tmpdir, MODELSLIM_CONFIG_FILENAME)
with open(config_path, "w") as f:
json.dump(quant_data, f)

config.maybe_update_config(tmpdir)

self.assertEqual(config.quant_description, quant_data)

def test_maybe_update_config_raises_when_file_missing(self):
config = AscendModelSlimConfig()

with tempfile.TemporaryDirectory() as tmpdir:
with self.assertRaises(ValueError) as ctx:
config.maybe_update_config(tmpdir)

error_msg = str(ctx.exception)
self.assertIn("ModelSlim Quantization Config Not Found", error_msg)
self.assertIn(MODELSLIM_CONFIG_FILENAME, error_msg)

def test_maybe_update_config_raises_with_json_files_listed(self):
config = AscendModelSlimConfig()

with tempfile.TemporaryDirectory() as tmpdir:
# Create a dummy json file that is NOT the config file
dummy_path = os.path.join(tmpdir, "config.json")
with open(dummy_path, "w") as f:
json.dump({"dummy": True}, f)

with self.assertRaises(ValueError) as ctx:
config.maybe_update_config(tmpdir)

error_msg = str(ctx.exception)
self.assertIn("config.json", error_msg)

def test_maybe_update_config_non_directory_raises(self):
config = AscendModelSlimConfig()

with self.assertRaises(ValueError) as ctx:
config.maybe_update_config("not_a_real_directory_path")

error_msg = str(ctx.exception)
self.assertIn("ModelSlim Quantization Config Not Found", error_msg)

def test_apply_extra_quant_adaptations_shared_head(self):
config = AscendModelSlimConfig()
config.quant_description = {
"model.layers.0.shared_head.weight": "INT8",
}
config._apply_extra_quant_adaptations()
self.assertIn("model.layers.0.weight", config.quant_description)
self.assertEqual(config.quant_description["model.layers.0.weight"],
"INT8")

def test_apply_extra_quant_adaptations_weight_packed(self):
config = AscendModelSlimConfig()
config.quant_description = {
"model.layers.0.weight_packed": "INT8",
}
config._apply_extra_quant_adaptations()
self.assertIn("model.layers.0.weight", config.quant_description)
self.assertEqual(config.quant_description["model.layers.0.weight"],
"INT8")

def test_get_scaled_act_names(self):
self.assertEqual(self.ascend_config.get_scaled_act_names(), [])
182 changes: 182 additions & 0 deletions tests/ut/quantization/test_quant_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import json
import logging
import os
import tempfile
from unittest.mock import MagicMock, patch

from tests.ut.base import TestBase
from vllm_ascend.quantization.modelslim_config import MODELSLIM_CONFIG_FILENAME
from vllm_ascend.quantization.utils import (
detect_quantization_method,
maybe_auto_detect_quantization,
)
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD


class TestDetectQuantizationMethod(TestBase):

def test_returns_none_for_non_directory(self):
result = detect_quantization_method("/non/existent/path")
self.assertIsNone(result)

def test_detects_modelslim(self):
with tempfile.TemporaryDirectory() as tmpdir:
config_path = os.path.join(tmpdir, MODELSLIM_CONFIG_FILENAME)
with open(config_path, "w") as f:
json.dump({"layer.weight": "INT8"}, f)

result = detect_quantization_method(tmpdir)
self.assertEqual(result, ASCEND_QUANTIZATION_METHOD)

def test_detects_compressed_tensors(self):
with tempfile.TemporaryDirectory() as tmpdir:
config_path = os.path.join(tmpdir, "config.json")
with open(config_path, "w") as f:
json.dump({
"quantization_config": {
"quant_method": "compressed-tensors"
}
}, f)

result = detect_quantization_method(tmpdir)
self.assertEqual(result, COMPRESSED_TENSORS_METHOD)

def test_returns_none_for_no_quant(self):
with tempfile.TemporaryDirectory() as tmpdir:
result = detect_quantization_method(tmpdir)
self.assertIsNone(result)

def test_returns_none_for_non_compressed_tensors_quant_method(self):
with tempfile.TemporaryDirectory() as tmpdir:
config_path = os.path.join(tmpdir, "config.json")
with open(config_path, "w") as f:
json.dump({
"quantization_config": {
"quant_method": "gptq"
}
}, f)

result = detect_quantization_method(tmpdir)
self.assertIsNone(result)

def test_returns_none_for_config_without_quant_config(self):
with tempfile.TemporaryDirectory() as tmpdir:
config_path = os.path.join(tmpdir, "config.json")
with open(config_path, "w") as f:
json.dump({"model_type": "llama"}, f)

result = detect_quantization_method(tmpdir)
self.assertIsNone(result)

def test_returns_none_for_malformed_config_json(self):
with tempfile.TemporaryDirectory() as tmpdir:
config_path = os.path.join(tmpdir, "config.json")
with open(config_path, "w") as f:
f.write("not valid json{{{")

result = detect_quantization_method(tmpdir)
self.assertIsNone(result)

def test_modelslim_takes_priority_over_compressed_tensors(self):
"""When both ModelSlim config and compressed-tensors config exist,
ModelSlim should take priority."""
with tempfile.TemporaryDirectory() as tmpdir:
# Create ModelSlim config
modelslim_path = os.path.join(tmpdir, MODELSLIM_CONFIG_FILENAME)
with open(modelslim_path, "w") as f:
json.dump({"layer.weight": "INT8"}, f)

# Create compressed-tensors config
config_path = os.path.join(tmpdir, "config.json")
with open(config_path, "w") as f:
json.dump({
"quantization_config": {
"quant_method": "compressed-tensors"
}
}, f)

result = detect_quantization_method(tmpdir)
self.assertEqual(result, ASCEND_QUANTIZATION_METHOD)


class TestMaybeAutoDetectQuantization(TestBase):

def _make_vllm_config(self, model_path="/fake/model", quantization=None):
vllm_config = MagicMock()
vllm_config.model_config.model = model_path
vllm_config.model_config.quantization = quantization
return vllm_config

@patch("vllm_ascend.quantization.utils.detect_quantization_method",
return_value=None)
def test_no_detection_does_nothing(self, mock_detect):
vllm_config = self._make_vllm_config()
maybe_auto_detect_quantization(vllm_config)
# quantization should remain unchanged
self.assertIsNone(vllm_config.model_config.quantization)

@patch("vllm_ascend.quantization.utils.detect_quantization_method",
return_value=ASCEND_QUANTIZATION_METHOD)
def test_user_specified_same_method_no_change(self, mock_detect):
vllm_config = self._make_vllm_config(
quantization=ASCEND_QUANTIZATION_METHOD)
maybe_auto_detect_quantization(vllm_config)
self.assertEqual(vllm_config.model_config.quantization,
ASCEND_QUANTIZATION_METHOD)

@patch("vllm.config.VllmConfig._get_quantization_config",
return_value=MagicMock())
@patch("vllm_ascend.quantization.utils.detect_quantization_method",
return_value=ASCEND_QUANTIZATION_METHOD)
def test_auto_detect_sets_quantization_and_logs_info(
self, mock_detect, mock_get_quant_config):
"""When no --quantization is specified but ModelSlim config is found,
the method should auto-set quantization and emit an INFO log."""
vllm_config = self._make_vllm_config(
model_path="/fake/quant_model", quantization=None)

with self.assertLogs("vllm_ascend.quantization.utils",
level=logging.INFO) as cm:
maybe_auto_detect_quantization(vllm_config)

self.assertEqual(vllm_config.model_config.quantization,
ASCEND_QUANTIZATION_METHOD)
log_output = "\n".join(cm.output)
self.assertIn("Auto-detected quantization method", log_output)
self.assertIn(ASCEND_QUANTIZATION_METHOD, log_output)
self.assertIn("/fake/quant_model", log_output)

@patch("vllm_ascend.quantization.utils.detect_quantization_method",
return_value=ASCEND_QUANTIZATION_METHOD)
def test_user_mismatch_logs_warning(self, mock_detect):
"""When user specifies a different method than auto-detected,
a WARNING should be emitted and user's choice should be respected."""
vllm_config = self._make_vllm_config(
model_path="/fake/quant_model",
quantization=COMPRESSED_TENSORS_METHOD)

with self.assertLogs("vllm_ascend.quantization.utils",
level=logging.WARNING) as cm:
maybe_auto_detect_quantization(vllm_config)

# User's choice is respected
self.assertEqual(vllm_config.model_config.quantization,
COMPRESSED_TENSORS_METHOD)
log_output = "\n".join(cm.output)
self.assertIn("Auto-detected quantization method", log_output)
self.assertIn(ASCEND_QUANTIZATION_METHOD, log_output)
self.assertIn(COMPRESSED_TENSORS_METHOD, log_output)

@patch("vllm_ascend.quantization.utils.detect_quantization_method",
return_value=None)
def test_no_detection_emits_no_log(self, mock_detect):
"""When no quantization is detected, no log should be emitted."""
vllm_config = self._make_vllm_config(quantization=None)
logger_name = "vllm_ascend.quantization.utils"

with self.assertRaises(AssertionError):
# assertLogs raises AssertionError when no logs are emitted
with self.assertLogs(logger_name, level=logging.DEBUG):
maybe_auto_detect_quantization(vllm_config)

self.assertIsNone(vllm_config.model_config.quantization)
Loading