Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
218 commits
Select commit Hold shift + click to select a range
dcea881
Automatic quant_model_description.json detection support
OrangeRedeng Dec 5, 2025
aa0a0aa
Add w4a4 support
OrangeRedeng Dec 5, 2025
6c845ad
Refactor w8a8
OrangeRedeng Dec 5, 2025
dee644b
Add import section
OrangeRedeng Dec 5, 2025
35b8983
Create quantization utils file
OrangeRedeng Dec 5, 2025
311cc28
Create w4a16
OrangeRedeng Dec 5, 2025
6869ebf
Create w4a8.py
OrangeRedeng Dec 5, 2025
c7d6dd5
Rename w4a16.py to w4a16_moe.py
OrangeRedeng Dec 5, 2025
7ffe0f6
Rename w4a8.py to w4a8_moe.py
OrangeRedeng Dec 5, 2025
e2d8889
Create w8a8_moe
OrangeRedeng Dec 5, 2025
41d3d3f
Create w4a8.py
OrangeRedeng Dec 5, 2025
6d0b035
Create msmodelslim structure, initial commit
TamirBaydasov Dec 8, 2025
66c7517
Working msmodelslim structure, W8A8, W8A8 MoE, W4A4
TamirBaydasov Dec 10, 2025
471ad1a
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 10, 2025
ccfe6f6
Delete w4a16_moe.py
OrangeRedeng Dec 11, 2025
0a48b2b
Delete w4a8.py
OrangeRedeng Dec 11, 2025
f4fdb0e
Delete w4a8_moe.py
OrangeRedeng Dec 11, 2025
1f4f870
Delete w8a8.py
OrangeRedeng Dec 11, 2025
b5fcf78
Delete w8a8_moe.py
OrangeRedeng Dec 11, 2025
ba57bc7
Delete utils.py
OrangeRedeng Dec 11, 2025
a5704f1
Move process_weights to kernel-side, add npu compressed-tensors w8a8i…
TamirBaydasov Dec 11, 2025
c42c8f1
Added check for empty scheme
OrangeRedeng Dec 12, 2025
25d0d09
Remove unnecessary method
OrangeRedeng Dec 12, 2025
ca4895e
Add w4a8 support
OrangeRedeng Dec 12, 2025
28ff8e0
Add w4a8 support (kernel)
OrangeRedeng Dec 12, 2025
d9412d4
Update fused_moe_method_npu.py
TamirBaydasov Dec 12, 2025
0f81db3
Fix w8a8_static bug
OrangeRedeng Dec 15, 2025
3175d8b
Improving the code structure
OrangeRedeng Dec 15, 2025
23db53f
Delete print()
OrangeRedeng Dec 15, 2025
393f7d1
Update w4a8 for MOE
OrangeRedeng Dec 15, 2025
5c60c95
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 15, 2025
d4d53e0
Fix w4a4 weights loading
OrangeRedeng Dec 16, 2025
2bb7acf
Update model_config.py
OrangeRedeng Dec 16, 2025
4a05e5d
Add w4a4 test
OrangeRedeng Dec 16, 2025
d0a577f
Add compressed-tensors unit-test
OrangeRedeng Dec 16, 2025
d9f8a41
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 16, 2025
77a923e
Pre-commit fixes
Dec 17, 2025
3917919
Revert "Pre-commit fixes"
OrangeRedeng Dec 17, 2025
df01a40
Pre-commit fixes
OrangeRedeng Dec 17, 2025
a16b69e
Fix model config loading, add NPU w8a8int8 MoE for compressed-tensors…
TamirBaydasov Dec 17, 2025
238759c
Pre-commit fixes
OrangeRedeng Dec 17, 2025
847e190
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 17, 2025
4640d05
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 17, 2025
5ca19cb
Delete comments
OrangeRedeng Dec 17, 2025
1f18881
Delete comments
OrangeRedeng Dec 17, 2025
2bee5c7
Update model_config.py
TamirBaydasov Dec 17, 2025
2670aa9
Quickfix
OrangeRedeng Dec 17, 2025
1e45ead
Update fused_moe_method_npu.py
TamirBaydasov Dec 17, 2025
d3298ec
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 17, 2025
afc11a6
Update CODEOWNERS
TamirBaydasov Dec 17, 2025
168b2a8
Pre-commit fixes
OrangeRedeng Dec 17, 2025
2185718
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 18, 2025
d551652
Update msmodelslim_w8a8_int8.py
TamirBaydasov Dec 18, 2025
1cf18c0
Update msmodelslim.py
TamirBaydasov Dec 18, 2025
3dccf89
Delete python/sglang/srt/hardware_backend/npu/quantization/modelslim.py
OrangeRedeng Dec 18, 2025
1842d0a
Removed unused code
OrangeRedeng Dec 18, 2025
75de787
Remove --quantization modelslim flag from doc
OrangeRedeng Dec 18, 2025
e958767
Delete --quantization "modelslim" flag
OrangeRedeng Dec 18, 2025
1567885
Delete --quantization "modelslim" flag
OrangeRedeng Dec 18, 2025
d34cb6f
Update test_ascend_hicache_mla.py
OrangeRedeng Dec 18, 2025
09a6d44
Delete --quantization "modelslim" flag
OrangeRedeng Dec 18, 2025
2b7003e
Update test_ascend_mla_w8a8int8.py
OrangeRedeng Dec 18, 2025
43b5d66
Create README.md for msModelSlim
OrangeRedeng Dec 18, 2025
420d6e8
Update README.md
OrangeRedeng Dec 18, 2025
f79f9ee
Update README.md
OrangeRedeng Dec 18, 2025
a7c43bb
Update fused_moe_method_npu.py
TamirBaydasov Dec 18, 2025
ef2fdb8
Update README.md
OrangeRedeng Dec 18, 2025
cb95c0a
Update README.md
OrangeRedeng Dec 18, 2025
ca38c59
Update layer.py
TamirBaydasov Dec 18, 2025
583cb4d
Update compressed_tensors.py
TamirBaydasov Dec 18, 2025
8af0033
Update compressed_tensors_moe.py
TamirBaydasov Dec 18, 2025
9f8c407
Quickfix
OrangeRedeng Dec 18, 2025
d31e96c
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 18, 2025
72efd3a
Update README.md
OrangeRedeng Dec 18, 2025
384835b
Update msmodelslim_moe.py
OrangeRedeng Dec 18, 2025
4ebfb54
Update fused_moe_method_npu.py
OrangeRedeng Dec 18, 2025
0cfbd93
Create test_ascend_w4a4_quantization.py in srt/ascend
TamirBaydasov Dec 18, 2025
87b65a8
Delete test/manual/ascend/test_ascend_w4a4_quantization.py
TamirBaydasov Dec 18, 2025
177102d
Create test_ascend_w8a8_quantization.py
TamirBaydasov Dec 18, 2025
16ca773
Update run_suite.py
TamirBaydasov Dec 18, 2025
c6def39
Update test_ascend_w8a8_quantization.py
TamirBaydasov Dec 18, 2025
d0dd427
Create ascend_npu_quantization.md
OrangeRedeng Dec 18, 2025
2e1219f
Bugfix
OrangeRedeng Dec 18, 2025
9d6ffbd
Pre-commit fixes
OrangeRedeng Dec 18, 2025
17a6248
Update fused_moe_method_npu.py
OrangeRedeng Dec 18, 2025
0bf3389
Fix missprint
OrangeRedeng Dec 18, 2025
69d3438
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 18, 2025
1d28157
Pre-commit fixes
OrangeRedeng Dec 18, 2025
a5b88e9
Update ascend_npu_quantization.md
OrangeRedeng Dec 18, 2025
30f7b10
Update ascend_npu_quantization.md
OrangeRedeng Dec 18, 2025
22c85ce
Update python/sglang/srt/configs/model_config.py
TamirBaydasov Dec 19, 2025
21b9219
Update compressed_tensors.py
TamirBaydasov Dec 19, 2025
52b1088
Update compressed_tensors_moe.py
TamirBaydasov Dec 19, 2025
2a5f745
Update __init__.py
TamirBaydasov Dec 19, 2025
309e5ef
Update compressed_tensors_w8a8_int8.py
TamirBaydasov Dec 19, 2025
611546d
Update README.md
TamirBaydasov Dec 19, 2025
d2cc722
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 19, 2025
d2888fd
Update linear_method_npu.py
OrangeRedeng Dec 19, 2025
554027a
Fix group_size
OrangeRedeng Dec 19, 2025
ad52cda
Fix group_size
OrangeRedeng Dec 19, 2025
1d0eddb
Update fused_moe_method_npu.py
OrangeRedeng Dec 19, 2025
aff9585
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 19, 2025
c2e972f
Update fused_moe_method_npu.py
OrangeRedeng Dec 22, 2025
3bc7faf
Pre-commit fixes
OrangeRedeng Dec 22, 2025
c7480fb
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 22, 2025
ff1f793
Fix Qwen3-32B AWQ issue
OrangeRedeng Dec 22, 2025
7cbf964
Update ascend_npu_quantization.md
OrangeRedeng Dec 22, 2025
7b20ccf
Update ascend_npu_quantization.md
OrangeRedeng Dec 22, 2025
ed9c68a
Merge branch 'main' into npu_quantization_refactor
ping1jing2 Dec 22, 2025
e1cabfa
Update fused_moe_method_npu.py
TamirBaydasov Dec 22, 2025
734ab1d
Update linear_method_npu.py
TamirBaydasov Dec 22, 2025
93533b0
Update base_config.py
TamirBaydasov Dec 22, 2025
0cd79c6
Update compressed_tensors_moe.py
TamirBaydasov Dec 22, 2025
a9d4847
Update compressed_tensors_w8a8_int8.py
TamirBaydasov Dec 22, 2025
af3756b
Update msmodelslim.py
TamirBaydasov Dec 22, 2025
1ddd8d4
Update msmodelslim_moe.py
TamirBaydasov Dec 22, 2025
76a1e94
Update msmodelslim_w4a4_int4.py
TamirBaydasov Dec 22, 2025
f773ee4
Update msmodelslim_w8a8_int8.py
TamirBaydasov Dec 22, 2025
a6d1619
Update msmodelslim_moe.py
OrangeRedeng Dec 23, 2025
789c246
Fix lint issue
OrangeRedeng Dec 23, 2025
2cc4db4
Fix lint issue
OrangeRedeng Dec 23, 2025
94827ef
Fix lint issue
OrangeRedeng Dec 23, 2025
1a30a42
Change local path to modelscope
OrangeRedeng Dec 23, 2025
f539100
Update test_ascend_w4a4_quantization.py
OrangeRedeng Dec 23, 2025
01f6c58
Temporary fix
OrangeRedeng Dec 23, 2025
0dacfd2
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 23, 2025
07e1f84
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 24, 2025
c9a8122
Update test_ascend_w8a8_quantization.py
OrangeRedeng Dec 24, 2025
6bb9f20
Update run_suite.py
OrangeRedeng Dec 24, 2025
836dc16
Update test_ascend_w4a4_quantization.py
OrangeRedeng Dec 24, 2025
14b6ab8
Update test_ascend_w4a4_quantization.py
OrangeRedeng Dec 24, 2025
a8a03a0
Merge branch 'main' into npu_quantization_refactor
AniZpZ Dec 25, 2025
15040cc
Update msmodelslim_moe.py
TamirBaydasov Dec 25, 2025
5a1c7ec
Update msmodelslim_moe.py
TamirBaydasov Dec 25, 2025
a26d9e6
Update run_suite.py
OrangeRedeng Dec 25, 2025
c560c8f
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 26, 2025
1d44466
Add modelslim to optimized methods
TamirBaydasov Dec 26, 2025
18377d0
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 26, 2025
686966b
Resolve conflicts 1/2
eshoguli Dec 29, 2025
1c888e0
Update test_ascend_w4a4_quantization.py
OrangeRedeng Dec 29, 2025
1830d74
Resolve conflicts 1/2
OrangeRedeng Dec 29, 2025
46a3570
Resolve conflicts 2/2
OrangeRedeng Dec 29, 2025
217536f
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 29, 2025
ffdc7dc
Update compressed_tensors_moe.py
OrangeRedeng Dec 29, 2025
c38e16f
Update compressed_tensors_moe.py
OrangeRedeng Dec 29, 2025
ef216f4
Update compressed_tensors_moe.py
OrangeRedeng Dec 29, 2025
5d43c4a
Update compressed_tensors_moe.py
OrangeRedeng Dec 29, 2025
bee77f0
Update compressed_tensors_moe.py
OrangeRedeng Dec 29, 2025
ee59b95
Update fused_moe_method_npu.py
OrangeRedeng Dec 29, 2025
02d7a6a
Update msmodelslim_moe.py
OrangeRedeng Dec 29, 2025
ff41d73
Update compressed_tensors_moe.py
OrangeRedeng Dec 29, 2025
8d1bb48
Fix lint issue
OrangeRedeng Dec 29, 2025
6b46093
Fix lint issue
OrangeRedeng Dec 29, 2025
567a771
Update compressed_tensors_moe.py
OrangeRedeng Dec 29, 2025
1b2f289
Update msmodelslim_moe.py
OrangeRedeng Dec 29, 2025
fe7067c
Update compressed_tensors_moe.py
OrangeRedeng Dec 29, 2025
2e390e3
Fix lint issue
OrangeRedeng Dec 29, 2025
ee17e0c
Update msmodelslim_moe.py
OrangeRedeng Dec 29, 2025
662fada
Update msmodelslim_moe.py
OrangeRedeng Dec 29, 2025
2fb272d
Update compressed_tensors_moe.py
OrangeRedeng Dec 29, 2025
ae7875c
Update msmodelslim_moe.py
OrangeRedeng Dec 29, 2025
b4c0ebe
Update fused_moe_method_npu.py
OrangeRedeng Dec 29, 2025
897094c
Update msmodelslim_moe.py
OrangeRedeng Dec 29, 2025
b463625
Fix lint issue
OrangeRedeng Dec 29, 2025
349dcd0
Fix lint issue
OrangeRedeng Dec 29, 2025
b776895
Update fused_moe_method_npu.py
OrangeRedeng Dec 29, 2025
56c8d06
Fix lint issue
OrangeRedeng Dec 29, 2025
4e9c0d0
Fix lint issue
OrangeRedeng Dec 29, 2025
f091ab0
Fix lint issue
OrangeRedeng Dec 29, 2025
30ea24e
Fix lint issue
OrangeRedeng Dec 29, 2025
b430667
Update fused_moe_method_npu.py
OrangeRedeng Dec 29, 2025
47e8406
Update test_ascend_w4a4_quantization.py
OrangeRedeng Dec 29, 2025
206bb5d
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 29, 2025
97b38e4
Rename MsModelSlim -> ModelSlim
OrangeRedeng Dec 29, 2025
7edefee
Merge branch 'main' into npu_quantization_refactor
ping1jing2 Dec 30, 2025
d6f0064
Fix w4a4 test
OrangeRedeng Dec 30, 2025
0aad1d1
Fix link issue
OrangeRedeng Dec 30, 2025
e861924
Return run_decode to test_ascend_w4a4_quantization.py
OrangeRedeng Dec 30, 2025
a443cf9
Update modelslim_moe.py
OrangeRedeng Dec 30, 2025
373b9c5
Fix link
OrangeRedeng Dec 30, 2025
86093bb
Fix link again
OrangeRedeng Dec 30, 2025
2de91b8
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 30, 2025
70f2fab
Add w4a8 strategy to compressed-tensors
OrangeRedeng Dec 30, 2025
f9450c8
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 30, 2025
d5ad3a1
Fix test again
OrangeRedeng Dec 30, 2025
529773d
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 30, 2025
45d6421
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Dec 30, 2025
cb58406
Merge branch 'main' into npu_quantization_refactor
iforgetmyname Jan 3, 2026
bf617a5
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 3, 2026
a657e87
Update test order
OrangeRedeng Jan 3, 2026
b94d390
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 4, 2026
ff565db
Move w4a4_test to a2-tp1 suite
OrangeRedeng Jan 4, 2026
c97c232
Move w4a4_test to a2-tp1 suite
OrangeRedeng Jan 4, 2026
c190ea3
Return w4a4 to A3
OrangeRedeng Jan 4, 2026
659fa07
Remove unused is_npu()
OrangeRedeng Jan 4, 2026
b3e2021
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 5, 2026
9aec4b9
Merge branch 'main' into npu_quantization_refactor
iforgetmyname Jan 6, 2026
77e9fa8
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 12, 2026
4716b73
Update test_ascend_w4a4_quantization.py
OrangeRedeng Jan 12, 2026
2ace366
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 12, 2026
42d849e
Fix test_ascend_piecewise_graph_prefill test
OrangeRedeng Jan 12, 2026
0a10c5f
Merge branch 'main' into npu_quantization_refactor
ping1jing2 Jan 12, 2026
64d25e9
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 12, 2026
de0cd1d
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 12, 2026
9a95ff8
Move w4a4 test to A2
OrangeRedeng Jan 13, 2026
d323c6a
Update test_ascend_w4a4_quantization.py
OrangeRedeng Jan 13, 2026
7e3d281
Update run_suite.py
OrangeRedeng Jan 13, 2026
601a349
Update test_ascend_w4a4_quantization.py
OrangeRedeng Jan 13, 2026
ef4ce00
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 13, 2026
0d16e53
Update test_ascend_w4a4_quantization.py
OrangeRedeng Jan 13, 2026
6bcf2f2
Merge branch 'main' into npu_quantization_refactor
TamirBaydasov Jan 13, 2026
7b9e614
Fix w4a4 test
OrangeRedeng Jan 13, 2026
a79e4b9
Fix w4a4 test
OrangeRedeng Jan 13, 2026
c113924
Merge branch 'main' into npu_quantization_refactor
iforgetmyname Jan 13, 2026
bfb87cf
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 13, 2026
eeb3875
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 13, 2026
cd881ee
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 14, 2026
27b373b
Merge branch 'main' into npu_quantization_refactor
OrangeRedeng Jan 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
/python/sglang/srt/function_call @CatherineSue @JustinTong0323
/python/sglang/srt/grpc @CatherineSue @slin1237
/python/sglang/srt/hardware_backend/npu @ping1jing2 @iforgetmyname
/python/sglang/srt/hardware_backend/npu/quantization @OrangeRedeng @TamirBaydasov @iforgetmyname
/python/sglang/srt/layers @merrymercy @Ying1123 @Fridge003 @ispobock @HaiShaw @ch-wan @BBuf @Edwardf0t1
/python/sglang/srt/layers/attention @merrymercy @Fridge003 @ispobock @Qiaolin-Yu @hebiao064
/python/sglang/srt/layers/attention/fla @yizhang2077 @hebiao064
Expand Down
5 changes: 0 additions & 5 deletions docs/platforms/ascend_npu_deepseek_example.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ python3 -m sglang.launch_server \
--trust-remote-code \
--attention-backend ascend \
--device npu \
--quantization modelslim \
--watchdog-timeout 9000 \
--cuda-graph-bs 8 16 24 28 32 \
--mem-fraction-static 0.68 \
Expand Down Expand Up @@ -88,7 +87,6 @@ python -m sglang.launch_server \
--mem-fraction-static 0.6 \
--attention-backend ascend \
--device npu \
--quantization modelslim \
--max-running-requests 8 \
--context-length 8192 \
--disable-radix-cache \
Expand Down Expand Up @@ -144,7 +142,6 @@ python -m sglang.launch_server \
--max-running-requests 352 \
--attention-backend ascend \
--device npu \
--quantization modelslim \
--moe-a2a-backend deepep \
--enable-dp-attention \
--deepep-mode low_latency \
Expand Down Expand Up @@ -216,7 +213,6 @@ do
--mem-fraction-static 0.81 \
--attention-backend ascend \
--device npu \
--quantization modelslim \
--max-running-requests 8 \
--context-length 8192 \
--disable-radix-cache \
Expand Down Expand Up @@ -279,7 +275,6 @@ do
--max-running-requests 832 \
--attention-backend ascend \
--device npu \
--quantization modelslim \
--moe-a2a-backend deepep \
--enable-dp-attention \
--deepep-mode low_latency \
Expand Down
21 changes: 21 additions & 0 deletions docs/platforms/ascend_npu_quantization.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Quantization on Ascend.

To load already quantized models, simply load the model weights and config. Again, if the model has been quantized offline, there's no need to add `--quantization` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` or `config.json` config.

[ModelSlim on Ascend support](https://github.com/sgl-project/sglang/pull/14504):
- [x] W4A4 dynamic linear
- [x] W8A8 static linear
- [x] W8A8 dynamic linear
- [x] W4A8 dynamic MOE
- [x] W8A8 dynamic MOE

[AWQ on Ascend support](https://github.com/sgl-project/sglang/pull/10158):
- [x] W4A16 linear
- [x] W8A16 linear # Need to test
- [x] W4A16 MOE # Need to test

Compressed-tensors (LLM Compressor) on Ascend support:
- [x] [W4A8 dynamic MOE with/without activation clip](https://github.com/sgl-project/sglang/pull/14736) # Need to test
- [x] [W4A16 MOE](https://github.com/sgl-project/sglang/pull/12759)
- [x] [W8A8 dynamic linear](https://github.com/sgl-project/sglang/pull/14504)
- [x] [W8A8 dynamic MOE](https://github.com/sgl-project/sglang/pull/14504)
29 changes: 27 additions & 2 deletions python/sglang/srt/configs/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import math
import os
from enum import Enum, IntEnum, auto
from pathlib import Path
from typing import Any, List, Optional, Set, Union

import torch
Expand Down Expand Up @@ -632,6 +633,18 @@ def _parse_quant_hf_config(self):
quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
return quant_cfg

def _find_quant_modelslim_config(self):
quant_config_file = Path(self.model_path, "quant_model_description.json")
quant_cfg = None
if quant_config_file.is_file():
with open(quant_config_file) as f:
quant_cfg = json.load(f)
# This field is required for flagless model loading but is not present in
# modelslim model description, so we're adding it here manually.
quant_cfg["quant_method"] = "modelslim"

return quant_cfg

def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]:
"""Parse ModelOpt quantization config and return the appropriate quant_method."""
json_quant_configs = quant_config_dict["quantization"]
Expand Down Expand Up @@ -744,6 +757,7 @@ def _verify_quantization(self) -> None:
"w4afp8",
"petit_nvfp4",
"quark",
"modelslim",
]
compatible_quantization_methods = {
"modelopt_fp8": ["modelopt"],
Expand All @@ -755,8 +769,19 @@ def _verify_quantization(self) -> None:
if self.quantization is not None:
self.quantization = self.quantization.lower()

# Parse quantization method from the HF model config, if available.
quant_cfg = self._parse_quant_hf_config()
# Parse quantization method from the HF and ModelSlim model config, if available.
# Only one function should return config, other should return None.
cfg_list = []
cfg_list.append(self._parse_quant_hf_config())
cfg_list.append(self._find_quant_modelslim_config())

# Filter out None values
cfg_list = [item for item in cfg_list if item is not None]
if len(cfg_list) > 1:
raise ValueError(
"Config list contains configs from 2 methods, must be only 1"
)
quant_cfg = cfg_list[0] if cfg_list else None

if quant_cfg is not None:
quant_method = quant_cfg.get(
Expand Down
Loading
Loading