From 8a7349ce08ee1f6c9ab83d3e9d236516da28defa Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sat, 22 Nov 2025 21:49:50 +0000 Subject: [PATCH 1/5] basic design Signed-off-by: Kyle Sayers --- vllm/model_executor/layers/linear.py | 6 ++- .../schemes/compressed_tensors_w8a8_fp8.py | 48 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dfcc601a1c53..906fa019963d 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -863,6 +863,10 @@ def weight_loader_v2( tp_rank=self.tp_rank, ) + @property + def all_shards(self): + return range(len(self.output_sizes)) + class QKVParallelLinear(ColumnParallelLinear): """Linear layers for the attention's QKV transformation. @@ -1421,4 +1425,4 @@ def extra_repr(self) -> str: s += f", bias={self.bias is not None}" s += f", tp_size={self.tp_size}" s += f", reduce_results={self.reduce_results}" - return s + return s \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index ee99572f5f49..6b9711694089 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -31,8 +31,10 @@ from vllm.model_executor.parameter import ( BlockQuantScaleParameter, ChannelQuantScaleParameter, + ModelWeightParameter, PerTensorScaleParameter, ) +from vllm.model_executor.utils import set_weight_attrs __all__ = ["CompressedTensorsW8A8Fp8"] @@ -100,6 +102,27 @@ def create_weights( layer.weight_block_size = None layer.orig_dtype = params_dtype + online_quantization = True + if online_quantization: + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + device="meta", # (1) Initialize unquantized weight on meta device + weight_loader=online_quantize( + layer, + weight_loader, + name="weight", + wait_for_params=["weight"], + wait_for_shards=layer.all_shards, + ), + ) + return + if self.strategy == QuantizationStrategy.BLOCK: assert self.weight_block_size is not None layer.weight_block_size = self.weight_block_size @@ -198,3 +221,28 @@ def apply_weights( input_scale=layer.input_scale, bias=bias, ) + + +def online_quantize(layer, weight_loader, name, wait_for_params, wait_for_shards): + wait_for_keys = [ + (param_name, shard_id) + for param_name in wait_for_params + for shard_id in wait_for_shards + ] # params are determined by quant method, shards are determined by layer + + def wrapped_weight_loader(param, loaded_weight, **kwargs): + # (2) allocate unquantized buffer on GPU for loading + if getattr(layer, name).device == "meta": + layer.register_parameter(name, param.to(torch.cuda.current_device())) + + # (3) load into unquantized GPU buffer + weight_loader(getattr(layer, name), loaded_weight, loaded_shard_id) + + # check if all necessary weights and shards are loaded + loaded_shard_id = kwargs.get("loaded_shard_id", None) + wait_for_keys.remove((name, loaded_shard_id)) + if len(wait_for_keys) <= 0: + # (4) do quantization and allocate new quantized params + layer.weight, layer.scale = fp8_quantize(layer.weight) + + return wrapped_weight_loader From 23ec02028e306d5d317506a116a8c844f60eb8dc Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sat, 22 Nov 2025 21:50:49 +0000 Subject: [PATCH 2/5] reduce diff Signed-off-by: Kyle Sayers --- .../compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 6b9711694089..1277cf9712f3 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -34,7 +34,6 @@ ModelWeightParameter, PerTensorScaleParameter, ) -from vllm.model_executor.utils import set_weight_attrs __all__ = ["CompressedTensorsW8A8Fp8"] From 0e33e3dc43292ed72de609d0980b4dc0b350d62e Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sat, 22 Nov 2025 21:51:37 +0000 Subject: [PATCH 3/5] reduce diff Signed-off-by: Kyle Sayers --- vllm/model_executor/layers/linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 906fa019963d..67551284d4f8 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -1425,4 +1425,4 @@ def extra_repr(self) -> str: s += f", bias={self.bias is not None}" s += f", tp_size={self.tp_size}" s += f", reduce_results={self.reduce_results}" - return s \ No newline at end of file + return s From 69d61914809472a3ca0d862b094b46c6e6219e88 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sat, 22 Nov 2025 22:00:15 +0000 Subject: [PATCH 4/5] use numel Signed-off-by: Kyle Sayers --- vllm/model_executor/layers/linear.py | 4 ---- .../schemes/compressed_tensors_w8a8_fp8.py | 15 +++++---------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 67551284d4f8..dfcc601a1c53 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -863,10 +863,6 @@ def weight_loader_v2( tp_rank=self.tp_rank, ) - @property - def all_shards(self): - return range(len(self.output_sizes)) - class QKVParallelLinear(ColumnParallelLinear): """Linear layers for the attention's QKV transformation. diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 1277cf9712f3..8525c989e205 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -117,7 +117,6 @@ def create_weights( weight_loader, name="weight", wait_for_params=["weight"], - wait_for_shards=layer.all_shards, ), ) return @@ -223,11 +222,7 @@ def apply_weights( def online_quantize(layer, weight_loader, name, wait_for_params, wait_for_shards): - wait_for_keys = [ - (param_name, shard_id) - for param_name in wait_for_params - for shard_id in wait_for_shards - ] # params are determined by quant method, shards are determined by layer + numel_loaded = 0 def wrapped_weight_loader(param, loaded_weight, **kwargs): # (2) allocate unquantized buffer on GPU for loading @@ -235,12 +230,12 @@ def wrapped_weight_loader(param, loaded_weight, **kwargs): layer.register_parameter(name, param.to(torch.cuda.current_device())) # (3) load into unquantized GPU buffer - weight_loader(getattr(layer, name), loaded_weight, loaded_shard_id) + weight_loader(getattr(layer, name), loaded_weight, **kwargs) # check if all necessary weights and shards are loaded - loaded_shard_id = kwargs.get("loaded_shard_id", None) - wait_for_keys.remove((name, loaded_shard_id)) - if len(wait_for_keys) <= 0: + nonlocal numel_loaded + numel_loaded += loaded_weight.numel() + if numel_loaded >= getattr(layer, name).numel(): # (4) do quantization and allocate new quantized params layer.weight, layer.scale = fp8_quantize(layer.weight) From c123ccecd842380455e7371a07a0a032b14e5416 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sat, 22 Nov 2025 22:07:47 +0000 Subject: [PATCH 5/5] wait_for_params Signed-off-by: Kyle Sayers --- .../schemes/compressed_tensors_w8a8_fp8.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 8525c989e205..63611b3aff03 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -6,6 +6,7 @@ import torch from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy from torch.nn import Parameter +from collections import defaultdict from vllm._aiter_ops import rocm_aiter_ops from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( @@ -221,21 +222,23 @@ def apply_weights( ) -def online_quantize(layer, weight_loader, name, wait_for_params, wait_for_shards): - numel_loaded = 0 +def online_quantize(layer, weight_loader, param_name, wait_for_params): + numel_loaded = defaultdict(int) def wrapped_weight_loader(param, loaded_weight, **kwargs): # (2) allocate unquantized buffer on GPU for loading - if getattr(layer, name).device == "meta": - layer.register_parameter(name, param.to(torch.cuda.current_device())) + if getattr(layer, param_name).device == "meta": + layer.register_parameter(param_name, param.to(torch.cuda.current_device())) # (3) load into unquantized GPU buffer - weight_loader(getattr(layer, name), loaded_weight, **kwargs) + weight_loader(getattr(layer, param_name), loaded_weight, **kwargs) # check if all necessary weights and shards are loaded - nonlocal numel_loaded - numel_loaded += loaded_weight.numel() - if numel_loaded >= getattr(layer, name).numel(): + numel_loaded[param_name] += loaded_weight.numel() + if all( + numel_loaded[name] >= getattr(layer, name).numel() + for name in wait_for_params + ): # (4) do quantization and allocate new quantized params layer.weight, layer.scale = fp8_quantize(layer.weight)