Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import torch
from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
from torch.nn import Parameter
from collections import defaultdict

from vllm._aiter_ops import rocm_aiter_ops
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
Expand All @@ -31,6 +32,7 @@
from vllm.model_executor.parameter import (
BlockQuantScaleParameter,
ChannelQuantScaleParameter,
ModelWeightParameter,
PerTensorScaleParameter,
)

Expand Down Expand Up @@ -100,6 +102,26 @@
layer.weight_block_size = None
layer.orig_dtype = params_dtype

online_quantization = True
if online_quantization:
weight = ModelWeightParameter(
data=torch.empty(
output_size_per_partition,
input_size_per_partition,
),
input_dim=1,
output_dim=0,
weight_loader=weight_loader,
device="meta", # (1) Initialize unquantized weight on meta device
weight_loader=online_quantize(

Check failure on line 116 in vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

View workflow job for this annotation

GitHub Actions / pre-commit

Unexpected keyword argument "name" for "online_quantize" [call-arg]

Check failure on line 116 in vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

View workflow job for this annotation

GitHub Actions / pre-commit

Unexpected keyword argument "name" for "online_quantize" [call-arg]

Check failure on line 116 in vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

View workflow job for this annotation

GitHub Actions / pre-commit

Unexpected keyword argument "name" for "online_quantize" [call-arg]
layer,
weight_loader,
name="weight",
wait_for_params=["weight"],
),

Check failure on line 121 in vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (invalid-syntax)

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py:116:17: invalid-syntax: Duplicate keyword argument "weight_loader"
)
return

if self.strategy == QuantizationStrategy.BLOCK:
assert self.weight_block_size is not None
layer.weight_block_size = self.weight_block_size
Expand Down Expand Up @@ -198,3 +220,26 @@
input_scale=layer.input_scale,
bias=bias,
)


def online_quantize(layer, weight_loader, param_name, wait_for_params):
numel_loaded = defaultdict(int)

Check failure on line 226 in vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

View workflow job for this annotation

GitHub Actions / pre-commit

Need type annotation for "numel_loaded" [var-annotated]

Check failure on line 226 in vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

View workflow job for this annotation

GitHub Actions / pre-commit

Need type annotation for "numel_loaded" [var-annotated]

Check failure on line 226 in vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

View workflow job for this annotation

GitHub Actions / pre-commit

Need type annotation for "numel_loaded" [var-annotated]

def wrapped_weight_loader(param, loaded_weight, **kwargs):
# (2) allocate unquantized buffer on GPU for loading
if getattr(layer, param_name).device == "meta":
layer.register_parameter(param_name, param.to(torch.cuda.current_device()))

# (3) load into unquantized GPU buffer
weight_loader(getattr(layer, param_name), loaded_weight, **kwargs)

# check if all necessary weights and shards are loaded
numel_loaded[param_name] += loaded_weight.numel()
if all(
numel_loaded[name] >= getattr(layer, name).numel()
for name in wait_for_params
):
# (4) do quantization and allocate new quantized params
layer.weight, layer.scale = fp8_quantize(layer.weight)

Check failure on line 243 in vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

View workflow job for this annotation

GitHub Actions / pre-commit

Name "fp8_quantize" is not defined [name-defined]

Check failure on line 243 in vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

View workflow job for this annotation

GitHub Actions / pre-commit

Name "fp8_quantize" is not defined [name-defined]

Check failure on line 243 in vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

View workflow job for this annotation

GitHub Actions / pre-commit

Name "fp8_quantize" is not defined [name-defined]

return wrapped_weight_loader