From 2a2387ead0f374c07425d3ff850244b21b54c7fb Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 12 Apr 2024 20:30:08 +0000 Subject: [PATCH 01/12] first end to end run with rowparallellinear in the fun format --- vllm/model_executor/layers/linear.py | 13 +- .../layers/quantization/smoothquant.py | 118 +++++++++++++++++- vllm/model_executor/models/llama.py | 19 +-- 3 files changed, 138 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index f3d4d1789db2..c8236860ee1a 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -525,8 +525,12 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size_per_partition, self.output_size, self.input_size, - self.output_size, self.params_dtype) + input_size_per_partition=self.input_size_per_partition, + output_size_per_partition=self.output_size, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + logical_widths=[self.output_size]) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -555,6 +559,11 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) + + # TODO: canon + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py index d9d82e6cfbc3..8b9fae111311 100644 --- a/vllm/model_executor/layers/quantization/smoothquant.py +++ b/vllm/model_executor/layers/quantization/smoothquant.py @@ -1,8 +1,9 @@ -from typing import Any, Dict, List, Tuple, Optional +from typing import Any, Dict, List, Tuple, Optional, Union import torch from torch._tensor import Tensor from torch.nn.parameter import Parameter +from torch.nn import ParameterList import threading from vllm._C import ops @@ -94,7 +95,6 @@ def __new__(cls, *args, **kwargs): def get_i8cugemm(self): return self.i8cugemm - class SQLinearMethod(LinearMethodBase): """Linear method for SmoothQuant. """ @@ -106,7 +106,8 @@ def __init__(self, gemm): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, - params_dtype: torch.dtype) -> Dict[str, Tensor]: + params_dtype: torch.dtype, + logical_widths=None) -> Dict[str, Tensor]: weight = Parameter( torch.empty( output_size_per_partition, @@ -173,6 +174,117 @@ def apply_weights(self, return y +class FunSQLinearMethod(LinearMethodBase): + """Linear method for SmoothQuant. + """ + + def __init__( + self, + per_token_quant, + quant_dtype, + dequant_dtype, + ): + self.per_token_quant = per_token_quant + self.quant_dtype = quant_dtype + self.dequant_dtype = dequant_dtype + self.i8cugemm = Int8GEMM().get_i8cugemm() + + def create_weights( + self, + input_size_per_partition: int, + output_size_per_partition: int, + input_size: int, + output_size: int, + params_dtype: torch.dtype, + logical_widths: Optional[List[int]] = None, + ) -> Dict[str, Tensor]: + weight = Parameter( + torch.empty( + output_size_per_partition, + input_size_per_partition, + device="cuda", + dtype=torch.int8, + ), requires_grad=False, + ) + set_weight_attrs(weight, { + "input_dim": 1, + "output_dim": 0, + }) + + dequant_scale = Parameter( + torch.tensor( + [1.0] * len(logical_widths), + dtype=torch.float32, + device='cpu' + ), requires_grad=False + ) + + return { + "weight": weight, + "dequant_scale": dequant_scale, + "logical_widths": logical_widths, + } + + + def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths): + x_dq = torch.empty_like(x_q, dtype=self.dequant_dtype) + + # Split into shards. + x_q_split = x_q.split(logical_widths, dim=-1) + x_dq_split = x_dq.split(logical_widths, dim=-1) + + # Dequantize each shard. + for xq, weight_scale, activation_scale, xdq in zip( + x_q_split, weight_scales, activation_scales, x_dq_split): + ops.dequant(xdq, xq, activation_scale, weight_scale) + + # Return dequantized activation. + return x_dq + + + def _quantize(self, x, per_token_quant: bool): + x_q = torch.empty_like(x, dtype=self.quant_dtype) + + # Compute activation scale if per token. + if per_token_quant: + activation_scale = torch.empty( + x.numel() // x.shape[-1], + dtype=torch.float32, + device=x.device) + ops.quant(x_q, x, activation_scale) + # Set activation scale if per tensor. TODO: why 1.0? << static? + else: + activation_scale = None + ops.quant(x_q, x, 1.0) + + return x_q, activation_scale + + + def apply_weights(self, + weights: Dict[str, Tensor], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> Tensor: + assert bias is None + weight = weights["weight"] + dequant_scale = weights["dequant_scale"] + logical_widths = weights["logical_widths"] + + # Q + x_q, activation_scale = self._quantize(x, self.per_token_quant) + + # GEMM + x_q = x_q.view(-1, x_q.shape[-1]) + out_q = torch.empty( + (x_q.shape[0], weight.shape[0]), + dtype=torch.int32, device=x.device) + + self.i8cugemm.linear_a8_w8_o32_(x_q, weight, out_q) + out_q = out_q.view(*x_q.shape[:-1], -1) + + # DQ + return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths) + + class SQLinearMethodQKV(SQLinearMethod): def __init__(self, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 1fffbc5fa30c..fbe6c3c9e8e1 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -34,6 +34,7 @@ SQLinearMethod, SQLinearMethodQKV, SQLinearMethodOProj, + FunSQLinearMethod, SQLinearMethodGateUpProj, SQLinearMethodDownProj) from vllm.model_executor.layers.layernorm import RMSNorm @@ -93,10 +94,15 @@ def __init__( if self.use_int8: # override gate_up linear method assert isinstance(linear_method, SQLinearMethod) - down_proj_linear_method = SQLinearMethodDownProj( - gemm=Int8GEMM, + # down_proj_linear_method = SQLinearMethodDownProj( + # gemm=Int8GEMM, + # quant_dtype=torch.int8, + # dequant_dtype=torch.float) + down_proj_linear_method = FunSQLinearMethod( + per_token_quant=True, quant_dtype=torch.int8, dequant_dtype=torch.float) + self.down_proj = RowParallelLinear(intermediate_size, hidden_size, bias=False, @@ -193,11 +199,10 @@ def __init__( if self.use_int8: # override o_proj linear method assert isinstance(linear_method, SQLinearMethod) - o_proj_linear_method = SQLinearMethodOProj( - gemm=Int8GEMM, - use_per_token_quant=True, # TODO (varun) : Read from config - quant_dtype = torch.int8, - dequant_dtype= torch.float) + o_proj_linear_method = FunSQLinearMethod( + per_token_quant=True, + quant_dtype=torch.int8, + dequant_dtype=torch.float) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, From 7373f4c5e07f08c58a9ac11aa6acaec31c996366 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 12 Apr 2024 21:53:21 +0000 Subject: [PATCH 02/12] got qkvproj working with funsqlinearmethod --- vllm/model_executor/layers/linear.py | 39 ++++++++++++++++--- .../layers/quantization/smoothquant.py | 18 ++++++--- vllm/model_executor/models/llama.py | 38 +++++++++++++----- 3 files changed, 74 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index c8236860ee1a..146e654181eb 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -150,6 +150,9 @@ class ColumnParallelLinear(torch.nn.Module): skip adding bias but instead return it. params_dtype: Data type for the parameters. linear_method: (Maybe quantized) linear method. + logical_widths: Optional list of widths for logical weight matrices. + E.g. for QKVParallelLinear, this parameter defines + the width """ def __init__( @@ -161,6 +164,7 @@ def __init__( skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, linear_method: Optional[LinearMethodBase] = None, + logical_widths: Optional[List[int]] = None, ): super().__init__() @@ -179,8 +183,13 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size, self.output_size_per_partition, self.input_size, - self.output_size, self.params_dtype) + input_size_per_partition=self.input_size, + output_size_per_partition=self.output_size_per_partition, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + logical_widths=logical_widths, + ) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -257,8 +266,15 @@ def __init__( self.output_sizes = output_sizes tp_size = get_tensor_model_parallel_world_size() assert all(output_size % tp_size == 0 for output_size in output_sizes) - super().__init__(input_size, sum(output_sizes), bias, gather_output, - skip_bias_add, params_dtype, linear_method) + super().__init__( + input_size=input_size, + output_size=sum(output_sizes), + bias=bias, + gather_output=gather_output, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + linear_method=linear_method, + logical_widths=output_sizes) def weight_loader(self, param: Parameter, @@ -383,8 +399,19 @@ def __init__( input_size = self.hidden_size output_size = (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size - super().__init__(input_size, output_size, bias, False, skip_bias_add, - params_dtype, linear_method) + super().__init__( + input_size=input_size, + output_size=output_size, + bias=bias, + gather_output=False, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + linear_method=linear_method, + logical_widths = [ + self.num_heads * self.head_size, # q_proj + self.total_num_kv_heads * self.head_size, # k_proj + self.total_num_kv_heads * self.head_size, # v_proj + ]) def weight_loader(self, param: Parameter, diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py index 8b9fae111311..0d28701ae0a1 100644 --- a/vllm/model_executor/layers/quantization/smoothquant.py +++ b/vllm/model_executor/layers/quantization/smoothquant.py @@ -104,7 +104,8 @@ def __init__(self, gemm): self.i8cugemm = i8_gemm.get_i8cugemm() def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, + output_size_per_partition: int, + input_size: int, output_size: int, params_dtype: torch.dtype, logical_widths=None) -> Dict[str, Tensor]: @@ -233,10 +234,16 @@ def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths): x_q_split = x_q.split(logical_widths, dim=-1) x_dq_split = x_dq.split(logical_widths, dim=-1) - # Dequantize each shard. - for xq, weight_scale, activation_scale, xdq in zip( - x_q_split, weight_scales, activation_scales, x_dq_split): - ops.dequant(xdq, xq, activation_scale, weight_scale) + # If QuantType is Static per Tensor: + if activation_scales is None: + for xdq, xq, weight_scale in zip(x_dq_split, x_q_split, weight_scales): + ops.dequant(xdq, xq, weight_scale) + + # If QuantType is Dynamic per Token: + else: + for xdq, xq, weight_scale, activation_scale in zip( + x_dq_split, x_q_split, weight_scales, activation_scales): + ops.dequant(xdq, xq, activation_scale, weight_scale) # Return dequantized activation. return x_dq @@ -315,6 +322,7 @@ def dequantize(self, x_q, weights : Dict[str, Tensor]): q_scale, k_scale, v_scale = (weights['q_dequant_scale'], weights['k_dequant_scale'], weights['v_dequant_scale']) + ops.dequant(q, q_q, q_scale) ops.dequant(k, k_q, k_scale) ops.dequant(v, v_q, v_scale) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index fbe6c3c9e8e1..7a4a51cac96c 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -92,12 +92,7 @@ def __init__( down_proj_linear_method = linear_method if self.use_int8: - # override gate_up linear method assert isinstance(linear_method, SQLinearMethod) - # down_proj_linear_method = SQLinearMethodDownProj( - # gemm=Int8GEMM, - # quant_dtype=torch.int8, - # dequant_dtype=torch.float) down_proj_linear_method = FunSQLinearMethod( per_token_quant=True, quant_dtype=torch.int8, @@ -181,11 +176,17 @@ def __init__( if self.use_int8: # override qkv linear method assert isinstance(linear_method, SQLinearMethod) - qkv_linear_method = SQLinearMethodQKV( - gemm=Int8GEMM, - qkv_sizes=(self.q_size, self.kv_size, self.kv_size), + # qkv_linear_method = SQLinearMethodQKV( + # gemm=Int8GEMM, + # qkv_sizes=(self.q_size, self.kv_size, self.kv_size), + # quant_dtype=torch.int8, + # dequant_dtype=self.rotary_emb.cos_sin_cache.dtype) + qkv_linear_method = FunSQLinearMethod( + per_token_quant=False, quant_dtype=torch.int8, dequant_dtype=self.rotary_emb.cos_sin_cache.dtype) + + self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, @@ -483,11 +484,28 @@ def load_weights(self, for (param_name, weight_name, _) in stacked_params_mapping: if weight_name not in name: continue + + k_proj = "k_proj" in name + v_proj = "v_proj" in name + q_proj = "q_proj" in name name = name.replace(weight_name, param_name) prefix = weight_name.split('_')[0] suffix = name.split('.')[-1] - new_name = prefix + '_' + suffix - param = params_dict[name.replace(suffix, new_name)] + + if "qkv" in name: + if q_proj: + param = params_dict[name.replace(suffix, "dequant_scale")][0] + elif k_proj: + param = params_dict[name.replace(suffix, "dequant_scale")][1] + else: + assert v_proj + param = params_dict[name.replace(suffix, "dequant_scale")][2] + + else: + suffix = name.split('.')[-1] + new_name = prefix + '_' + suffix + param = params_dict[name.replace(suffix, new_name)] + param.copy_(loaded_weight) is_fusion_scale = True break From a3d4ee587416389549683d48c272f62cc3a9f131 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 12 Apr 2024 22:04:33 +0000 Subject: [PATCH 03/12] converted all SQLinearMethods to use FunSQLinearMethod -- now working to remove changes to Llama.py --- .../layers/quantization/smoothquant.py | 177 +----------------- vllm/model_executor/models/llama.py | 44 ++--- 2 files changed, 17 insertions(+), 204 deletions(-) diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py index 0d28701ae0a1..3358ef9ba4d2 100644 --- a/vllm/model_executor/layers/quantization/smoothquant.py +++ b/vllm/model_executor/layers/quantization/smoothquant.py @@ -95,6 +95,7 @@ def __new__(cls, *args, **kwargs): def get_i8cugemm(self): return self.i8cugemm + class SQLinearMethod(LinearMethodBase): """Linear method for SmoothQuant. """ @@ -290,179 +291,3 @@ def apply_weights(self, # DQ return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths) - - -class SQLinearMethodQKV(SQLinearMethod): - - def __init__(self, - gemm, - qkv_sizes : Tuple[int, int, int], - quant_dtype : torch.dtype = torch.int8, - dequant_dtype : torch.dtype = torch.float): - super().__init__(gemm) - self.qkv_sizes = qkv_sizes - self.quant_dtype = quant_dtype - self.dequant_dtype = dequant_dtype - - def quantize(self, x): - assert x.dtype != self.quant_dtype - x_q = torch.empty_like(x, dtype=self.quant_dtype) - ops.quant(x_q, x, 1.0) - return x_q - - def dequantize(self, x_q, weights : Dict[str, Tensor]): - # split to get the quantized qkv - q_q, k_q, v_q = x_q.split(list(self.qkv_sizes), dim=-1) - - # create dequant qkv buffer and split to get the individual dequant qkv - # buffers - qkv = torch.empty_like(x_q, dtype=self.dequant_dtype) - q, k, v = qkv.split(list(self.qkv_sizes), dim=-1) - - q_scale, k_scale, v_scale = (weights['q_dequant_scale'], - weights['k_dequant_scale'], - weights['v_dequant_scale']) - - ops.dequant(q, q_q, q_scale) - ops.dequant(k, k_q, k_scale) - ops.dequant(v, v_q, v_scale) - - return qkv - - def apply_weights(self, - weights: Dict[str, Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> Tensor: - x_q = self.quantize(x) - y_q = super().apply_weights(weights, x_q, bias) - return self.dequantize(y_q, weights) - -class SQLinearMethodOProj(SQLinearMethod): - - def __init__(self, - gemm, - use_per_token_quant:bool, - quant_dtype : torch.dtype = torch.int8, - dequant_dtype : torch.dtype = torch.float): - super().__init__(gemm) - self.use_per_token_quant = use_per_token_quant - self.quant_dtype = quant_dtype - self.dequant_dtype = dequant_dtype - - def quantize(self, x) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - # x is the paged-attention output - assert x.dtype != self.quant_dtype - act_scale = None - x_q = torch.empty_like(x, dtype=self.quant_dtype) - if self.use_per_token_quant: - act_scale = torch.empty(x.numel() // x.shape[-1], - dtype=torch.float32, - device=x.device) - ops.quant(x_q, x, act_scale) - else: - ops.quant(x_q, x, 1.0) - return x_q, act_scale - - def dequantize(self, x_q, weights : Dict[str, Tensor], act_scale : torch.Tensor) -> torch.Tensor: - o_dequant_scale = weights['dequant_scale'] - x = torch.empty_like( - x_q, - dtype=self.dequant_dtype, - device=x_q.device) - ops.dequant(x, x_q, act_scale, o_dequant_scale) - return x - - def apply_weights(self, - weights: Dict[str, Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> Tensor: - pass - x_q, act_scale = self.quantize(x) - y_q = super().apply_weights(weights, x_q, bias) - return self.dequantize(y_q, weights, act_scale) - -class SQLinearMethodGateUpProj(SQLinearMethod): - - def __init__(self, - gemm, - quant_dtype : torch.dtype = torch.int8, - dequant_dtype : torch.dtype = torch.float): - super().__init__(gemm) - self.quant_dtype = quant_dtype - self.dequant_dtype = dequant_dtype - - def quantize(self, x) -> torch.Tensor: - # x is the attention output - assert x.dtype != self.quant_dtype - x_q = torch.empty_like(x, dtype=self.quant_dtype, device=x.device) - ops.quant(x_q, x, 1.0) - return x_q - - def dequantize(self, gate_up_q: torch.Tensor, weights : Dict[str, Tensor]) -> torch.Tensor: - - def split_gate_up(gate_up : torch.Tensor): - d = gate_up.shape[-1] - return (torch.narrow(gate_up, 1, 0, d//2), - torch.narrow(gate_up, 1, d//2, d//2)) - - # create a dequant gate_up buffer and split it into constituent parts. - gate_up = torch.empty_like(gate_up_q, - dtype=self.dequant_dtype, - device=gate_up_q.device) - - # split quantized gate_up into constituent parts. - gate_q, up_q = split_gate_up(gate_up_q) - # split output gate_up buffer into constituent parts. - gate, up = split_gate_up(gate_up) - - gate_scale, up_scale = (weights['gate_dequant_scale'], - weights['up_dequant_scale']) - ops.dequant(gate, gate_q, gate_scale) - ops.dequant(up, up_q, up_scale) - - return gate_up - - def apply_weights(self, - weights: Dict[str, Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> Tensor: - x_q = self.quantize(x) - gate_up_q = super().apply_weights(weights, x_q, bias) - return self.dequantize(gate_up_q, weights) - -class SQLinearMethodDownProj(SQLinearMethod): - - def __init__(self, - gemm, - quant_dtype : torch.dtype = torch.int8, - dequant_dtype : torch.dtype = torch.float): - super().__init__(gemm) - self.quant_dtype = quant_dtype - self.dequant_dtype = dequant_dtype - - def quantize(self, x) -> Tuple[torch.Tensor, torch.Tensor]: - assert x.dtype != self.quant_dtype - # TODO (varun) : This is per-token quant - Read from config - x_q = torch.empty_like(x, dtype=self.quant_dtype) - scale = torch.empty(x.numel() // x.shape[-1], - dtype=torch.float32, - device=x.device) - ops.quant(x_q, x, scale) - return x_q, scale - - def dequantize(self, x_q, weights : Dict[str, Tensor], act_scale : torch.Tensor) -> torch.Tensor: - down_dequant_scale = weights['dequant_scale'] - x = torch.empty_like( - x_q, - dtype=self.dequant_dtype, - device=x_q.device) - ops.dequant(x, x_q, act_scale, down_dequant_scale) - return x - - def apply_weights(self, - weights: Dict[str, Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - x_q, act_scale = self.quantize(x) - y_q = super().apply_weights(weights, x_q, bias) - return self.dequantize(y_q, weights, act_scale) \ No newline at end of file diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 7a4a51cac96c..efb59c71afb7 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -30,13 +30,8 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import LoRAConfig from vllm.model_executor.layers.quantization.smoothquant import ( - Int8GEMM, SQLinearMethod, - SQLinearMethodQKV, - SQLinearMethodOProj, - FunSQLinearMethod, - SQLinearMethodGateUpProj, - SQLinearMethodDownProj) + FunSQLinearMethod,) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (LinearMethodBase, @@ -78,8 +73,8 @@ def __init__( if self.use_int8: # override gate_up linear method assert isinstance(linear_method, SQLinearMethod) - gate_up_linear_method = SQLinearMethodGateUpProj( - gemm=Int8GEMM, + gate_up_linear_method = FunSQLinearMethod( + per_token_quant=False, quant_dtype=torch.int8, dequant_dtype=torch.float) self.gate_up_proj = MergedColumnParallelLinear( @@ -484,28 +479,21 @@ def load_weights(self, for (param_name, weight_name, _) in stacked_params_mapping: if weight_name not in name: continue - - k_proj = "k_proj" in name - v_proj = "v_proj" in name - q_proj = "q_proj" in name + if "q_proj" in name: + idx = 0 + elif "k_proj" in name: + idx = 1 + elif "v_proj" in name: + idx = 2 + elif "gate_proj" in name: + idx = 0 + else: + assert "up_proj" in name + idx = 1 + name = name.replace(weight_name, param_name) - prefix = weight_name.split('_')[0] suffix = name.split('.')[-1] - - if "qkv" in name: - if q_proj: - param = params_dict[name.replace(suffix, "dequant_scale")][0] - elif k_proj: - param = params_dict[name.replace(suffix, "dequant_scale")][1] - else: - assert v_proj - param = params_dict[name.replace(suffix, "dequant_scale")][2] - - else: - suffix = name.split('.')[-1] - new_name = prefix + '_' + suffix - param = params_dict[name.replace(suffix, new_name)] - + param = params_dict[name.replace(suffix, "dequant_scale")][idx] param.copy_(loaded_weight) is_fusion_scale = True break From 4bb02750be43685cd46b06690555e2bcd014022c Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 12 Apr 2024 22:30:12 +0000 Subject: [PATCH 04/12] stash --- vllm/model_executor/layers/linear.py | 4 +- .../layers/quantization/smoothquant.py | 104 ++---------------- vllm/model_executor/model_loader.py | 3 +- vllm/model_executor/models/llama.py | 82 ++------------ 4 files changed, 25 insertions(+), 168 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 146e654181eb..57095dff6bb0 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -189,6 +189,7 @@ def __init__( output_size=self.output_size, params_dtype=self.params_dtype, logical_widths=logical_widths, + per_token_quant=False, ) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): @@ -557,7 +558,8 @@ def __init__( input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - logical_widths=[self.output_size]) + logical_widths=[self.output_size], + per_token_quant=True,) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py index 3358ef9ba4d2..13b995921377 100644 --- a/vllm/model_executor/layers/quantization/smoothquant.py +++ b/vllm/model_executor/layers/quantization/smoothquant.py @@ -72,7 +72,7 @@ def from_config(cls, config: Dict[str, Any]) -> "SmoothQuantConfig": return cls(weight_bits, quant_map) def get_linear_method(self) -> "SQLinearMethod": - return SQLinearMethod(Int8GEMM) + return SQLinearMethod() def get_scaled_act_names(self) -> List[str]: return [] @@ -95,100 +95,11 @@ def __new__(cls, *args, **kwargs): def get_i8cugemm(self): return self.i8cugemm - class SQLinearMethod(LinearMethodBase): - """Linear method for SmoothQuant. + """Linear method for AutoSmoothQuant. """ - def __init__(self, gemm): - i8_gemm = gemm() - self.i8cugemm = i8_gemm.get_i8cugemm() - - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, - input_size: int, - output_size: int, - params_dtype: torch.dtype, - logical_widths=None) -> Dict[str, Tensor]: - weight = Parameter( - torch.empty( - output_size_per_partition, - input_size_per_partition, - device="cuda", - dtype=torch.int8, - ), - requires_grad=False, - ) - set_weight_attrs(weight, { - "input_dim": 1, - "output_dim": 0, - }) - # q k v dequant_scales are used in QKVParallelLinear - q_dequant_scale = Parameter( - torch.tensor(1.0, dtype=torch.float32, device='cpu'), - requires_grad=False, - ) - k_dequant_scale = Parameter( - torch.tensor(1.0, dtype=torch.float32, device='cpu'), - requires_grad=False, - ) - v_dequant_scale = Parameter( - torch.tensor(1.0, dtype=torch.float32, device='cpu'), - requires_grad=False, - ) - # gate up dequant_scales are used in MergedColumnParallelLinear - gate_dequant_scale = Parameter( - torch.tensor(1.0, dtype=torch.float32, device='cpu'), - requires_grad=False, - ) - up_dequant_scale = Parameter( - torch.tensor(1.0, dtype=torch.float32, device='cpu'), - requires_grad=False, - ) - # dequant_scale is used in RowParallelLinear - dequant_scale = Parameter( - torch.tensor(1.0, dtype=torch.float32, device='cpu'), - requires_grad=False, - ) - return { - "weight": weight, - "q_dequant_scale": q_dequant_scale, - "k_dequant_scale": k_dequant_scale, - "v_dequant_scale": v_dequant_scale, - "gate_dequant_scale": gate_dequant_scale, - "up_dequant_scale": up_dequant_scale, - "dequant_scale": dequant_scale - } - - def apply_weights(self, - weights: Dict[str, Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> Tensor: - assert bias is None - weight = weights["weight"] - x_shape = x.shape - x = x.view(-1, x_shape[-1]) - y = torch.empty((x.shape[0], weight.shape[0]), - dtype=torch.int32, - device=x.device) - self.i8cugemm.linear_a8_w8_o32_(x, weight, y) - y = y.view(*x_shape[:-1], -1) - return y - - -class FunSQLinearMethod(LinearMethodBase): - """Linear method for SmoothQuant. - """ - - def __init__( - self, - per_token_quant, - quant_dtype, - dequant_dtype, - ): - self.per_token_quant = per_token_quant - self.quant_dtype = quant_dtype - self.dequant_dtype = dequant_dtype + def __init__(self,): self.i8cugemm = Int8GEMM().get_i8cugemm() def create_weights( @@ -199,6 +110,7 @@ def create_weights( output_size: int, params_dtype: torch.dtype, logical_widths: Optional[List[int]] = None, + per_token_quant:bool = False, ) -> Dict[str, Tensor]: weight = Parameter( torch.empty( @@ -216,8 +128,8 @@ def create_weights( dequant_scale = Parameter( torch.tensor( [1.0] * len(logical_widths), - dtype=torch.float32, - device='cpu' + dtype=params_dtype, + device='cuda' ), requires_grad=False ) @@ -225,6 +137,7 @@ def create_weights( "weight": weight, "dequant_scale": dequant_scale, "logical_widths": logical_widths, + "per_token_quant": per_token_quant, } @@ -276,9 +189,10 @@ def apply_weights(self, weight = weights["weight"] dequant_scale = weights["dequant_scale"] logical_widths = weights["logical_widths"] + per_token_quant = weights["per_token_quant"] # Q - x_q, activation_scale = self._quantize(x, self.per_token_quant) + x_q, activation_scale = self._quantize(x, per_token_quant) # GEMM x_q = x_q.view(-1, x_q.shape[-1]) diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py index b191dc4009b5..07fab1f1ed6f 100644 --- a/vllm/model_executor/model_loader.py +++ b/vllm/model_executor/model_loader.py @@ -83,8 +83,7 @@ def get_model(model_config: ModelConfig, device_config: DeviceConfig, # The weights will be initialized as empty tensors. with torch.device(device_config.device): if _is_support_smoothquant(model_config): - model = model_class(model_config.hf_config, linear_method, - quant_config) + model = model_class(model_config.hf_config, linear_method) elif hasattr(model_class, "supported_lora_modules"): model = model_class(model_config.hf_config, linear_method, lora_config) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index efb59c71afb7..d01d08ff6ff6 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -29,9 +29,6 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import LoRAConfig -from vllm.model_executor.layers.quantization.smoothquant import ( - SQLinearMethod, - FunSQLinearMethod,) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (LinearMethodBase, @@ -62,42 +59,24 @@ def __init__( intermediate_size: int, hidden_act: str, linear_method: Optional[LinearMethodBase] = None, - quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() self.hidden_size = hidden_size - self.use_int8 = quant_config is not None and quant_config.get_name( - ) == "smoothquant" - - gate_up_linear_method = linear_method - if self.use_int8: - # override gate_up linear method - assert isinstance(linear_method, SQLinearMethod) - gate_up_linear_method = FunSQLinearMethod( - per_token_quant=False, - quant_dtype=torch.int8, - dequant_dtype=torch.float) + self.gate_up_proj = MergedColumnParallelLinear( hidden_size, [intermediate_size] * 2, bias=False, - linear_method=gate_up_linear_method) + linear_method=linear_method) + + self.down_proj = RowParallelLinear( + intermediate_size, hidden_size, + bias=False, + linear_method=linear_method) + if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") - down_proj_linear_method = linear_method - if self.use_int8: - assert isinstance(linear_method, SQLinearMethod) - down_proj_linear_method = FunSQLinearMethod( - per_token_quant=True, - quant_dtype=torch.int8, - dequant_dtype=torch.float) - - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=down_proj_linear_method) - self.act_fn = SiluAndMul() def forward(self, x): @@ -145,10 +124,6 @@ def __init__( self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings - self.use_int8 = quant_config is not None and quant_config.get_name( - ) == "smoothquant" - # Needs to be ironed out!! - self.use_per_token_quant = self.use_int8 # This will be overwritten by model initialization if we are using it. # N.B. currently we only support per tensor scalar scaling factors @@ -167,44 +142,20 @@ def __init__( rope_scaling=rope_scaling, ) - qkv_linear_method = linear_method - if self.use_int8: - # override qkv linear method - assert isinstance(linear_method, SQLinearMethod) - # qkv_linear_method = SQLinearMethodQKV( - # gemm=Int8GEMM, - # qkv_sizes=(self.q_size, self.kv_size, self.kv_size), - # quant_dtype=torch.int8, - # dequant_dtype=self.rotary_emb.cos_sin_cache.dtype) - qkv_linear_method = FunSQLinearMethod( - per_token_quant=False, - quant_dtype=torch.int8, - dequant_dtype=self.rotary_emb.cos_sin_cache.dtype) - - self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, self.total_num_heads, self.total_num_kv_heads, bias=bias, - linear_method=qkv_linear_method, + linear_method=linear_method, ) - o_proj_linear_method = linear_method - if self.use_int8: - # override o_proj linear method - assert isinstance(linear_method, SQLinearMethod) - o_proj_linear_method = FunSQLinearMethod( - per_token_quant=True, - quant_dtype=torch.int8, - dequant_dtype=torch.float) - self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, bias=bias, - linear_method=o_proj_linear_method, + linear_method=linear_method, ) self.attn = Attention( @@ -235,12 +186,9 @@ def __init__( self, config: LlamaConfig, linear_method: Optional[LinearMethodBase] = None, - quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size - self.use_int8 = quant_config is not None and quant_config.get_name( - ) == "smoothquant" self.tp_size = get_tensor_model_parallel_world_size() rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) @@ -256,7 +204,6 @@ def __init__( rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, linear_method=linear_method, - quant_config=quant_config, bias=getattr(config, "bias", False), sliding_window=sliding_window, ) @@ -265,7 +212,6 @@ def __init__( intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, linear_method=linear_method, - quant_config=quant_config, ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -309,7 +255,6 @@ def __init__( self, config: LlamaConfig, linear_method: Optional[LinearMethodBase] = None, - quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, ) -> None: super().__init__() @@ -325,7 +270,7 @@ def __init__( org_num_embeddings=config.vocab_size, ) self.layers = nn.ModuleList([ - LlamaDecoderLayer(config, linear_method, quant_config) + LlamaDecoderLayer(config, linear_method) for _ in range(config.num_hidden_layers) ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -391,15 +336,12 @@ def __init__( self, config: LlamaConfig, linear_method: Optional[LinearMethodBase] = None, - quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, ) -> None: super().__init__() self.config = config self.linear_method = linear_method - self.quant_config = quant_config - self.model = LlamaModel(config, linear_method, lora_config=lora_config, - quant_config = quant_config) + self.model = LlamaModel(config, linear_method, lora_config=lora_config) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size From fa35654b4a0ac4b5978ecf987d056c0970c79841 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 12 Apr 2024 22:56:47 +0000 Subject: [PATCH 05/12] updated llama.py to minimze changes --- .../layers/quantization/smoothquant.py | 6 ++- vllm/model_executor/models/llama.py | 52 +++++++------------ 2 files changed, 24 insertions(+), 34 deletions(-) diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py index 13b995921377..ec7344e39291 100644 --- a/vllm/model_executor/layers/quantization/smoothquant.py +++ b/vllm/model_executor/layers/quantization/smoothquant.py @@ -112,6 +112,8 @@ def create_weights( logical_widths: Optional[List[int]] = None, per_token_quant:bool = False, ) -> Dict[str, Tensor]: + self.output_dtype = params_dtype + weight = Parameter( torch.empty( output_size_per_partition, @@ -142,7 +144,7 @@ def create_weights( def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths): - x_dq = torch.empty_like(x_q, dtype=self.dequant_dtype) + x_dq = torch.empty_like(x_q, dtype=self.output_dtype) # Split into shards. x_q_split = x_q.split(logical_widths, dim=-1) @@ -164,7 +166,7 @@ def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths): def _quantize(self, x, per_token_quant: bool): - x_q = torch.empty_like(x, dtype=self.quant_dtype) + x_q = torch.empty_like(x, dtype=torch.int8) # Compute activation scale if per token. if per_token_quant: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d01d08ff6ff6..f0467007a170 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -29,14 +29,12 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import LoRAConfig -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) - -from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler @@ -51,6 +49,7 @@ from vllm.sequence import SamplerOutput from vllm.utils import is_hip + class LlamaMLP(nn.Module): def __init__( @@ -61,8 +60,6 @@ def __init__( linear_method: Optional[LinearMethodBase] = None, ) -> None: super().__init__() - self.hidden_size = hidden_size - self.gate_up_proj = MergedColumnParallelLinear( hidden_size, [intermediate_size] * 2, bias=False, @@ -76,7 +73,6 @@ def __init__( if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") - self.act_fn = SiluAndMul() def forward(self, x): @@ -96,28 +92,25 @@ def __init__( rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, linear_method: Optional[LinearMethodBase] = None, - quant_config: Optional[QuantizationConfig] = None, bias: bool = False, sliding_window: Optional[int] = None, ) -> None: super().__init__() self.hidden_size = hidden_size - self.tp_size = get_tensor_model_parallel_world_size() + tp_size = get_tensor_model_parallel_world_size() self.total_num_heads = num_heads - assert self.total_num_heads % self.tp_size == 0 - self.num_heads = self.total_num_heads // self.tp_size + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size self.total_num_kv_heads = num_kv_heads - self.default_dtype = torch.get_default_dtype() - - if self.total_num_kv_heads >= self.tp_size: + if self.total_num_kv_heads >= tp_size: # Number of KV heads is greater than TP size, so we partition # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % self.tp_size == 0 + assert self.total_num_kv_heads % tp_size == 0 else: # Number of KV heads is less than TP size, so we replicate # the KV heads across multiple tensor parallel GPUs. - assert self.tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size) + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.head_dim = hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim @@ -134,14 +127,6 @@ def __init__( # scaling_factor = tensor_amax / FPtype_max self.kv_scale = 1.0 - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - ) - self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, @@ -150,7 +135,6 @@ def __init__( bias=bias, linear_method=linear_method, ) - self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, @@ -158,6 +142,14 @@ def __init__( linear_method=linear_method, ) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = Attention( self.num_heads, self.head_dim, @@ -189,7 +181,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - self.tp_size = get_tensor_model_parallel_world_size() rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", @@ -226,7 +217,6 @@ def forward( attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention if residual is None: residual = hidden_states @@ -234,7 +224,6 @@ def forward( else: hidden_states, residual = self.input_layernorm( hidden_states, residual) - hidden_states = self.self_attn( positions=positions, hidden_states=hidden_states, @@ -243,8 +232,8 @@ def forward( ) # Fully Connected - hidden_states, residual = self.post_attention_layernorm(hidden_states, - residual) + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) hidden_states = self.mlp(hidden_states) return hidden_states, residual @@ -391,8 +380,6 @@ def load_weights(self, load_format: str = "auto", revision: Optional[str] = None): # For SmoothQuant - int8_fusion = self.quant_config is not None and \ - self.quant_config.get_name() == "smoothquant" stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -415,6 +402,7 @@ def load_weights(self, if "bias" in name: continue # load dequant scale for qkv_proj and gate_up_proj + int8_fusion = True if int8_fusion: is_fusion_scale = False if "scale" in name: From 53f19125a8707e998e69bc54bc93fcdef90c71bd Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 12 Apr 2024 22:58:14 +0000 Subject: [PATCH 06/12] updated llama.py to minimze changes --- vllm/model_executor/models/llama.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f0467007a170..9e6cee4b4d66 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -64,11 +64,10 @@ def __init__( hidden_size, [intermediate_size] * 2, bias=False, linear_method=linear_method) - - self.down_proj = RowParallelLinear( - intermediate_size, hidden_size, - bias=False, - linear_method=linear_method) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " @@ -81,6 +80,7 @@ def forward(self, x): x, _ = self.down_proj(x) return x + class LlamaAttention(nn.Module): def __init__( @@ -149,13 +149,11 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=sliding_window) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=sliding_window) def forward( self, @@ -172,6 +170,7 @@ def forward( output, _ = self.o_proj(attn_output) return output + class LlamaDecoderLayer(nn.Module): def __init__( From f62ff3a59c4b27d7b282fe2175585dfc225bb28e Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 12 Apr 2024 23:00:20 +0000 Subject: [PATCH 07/12] minimize changes to llama.py --- vllm/model_executor/models/llama.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 9e6cee4b4d66..1831fbb31b4a 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -68,7 +68,6 @@ def __init__( hidden_size, bias=False, linear_method=linear_method) - if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -378,7 +377,6 @@ def load_weights(self, cache_dir: Optional[str] = None, load_format: str = "auto", revision: Optional[str] = None): - # For SmoothQuant stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -397,9 +395,6 @@ def load_weights(self, # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - # bias is useless for llama - if "bias" in name: - continue # load dequant scale for qkv_proj and gate_up_proj int8_fusion = True if int8_fusion: From 888135fc57b262e39a91e3467be2d549f701a936 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 12 Apr 2024 23:01:30 +0000 Subject: [PATCH 08/12] minimize changes to llama.py --- vllm/model_executor/layers/quantization/smoothquant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py index ec7344e39291..e658cb4eaff6 100644 --- a/vllm/model_executor/layers/quantization/smoothquant.py +++ b/vllm/model_executor/layers/quantization/smoothquant.py @@ -110,7 +110,7 @@ def create_weights( output_size: int, params_dtype: torch.dtype, logical_widths: Optional[List[int]] = None, - per_token_quant:bool = False, + per_token_quant: bool = False, ) -> Dict[str, Tensor]: self.output_dtype = params_dtype From d5d223b91cc16afd58e6091962c011331344d0ba Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 12 Apr 2024 23:09:37 +0000 Subject: [PATCH 09/12] tweak --- .../layers/quantization/smoothquant.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py index e658cb4eaff6..09f112132d95 100644 --- a/vllm/model_executor/layers/quantization/smoothquant.py +++ b/vllm/model_executor/layers/quantization/smoothquant.py @@ -182,6 +182,15 @@ def _quantize(self, x, per_token_quant: bool): return x_q, activation_scale + def _gemm(self, x_q, weight): + x_q = x_q.view(-1, x_q.shape[-1]) + out_q = torch.empty( + (x_q.shape[0], weight.shape[0]), + dtype=torch.int32, device="cuda") + + self.i8cugemm.linear_a8_w8_o32_(x_q, weight, out_q) + return out_q.view(*x_q.shape[:-1], -1) + def apply_weights(self, weights: Dict[str, Tensor], @@ -195,15 +204,5 @@ def apply_weights(self, # Q x_q, activation_scale = self._quantize(x, per_token_quant) - - # GEMM - x_q = x_q.view(-1, x_q.shape[-1]) - out_q = torch.empty( - (x_q.shape[0], weight.shape[0]), - dtype=torch.int32, device=x.device) - - self.i8cugemm.linear_a8_w8_o32_(x_q, weight, out_q) - out_q = out_q.view(*x_q.shape[:-1], -1) - - # DQ - return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths) + out_q = self._gemm(x_q, weight) + return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths) From 10551e49ca95fd21b27c5842ca85097b94756105 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sat, 13 Apr 2024 00:24:36 +0000 Subject: [PATCH 10/12] updated llama weight loading --- vllm/model_executor/layers/linear.py | 53 +++++++++++++++++-- .../layers/quantization/smoothquant.py | 36 ++++++++++++- vllm/model_executor/models/llama.py | 31 ++--------- 3 files changed, 87 insertions(+), 33 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 57095dff6bb0..f9332c05cfe6 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -43,6 +43,12 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: """Apply the weights to the input tensor.""" raise NotImplementedError + + def maybe_update_loaded_weight_name(self, name: str) -> str: + """Update the name of a loaded weight to enable generic handling of + cases where serialized state_dict does not match vllm model definition. + """ + return name class UnquantizedLinearMethod(LinearMethodBase): @@ -283,6 +289,18 @@ def weight_loader(self, loaded_shard_id: Optional[int] = None): param_data = param.data output_dim = getattr(param, "output_dim", None) + param_shard_splitter = getattr(param, "shard_splitter", None) + if output_dim is not None and param_shard_splitter is not None: + raise NotImplementedError( + "We do not currently support output_dim != None and " + "shard_splitter != None for a parameter. Please open an issue." + ) + if loaded_shard_id is None and param_shard_splitter is not None: + raise NotImplementedError( + "We do not currently support loaded_shard_id == None and " + "shard_splitter != None for a parameter. Please open an issue." + ) + if loaded_shard_id is None: # Loaded weight is already packed. if output_dim is None: @@ -335,6 +353,10 @@ def weight_loader(self, start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + # If a param_shard_splitter is defined by the LinearMethod, use it. + elif param_shard_splitter is not None: + param_data, loaded_weight = param_shard_splitter( + param_data, loaded_weight, loaded_shard_id) else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: @@ -342,6 +364,7 @@ def weight_loader(self, "Loading a weight without `output_dim` attribute in " "MergedColumnParallelLinear, assume the weight is " "the same for all partitions.") + assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -420,6 +443,18 @@ def weight_loader(self, loaded_shard_id: Optional[str] = None): param_data = param.data output_dim = getattr(param, "output_dim", None) + param_shard_splitter = getattr(param, "shard_splitter", None) + + if output_dim is not None and param_shard_splitter is not None: + raise NotImplementedError( + "We do not currently support output_dim != None and " + "shard_splitter != None for a parameter. Please open an issue." + ) + if loaded_shard_id is None and param_shard_splitter is not None: + raise NotImplementedError( + "We do not currently support loaded_shard_id == None and " + "shard_splitter != None for a parameter. Please open an issue." + ) if loaded_shard_id is None: # Loaded weight is already packed. @@ -455,6 +490,8 @@ def weight_loader(self, tp_rank = get_tensor_model_parallel_rank() assert loaded_shard_id in ["q", "k", "v"] + + # If output dim is defined, use the default loading process. if output_dim is not None: if loaded_shard_id == "q": shard_offset = 0 @@ -478,15 +515,19 @@ def weight_loader(self, shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) - param_data = param_data.narrow(output_dim, shard_offset, - shard_size) + param_data = param_data.narrow( + output_dim, shard_offset, shard_size) if loaded_shard_id == "q": shard_id = tp_rank else: shard_id = tp_rank // self.num_kv_head_replicas start_idx = shard_id * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) + shard_size) + # If a param_shard_splitter is defined by the LinearMethod, use it. + elif param_shard_splitter is not None: + param_data, loaded_weight = param_shard_splitter( + param_data, loaded_weight, loaded_shard_id) else: ignore_warning = getattr(param, "ignore_warning", False) if not ignore_warning: @@ -494,7 +535,11 @@ def weight_loader(self, "Loading a weight without `output_dim` attribute in " "QKVParallelLinear, assume the weight is the same " "for all partitions.") - assert param_data.shape == loaded_weight.shape + + assert ( + param_data.shape == loaded_weight.shape or + (len(param_data.shape) == 0 and len(loaded_weight.shape) == 0) + ) param_data.copy_(loaded_weight) diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py index 09f112132d95..9aaa9db45f57 100644 --- a/vllm/model_executor/layers/quantization/smoothquant.py +++ b/vllm/model_executor/layers/quantization/smoothquant.py @@ -102,6 +102,15 @@ class SQLinearMethod(LinearMethodBase): def __init__(self,): self.i8cugemm = Int8GEMM().get_i8cugemm() + + def maybe_update_loaded_weight_name(self, name: str) -> str: + # Convert prefix.k_dequant_scale >> prefix.dequant_scale. + if "dequant_scale" in name: + suffix = name.split('.')[-1] + name.replace(suffix, "dequant_scale") + return name + + def create_weights( self, input_size_per_partition: int, @@ -134,6 +143,9 @@ def create_weights( device='cuda' ), requires_grad=False ) + set_weight_attrs(dequant_scale, { + "shard_splitter": self.shard_splitter_scales, + }) return { "weight": weight, @@ -142,6 +154,28 @@ def create_weights( "per_token_quant": per_token_quant, } + @staticmethod + def shard_splitter_scales( + param: Tensor, + loaded_weight: Tensor, + shard_id: Union[str, int] + ) -> Tuple[Tensor, Tensor]: + index_in_param = None + if shard_id == "q": + index_in_param = 0 + elif shard_id == "k": + index_in_param = 1 + elif shard_id == "v": + index_in_param = 2 + elif type(shard_id) == int: + index_in_param = shard_id + else: + raise ValueError( + f"shard_id must be 'q', 'k', 'v' or an int, but got {shard_id}" + ) + + return param[index_in_param], loaded_weight + def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths): x_dq = torch.empty_like(x_q, dtype=self.output_dtype) @@ -202,7 +236,7 @@ def apply_weights(self, logical_widths = weights["logical_widths"] per_token_quant = weights["per_token_quant"] - # Q + # Q --> GEMM --> DQ x_q, activation_scale = self._quantize(x, per_token_quant) out_q = self._gemm(x_q, weight) return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 1831fbb31b4a..868394c76109 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -388,6 +388,9 @@ def load_weights(self, params_dict = dict(self.named_parameters()) for name, loaded_weight in hf_model_weights_iterator( model_name_or_path, cache_dir, load_format, revision): + # Update name of the loaded_weight if needed by the LinearMethod. + name = self.linear_method.maybe_update_loaded_weight_name(name) + if "rotary_emb.inv_freq" in name: continue if ("rotary_emb.cos_cached" in name @@ -395,34 +398,6 @@ def load_weights(self, # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - # load dequant scale for qkv_proj and gate_up_proj - int8_fusion = True - if int8_fusion: - is_fusion_scale = False - if "scale" in name: - for (param_name, weight_name, _) in stacked_params_mapping: - if weight_name not in name: - continue - if "q_proj" in name: - idx = 0 - elif "k_proj" in name: - idx = 1 - elif "v_proj" in name: - idx = 2 - elif "gate_proj" in name: - idx = 0 - else: - assert "up_proj" in name - idx = 1 - - name = name.replace(weight_name, param_name) - suffix = name.split('.')[-1] - param = params_dict[name.replace(suffix, "dequant_scale")][idx] - param.copy_(loaded_weight) - is_fusion_scale = True - break - if is_fusion_scale: - continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue From a0480f7b7d0819e543f799360dbd214cf684a67b Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sat, 13 Apr 2024 02:14:37 +0000 Subject: [PATCH 11/12] added TODO --- examples/offline_inference.py | 2 +- vllm/model_executor/layers/linear.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference.py b/examples/offline_inference.py index 9b758fa2479f..6b548d5e8921 100644 --- a/examples/offline_inference.py +++ b/examples/offline_inference.py @@ -11,7 +11,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. -llm = LLM(model="facebook/opt-125m") +llm = LLM(model="meta-llama/Llama-2-7b-chat-hf") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index f9332c05cfe6..8f231e0b021c 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -65,7 +65,7 @@ def __init__(self, separate_bias_add: bool = False): def create_weights(self, input_size_per_partition: int, output_size_per_partition: int, input_size: int, output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: + params_dtype: torch.dtype, logical_widths: Optional[List[int]]) -> Dict[str, Any]: weight = Parameter(torch.empty(output_size_per_partition, input_size_per_partition, dtype=params_dtype), @@ -195,6 +195,7 @@ def __init__( output_size=self.output_size, params_dtype=self.params_dtype, logical_widths=logical_widths, + # TODO: remove this, should be coming through the quant config. per_token_quant=False, ) for name, weight in self.linear_weights.items(): @@ -604,7 +605,9 @@ def __init__( output_size=self.output_size, params_dtype=self.params_dtype, logical_widths=[self.output_size], - per_token_quant=True,) + # TODO: remove this, should be coming through the quant config. + per_token_quant=True, + ) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) From 1afab71335248dfe23aac8e0d32640b5012d9309 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Tue, 16 Apr 2024 13:32:26 -0400 Subject: [PATCH 12/12] [2/N] Rs/vllm quantization - Refactor refactor to support non-uniform via config (#188) Refactored to support nonuniform quantization by adding a new layer of Abstraction. Now, `SmoothQuantLinearMethod` can hold a `SmoothQuantFormat`, which implements the details of how to do quant and dequant operations. There are two `SmoothQuantFormat` classes: - `SmoothQuantDynamicPerToken` - `SmoothQuantStaticPerTensor` We have the following lifecycle: - `LinearMethod` is created during `get_model`, has access to `QuantizationConfig` - `Layer` is initialized and passed a `LinearMethod` - `Layer` calls `LinearMethod.create_weights`, which creates a dictionary of weights and metadata - `Layer` calls `LinearMethod.apply_weights` during inference, passing the dictionary created during `create_weights` This PR modifies the `LinearMethod.create_weights` API to receive a `layer_name` as argument. The `LinearMethod` then looks in the `config` to determine which `SmoothQuantFormat` to use for the layer with `layer_name` - As a result, the `LinearMethod` is responsible for parsing the config from disk and making decisions about what the inference format should look like. In this specific case, since the `SmoothQuantConfig` is not very good, we just match on the suffix `qkv` to determine what each layer should use --> but for SparseMLConfig, we could use a similar structure In this PR, the `SmoothQuantFormat` is passed in the dictionary returned by `create_weights` and then is used by `apply_weights` ### In Summary I think this is a good overall structure because it: - (a) allows us to make minimal changes to the existing models - (b) allows us to make no changes to the model loading lifecycle (i.e. config / constructor / linear method) ** critically requires having one LinearMethod that propagates through the whole model - (c) encapsulates the nonuniform logic into the `LinearMethod`, allowing us to have a clean interface into ### For SparseML Models We could imagine the following architecture: #### Config Config is responsible for: - loading config from disk - mapping layer_names --> `SparseMLFormat` ```python class SparseMLConfig def from_dict() def get_layer_format(layer_name): return SparseMLFormat ``` #### LinearMethod Config is responsible for: - interface between layers and kernels (so LinearMethod is what is used by the model) ```python class SparseMLLinearMethod: def __init__(self, sparseml_config) self.sparseml_config = sparseml_config def create_weights(layer_name, ...): # this, e.g. is where nonuniform might be supported format = self.sparseml_config.get_layer_format(layer_name) weights = format.get_weights() weights["format"] = format return weights # wrapper around the SparseML format def apply_weights(x, weights, ...) format = weights["format"] weights = weights["weights"] return format.apply_weights(x, weights) ``` #### SparseMLFormat Format is responsible for: - actual weight creation and forward ```python class SparseMLLinearMethod: def __init__(self, sparseml_config) self.sparseml_config = sparseml_config def get_weights(sizes): # returns dictionary , e.g. return { "weights": x "scales": y } def apply_weights(weights, x): # calls cuda kernel return output ``` Sample Formats: - `W8A8DynamicPerToken` - `SparseW8A8StaticPerTensorAsymmetric` - `W4A8DynamicPerToken` - ... --- examples/offline_quantized_inference.py | 2 +- examples/simple_test.py | 35 ++ vllm/config.py | 4 +- vllm/model_executor/layers/linear.py | 76 +++-- .../model_executor/layers/quantization/awq.py | 15 +- .../layers/quantization/base_config.py | 6 +- .../layers/quantization/gptq.py | 7 +- .../layers/quantization/marlin.py | 6 +- .../layers/quantization/smoothquant.py | 242 -------------- .../quantization/smoothquant/__init__.py | 14 + .../layers/quantization/smoothquant/config.py | 306 ++++++++++++++++++ .../quantization/smoothquant/formats.py | 100 ++++++ .../layers/quantization/squeezellm.py | 15 +- vllm/model_executor/model_loader.py | 8 +- vllm/model_executor/models/llama.py | 45 ++- 15 files changed, 571 insertions(+), 310 deletions(-) create mode 100644 examples/simple_test.py delete mode 100644 vllm/model_executor/layers/quantization/smoothquant.py create mode 100644 vllm/model_executor/layers/quantization/smoothquant/__init__.py create mode 100644 vllm/model_executor/layers/quantization/smoothquant/config.py create mode 100644 vllm/model_executor/layers/quantization/smoothquant/formats.py diff --git a/examples/offline_quantized_inference.py b/examples/offline_quantized_inference.py index 124a99468704..8b3dbea72ae6 100644 --- a/examples/offline_quantized_inference.py +++ b/examples/offline_quantized_inference.py @@ -17,7 +17,7 @@ # Create an LLM. llm = LLM( - model=model_path, + model="nm-testing/Nous-Hermes-Llama2-13b-smoothquant", gpu_memory_utilization=0.9, max_model_len=2048, quantization="smoothquant", diff --git a/examples/simple_test.py b/examples/simple_test.py new file mode 100644 index 000000000000..dcf8b8c7ed1e --- /dev/null +++ b/examples/simple_test.py @@ -0,0 +1,35 @@ +import argparse +from vllm import LLM, SamplingParams + +MODELS = { + "tinyllama-fp16": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "tinyllama-marlin": "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", + "tinyllama-gptq": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", + "tinyllama-awq": "TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ", +} + +parser = argparse.ArgumentParser() +parser.add_argument("--model", type=str) +parser.add_argument("--tensor-parallel-size", type=int, default=1) +args = parser.parse_args() + +if args.model not in MODELS: + print(f"Got model id of {args.model}; Must be in {list(MODELS.keys())}") + raise ValueError +else: + model_id = MODELS[args.model] + print(f"Using model_id = {model_id}") + +messages=[{ + "role": "system", + "content": "You are a helpful assistant." +}, { + "role": "user", + "content": "What is deep learning?" +}] + +model = LLM(model_id, enforce_eager=True, max_model_len=2048, tensor_parallel_size=args.tensor_parallel_size, dtype="float16") +prompt = model.llm_engine.tokenizer.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +out = model.generate(prompt, SamplingParams(max_tokens=50)) +print(f"\n-----prompt\n{prompt}") +print(f"\n-----generation\n{out[0].outputs[0].text}") diff --git a/vllm/config.py b/vllm/config.py index 3149aaf68914..cd48fe4f1b9d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -173,8 +173,8 @@ def _verify_tokenizer_mode(self) -> None: self.tokenizer_mode = tokenizer_mode def _verify_quantization(self) -> None: - supported_quantization = ["awq", "gptq", "squeezellm", "smoothquant"] - rocm_not_supported_quantization = ["awq", "marlin"] + supported_quantization = ["awq", "gptq", "marlin", "squeezellm", "smoothquant"] + rocm_not_supported_quantization = ["awq", "marlin", "smoothquant"] if self.quantization is not None: self.quantization = self.quantization.lower() diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 8f231e0b021c..2598156bbed3 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -29,8 +29,11 @@ class LinearMethodBase(ABC): """Base class for different (maybe quantized) linear methods.""" @abstractmethod - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, + def create_weights(self, + layer_name: str, + input_size_per_partition: int, + output_sizes_per_partition: List[int], + input_size: int, output_size: int, params_dtype: torch.dtype) -> Dict[str, Any]: """Create weights for a linear layer.""" @@ -62,17 +65,20 @@ class UnquantizedLinearMethod(LinearMethodBase): def __init__(self, separate_bias_add: bool = False): self.separate_bias_add = separate_bias_add - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype, logical_widths: Optional[List[int]]) -> Dict[str, Any]: - weight = Parameter(torch.empty(output_size_per_partition, + def create_weights(self, + layer_name: str, + input_size_per_partition: int, + output_sizes_per_partition: List[int], + input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + weight = Parameter(torch.empty(sum(output_sizes_per_partition), input_size_per_partition, dtype=params_dtype), requires_grad=False) set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) return {"weight": weight} + def apply_weights(self, weights: Dict[str, torch.Tensor], x: torch.Tensor, @@ -89,6 +95,7 @@ class ReplicatedLinear(torch.nn.Module): """Replicated linear layer. Args: + layer_name: name of the layer in the state dict. input_size: input dimension of the linear layer. output_size: output dimension of the linear layer. bias: If true, add bias. @@ -99,6 +106,7 @@ class ReplicatedLinear(torch.nn.Module): def __init__( self, + layer_name: str, input_size: int, output_size: int, bias: bool = True, @@ -109,6 +117,7 @@ def __init__( super().__init__() # Keep input parameters + self.layer_name = layer_name self.input_size = input_size self.output_size = output_size self.skip_bias_add = skip_bias_add @@ -119,8 +128,8 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( - self.input_size, self.output_size, self.input_size, - self.output_size, self.params_dtype) + self.layer_name, self.input_size, [self.output_size], + self.input_size, self.output_size, self.params_dtype) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): self.register_parameter(name, weight) @@ -145,6 +154,7 @@ class ColumnParallelLinear(torch.nn.Module): its second dimension as A = [A_1, ..., A_p]. Args: + layer_name: name of the layer in the state dict. input_size: first dimension of matrix A. output_size: second dimension of matrix A. bias: If true, add bias. @@ -163,6 +173,7 @@ class ColumnParallelLinear(torch.nn.Module): def __init__( self, + layer_name: str, input_size: int, output_size: int, bias: bool = True, @@ -170,17 +181,24 @@ def __init__( skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, linear_method: Optional[LinearMethodBase] = None, - logical_widths: Optional[List[int]] = None, ): super().__init__() # Keep input parameters + self.layer_name = layer_name self.input_size = input_size self.output_size = output_size self.gather_output = gather_output # Divide the weight matrix along the last dimension. tp_size = get_tensor_model_parallel_world_size() - self.output_size_per_partition = divide(output_size, tp_size) + self.output_size_per_partition = divide(self.output_size, tp_size) + self.output_sizes_per_partition = [self.output_size_per_partition] + # If QKV or MergedColumn, use output size of each partition. + if self.output_sizes is not None: + self.output_sizes_per_partition = [ + divide(output_size, tp_size) for output_size in self.output_sizes + ] + self.skip_bias_add = skip_bias_add if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -189,14 +207,12 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( + layer_name=self.layer_name, input_size_per_partition=self.input_size, - output_size_per_partition=self.output_size_per_partition, + output_sizes_per_partition=self.output_sizes_per_partition, input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - logical_widths=logical_widths, - # TODO: remove this, should be coming through the quant config. - per_token_quant=False, ) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): @@ -263,6 +279,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): def __init__( self, + layer_name: str, input_size: int, output_sizes: List[int], bias: bool = True, @@ -275,14 +292,14 @@ def __init__( tp_size = get_tensor_model_parallel_world_size() assert all(output_size % tp_size == 0 for output_size in output_sizes) super().__init__( + layer_name=layer_name, input_size=input_size, output_size=sum(output_sizes), bias=bias, gather_output=gather_output, skip_bias_add=skip_bias_add, params_dtype=params_dtype, - linear_method=linear_method, - logical_widths=output_sizes) + linear_method=linear_method) def weight_loader(self, param: Parameter, @@ -381,6 +398,7 @@ class QKVParallelLinear(ColumnParallelLinear): be replicated while the query heads are partitioned. Args: + layer_name: name of the layer in the state dict. hidden_size: input hidden state size of the transformer. head_size: size of each attention head. total_num_heads: total number of attention query heads. @@ -396,6 +414,7 @@ class QKVParallelLinear(ColumnParallelLinear): def __init__( self, + layer_name: str, hidden_size: int, head_size: int, total_num_heads: int, @@ -424,19 +443,21 @@ def __init__( input_size = self.hidden_size output_size = (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size + self.output_sizes = [ + self.num_heads * self.head_size * tp_size, # q_proj + self.num_kv_heads * self.head_size * tp_size, # k_proj + self.num_kv_heads * self.head_size * tp_size, # v_proj + ] + super().__init__( + layer_name=layer_name, input_size=input_size, output_size=output_size, bias=bias, gather_output=False, skip_bias_add=skip_bias_add, params_dtype=params_dtype, - linear_method=linear_method, - logical_widths = [ - self.num_heads * self.head_size, # q_proj - self.total_num_kv_heads * self.head_size, # k_proj - self.total_num_kv_heads * self.head_size, # v_proj - ]) + linear_method=linear_method) def weight_loader(self, param: Parameter, @@ -557,6 +578,7 @@ class RowParallelLinear(torch.nn.Module): | A_p | - - Arguments: + layer_name: name of the layer in the state dict. input_size: first dimension of matrix A. output_size: second dimension of matrix A. bias: If true, add bias. Note that bias is not parallelized. @@ -572,6 +594,7 @@ class RowParallelLinear(torch.nn.Module): def __init__( self, + layer_name: str, input_size: int, output_size: int, bias: bool = True, @@ -583,6 +606,7 @@ def __init__( ): super().__init__() # Keep input parameters + self.layer_name = layer_name self.input_size = input_size self.output_size = output_size self.input_is_parallel = input_is_parallel @@ -599,14 +623,12 @@ def __init__( linear_method = UnquantizedLinearMethod() self.linear_method = linear_method self.linear_weights = self.linear_method.create_weights( + layer_name=self.layer_name, input_size_per_partition=self.input_size_per_partition, - output_size_per_partition=self.output_size, + output_sizes_per_partition=[self.output_size], input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - logical_widths=[self.output_size], - # TODO: remove this, should be coming through the quant config. - per_token_quant=True, ) for name, weight in self.linear_weights.items(): if isinstance(weight, torch.Tensor): diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 2caef5f1ebf5..7cf94ae9f44e 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -79,10 +79,17 @@ class AWQLinearMethod(LinearMethodBase): def __init__(self, quant_config: AWQConfig): self.quant_config = quant_config - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: + def create_weights( + self, + layer_name: str, + input_size_per_partition: int, + output_sizes_per_partition: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + del layer_name, input_size, output_size # Unused. + output_size_per_partition = sum(output_sizes_per_partition) + if input_size_per_partition % self.quant_config.group_size != 0: raise ValueError( "The input size is not aligned with the quantized " diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 6115e7c3be95..868e09252bb2 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -51,14 +51,12 @@ def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any: "quantization config.") @abstractmethod - def get_linear_method(self) -> LinearMethodBase: + def get_linear_method(self, name) -> LinearMethodBase: """Get the linear method to use for the quantized linear layer.""" raise NotImplementedError - @abstractmethod def get_scaled_act_names(self) -> List[str]: """Returns the activation function names that should be post-scaled. - For now, this is only used by AWQ. """ - raise NotImplementedError + return [] diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 53baf710ed81..8c3492ae67d8 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -89,13 +89,16 @@ def __init__(self, quant_config: GPTQConfig): def create_weights( self, + layer_name: str, input_size_per_partition: int, - output_size_per_partition: int, + output_sizes_per_partition: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, ) -> Dict[str, Any]: - del output_size # Unused. + del output_size, layer_name # Unused. + output_size_per_partition = sum(output_sizes_per_partition) + if input_size_per_partition % self.quant_config.group_size != 0: raise ValueError( "The input size is not aligned with the quantized " diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 784229878edf..59d217567919 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -91,13 +91,15 @@ def __init__(self, quant_config: MarlinConfig): def create_weights( self, + layer_name: str, input_size_per_partition: int, - output_size_per_partition: int, + output_sizes_per_partition: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, ) -> Dict[str, Any]: - del output_size # Unused. + del layer_name, input_size, output_size # Unused. + output_size_per_partition = sum(output_sizes_per_partition) if params_dtype != torch.float16: raise ValueError( diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py deleted file mode 100644 index 9aaa9db45f57..000000000000 --- a/vllm/model_executor/layers/quantization/smoothquant.py +++ /dev/null @@ -1,242 +0,0 @@ -from typing import Any, Dict, List, Tuple, Optional, Union - -import torch -from torch._tensor import Tensor -from torch.nn.parameter import Parameter -from torch.nn import ParameterList -import threading - -from vllm._C import ops -from vllm.model_executor.layers.linear import (LinearMethodBase, - set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig - - -class SmoothQuantConfig(QuantizationConfig): - """Config class for SmoothQuant - - Reference: https://github.com/mit-han-lab/smoothquant - """ - - def __init__(self, - weight_bits: int = 8, - quant_map: dict[str:str] = None) -> None: - self.weight_bits = weight_bits - self.quant_map = quant_map - - if self.weight_bits != 8: - raise ValueError( - "Currently, only w8a8 quantization is supported for " - f"SmoothQuant, but got {self.weight_bits} bits.") - if self.quant_map is None or self.quant_map == {}: - raise ValueError( - 'Quant_map for SmoothQuant should not be None or an empty dict. ' - 'For example, when using llama, you should set a quant_config.json in model directory, like ' - '{ "qkv": "per-tensor", "out": "per-token", "fc1": "per-tensor", "fc2": "per-token" }' - ) - - def __repr__(self) -> str: - return (f"SmoothQuantConfig(weight_bits={self.weight_bits}, " - f"quant_map={self.quant_map})") - - def get_name(self) -> str: - return "smoothquant" - - def get_supported_act_dtypes(self) -> List[torch.dtype]: - return [torch.half, torch.float] - - def get_min_capability(self) -> int: - # The smoothquant kernel only supports Ampere or newer GPUs. - return 80 - - @classmethod - def get_config_filenames(cls) -> List[str]: - """List of filenames to search for in the model directory.""" - return [ - "quant_config.json", - "quantize_config.json", - ] - - @classmethod - def from_config(cls, config: Dict[str, Any]) -> "SmoothQuantConfig": - try: - weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) - except ValueError as e: - weight_bits = 8 - print(str(e) + " Set weight_bits = 8 by default.") - - quant_map = {} - for key, value in config.items(): - if value in ["per-tensor", "per-token"]: - quant_map[key] = value - return cls(weight_bits, quant_map) - - def get_linear_method(self) -> "SQLinearMethod": - return SQLinearMethod() - - def get_scaled_act_names(self) -> List[str]: - return [] - - -class Int8GEMM(object): - _instance_lock = threading.Lock() - - def __init__(self): - if not hasattr(self, "i8cugemm"): - self.i8cugemm = ops.I8CUGEMM() - - def __new__(cls, *args, **kwargs): - if not hasattr(Int8GEMM, "_instance"): - with Int8GEMM._instance_lock: - if not hasattr(Int8GEMM, "_instance"): - Int8GEMM._instance = object.__new__(cls) - return Int8GEMM._instance - - def get_i8cugemm(self): - return self.i8cugemm - -class SQLinearMethod(LinearMethodBase): - """Linear method for AutoSmoothQuant. - """ - - def __init__(self,): - self.i8cugemm = Int8GEMM().get_i8cugemm() - - - def maybe_update_loaded_weight_name(self, name: str) -> str: - # Convert prefix.k_dequant_scale >> prefix.dequant_scale. - if "dequant_scale" in name: - suffix = name.split('.')[-1] - name.replace(suffix, "dequant_scale") - return name - - - def create_weights( - self, - input_size_per_partition: int, - output_size_per_partition: int, - input_size: int, - output_size: int, - params_dtype: torch.dtype, - logical_widths: Optional[List[int]] = None, - per_token_quant: bool = False, - ) -> Dict[str, Tensor]: - self.output_dtype = params_dtype - - weight = Parameter( - torch.empty( - output_size_per_partition, - input_size_per_partition, - device="cuda", - dtype=torch.int8, - ), requires_grad=False, - ) - set_weight_attrs(weight, { - "input_dim": 1, - "output_dim": 0, - }) - - dequant_scale = Parameter( - torch.tensor( - [1.0] * len(logical_widths), - dtype=params_dtype, - device='cuda' - ), requires_grad=False - ) - set_weight_attrs(dequant_scale, { - "shard_splitter": self.shard_splitter_scales, - }) - - return { - "weight": weight, - "dequant_scale": dequant_scale, - "logical_widths": logical_widths, - "per_token_quant": per_token_quant, - } - - @staticmethod - def shard_splitter_scales( - param: Tensor, - loaded_weight: Tensor, - shard_id: Union[str, int] - ) -> Tuple[Tensor, Tensor]: - index_in_param = None - if shard_id == "q": - index_in_param = 0 - elif shard_id == "k": - index_in_param = 1 - elif shard_id == "v": - index_in_param = 2 - elif type(shard_id) == int: - index_in_param = shard_id - else: - raise ValueError( - f"shard_id must be 'q', 'k', 'v' or an int, but got {shard_id}" - ) - - return param[index_in_param], loaded_weight - - - def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths): - x_dq = torch.empty_like(x_q, dtype=self.output_dtype) - - # Split into shards. - x_q_split = x_q.split(logical_widths, dim=-1) - x_dq_split = x_dq.split(logical_widths, dim=-1) - - # If QuantType is Static per Tensor: - if activation_scales is None: - for xdq, xq, weight_scale in zip(x_dq_split, x_q_split, weight_scales): - ops.dequant(xdq, xq, weight_scale) - - # If QuantType is Dynamic per Token: - else: - for xdq, xq, weight_scale, activation_scale in zip( - x_dq_split, x_q_split, weight_scales, activation_scales): - ops.dequant(xdq, xq, activation_scale, weight_scale) - - # Return dequantized activation. - return x_dq - - - def _quantize(self, x, per_token_quant: bool): - x_q = torch.empty_like(x, dtype=torch.int8) - - # Compute activation scale if per token. - if per_token_quant: - activation_scale = torch.empty( - x.numel() // x.shape[-1], - dtype=torch.float32, - device=x.device) - ops.quant(x_q, x, activation_scale) - # Set activation scale if per tensor. TODO: why 1.0? << static? - else: - activation_scale = None - ops.quant(x_q, x, 1.0) - - return x_q, activation_scale - - def _gemm(self, x_q, weight): - x_q = x_q.view(-1, x_q.shape[-1]) - out_q = torch.empty( - (x_q.shape[0], weight.shape[0]), - dtype=torch.int32, device="cuda") - - self.i8cugemm.linear_a8_w8_o32_(x_q, weight, out_q) - return out_q.view(*x_q.shape[:-1], -1) - - - def apply_weights(self, - weights: Dict[str, Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> Tensor: - assert bias is None - weight = weights["weight"] - dequant_scale = weights["dequant_scale"] - logical_widths = weights["logical_widths"] - per_token_quant = weights["per_token_quant"] - - # Q --> GEMM --> DQ - x_q, activation_scale = self._quantize(x, per_token_quant) - out_q = self._gemm(x_q, weight) - return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths) diff --git a/vllm/model_executor/layers/quantization/smoothquant/__init__.py b/vllm/model_executor/layers/quantization/smoothquant/__init__.py new file mode 100644 index 000000000000..2f62cee49d95 --- /dev/null +++ b/vllm/model_executor/layers/quantization/smoothquant/__init__.py @@ -0,0 +1,14 @@ +from vllm.model_executor.layers.quantization.smoothquant.formats import ( + SmoothQuantFormat +) + +from vllm.model_executor.layers.quantization.smoothquant.config import ( + SmoothQuantConfig, + SmoothQuantLinearMethod +) + +__all__ = [ + "SmoothQuantFormat", + "SmoothQuantConfig", + "SmoothQuantLinearMethod", +] diff --git a/vllm/model_executor/layers/quantization/smoothquant/config.py b/vllm/model_executor/layers/quantization/smoothquant/config.py new file mode 100644 index 000000000000..885ffce3e36d --- /dev/null +++ b/vllm/model_executor/layers/quantization/smoothquant/config.py @@ -0,0 +1,306 @@ +from typing import Any, Dict, List, Tuple, Type, Optional, Union +import threading + +import torch +from torch.nn.parameter import Parameter + +from vllm._C import ops +from vllm.model_executor.layers.linear import ( + LinearMethodBase, + set_weight_attrs) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.quantization.smoothquant.formats import ( + SmoothQuantFormat, + SmoothQuantDynamicPerToken, + SmoothQuantStaticPerTensor, +) + +LAYER_KEYS = ["qkv", "out", "fc1", "fc2"] +FORMAT_REGISTRY = { + "per-token": SmoothQuantDynamicPerToken, + "per-tensor": SmoothQuantStaticPerTensor, +} + +def get_sq_format_cls(format_key: str) -> Type[SmoothQuantFormat]: + if format_key not in FORMAT_REGISTRY: + raise ValueError(f"Invalid smoothquant format: {format_key}") + return FORMAT_REGISTRY[format_key] + +class SmoothQuantConfig(QuantizationConfig): + """Config class for SmoothQuant. + + Reference: https://github.com/mit-han-lab/smoothquant + """ + def __init__(self, + layer_format_map: Dict[str, str]) -> None: + self.layer_format_map = layer_format_map + + for key, format in self.layer_format_map.items(): + if key not in LAYER_KEYS: + raise ValueError( + f"Found key of {key} in {self.layer_format_map}, " + f"but key must be one of {LAYER_KEYS}" + ) + if format not in FORMAT_REGISTRY: + raise ValueError( + f"Found format of {format} in {self.layer_format_map}, " + f"but format must be one of {FORMAT_REGISTRY}" + ) + for key in LAYER_KEYS: + if key not in self.layer_format_map: + raise ValueError( + f"Could not find {key} in {layer_format_map}" + ) + + def __repr__(self) -> str: + return (f"SmoothQuantConfig(layer_format_map={self.layer_format_map})") + + def get_name(self) -> str: + return "smoothquant" + + def get_supported_act_dtypes(self) -> List[torch.dtype]: + # TODO: check if we support fp16 / bf16 as well. + return [torch.float] + + def get_min_capability(self) -> int: + # TODO: check if this is right. + return 80 + + @classmethod + def get_config_filenames(cls) -> List[str]: + """List of filenames to search for in the model directory.""" + return [ + "quant_config.json", + ] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "SmoothQuantConfig": + layer_format_map: Dict[str, str] = {} + for layer_key, format in config.items(): + if format in FORMAT_REGISTRY: + layer_format_map[layer_key] = format + return cls(layer_format_map) + + def get_linear_method(self) -> "SmoothQuantLinearMethod": + return SmoothQuantLinearMethod(self) + + +# TODO: why is this needed? +class Int8GEMM(object): + _instance_lock = threading.Lock() + + def __init__(self): + if not hasattr(self, "i8cugemm"): + self.i8cugemm = ops.I8CUGEMM() + + def __new__(cls, *args, **kwargs): + if not hasattr(Int8GEMM, "_instance"): + with Int8GEMM._instance_lock: + if not hasattr(Int8GEMM, "_instance"): + Int8GEMM._instance = object.__new__(cls) + return Int8GEMM._instance + + def get_i8cugemm(self): + return self.i8cugemm + + +class SmoothQuantLinearMethod(LinearMethodBase): + def __init__(self, sq_config: SmoothQuantConfig) -> None: + self.sq_config = sq_config + self.sq_type = None + self.i8cugemm = Int8GEMM().get_i8cugemm() + + def maybe_update_loaded_weight_name(self, + name: str) -> str: + """Convert serialized name k_dequant_scale to dequant_scale. + + This function is called by model_cls.load_weights() during the weight + loading process to match on disk state dict to vllm state dict. + """ + if "dequant_scale" in name: + suffix = name.split('.')[-1] + name.replace(suffix, "dequant_scale") + return name + + def scales_shard_splitter(self, + param: torch.Tensor, + loaded_weight: torch.Tensor, + shard_id: Union[str, int]) -> Tuple[torch.Tensor, torch.Tensor]: + """Index into param for for loading. + + This function is called by QKVColumnLinear and MergedColumnParallelLinear + during weight loading to put the scales from disk in the right spot. + """ + if type(shard_id) == str: + qkv_idxs = { "q": 0, "k": 1, "v": 2 } + if shard_id not in qkv_idxs: + raise ValueError(f"Invalid shard_id {shard_id}") + shard_id = qkv_idxs[shard_id] + elif type(shard_id) != int: + raise ValueError(f"Invalid shard id {shard_id}") + + return param[shard_id], loaded_weight + + def get_layer_format(self, layer_name: str) -> SmoothQuantFormat: + """ + Gets the SmoothQuantFormat for a specific layer. + + SmoothQuantLinearMethod uses SmoothQuantType to support non-uniform quantization + (where each layer has a different format). To determine the SmoothQuantFormat + for a layer, we match the layer_name to the layer_keys=["qkv","out","fc1","fc2"] + and use layer_format_map to to determine the SQFormat. + + Args: + layer_name: Name of the layer we are creating the LinearMethod for. + Returns + sq_linear_method: SmoothQuantLinearMethod with the right SQFormat. + """ + # Note: AutoSmoothQuant Serialization is not very good yet. + # + # It looks like the following (which does not map to layer names in the model): + # { + # "qkv": "per-tensor", + # "out": "per-token", + # "fc1": "per-tensor", + # "fc2": "per-token" + # } + # + # So, this is a hack for llama now. But with the SparseMLConfig, we can make robust, + # where we actually use the layer_name in the model to look up what the format is + # based on the config. + # + # What it would actually look like: + # layer_config is None + # for supported_key in SUPPORTED_LAYER_KEYS: + # if supported_key in layer_name: + # sq_format = self.layer_mapping[lookup_key] + # return get_sq_format_cls(sq_format)() + + HACKED_REMAP_FOR_LLAMA = { + "qkv": "qkv", + "o_proj": "out", + "gate_up": + "fc1", "down": "fc2", + } + + for match_key, lookup_key in HACKED_REMAP_FOR_LLAMA.items(): + if match_key in layer_name: + sq_format = self.sq_config.layer_format_map[lookup_key] + return get_sq_format_cls(sq_format)() + + raise ValueError + + def create_weights(self, + layer_name: str, + input_size_per_partition: int, + output_sizes_per_partition: int, + input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, torch.Tensor]: + del input_size, output_size + + # Statically Quantized Weights. + weight = Parameter( + torch.empty( + sum(output_sizes_per_partition), + input_size_per_partition, + device="cuda", dtype=torch.int8, + ), requires_grad=False, + ) + set_weight_attrs(weight, { + "input_dim": 1, + "output_dim": 0, + }) + + # Static scale for each logical weight (e.g. 3 for QKV). + dequant_scale = Parameter( + torch.empty( + len(output_sizes_per_partition), + device='cuda', dtype=params_dtype, + ), requires_grad=False + ) + set_weight_attrs(dequant_scale, { + "shard_splitter": self.scales_shard_splitter, + }) + + return { + "weight": weight, + "dequant_scale": dequant_scale, + "logical_widths": output_sizes_per_partition, + "sq_format": self.get_layer_format(layer_name) + } + + def _quantize(self, + x: torch.Tensor, + sq_format: SmoothQuantFormat) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """Quantize activations. + + Args: + x: Activation at floating point precision. + Returns: + x_q: Quantized activation at INT8 + activation_scales: Optional dynamic scales for each token. + """ + x_q = torch.empty_like(x, dtype=torch.int8) + x_q, activation_scales = sq_format.quantize_op(x, x_q) + return x_q, activation_scales + + def _dequantize(self, + x_q: torch.Tensor, + dynamic_scales: Optional[torch.Tensor], + static_scales: torch.Tensor, + logical_widths: List[int], + dtype: torch.dtype, + sq_format: SmoothQuantFormat) -> torch.Tensor: + """Dequantize activations. + + Args: + x_q: quantized activations. + dynamic_scales: Optional dynamic scales. + static_scales: Static dequantization scales. + logical_widths: Width of each logical activation (for QKV case). + dtype: Datatype to dequantize to. + Returns: + x_dq: dequantized activation at output_dtype precision + """ + # Split X_q and X_dq buffer into logical activations (for QKV case). + x_q_split = x_q.split(logical_widths, dim=-1) + x_dq = torch.empty_like(x_q, dtype=dtype) + x_dq_split = x_dq.split(logical_widths, dim=-1) + # Dequantize in place and return. + sq_format.dequantize_op(x_q_split, x_dq_split, dynamic_scales, static_scales) + return x_dq + + + def apply_weights(self, + weights: Dict[str, torch.Tensor], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + """Forward method. Computes Q --> GEMM --> DQ. + + Args: + weigths: Dictionary of weights, scales, and metadata. + x: Input in floating point precision. + bias: Optional bias. + Returns: + a_dq: Dequantized activation at floating point precision. + """ + if bias is not None: + raise NotImplementedError + weight_q = weights["weight"] + static_scales = weights["dequant_scale"] + logical_widths = weights["logical_widths"] + sq_format = weights["sq_format"] + + # Q + x_q, activation_scales = self._quantize(x, sq_format) + + # GEMM + x_q = x_q.view(-1, x_q.shape[-1]) + a_q = torch.empty((x_q.shape[0], weight_q.shape[0]), dtype=torch.int32, device="cuda") + self.i8cugemm.linear_a8_w8_o32_(x_q, weight_q, a_q) + a_q = a_q.view(*x_q.shape[:-1], -1) + + # DQ + return self._dequantize(a_q, activation_scales, static_scales, logical_widths, x.dtype, sq_format) diff --git a/vllm/model_executor/layers/quantization/smoothquant/formats.py b/vllm/model_executor/layers/quantization/smoothquant/formats.py new file mode 100644 index 000000000000..b8ddd642c888 --- /dev/null +++ b/vllm/model_executor/layers/quantization/smoothquant/formats.py @@ -0,0 +1,100 @@ +from abc import ABC, abstractmethod +from typing import List, Optional, Tuple, Type + +import torch + +from vllm._C import ops + + +class SmoothQuantFormat(ABC): + @abstractmethod + def dequantize_op(self, + x_qs: List[torch.Tensor], + x_dqs: List[torch.Tensor], + dynamic_scales: Optional[torch.Tensor], + static_scales: torch.Tensor) -> None: + """Dequantize the activations. x_dq is updated in place. + + Args: + x_qs: List of N quantized activations. + x_dqs: List of N buffers to fill with dequantized values. + dynamic_scales: Optional dynamic scales for dequantization. + static_scales: Static scales for dequantization. N values. + """ + raise NotImplementedError + + + @abstractmethod + def quantize_op(self, + x: torch.Tensor, + x_q: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """Quantize the input and (optionally compute dequant scales). + + Args: + x: Input data in floating point format. + x_q: Buffer for quantized inputs. + Returns: + x_q: Quantized input. + activation_scales: Optional dynamic scales for the activations. + """ + raise NotImplementedError + + +class SmoothQuantDynamicPerToken(SmoothQuantFormat): + def dequantize_op(self, + x_qs: List[torch.Tensor], + x_dqs: List[torch.Tensor], + dynamic_scales: Optional[torch.Tensor], + static_scales: torch.Tensor) -> None: + """Notes: + dynamic_scales: N scales for N tokens in the activation. + static_scales: K scales for K logical activations (equals just w_scale). + """ + if dynamic_scales is None: + raise ValueError + + # Dequantize each logical activation. + # TODO: test this for case when logical_widths > 1 (may need to reshape) + for x_dq, x_q, dynamic_scale, static_scale in zip( + x_dqs, x_qs, dynamic_scales, static_scales): + + # Dequantize (updates x_dq in place). + ops.dequant(x_dq, x_q, dynamic_scale, static_scale) + + + def quantize_op(self, + x: torch.Tensor, + x_q: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Notes: + Returns quantized activaiton and dynamic activation scales. + """ + activation_scales = torch.empty(x.numel() // x.shape[-1], dtype=x.dtype, device=x.device) + ops.quant(x_q, x, activation_scales) + return x_q, activation_scales + + +class SmoothQuantStaticPerTensor(SmoothQuantFormat): + def dequantize_op(self, + x_qs: List[torch.Tensor], + x_dqs: List[torch.Tensor], + dynamic_scales: Optional[torch.Tensor], + static_scales: torch.Tensor) -> None: + """Notes: + dynamic_scales: None + static_scales: K scales for K logical activations (equals w_scale * a_scale). + """ + if dynamic_scales is not None: + raise ValueError + + # Dequantize each logical activation. + for xdq, xq, static_scale in zip(x_dqs, x_qs, static_scales): + ops.dequant(xdq, xq, static_scale) + + def quantize_op(self, + x: torch.Tensor, + x_q: torch.Tensor) -> Tuple[torch.Tensor, None]: + """Notes: + Returns quantized activaiton and no dynamic scales. + """ + ops.quant(x_q, x, 1.0) + return x_q, None diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index ed25455e6ec1..893e6781089d 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -68,10 +68,17 @@ class SqueezeLLMLinearMethod(LinearMethodBase): def __init__(self, quant_config: SqueezeLLMConfig): self.quant_config = quant_config - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: + def create_weights( + self, + layer_name: str, + input_size_per_partition: int, + output_sizes_per_partition: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + del layer_name, input_size # Unused. + output_size_per_partition = sum(output_sizes_per_partition) + if input_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( "The input size is not aligned with the quantized " diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py index 07fab1f1ed6f..de7910c4860b 100644 --- a/vllm/model_executor/model_loader.py +++ b/vllm/model_executor/model_loader.py @@ -46,10 +46,6 @@ def _get_model_architecture( def get_architecture_class_name(model_config: ModelConfig) -> str: return _get_model_architecture(model_config)[1] -def _is_support_smoothquant(model_config: ModelConfig) -> bool: - architectures = getattr(model_config.hf_config, "architectures", []) - supported_archs = ModelRegistry.get_supported_smoothquant_archs() - return any(arch in supported_archs for arch in architectures) def get_model(model_config: ModelConfig, device_config: DeviceConfig, **kwargs) -> nn.Module: @@ -82,9 +78,7 @@ def get_model(model_config: ModelConfig, device_config: DeviceConfig, # Create a model instance. # The weights will be initialized as empty tensors. with torch.device(device_config.device): - if _is_support_smoothquant(model_config): - model = model_class(model_config.hf_config, linear_method) - elif hasattr(model_class, "supported_lora_modules"): + if hasattr(model_class, "supported_lora_modules"): model = model_class(model_config.hf_config, linear_method, lora_config) else: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 868394c76109..0b6c75705764 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -57,17 +57,22 @@ def __init__( hidden_size: int, intermediate_size: int, hidden_act: str, + parent_name: str, linear_method: Optional[LinearMethodBase] = None, ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, + layer_name=f"{parent_name}.gate_up_proj", + input_size=hidden_size, + output_sizes=[intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.down_proj = RowParallelLinear( + layer_name=f"{parent_name}.down_proj", + input_size=intermediate_size, + output_size=hidden_size, bias=False, linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -87,6 +92,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + parent_name: str, rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, @@ -127,16 +133,18 @@ def __init__( self.kv_scale = 1.0 self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, + layer_name=f"{parent_name}.qkv_proj", + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, bias=bias, linear_method=linear_method, ) self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, + layer_name=f"{parent_name}.o_proj", + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, bias=bias, linear_method=linear_method, ) @@ -175,6 +183,7 @@ class LlamaDecoderLayer(nn.Module): def __init__( self, config: LlamaConfig, + parent_name: str, linear_method: Optional[LinearMethodBase] = None, ) -> None: super().__init__() @@ -192,6 +201,7 @@ def __init__( rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, + parent_name=f"{parent_name}.self_attn", linear_method=linear_method, bias=getattr(config, "bias", False), sliding_window=sliding_window, @@ -200,6 +210,7 @@ def __init__( hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, + parent_name=f"{parent_name}.mlp", linear_method=linear_method, ) self.input_layernorm = RMSNorm(config.hidden_size, @@ -242,7 +253,7 @@ def __init__( self, config: LlamaConfig, linear_method: Optional[LinearMethodBase] = None, - lora_config: Optional[LoRAConfig] = None, + lora_config: Optional[LoRAConfig] = None ) -> None: super().__init__() self.config = config @@ -257,8 +268,10 @@ def __init__( org_num_embeddings=config.vocab_size, ) self.layers = nn.ModuleList([ - LlamaDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) + LlamaDecoderLayer(config, + parent_name=f"model.layers.{idx}", + linear_method=linear_method) + for idx in range(config.num_hidden_layers) ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -386,10 +399,12 @@ def load_weights(self, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( model_name_or_path, cache_dir, load_format, revision): # Update name of the loaded_weight if needed by the LinearMethod. - name = self.linear_method.maybe_update_loaded_weight_name(name) + if self.linear_method: + name = self.linear_method.maybe_update_loaded_weight_name(name) if "rotary_emb.inv_freq" in name: continue