From 2a2387ead0f374c07425d3ff850244b21b54c7fb Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 12 Apr 2024 20:30:08 +0000
Subject: [PATCH 01/12] first end to end run with rowparallellinear in the fun
 format

---
 vllm/model_executor/layers/linear.py          |  13 +-
 .../layers/quantization/smoothquant.py        | 118 +++++++++++++++++-
 vllm/model_executor/models/llama.py           |  19 +--
 3 files changed, 138 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index f3d4d1789db2..c8236860ee1a 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -525,8 +525,12 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size_per_partition, self.output_size, self.input_size,
-            self.output_size, self.params_dtype)
+            input_size_per_partition=self.input_size_per_partition,
+            output_size_per_partition=self.output_size,
+            input_size=self.input_size,
+            output_size=self.output_size, 
+            params_dtype=self.params_dtype,
+            logical_widths=[self.output_size])
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -555,6 +559,11 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
+
+        # TODO: canon
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py
index d9d82e6cfbc3..8b9fae111311 100644
--- a/vllm/model_executor/layers/quantization/smoothquant.py
+++ b/vllm/model_executor/layers/quantization/smoothquant.py
@@ -1,8 +1,9 @@
-from typing import Any, Dict, List, Tuple, Optional
+from typing import Any, Dict, List, Tuple, Optional, Union
 
 import torch
 from torch._tensor import Tensor
 from torch.nn.parameter import Parameter
+from torch.nn import ParameterList
 import threading
 
 from vllm._C import ops
@@ -94,7 +95,6 @@ def __new__(cls, *args, **kwargs):
     def get_i8cugemm(self):
         return self.i8cugemm
 
-
 class SQLinearMethod(LinearMethodBase):
     """Linear method for SmoothQuant.
     """
@@ -106,7 +106,8 @@ def __init__(self, gemm):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int,
-                       params_dtype: torch.dtype) -> Dict[str, Tensor]:
+                       params_dtype: torch.dtype,
+                       logical_widths=None) -> Dict[str, Tensor]:
         weight = Parameter(
             torch.empty(
                 output_size_per_partition,
@@ -173,6 +174,117 @@ def apply_weights(self,
         return y
 
 
+class FunSQLinearMethod(LinearMethodBase):
+    """Linear method for SmoothQuant.
+    """
+
+    def __init__(
+        self, 
+        per_token_quant,
+        quant_dtype,
+        dequant_dtype,
+    ):
+        self.per_token_quant = per_token_quant
+        self.quant_dtype = quant_dtype
+        self.dequant_dtype = dequant_dtype
+        self.i8cugemm = Int8GEMM().get_i8cugemm()
+
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        logical_widths: Optional[List[int]] = None,
+    ) -> Dict[str, Tensor]:
+        weight = Parameter(
+            torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                device="cuda",
+                dtype=torch.int8,
+            ), requires_grad=False,
+        )
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+        })
+
+        dequant_scale = Parameter(
+            torch.tensor(
+                [1.0] * len(logical_widths), 
+                dtype=torch.float32,
+                device='cpu'
+            ), requires_grad=False
+        )
+
+        return {
+            "weight": weight,
+            "dequant_scale": dequant_scale,
+            "logical_widths": logical_widths,
+        }
+
+
+    def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths):
+        x_dq = torch.empty_like(x_q, dtype=self.dequant_dtype)
+
+        # Split into shards.
+        x_q_split = x_q.split(logical_widths, dim=-1)
+        x_dq_split = x_dq.split(logical_widths, dim=-1)
+
+        # Dequantize each shard.
+        for xq, weight_scale, activation_scale, xdq in zip(
+            x_q_split, weight_scales, activation_scales, x_dq_split):
+            ops.dequant(xdq, xq, activation_scale, weight_scale)
+
+        # Return dequantized activation.
+        return x_dq
+
+
+    def _quantize(self, x, per_token_quant: bool):
+        x_q = torch.empty_like(x, dtype=self.quant_dtype)
+        
+        # Compute activation scale if per token.
+        if per_token_quant:
+            activation_scale = torch.empty(
+                x.numel() // x.shape[-1],
+                dtype=torch.float32,
+                device=x.device)
+            ops.quant(x_q, x, activation_scale)
+        # Set activation scale if per tensor. TODO: why 1.0? << static?
+        else:   
+            activation_scale = None
+            ops.quant(x_q, x, 1.0)
+        
+        return x_q, activation_scale
+
+
+    def apply_weights(self,
+                      weights: Dict[str, Tensor],
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> Tensor:
+        assert bias is None
+        weight = weights["weight"]
+        dequant_scale = weights["dequant_scale"]
+        logical_widths = weights["logical_widths"]
+
+        # Q
+        x_q, activation_scale = self._quantize(x, self.per_token_quant)
+        
+        # GEMM
+        x_q = x_q.view(-1, x_q.shape[-1])
+        out_q = torch.empty(
+            (x_q.shape[0], weight.shape[0]),
+            dtype=torch.int32, device=x.device)
+        
+        self.i8cugemm.linear_a8_w8_o32_(x_q, weight, out_q)
+        out_q = out_q.view(*x_q.shape[:-1], -1)
+        
+        # DQ
+        return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths)
+
+
 class SQLinearMethodQKV(SQLinearMethod):
 
     def __init__(self,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 1fffbc5fa30c..fbe6c3c9e8e1 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -34,6 +34,7 @@
     SQLinearMethod,
     SQLinearMethodQKV,
     SQLinearMethodOProj,
+    FunSQLinearMethod,
     SQLinearMethodGateUpProj,
     SQLinearMethodDownProj)
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -93,10 +94,15 @@ def __init__(
         if self.use_int8:
             # override gate_up linear method
             assert isinstance(linear_method, SQLinearMethod)
-            down_proj_linear_method = SQLinearMethodDownProj(
-                gemm=Int8GEMM,
+            # down_proj_linear_method = SQLinearMethodDownProj(
+            #     gemm=Int8GEMM,
+            #     quant_dtype=torch.int8,
+            #     dequant_dtype=torch.float)
+            down_proj_linear_method = FunSQLinearMethod(
+                per_token_quant=True,
                 quant_dtype=torch.int8,
                 dequant_dtype=torch.float)
+
         self.down_proj = RowParallelLinear(intermediate_size,
                                            hidden_size,
                                            bias=False,
@@ -193,11 +199,10 @@ def __init__(
         if self.use_int8:
             # override o_proj linear method
             assert isinstance(linear_method, SQLinearMethod)
-            o_proj_linear_method = SQLinearMethodOProj(
-                gemm=Int8GEMM,
-                use_per_token_quant=True, # TODO (varun) : Read from config
-                quant_dtype = torch.int8,
-                dequant_dtype=  torch.float)
+            o_proj_linear_method = FunSQLinearMethod(
+                per_token_quant=True,
+                quant_dtype=torch.int8,
+                dequant_dtype=torch.float)
 
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,

From 7373f4c5e07f08c58a9ac11aa6acaec31c996366 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 12 Apr 2024 21:53:21 +0000
Subject: [PATCH 02/12] got qkvproj working with funsqlinearmethod

---
 vllm/model_executor/layers/linear.py          | 39 ++++++++++++++++---
 .../layers/quantization/smoothquant.py        | 18 ++++++---
 vllm/model_executor/models/llama.py           | 38 +++++++++++++-----
 3 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index c8236860ee1a..146e654181eb 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -150,6 +150,9 @@ class ColumnParallelLinear(torch.nn.Module):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         linear_method: (Maybe quantized) linear method.
+        logical_widths: Optional list of widths for logical weight matrices.
+                        E.g. for QKVParallelLinear, this parameter defines
+                        the width
     """
 
     def __init__(
@@ -161,6 +164,7 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         linear_method: Optional[LinearMethodBase] = None,
+        logical_widths: Optional[List[int]] = None,
     ):
         super().__init__()
 
@@ -179,8 +183,13 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size, self.output_size_per_partition, self.input_size,
-            self.output_size, self.params_dtype)
+            input_size_per_partition=self.input_size,
+            output_size_per_partition=self.output_size_per_partition,
+            input_size=self.input_size,
+            output_size=self.output_size,
+            params_dtype=self.params_dtype,
+            logical_widths=logical_widths,
+        )
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -257,8 +266,15 @@ def __init__(
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
-        super().__init__(input_size, sum(output_sizes), bias, gather_output,
-                         skip_bias_add, params_dtype, linear_method)
+        super().__init__(
+            input_size=input_size,
+            output_size=sum(output_sizes),
+            bias=bias,
+            gather_output=gather_output,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            linear_method=linear_method,
+            logical_widths=output_sizes)
 
     def weight_loader(self,
                       param: Parameter,
@@ -383,8 +399,19 @@ def __init__(
         input_size = self.hidden_size
         output_size = (self.num_heads +
                        2 * self.num_kv_heads) * tp_size * self.head_size
-        super().__init__(input_size, output_size, bias, False, skip_bias_add,
-                         params_dtype, linear_method)
+        super().__init__(
+            input_size=input_size,
+            output_size=output_size, 
+            bias=bias, 
+            gather_output=False, 
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype, 
+            linear_method=linear_method,
+            logical_widths = [
+                self.num_heads * self.head_size,            # q_proj
+                self.total_num_kv_heads * self.head_size,   # k_proj
+                self.total_num_kv_heads * self.head_size,   # v_proj 
+            ])
 
     def weight_loader(self,
                       param: Parameter,
diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py
index 8b9fae111311..0d28701ae0a1 100644
--- a/vllm/model_executor/layers/quantization/smoothquant.py
+++ b/vllm/model_executor/layers/quantization/smoothquant.py
@@ -104,7 +104,8 @@ def __init__(self, gemm):
         self.i8cugemm = i8_gemm.get_i8cugemm()
 
     def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
+                       output_size_per_partition: int,
+                       input_size: int,
                        output_size: int,
                        params_dtype: torch.dtype,
                        logical_widths=None) -> Dict[str, Tensor]:
@@ -233,10 +234,16 @@ def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths):
         x_q_split = x_q.split(logical_widths, dim=-1)
         x_dq_split = x_dq.split(logical_widths, dim=-1)
 
-        # Dequantize each shard.
-        for xq, weight_scale, activation_scale, xdq in zip(
-            x_q_split, weight_scales, activation_scales, x_dq_split):
-            ops.dequant(xdq, xq, activation_scale, weight_scale)
+        # If QuantType is Static per Tensor:
+        if activation_scales is None:
+            for xdq, xq, weight_scale in zip(x_dq_split, x_q_split, weight_scales):
+                ops.dequant(xdq, xq, weight_scale)
+        
+        # If QuantType is Dynamic per Token:
+        else:
+            for xdq, xq, weight_scale, activation_scale in zip(
+                x_dq_split, x_q_split, weight_scales, activation_scales):
+                ops.dequant(xdq, xq, activation_scale, weight_scale)
 
         # Return dequantized activation.
         return x_dq
@@ -315,6 +322,7 @@ def dequantize(self, x_q, weights : Dict[str, Tensor]):
         q_scale, k_scale, v_scale = (weights['q_dequant_scale'],
                                      weights['k_dequant_scale'],
                                      weights['v_dequant_scale'])
+        
         ops.dequant(q, q_q, q_scale)
         ops.dequant(k, k_q, k_scale)
         ops.dequant(v, v_q, v_scale)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index fbe6c3c9e8e1..7a4a51cac96c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -92,12 +92,7 @@ def __init__(
 
         down_proj_linear_method = linear_method
         if self.use_int8:
-            # override gate_up linear method
             assert isinstance(linear_method, SQLinearMethod)
-            # down_proj_linear_method = SQLinearMethodDownProj(
-            #     gemm=Int8GEMM,
-            #     quant_dtype=torch.int8,
-            #     dequant_dtype=torch.float)
             down_proj_linear_method = FunSQLinearMethod(
                 per_token_quant=True,
                 quant_dtype=torch.int8,
@@ -181,11 +176,17 @@ def __init__(
         if self.use_int8:
             # override qkv linear method
             assert isinstance(linear_method, SQLinearMethod)
-            qkv_linear_method = SQLinearMethodQKV(
-                gemm=Int8GEMM,
-                qkv_sizes=(self.q_size, self.kv_size, self.kv_size),
+            # qkv_linear_method = SQLinearMethodQKV(
+            #     gemm=Int8GEMM,
+            #     qkv_sizes=(self.q_size, self.kv_size, self.kv_size),
+            #     quant_dtype=torch.int8,
+            #     dequant_dtype=self.rotary_emb.cos_sin_cache.dtype)
+            qkv_linear_method = FunSQLinearMethod(
+                per_token_quant=False,
                 quant_dtype=torch.int8,
                 dequant_dtype=self.rotary_emb.cos_sin_cache.dtype)
+
+            
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
             self.head_dim,
@@ -483,11 +484,28 @@ def load_weights(self,
                     for (param_name, weight_name, _) in stacked_params_mapping:
                         if weight_name not in name:
                             continue
+                        
+                        k_proj = "k_proj" in name
+                        v_proj = "v_proj" in name
+                        q_proj = "q_proj" in name
                         name = name.replace(weight_name, param_name)
                         prefix = weight_name.split('_')[0]
                         suffix = name.split('.')[-1]
-                        new_name = prefix + '_' + suffix
-                        param = params_dict[name.replace(suffix, new_name)]
+
+                        if "qkv" in name:
+                            if q_proj:
+                                param = params_dict[name.replace(suffix, "dequant_scale")][0]
+                            elif k_proj:
+                                param = params_dict[name.replace(suffix, "dequant_scale")][1]
+                            else:
+                                assert v_proj
+                                param = params_dict[name.replace(suffix, "dequant_scale")][2]
+                            
+                        else:
+                            suffix = name.split('.')[-1]
+                            new_name = prefix + '_' + suffix
+                            param = params_dict[name.replace(suffix, new_name)]
+                            
                         param.copy_(loaded_weight)
                         is_fusion_scale = True
                         break

From a3d4ee587416389549683d48c272f62cc3a9f131 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 12 Apr 2024 22:04:33 +0000
Subject: [PATCH 03/12] converted all SQLinearMethods to use FunSQLinearMethod
 -- now working to remove changes to Llama.py

---
 .../layers/quantization/smoothquant.py        | 177 +-----------------
 vllm/model_executor/models/llama.py           |  44 ++---
 2 files changed, 17 insertions(+), 204 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py
index 0d28701ae0a1..3358ef9ba4d2 100644
--- a/vllm/model_executor/layers/quantization/smoothquant.py
+++ b/vllm/model_executor/layers/quantization/smoothquant.py
@@ -95,6 +95,7 @@ def __new__(cls, *args, **kwargs):
     def get_i8cugemm(self):
         return self.i8cugemm
 
+
 class SQLinearMethod(LinearMethodBase):
     """Linear method for SmoothQuant.
     """
@@ -290,179 +291,3 @@ def apply_weights(self,
         
         # DQ
         return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths)
-
-
-class SQLinearMethodQKV(SQLinearMethod):
-
-    def __init__(self,
-                 gemm,
-                 qkv_sizes : Tuple[int, int, int],
-                 quant_dtype : torch.dtype = torch.int8,
-                 dequant_dtype : torch.dtype = torch.float):
-        super().__init__(gemm)
-        self.qkv_sizes = qkv_sizes
-        self.quant_dtype = quant_dtype
-        self.dequant_dtype = dequant_dtype
-
-    def quantize(self, x):
-        assert x.dtype != self.quant_dtype
-        x_q = torch.empty_like(x, dtype=self.quant_dtype)
-        ops.quant(x_q, x, 1.0)
-        return x_q
-
-    def dequantize(self, x_q, weights : Dict[str, Tensor]):
-        # split to get the quantized qkv
-        q_q, k_q, v_q = x_q.split(list(self.qkv_sizes), dim=-1)
-
-        # create dequant qkv buffer and split to get the individual dequant qkv
-        # buffers
-        qkv = torch.empty_like(x_q, dtype=self.dequant_dtype)
-        q, k, v = qkv.split(list(self.qkv_sizes), dim=-1)
-
-        q_scale, k_scale, v_scale = (weights['q_dequant_scale'],
-                                     weights['k_dequant_scale'],
-                                     weights['v_dequant_scale'])
-        
-        ops.dequant(q, q_q, q_scale)
-        ops.dequant(k, k_q, k_scale)
-        ops.dequant(v, v_q, v_scale)
-
-        return qkv
-
-    def apply_weights(self,
-                      weights: Dict[str, Tensor],
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> Tensor:
-        x_q = self.quantize(x)
-        y_q = super().apply_weights(weights, x_q, bias)
-        return self.dequantize(y_q, weights)
-        
-class SQLinearMethodOProj(SQLinearMethod):
-
-    def __init__(self,
-                 gemm,
-                 use_per_token_quant:bool,
-                 quant_dtype : torch.dtype = torch.int8,
-                 dequant_dtype : torch.dtype = torch.float):
-        super().__init__(gemm)
-        self.use_per_token_quant = use_per_token_quant
-        self.quant_dtype = quant_dtype
-        self.dequant_dtype = dequant_dtype
-
-    def quantize(self, x) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        # x is the paged-attention output
-        assert x.dtype != self.quant_dtype
-        act_scale = None
-        x_q = torch.empty_like(x, dtype=self.quant_dtype)
-        if self.use_per_token_quant:
-            act_scale = torch.empty(x.numel() // x.shape[-1],
-                                dtype=torch.float32,
-                                device=x.device)
-            ops.quant(x_q, x, act_scale)
-        else:
-            ops.quant(x_q, x, 1.0)
-        return x_q, act_scale
-
-    def dequantize(self, x_q, weights : Dict[str, Tensor], act_scale : torch.Tensor) -> torch.Tensor:
-        o_dequant_scale = weights['dequant_scale']
-        x = torch.empty_like(
-            x_q,
-            dtype=self.dequant_dtype,
-            device=x_q.device)
-        ops.dequant(x, x_q, act_scale, o_dequant_scale)
-        return x
-
-    def apply_weights(self,
-                      weights: Dict[str, Tensor],
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> Tensor:
-        pass
-        x_q, act_scale = self.quantize(x)
-        y_q = super().apply_weights(weights, x_q, bias)
-        return self.dequantize(y_q, weights, act_scale)
-
-class SQLinearMethodGateUpProj(SQLinearMethod):
-
-    def __init__(self,
-                 gemm,
-                 quant_dtype : torch.dtype = torch.int8,
-                 dequant_dtype : torch.dtype = torch.float):
-        super().__init__(gemm)
-        self.quant_dtype = quant_dtype
-        self.dequant_dtype = dequant_dtype
-
-    def quantize(self, x) -> torch.Tensor: 
-        # x is the attention output
-        assert x.dtype != self.quant_dtype
-        x_q = torch.empty_like(x, dtype=self.quant_dtype, device=x.device)
-        ops.quant(x_q, x, 1.0)
-        return x_q
-
-    def dequantize(self, gate_up_q: torch.Tensor, weights : Dict[str, Tensor]) -> torch.Tensor:
-
-        def split_gate_up(gate_up : torch.Tensor):
-            d = gate_up.shape[-1]
-            return (torch.narrow(gate_up, 1, 0, d//2), 
-                    torch.narrow(gate_up, 1, d//2, d//2))
-
-        # create a dequant gate_up buffer and split it into constituent parts.
-        gate_up = torch.empty_like(gate_up_q,
-                                   dtype=self.dequant_dtype,
-                                   device=gate_up_q.device) 
-
-        # split quantized gate_up into constituent parts.
-        gate_q, up_q = split_gate_up(gate_up_q)
-        # split output gate_up buffer into constituent parts. 
-        gate, up = split_gate_up(gate_up)
-
-        gate_scale, up_scale = (weights['gate_dequant_scale'],
-                                weights['up_dequant_scale'])
-        ops.dequant(gate, gate_q, gate_scale)
-        ops.dequant(up, up_q, up_scale)
-
-        return gate_up
-
-    def apply_weights(self,
-                      weights: Dict[str, Tensor],
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> Tensor:
-        x_q = self.quantize(x)
-        gate_up_q = super().apply_weights(weights, x_q, bias)
-        return self.dequantize(gate_up_q, weights)
-
-class SQLinearMethodDownProj(SQLinearMethod):
-
-    def __init__(self,
-                 gemm,
-                 quant_dtype : torch.dtype = torch.int8,
-                 dequant_dtype : torch.dtype = torch.float):
-        super().__init__(gemm)
-        self.quant_dtype = quant_dtype
-        self.dequant_dtype = dequant_dtype
-
-    def quantize(self, x) -> Tuple[torch.Tensor, torch.Tensor]: 
-        assert x.dtype != self.quant_dtype
-        # TODO (varun) : This is per-token quant - Read from config
-        x_q = torch.empty_like(x, dtype=self.quant_dtype) 
-        scale = torch.empty(x.numel() // x.shape[-1],
-                            dtype=torch.float32,
-                            device=x.device)
-        ops.quant(x_q, x, scale)
-        return x_q, scale
-
-    def dequantize(self, x_q, weights : Dict[str, Tensor], act_scale : torch.Tensor) -> torch.Tensor:
-        down_dequant_scale = weights['dequant_scale']
-        x = torch.empty_like(
-            x_q,
-            dtype=self.dequant_dtype,
-            device=x_q.device)
-        ops.dequant(x, x_q, act_scale, down_dequant_scale)
-        return x
-
-    def apply_weights(self,
-                      weights: Dict[str, Tensor],
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        x_q, act_scale = self.quantize(x)
-        y_q = super().apply_weights(weights, x_q, bias)
-        return self.dequantize(y_q, weights, act_scale)
\ No newline at end of file
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 7a4a51cac96c..efb59c71afb7 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -30,13 +30,8 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import LoRAConfig
 from vllm.model_executor.layers.quantization.smoothquant import (
-    Int8GEMM,
     SQLinearMethod,
-    SQLinearMethodQKV,
-    SQLinearMethodOProj,
-    FunSQLinearMethod,
-    SQLinearMethodGateUpProj,
-    SQLinearMethodDownProj)
+    FunSQLinearMethod,)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.linear import (LinearMethodBase,
@@ -78,8 +73,8 @@ def __init__(
         if self.use_int8:
             # override gate_up linear method
             assert isinstance(linear_method, SQLinearMethod)
-            gate_up_linear_method = SQLinearMethodGateUpProj(
-                gemm=Int8GEMM,
+            gate_up_linear_method = FunSQLinearMethod(
+                per_token_quant=False,
                 quant_dtype=torch.int8,
                 dequant_dtype=torch.float)
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -484,28 +479,21 @@ def load_weights(self,
                     for (param_name, weight_name, _) in stacked_params_mapping:
                         if weight_name not in name:
                             continue
-                        
-                        k_proj = "k_proj" in name
-                        v_proj = "v_proj" in name
-                        q_proj = "q_proj" in name
+                        if "q_proj" in name:
+                            idx = 0
+                        elif "k_proj" in name:
+                            idx = 1
+                        elif "v_proj" in name:
+                            idx = 2
+                        elif "gate_proj" in name:
+                            idx = 0
+                        else:
+                            assert "up_proj" in name
+                            idx = 1
+
                         name = name.replace(weight_name, param_name)
-                        prefix = weight_name.split('_')[0]
                         suffix = name.split('.')[-1]
-
-                        if "qkv" in name:
-                            if q_proj:
-                                param = params_dict[name.replace(suffix, "dequant_scale")][0]
-                            elif k_proj:
-                                param = params_dict[name.replace(suffix, "dequant_scale")][1]
-                            else:
-                                assert v_proj
-                                param = params_dict[name.replace(suffix, "dequant_scale")][2]
-                            
-                        else:
-                            suffix = name.split('.')[-1]
-                            new_name = prefix + '_' + suffix
-                            param = params_dict[name.replace(suffix, new_name)]
-                            
+                        param = params_dict[name.replace(suffix, "dequant_scale")][idx]
                         param.copy_(loaded_weight)
                         is_fusion_scale = True
                         break

From 4bb02750be43685cd46b06690555e2bcd014022c Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 12 Apr 2024 22:30:12 +0000
Subject: [PATCH 04/12] stash

---
 vllm/model_executor/layers/linear.py          |   4 +-
 .../layers/quantization/smoothquant.py        | 104 ++----------------
 vllm/model_executor/model_loader.py           |   3 +-
 vllm/model_executor/models/llama.py           |  82 ++------------
 4 files changed, 25 insertions(+), 168 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 146e654181eb..57095dff6bb0 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -189,6 +189,7 @@ def __init__(
             output_size=self.output_size,
             params_dtype=self.params_dtype,
             logical_widths=logical_widths,
+            per_token_quant=False,
         )
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
@@ -557,7 +558,8 @@ def __init__(
             input_size=self.input_size,
             output_size=self.output_size, 
             params_dtype=self.params_dtype,
-            logical_widths=[self.output_size])
+            logical_widths=[self.output_size],
+            per_token_quant=True,)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py
index 3358ef9ba4d2..13b995921377 100644
--- a/vllm/model_executor/layers/quantization/smoothquant.py
+++ b/vllm/model_executor/layers/quantization/smoothquant.py
@@ -72,7 +72,7 @@ def from_config(cls, config: Dict[str, Any]) -> "SmoothQuantConfig":
         return cls(weight_bits, quant_map)
 
     def get_linear_method(self) -> "SQLinearMethod":
-        return SQLinearMethod(Int8GEMM)
+        return SQLinearMethod()
 
     def get_scaled_act_names(self) -> List[str]:
         return []
@@ -95,100 +95,11 @@ def __new__(cls, *args, **kwargs):
     def get_i8cugemm(self):
         return self.i8cugemm
 
-
 class SQLinearMethod(LinearMethodBase):
-    """Linear method for SmoothQuant.
+    """Linear method for AutoSmoothQuant.
     """
 
-    def __init__(self, gemm):
-        i8_gemm = gemm()
-        self.i8cugemm = i8_gemm.get_i8cugemm()
-
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int,
-                       input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype,
-                       logical_widths=None) -> Dict[str, Tensor]:
-        weight = Parameter(
-            torch.empty(
-                output_size_per_partition,
-                input_size_per_partition,
-                device="cuda",
-                dtype=torch.int8,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(weight, {
-            "input_dim": 1,
-            "output_dim": 0,
-        })
-        # q k v dequant_scales are used in QKVParallelLinear
-        q_dequant_scale = Parameter(
-            torch.tensor(1.0, dtype=torch.float32, device='cpu'),
-            requires_grad=False,
-        )
-        k_dequant_scale = Parameter(
-            torch.tensor(1.0, dtype=torch.float32, device='cpu'),
-            requires_grad=False,
-        )
-        v_dequant_scale = Parameter(
-            torch.tensor(1.0, dtype=torch.float32, device='cpu'),
-            requires_grad=False,
-        )
-        # gate up dequant_scales are used in MergedColumnParallelLinear
-        gate_dequant_scale = Parameter(
-            torch.tensor(1.0, dtype=torch.float32, device='cpu'),
-            requires_grad=False,
-        )
-        up_dequant_scale = Parameter(
-            torch.tensor(1.0, dtype=torch.float32, device='cpu'),
-            requires_grad=False,
-        )
-        # dequant_scale is used in RowParallelLinear
-        dequant_scale = Parameter(
-            torch.tensor(1.0, dtype=torch.float32, device='cpu'),
-            requires_grad=False,
-        )
-        return {
-            "weight": weight,
-            "q_dequant_scale": q_dequant_scale,
-            "k_dequant_scale": k_dequant_scale,
-            "v_dequant_scale": v_dequant_scale,
-            "gate_dequant_scale": gate_dequant_scale,
-            "up_dequant_scale": up_dequant_scale,
-            "dequant_scale": dequant_scale
-        }
-
-    def apply_weights(self,
-                      weights: Dict[str, Tensor],
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> Tensor:
-        assert bias is None
-        weight = weights["weight"]
-        x_shape = x.shape
-        x = x.view(-1, x_shape[-1])
-        y = torch.empty((x.shape[0], weight.shape[0]),
-                        dtype=torch.int32,
-                        device=x.device)
-        self.i8cugemm.linear_a8_w8_o32_(x, weight, y)
-        y = y.view(*x_shape[:-1], -1)
-        return y
-
-
-class FunSQLinearMethod(LinearMethodBase):
-    """Linear method for SmoothQuant.
-    """
-
-    def __init__(
-        self, 
-        per_token_quant,
-        quant_dtype,
-        dequant_dtype,
-    ):
-        self.per_token_quant = per_token_quant
-        self.quant_dtype = quant_dtype
-        self.dequant_dtype = dequant_dtype
+    def __init__(self,):
         self.i8cugemm = Int8GEMM().get_i8cugemm()
 
     def create_weights(
@@ -199,6 +110,7 @@ def create_weights(
         output_size: int,
         params_dtype: torch.dtype,
         logical_widths: Optional[List[int]] = None,
+        per_token_quant:bool = False,
     ) -> Dict[str, Tensor]:
         weight = Parameter(
             torch.empty(
@@ -216,8 +128,8 @@ def create_weights(
         dequant_scale = Parameter(
             torch.tensor(
                 [1.0] * len(logical_widths), 
-                dtype=torch.float32,
-                device='cpu'
+                dtype=params_dtype,
+                device='cuda'
             ), requires_grad=False
         )
 
@@ -225,6 +137,7 @@ def create_weights(
             "weight": weight,
             "dequant_scale": dequant_scale,
             "logical_widths": logical_widths,
+            "per_token_quant": per_token_quant,
         }
 
 
@@ -276,9 +189,10 @@ def apply_weights(self,
         weight = weights["weight"]
         dequant_scale = weights["dequant_scale"]
         logical_widths = weights["logical_widths"]
+        per_token_quant = weights["per_token_quant"]
 
         # Q
-        x_q, activation_scale = self._quantize(x, self.per_token_quant)
+        x_q, activation_scale = self._quantize(x, per_token_quant)
         
         # GEMM
         x_q = x_q.view(-1, x_q.shape[-1])
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index b191dc4009b5..07fab1f1ed6f 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -83,8 +83,7 @@ def get_model(model_config: ModelConfig, device_config: DeviceConfig,
         # The weights will be initialized as empty tensors.
         with torch.device(device_config.device):
             if _is_support_smoothquant(model_config):
-                model = model_class(model_config.hf_config, linear_method,
-                                    quant_config)
+                model = model_class(model_config.hf_config, linear_method)
             elif hasattr(model_class, "supported_lora_modules"):
                 model = model_class(model_config.hf_config, linear_method,
                                     lora_config)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index efb59c71afb7..d01d08ff6ff6 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -29,9 +29,6 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import LoRAConfig
-from vllm.model_executor.layers.quantization.smoothquant import (
-    SQLinearMethod,
-    FunSQLinearMethod,)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.linear import (LinearMethodBase,
@@ -62,42 +59,24 @@ def __init__(
         intermediate_size: int,
         hidden_act: str,
         linear_method: Optional[LinearMethodBase] = None,
-        quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
-        self.use_int8 = quant_config is not None and quant_config.get_name(
-        ) == "smoothquant"
-
-        gate_up_linear_method = linear_method
-        if self.use_int8:
-            # override gate_up linear method
-            assert isinstance(linear_method, SQLinearMethod)
-            gate_up_linear_method = FunSQLinearMethod(
-                per_token_quant=False,
-                quant_dtype=torch.int8,
-                dequant_dtype=torch.float)
+
         self.gate_up_proj = MergedColumnParallelLinear(
             hidden_size, [intermediate_size] * 2,
             bias=False,
-            linear_method=gate_up_linear_method)
+            linear_method=linear_method)
+        
+        self.down_proj = RowParallelLinear(
+            intermediate_size, hidden_size,
+            bias=False,
+            linear_method=linear_method)
+
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
 
-        down_proj_linear_method = linear_method
-        if self.use_int8:
-            assert isinstance(linear_method, SQLinearMethod)
-            down_proj_linear_method = FunSQLinearMethod(
-                per_token_quant=True,
-                quant_dtype=torch.int8,
-                dequant_dtype=torch.float)
-
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=down_proj_linear_method)
-
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -145,10 +124,6 @@ def __init__(
         self.scaling = self.head_dim**-0.5
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
-        self.use_int8 = quant_config is not None and quant_config.get_name(
-        ) == "smoothquant"
-        # Needs to be ironed out!!
-        self.use_per_token_quant = self.use_int8
 
         # This will be overwritten by model initialization if we are using it.
         # N.B. currently we only support per tensor scalar scaling factors
@@ -167,44 +142,20 @@ def __init__(
             rope_scaling=rope_scaling,
         )
 
-        qkv_linear_method = linear_method
-        if self.use_int8:
-            # override qkv linear method
-            assert isinstance(linear_method, SQLinearMethod)
-            # qkv_linear_method = SQLinearMethodQKV(
-            #     gemm=Int8GEMM,
-            #     qkv_sizes=(self.q_size, self.kv_size, self.kv_size),
-            #     quant_dtype=torch.int8,
-            #     dequant_dtype=self.rotary_emb.cos_sin_cache.dtype)
-            qkv_linear_method = FunSQLinearMethod(
-                per_token_quant=False,
-                quant_dtype=torch.int8,
-                dequant_dtype=self.rotary_emb.cos_sin_cache.dtype)
-
-            
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
             self.head_dim,
             self.total_num_heads,
             self.total_num_kv_heads,
             bias=bias,
-            linear_method=qkv_linear_method,
+            linear_method=linear_method,
         )
 
-        o_proj_linear_method = linear_method
-        if self.use_int8:
-            # override o_proj linear method
-            assert isinstance(linear_method, SQLinearMethod)
-            o_proj_linear_method = FunSQLinearMethod(
-                per_token_quant=True,
-                quant_dtype=torch.int8,
-                dequant_dtype=torch.float)
-
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=bias,
-            linear_method=o_proj_linear_method,
+            linear_method=linear_method,
         )
 
         self.attn = Attention(
@@ -235,12 +186,9 @@ def __init__(
         self,
         config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
-        quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.use_int8 = quant_config is not None and quant_config.get_name(
-        ) == "smoothquant"
         self.tp_size = get_tensor_model_parallel_world_size()
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
@@ -256,7 +204,6 @@ def __init__(
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             linear_method=linear_method,
-            quant_config=quant_config,
             bias=getattr(config, "bias", False),
             sliding_window=sliding_window,
         )
@@ -265,7 +212,6 @@ def __init__(
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
             linear_method=linear_method,
-            quant_config=quant_config,
         )
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -309,7 +255,6 @@ def __init__(
         self,
         config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
-        quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
@@ -325,7 +270,7 @@ def __init__(
             org_num_embeddings=config.vocab_size,
         )
         self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config, linear_method, quant_config)
+            LlamaDecoderLayer(config, linear_method)
             for _ in range(config.num_hidden_layers)
         ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -391,15 +336,12 @@ def __init__(
         self,
         config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
-        quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
         self.config = config
         self.linear_method = linear_method
-        self.quant_config = quant_config
-        self.model = LlamaModel(config, linear_method, lora_config=lora_config,
-                                quant_config = quant_config)
+        self.model = LlamaModel(config, linear_method, lora_config=lora_config)
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size

From fa35654b4a0ac4b5978ecf987d056c0970c79841 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 12 Apr 2024 22:56:47 +0000
Subject: [PATCH 05/12] updated llama.py to minimze changes

---
 .../layers/quantization/smoothquant.py        |  6 ++-
 vllm/model_executor/models/llama.py           | 52 +++++++------------
 2 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py
index 13b995921377..ec7344e39291 100644
--- a/vllm/model_executor/layers/quantization/smoothquant.py
+++ b/vllm/model_executor/layers/quantization/smoothquant.py
@@ -112,6 +112,8 @@ def create_weights(
         logical_widths: Optional[List[int]] = None,
         per_token_quant:bool = False,
     ) -> Dict[str, Tensor]:
+        self.output_dtype = params_dtype
+
         weight = Parameter(
             torch.empty(
                 output_size_per_partition,
@@ -142,7 +144,7 @@ def create_weights(
 
 
     def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths):
-        x_dq = torch.empty_like(x_q, dtype=self.dequant_dtype)
+        x_dq = torch.empty_like(x_q, dtype=self.output_dtype)
 
         # Split into shards.
         x_q_split = x_q.split(logical_widths, dim=-1)
@@ -164,7 +166,7 @@ def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths):
 
 
     def _quantize(self, x, per_token_quant: bool):
-        x_q = torch.empty_like(x, dtype=self.quant_dtype)
+        x_q = torch.empty_like(x, dtype=torch.int8)
         
         # Compute activation scale if per token.
         if per_token_quant:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d01d08ff6ff6..f0467007a170 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -29,14 +29,12 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import LoRAConfig
-from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
-
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
@@ -51,6 +49,7 @@
 from vllm.sequence import SamplerOutput
 from vllm.utils import is_hip
 
+
 class LlamaMLP(nn.Module):
 
     def __init__(
@@ -61,8 +60,6 @@ def __init__(
         linear_method: Optional[LinearMethodBase] = None,
     ) -> None:
         super().__init__()
-        self.hidden_size = hidden_size
-
         self.gate_up_proj = MergedColumnParallelLinear(
             hidden_size, [intermediate_size] * 2,
             bias=False,
@@ -76,7 +73,6 @@ def __init__(
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
-
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -96,28 +92,25 @@ def __init__(
         rope_scaling: Optional[Dict[str, Any]] = None,
         max_position_embeddings: int = 8192,
         linear_method: Optional[LinearMethodBase] = None,
-        quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
         sliding_window: Optional[int] = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
-        self.tp_size = get_tensor_model_parallel_world_size()
+        tp_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = num_heads
-        assert self.total_num_heads % self.tp_size == 0
-        self.num_heads = self.total_num_heads // self.tp_size
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
         self.total_num_kv_heads = num_kv_heads
-        self.default_dtype = torch.get_default_dtype()
-
-        if self.total_num_kv_heads >= self.tp_size:
+        if self.total_num_kv_heads >= tp_size:
             # Number of KV heads is greater than TP size, so we partition
             # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % self.tp_size == 0
+            assert self.total_num_kv_heads % tp_size == 0
         else:
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
-            assert self.tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         self.head_dim = hidden_size // self.total_num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
@@ -134,14 +127,6 @@ def __init__(
         # scaling_factor = tensor_amax / FPtype_max
         self.kv_scale = 1.0
 
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
-        )
-
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
             self.head_dim,
@@ -150,7 +135,6 @@ def __init__(
             bias=bias,
             linear_method=linear_method,
         )
-
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
@@ -158,6 +142,14 @@ def __init__(
             linear_method=linear_method,
         )
 
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
         self.attn = Attention(
             self.num_heads,
             self.head_dim,
@@ -189,7 +181,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.tp_size = get_tensor_model_parallel_world_size()
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings",
@@ -226,7 +217,6 @@ def forward(
         attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-
         # Self Attention
         if residual is None:
             residual = hidden_states
@@ -234,7 +224,6 @@ def forward(
         else:
             hidden_states, residual = self.input_layernorm(
                 hidden_states, residual)
-
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
@@ -243,8 +232,8 @@ def forward(
         )
 
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(hidden_states,
-                                                               residual)
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
@@ -391,8 +380,6 @@ def load_weights(self,
                      load_format: str = "auto",
                      revision: Optional[str] = None):
         # For SmoothQuant
-        int8_fusion = self.quant_config is not None and \
-                self.quant_config.get_name() == "smoothquant"
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -415,6 +402,7 @@ def load_weights(self,
             if "bias" in name:
                 continue
             # load dequant scale for qkv_proj and gate_up_proj
+            int8_fusion = True
             if int8_fusion:
                 is_fusion_scale = False
                 if "scale" in name:

From 53f19125a8707e998e69bc54bc93fcdef90c71bd Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 12 Apr 2024 22:58:14 +0000
Subject: [PATCH 06/12] updated llama.py to minimze changes

---
 vllm/model_executor/models/llama.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f0467007a170..9e6cee4b4d66 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -64,11 +64,10 @@ def __init__(
             hidden_size, [intermediate_size] * 2,
             bias=False,
             linear_method=linear_method)
-        
-        self.down_proj = RowParallelLinear(
-            intermediate_size, hidden_size,
-            bias=False,
-            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method)
 
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
@@ -81,6 +80,7 @@ def forward(self, x):
         x, _ = self.down_proj(x)
         return x
 
+
 class LlamaAttention(nn.Module):
 
     def __init__(
@@ -149,13 +149,11 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-
-        self.attn = Attention(
-            self.num_heads,
-            self.head_dim,
-            self.scaling,
-            num_kv_heads=self.num_kv_heads,
-            sliding_window=sliding_window)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              sliding_window=sliding_window)
 
     def forward(
         self,
@@ -172,6 +170,7 @@ def forward(
         output, _ = self.o_proj(attn_output)
         return output
 
+
 class LlamaDecoderLayer(nn.Module):
 
     def __init__(

From f62ff3a59c4b27d7b282fe2175585dfc225bb28e Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 12 Apr 2024 23:00:20 +0000
Subject: [PATCH 07/12] minimize changes to llama.py

---
 vllm/model_executor/models/llama.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 9e6cee4b4d66..1831fbb31b4a 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -68,7 +68,6 @@ def __init__(
                                            hidden_size,
                                            bias=False,
                                            linear_method=linear_method)
-
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -378,7 +377,6 @@ def load_weights(self,
                      cache_dir: Optional[str] = None,
                      load_format: str = "auto",
                      revision: Optional[str] = None):
-        # For SmoothQuant
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -397,9 +395,6 @@ def load_weights(self,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            # bias is useless for llama
-            if "bias" in name:
-                continue
             # load dequant scale for qkv_proj and gate_up_proj
             int8_fusion = True
             if int8_fusion:

From 888135fc57b262e39a91e3467be2d549f701a936 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 12 Apr 2024 23:01:30 +0000
Subject: [PATCH 08/12] minimize changes to llama.py

---
 vllm/model_executor/layers/quantization/smoothquant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py
index ec7344e39291..e658cb4eaff6 100644
--- a/vllm/model_executor/layers/quantization/smoothquant.py
+++ b/vllm/model_executor/layers/quantization/smoothquant.py
@@ -110,7 +110,7 @@ def create_weights(
         output_size: int,
         params_dtype: torch.dtype,
         logical_widths: Optional[List[int]] = None,
-        per_token_quant:bool = False,
+        per_token_quant: bool = False,
     ) -> Dict[str, Tensor]:
         self.output_dtype = params_dtype
 

From d5d223b91cc16afd58e6091962c011331344d0ba Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 12 Apr 2024 23:09:37 +0000
Subject: [PATCH 09/12] tweak

---
 .../layers/quantization/smoothquant.py        | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py
index e658cb4eaff6..09f112132d95 100644
--- a/vllm/model_executor/layers/quantization/smoothquant.py
+++ b/vllm/model_executor/layers/quantization/smoothquant.py
@@ -182,6 +182,15 @@ def _quantize(self, x, per_token_quant: bool):
         
         return x_q, activation_scale
 
+    def _gemm(self, x_q, weight):
+        x_q = x_q.view(-1, x_q.shape[-1])
+        out_q = torch.empty(
+            (x_q.shape[0], weight.shape[0]),
+            dtype=torch.int32, device="cuda")
+        
+        self.i8cugemm.linear_a8_w8_o32_(x_q, weight, out_q)
+        return out_q.view(*x_q.shape[:-1], -1)
+
 
     def apply_weights(self,
                       weights: Dict[str, Tensor],
@@ -195,15 +204,5 @@ def apply_weights(self,
 
         # Q
         x_q, activation_scale = self._quantize(x, per_token_quant)
-        
-        # GEMM
-        x_q = x_q.view(-1, x_q.shape[-1])
-        out_q = torch.empty(
-            (x_q.shape[0], weight.shape[0]),
-            dtype=torch.int32, device=x.device)
-        
-        self.i8cugemm.linear_a8_w8_o32_(x_q, weight, out_q)
-        out_q = out_q.view(*x_q.shape[:-1], -1)
-        
-        # DQ
-        return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths)
+        out_q = self._gemm(x_q, weight)
+        return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths) 

From 10551e49ca95fd21b27c5842ca85097b94756105 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 13 Apr 2024 00:24:36 +0000
Subject: [PATCH 10/12] updated llama weight loading

---
 vllm/model_executor/layers/linear.py          | 53 +++++++++++++++++--
 .../layers/quantization/smoothquant.py        | 36 ++++++++++++-
 vllm/model_executor/models/llama.py           | 31 ++---------
 3 files changed, 87 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 57095dff6bb0..f9332c05cfe6 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -43,6 +43,12 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         """Apply the weights to the input tensor."""
         raise NotImplementedError
+    
+    def maybe_update_loaded_weight_name(self, name: str) -> str:
+        """Update the name of a loaded weight to enable generic handling of 
+        cases where serialized state_dict does not match vllm model definition.
+        """
+        return name
 
 
 class UnquantizedLinearMethod(LinearMethodBase):
@@ -283,6 +289,18 @@ def weight_loader(self,
                       loaded_shard_id: Optional[int] = None):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+        param_shard_splitter = getattr(param, "shard_splitter", None)
+        if output_dim is not None and param_shard_splitter is not None:
+            raise NotImplementedError(
+                "We do not currently support output_dim != None and "
+                "shard_splitter != None for a parameter. Please open an issue."
+            )
+        if loaded_shard_id is None and param_shard_splitter is not None:
+            raise NotImplementedError(
+                "We do not currently support loaded_shard_id == None and "
+                "shard_splitter != None for a parameter. Please open an issue."
+            )
+
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -335,6 +353,10 @@ def weight_loader(self,
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        # If a param_shard_splitter is defined by the LinearMethod, use it.
+        elif param_shard_splitter is not None:
+            param_data, loaded_weight = param_shard_splitter(
+                param_data, loaded_weight, loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -342,6 +364,7 @@ def weight_loader(self,
                     "Loading a weight without `output_dim` attribute in "
                     "MergedColumnParallelLinear, assume the weight is "
                     "the same for all partitions.")
+                
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -420,6 +443,18 @@ def weight_loader(self,
                       loaded_shard_id: Optional[str] = None):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+        param_shard_splitter = getattr(param, "shard_splitter", None)
+        
+        if output_dim is not None and param_shard_splitter is not None:
+            raise NotImplementedError(
+                "We do not currently support output_dim != None and "
+                "shard_splitter != None for a parameter. Please open an issue."
+            )
+        if loaded_shard_id is None and param_shard_splitter is not None:
+            raise NotImplementedError(
+                "We do not currently support loaded_shard_id == None and "
+                "shard_splitter != None for a parameter. Please open an issue."
+            )
 
         if loaded_shard_id is None:
             # Loaded weight is already packed.
@@ -455,6 +490,8 @@ def weight_loader(self,
 
         tp_rank = get_tensor_model_parallel_rank()
         assert loaded_shard_id in ["q", "k", "v"]
+        
+        # If output dim is defined, use the default loading process.
         if output_dim is not None:
             if loaded_shard_id == "q":
                 shard_offset = 0
@@ -478,15 +515,19 @@ def weight_loader(self,
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
-            param_data = param_data.narrow(output_dim, shard_offset,
-                                           shard_size)
+            param_data = param_data.narrow(
+                output_dim, shard_offset, shard_size)
             if loaded_shard_id == "q":
                 shard_id = tp_rank
             else:
                 shard_id = tp_rank // self.num_kv_head_replicas
             start_idx = shard_id * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                 shard_size)
+                                                shard_size)
+        # If a param_shard_splitter is defined by the LinearMethod, use it.
+        elif param_shard_splitter is not None:
+            param_data, loaded_weight = param_shard_splitter(
+                param_data, loaded_weight, loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -494,7 +535,11 @@ def weight_loader(self,
                     "Loading a weight without `output_dim` attribute in "
                     "QKVParallelLinear, assume the weight is the same "
                     "for all partitions.")
-        assert param_data.shape == loaded_weight.shape
+
+        assert (
+            param_data.shape == loaded_weight.shape or
+            (len(param_data.shape) == 0 and len(loaded_weight.shape) == 0)
+        )
         param_data.copy_(loaded_weight)
 
 
diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py
index 09f112132d95..9aaa9db45f57 100644
--- a/vllm/model_executor/layers/quantization/smoothquant.py
+++ b/vllm/model_executor/layers/quantization/smoothquant.py
@@ -102,6 +102,15 @@ class SQLinearMethod(LinearMethodBase):
     def __init__(self,):
         self.i8cugemm = Int8GEMM().get_i8cugemm()
 
+    
+    def maybe_update_loaded_weight_name(self, name: str) -> str:
+        # Convert prefix.k_dequant_scale >> prefix.dequant_scale.
+        if "dequant_scale" in name:
+            suffix = name.split('.')[-1]
+            name.replace(suffix, "dequant_scale")
+        return name
+
+
     def create_weights(
         self,
         input_size_per_partition: int,
@@ -134,6 +143,9 @@ def create_weights(
                 device='cuda'
             ), requires_grad=False
         )
+        set_weight_attrs(dequant_scale, {
+            "shard_splitter": self.shard_splitter_scales,
+        })
 
         return {
             "weight": weight,
@@ -142,6 +154,28 @@ def create_weights(
             "per_token_quant": per_token_quant,
         }
 
+    @staticmethod
+    def shard_splitter_scales(
+        param: Tensor,
+        loaded_weight: Tensor,
+        shard_id: Union[str, int]
+    ) -> Tuple[Tensor, Tensor]:
+        index_in_param = None
+        if shard_id == "q":
+            index_in_param = 0
+        elif shard_id == "k":
+            index_in_param = 1
+        elif shard_id == "v":
+            index_in_param = 2        
+        elif type(shard_id) == int:
+            index_in_param = shard_id
+        else:
+            raise ValueError(
+                f"shard_id must be 'q', 'k', 'v' or an int, but got {shard_id}"
+            )
+        
+        return param[index_in_param], loaded_weight
+
 
     def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths):
         x_dq = torch.empty_like(x_q, dtype=self.output_dtype)
@@ -202,7 +236,7 @@ def apply_weights(self,
         logical_widths = weights["logical_widths"]
         per_token_quant = weights["per_token_quant"]
 
-        # Q
+        # Q --> GEMM --> DQ
         x_q, activation_scale = self._quantize(x, per_token_quant)
         out_q = self._gemm(x_q, weight)
         return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths) 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 1831fbb31b4a..868394c76109 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -388,6 +388,9 @@ def load_weights(self,
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
+            # Update name of the loaded_weight if needed by the LinearMethod.
+            name = self.linear_method.maybe_update_loaded_weight_name(name)
+
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name
@@ -395,34 +398,6 @@ def load_weights(self,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            # load dequant scale for qkv_proj and gate_up_proj
-            int8_fusion = True
-            if int8_fusion:
-                is_fusion_scale = False
-                if "scale" in name:
-                    for (param_name, weight_name, _) in stacked_params_mapping:
-                        if weight_name not in name:
-                            continue
-                        if "q_proj" in name:
-                            idx = 0
-                        elif "k_proj" in name:
-                            idx = 1
-                        elif "v_proj" in name:
-                            idx = 2
-                        elif "gate_proj" in name:
-                            idx = 0
-                        else:
-                            assert "up_proj" in name
-                            idx = 1
-
-                        name = name.replace(weight_name, param_name)
-                        suffix = name.split('.')[-1]
-                        param = params_dict[name.replace(suffix, "dequant_scale")][idx]
-                        param.copy_(loaded_weight)
-                        is_fusion_scale = True
-                        break
-                    if is_fusion_scale:
-                        continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue

From a0480f7b7d0819e543f799360dbd214cf684a67b Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 13 Apr 2024 02:14:37 +0000
Subject: [PATCH 11/12] added TODO

---
 examples/offline_inference.py        | 2 +-
 vllm/model_executor/layers/linear.py | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index 9b758fa2479f..6b548d5e8921 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -11,7 +11,7 @@
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index f9332c05cfe6..8f231e0b021c 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -65,7 +65,7 @@ def __init__(self, separate_bias_add: bool = False):
     def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int,
-                       params_dtype: torch.dtype) -> Dict[str, Any]:
+                       params_dtype: torch.dtype, logical_widths: Optional[List[int]]) -> Dict[str, Any]:
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
                                        dtype=params_dtype),
@@ -195,6 +195,7 @@ def __init__(
             output_size=self.output_size,
             params_dtype=self.params_dtype,
             logical_widths=logical_widths,
+            # TODO: remove this, should be coming through the quant config.
             per_token_quant=False,
         )
         for name, weight in self.linear_weights.items():
@@ -604,7 +605,9 @@ def __init__(
             output_size=self.output_size, 
             params_dtype=self.params_dtype,
             logical_widths=[self.output_size],
-            per_token_quant=True,)
+            # TODO: remove this, should be coming through the quant config.
+            per_token_quant=True,
+        )
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)

From 1afab71335248dfe23aac8e0d32640b5012d9309 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Tue, 16 Apr 2024 13:32:26 -0400
Subject: [PATCH 12/12] [2/N] Rs/vllm quantization - Refactor refactor to
 support non-uniform via config (#188)

Refactored to support nonuniform quantization by adding a new layer of
Abstraction.

Now, `SmoothQuantLinearMethod` can hold a `SmoothQuantFormat`, which
implements the details of how to do quant and dequant operations. There
are two `SmoothQuantFormat` classes:
- `SmoothQuantDynamicPerToken`
- `SmoothQuantStaticPerTensor`

We have the following lifecycle:
- `LinearMethod` is created during `get_model`, has access to
`QuantizationConfig`
- `Layer` is initialized and passed a `LinearMethod`
- `Layer` calls `LinearMethod.create_weights`, which creates a
dictionary of weights and metadata
- `Layer` calls `LinearMethod.apply_weights` during inference, passing
the dictionary created during `create_weights`

This PR modifies the `LinearMethod.create_weights` API to receive a
`layer_name` as argument. The `LinearMethod` then looks in the `config`
to determine which `SmoothQuantFormat` to use for the layer with
`layer_name`
- As a result, the `LinearMethod` is responsible for parsing the config
from disk and making decisions about what the inference format should
look like. In this specific case, since the `SmoothQuantConfig` is not
very good, we just match on the suffix `qkv` to determine what each
layer should use --> but for SparseMLConfig, we could use a similar
structure

In this PR, the `SmoothQuantFormat` is passed in the dictionary returned
by `create_weights` and then is used by `apply_weights`


### In Summary

I think this is a good overall structure because it:
- (a) allows us to make minimal changes to the existing models
- (b) allows us to make no changes to the model loading lifecycle (i.e.
config / constructor / linear method) ** critically requires having one
LinearMethod that propagates through the whole model
- (c) encapsulates the nonuniform logic into the `LinearMethod`,
allowing us to have a clean interface into

### For SparseML Models

We could imagine the following architecture:

#### Config
Config is responsible for:
- loading config from disk
- mapping layer_names --> `SparseMLFormat`

```python
class SparseMLConfig
    def from_dict()

    def get_layer_format(layer_name):
         return SparseMLFormat
```

#### LinearMethod
Config is responsible for:
- interface between layers and kernels (so LinearMethod is what is used
by the model)

```python
class SparseMLLinearMethod:
    def __init__(self, sparseml_config)
          self.sparseml_config = sparseml_config

    def create_weights(layer_name, ...):
          # this, e.g. is where nonuniform might be supported
          format = self.sparseml_config.get_layer_format(layer_name)

          weights = format.get_weights()
          weights["format"] = format

          return weights

     # wrapper around the SparseML format
     def apply_weights(x, weights, ...)
           format = weights["format"]
           weights = weights["weights"]

           return format.apply_weights(x, weights)
```

#### SparseMLFormat
Format is responsible for:
- actual weight creation and forward

```python
class SparseMLLinearMethod:
    def __init__(self, sparseml_config)
          self.sparseml_config = sparseml_config

    def get_weights(sizes):
         # returns dictionary , e.g.
         return {
             "weights": x
             "scales": y
         }

     def apply_weights(weights, x):
         # calls cuda kernel
         return output
```

Sample Formats:
    - `W8A8DynamicPerToken`
    - `SparseW8A8StaticPerTensorAsymmetric`
    - `W4A8DynamicPerToken`
    - ...
---
 examples/offline_quantized_inference.py       |   2 +-
 examples/simple_test.py                       |  35 ++
 vllm/config.py                                |   4 +-
 vllm/model_executor/layers/linear.py          |  76 +++--
 .../model_executor/layers/quantization/awq.py |  15 +-
 .../layers/quantization/base_config.py        |   6 +-
 .../layers/quantization/gptq.py               |   7 +-
 .../layers/quantization/marlin.py             |   6 +-
 .../layers/quantization/smoothquant.py        | 242 --------------
 .../quantization/smoothquant/__init__.py      |  14 +
 .../layers/quantization/smoothquant/config.py | 306 ++++++++++++++++++
 .../quantization/smoothquant/formats.py       | 100 ++++++
 .../layers/quantization/squeezellm.py         |  15 +-
 vllm/model_executor/model_loader.py           |   8 +-
 vllm/model_executor/models/llama.py           |  45 ++-
 15 files changed, 571 insertions(+), 310 deletions(-)
 create mode 100644 examples/simple_test.py
 delete mode 100644 vllm/model_executor/layers/quantization/smoothquant.py
 create mode 100644 vllm/model_executor/layers/quantization/smoothquant/__init__.py
 create mode 100644 vllm/model_executor/layers/quantization/smoothquant/config.py
 create mode 100644 vllm/model_executor/layers/quantization/smoothquant/formats.py

diff --git a/examples/offline_quantized_inference.py b/examples/offline_quantized_inference.py
index 124a99468704..8b3dbea72ae6 100644
--- a/examples/offline_quantized_inference.py
+++ b/examples/offline_quantized_inference.py
@@ -17,7 +17,7 @@
 
 # Create an LLM.
 llm = LLM(
-    model=model_path,
+    model="nm-testing/Nous-Hermes-Llama2-13b-smoothquant",
     gpu_memory_utilization=0.9,
     max_model_len=2048,
     quantization="smoothquant",
diff --git a/examples/simple_test.py b/examples/simple_test.py
new file mode 100644
index 000000000000..dcf8b8c7ed1e
--- /dev/null
+++ b/examples/simple_test.py
@@ -0,0 +1,35 @@
+import argparse
+from vllm import LLM, SamplingParams
+
+MODELS = {
+    "tinyllama-fp16": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "tinyllama-marlin": "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin",
+    "tinyllama-gptq": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+    "tinyllama-awq": "TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ",
+}
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", type=str)
+parser.add_argument("--tensor-parallel-size", type=int, default=1)
+args = parser.parse_args()
+
+if args.model not in MODELS:
+    print(f"Got model id of {args.model}; Must be in {list(MODELS.keys())}")
+    raise ValueError
+else:
+    model_id = MODELS[args.model]
+    print(f"Using model_id = {model_id}")
+
+messages=[{
+    "role": "system",
+    "content": "You are a helpful assistant."
+}, {
+    "role": "user",
+    "content": "What is deep learning?"
+}]
+
+model = LLM(model_id, enforce_eager=True, max_model_len=2048, tensor_parallel_size=args.tensor_parallel_size, dtype="float16")
+prompt = model.llm_engine.tokenizer.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+out = model.generate(prompt, SamplingParams(max_tokens=50))
+print(f"\n-----prompt\n{prompt}")
+print(f"\n-----generation\n{out[0].outputs[0].text}")
diff --git a/vllm/config.py b/vllm/config.py
index 3149aaf68914..cd48fe4f1b9d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -173,8 +173,8 @@ def _verify_tokenizer_mode(self) -> None:
         self.tokenizer_mode = tokenizer_mode
 
     def _verify_quantization(self) -> None:
-        supported_quantization = ["awq", "gptq", "squeezellm", "smoothquant"]
-        rocm_not_supported_quantization = ["awq", "marlin"]
+        supported_quantization = ["awq", "gptq", "marlin", "squeezellm", "smoothquant"]
+        rocm_not_supported_quantization = ["awq", "marlin", "smoothquant"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 8f231e0b021c..2598156bbed3 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -29,8 +29,11 @@ class LinearMethodBase(ABC):
     """Base class for different (maybe quantized) linear methods."""
 
     @abstractmethod
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
+    def create_weights(self,
+                       layer_name: str,
+                       input_size_per_partition: int,
+                       output_sizes_per_partition: List[int],
+                       input_size: int,
                        output_size: int,
                        params_dtype: torch.dtype) -> Dict[str, Any]:
         """Create weights for a linear layer."""
@@ -62,17 +65,20 @@ class UnquantizedLinearMethod(LinearMethodBase):
     def __init__(self, separate_bias_add: bool = False):
         self.separate_bias_add = separate_bias_add
 
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype, logical_widths: Optional[List[int]]) -> Dict[str, Any]:
-        weight = Parameter(torch.empty(output_size_per_partition,
+    def create_weights(self,
+                       layer_name: str,
+                       input_size_per_partition: int,
+                       output_sizes_per_partition: List[int], 
+                       input_size: int, output_size: int,
+                       params_dtype: torch.dtype) -> Dict[str, Any]:
+        weight = Parameter(torch.empty(sum(output_sizes_per_partition),
                                        input_size_per_partition,
                                        dtype=params_dtype),
                            requires_grad=False)
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
         return {"weight": weight}
 
+
     def apply_weights(self,
                       weights: Dict[str, torch.Tensor],
                       x: torch.Tensor,
@@ -89,6 +95,7 @@ class ReplicatedLinear(torch.nn.Module):
     """Replicated linear layer.
 
     Args:
+        layer_name: name of the layer in the state dict.
         input_size: input dimension of the linear layer.
         output_size: output dimension of the linear layer.
         bias: If true, add bias.
@@ -99,6 +106,7 @@ class ReplicatedLinear(torch.nn.Module):
 
     def __init__(
         self,
+        layer_name: str,
         input_size: int,
         output_size: int,
         bias: bool = True,
@@ -109,6 +117,7 @@ def __init__(
         super().__init__()
 
         # Keep input parameters
+        self.layer_name = layer_name
         self.input_size = input_size
         self.output_size = output_size
         self.skip_bias_add = skip_bias_add
@@ -119,8 +128,8 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
-            self.input_size, self.output_size, self.input_size,
-            self.output_size, self.params_dtype)
+            self.layer_name, self.input_size, [self.output_size],
+            self.input_size, self.output_size, self.params_dtype)
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
                 self.register_parameter(name, weight)
@@ -145,6 +154,7 @@ class ColumnParallelLinear(torch.nn.Module):
     its second dimension as A = [A_1, ..., A_p].
 
     Args:
+        layer_name: name of the layer in the state dict.
         input_size: first dimension of matrix A.
         output_size: second dimension of matrix A.
         bias: If true, add bias.
@@ -163,6 +173,7 @@ class ColumnParallelLinear(torch.nn.Module):
 
     def __init__(
         self,
+        layer_name: str,
         input_size: int,
         output_size: int,
         bias: bool = True,
@@ -170,17 +181,24 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         linear_method: Optional[LinearMethodBase] = None,
-        logical_widths: Optional[List[int]] = None,
     ):
         super().__init__()
 
         # Keep input parameters
+        self.layer_name = layer_name
         self.input_size = input_size
         self.output_size = output_size
         self.gather_output = gather_output
         # Divide the weight matrix along the last dimension.
         tp_size = get_tensor_model_parallel_world_size()
-        self.output_size_per_partition = divide(output_size, tp_size)
+        self.output_size_per_partition = divide(self.output_size, tp_size)
+        self.output_sizes_per_partition = [self.output_size_per_partition]
+        # If QKV or MergedColumn, use output size of each partition. 
+        if self.output_sizes is not None:
+            self.output_sizes_per_partition = [
+                divide(output_size, tp_size) for output_size in self.output_sizes
+            ]
+
         self.skip_bias_add = skip_bias_add
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -189,14 +207,12 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
+            layer_name=self.layer_name,
             input_size_per_partition=self.input_size,
-            output_size_per_partition=self.output_size_per_partition,
+            output_sizes_per_partition=self.output_sizes_per_partition,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            logical_widths=logical_widths,
-            # TODO: remove this, should be coming through the quant config.
-            per_token_quant=False,
         )
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
@@ -263,6 +279,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
 
     def __init__(
         self,
+        layer_name: str,
         input_size: int,
         output_sizes: List[int],
         bias: bool = True,
@@ -275,14 +292,14 @@ def __init__(
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
         super().__init__(
+            layer_name=layer_name,
             input_size=input_size,
             output_size=sum(output_sizes),
             bias=bias,
             gather_output=gather_output,
             skip_bias_add=skip_bias_add,
             params_dtype=params_dtype,
-            linear_method=linear_method,
-            logical_widths=output_sizes)
+            linear_method=linear_method)
 
     def weight_loader(self,
                       param: Parameter,
@@ -381,6 +398,7 @@ class QKVParallelLinear(ColumnParallelLinear):
     be replicated while the query heads are partitioned.
 
     Args:
+        layer_name: name of the layer in the state dict.
         hidden_size: input hidden state size of the transformer.
         head_size: size of each attention head.
         total_num_heads: total number of attention query heads.
@@ -396,6 +414,7 @@ class QKVParallelLinear(ColumnParallelLinear):
 
     def __init__(
         self,
+        layer_name: str,
         hidden_size: int,
         head_size: int,
         total_num_heads: int,
@@ -424,19 +443,21 @@ def __init__(
         input_size = self.hidden_size
         output_size = (self.num_heads +
                        2 * self.num_kv_heads) * tp_size * self.head_size
+        self.output_sizes = [
+            self.num_heads * self.head_size * tp_size,      # q_proj
+            self.num_kv_heads * self.head_size * tp_size,   # k_proj
+            self.num_kv_heads * self.head_size * tp_size,   # v_proj 
+        ]
+
         super().__init__(
+            layer_name=layer_name,
             input_size=input_size,
             output_size=output_size, 
             bias=bias, 
             gather_output=False, 
             skip_bias_add=skip_bias_add,
             params_dtype=params_dtype, 
-            linear_method=linear_method,
-            logical_widths = [
-                self.num_heads * self.head_size,            # q_proj
-                self.total_num_kv_heads * self.head_size,   # k_proj
-                self.total_num_kv_heads * self.head_size,   # v_proj 
-            ])
+            linear_method=linear_method)
 
     def weight_loader(self,
                       param: Parameter,
@@ -557,6 +578,7 @@ class RowParallelLinear(torch.nn.Module):
               | A_p |
                -   -
     Arguments:
+        layer_name: name of the layer in the state dict.
         input_size: first dimension of matrix A.
         output_size: second dimension of matrix A.
         bias: If true, add bias. Note that bias is not parallelized.
@@ -572,6 +594,7 @@ class RowParallelLinear(torch.nn.Module):
 
     def __init__(
         self,
+        layer_name: str,
         input_size: int,
         output_size: int,
         bias: bool = True,
@@ -583,6 +606,7 @@ def __init__(
     ):
         super().__init__()
         # Keep input parameters
+        self.layer_name = layer_name
         self.input_size = input_size
         self.output_size = output_size
         self.input_is_parallel = input_is_parallel
@@ -599,14 +623,12 @@ def __init__(
             linear_method = UnquantizedLinearMethod()
         self.linear_method = linear_method
         self.linear_weights = self.linear_method.create_weights(
+            layer_name=self.layer_name,
             input_size_per_partition=self.input_size_per_partition,
-            output_size_per_partition=self.output_size,
+            output_sizes_per_partition=[self.output_size],
             input_size=self.input_size,
             output_size=self.output_size, 
             params_dtype=self.params_dtype,
-            logical_widths=[self.output_size],
-            # TODO: remove this, should be coming through the quant config.
-            per_token_quant=True,
         )
         for name, weight in self.linear_weights.items():
             if isinstance(weight, torch.Tensor):
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 2caef5f1ebf5..7cf94ae9f44e 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -79,10 +79,17 @@ class AWQLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: AWQConfig):
         self.quant_config = quant_config
 
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype) -> Dict[str, Any]:
+    def create_weights(
+        self, 
+        layer_name: str,
+        input_size_per_partition: int,
+        output_sizes_per_partition: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype) -> Dict[str, Any]:
+        del layer_name, input_size, output_size # Unused.
+        output_size_per_partition = sum(output_sizes_per_partition)
+
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 6115e7c3be95..868e09252bb2 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -51,14 +51,12 @@ def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
                          "quantization config.")
 
     @abstractmethod
-    def get_linear_method(self) -> LinearMethodBase:
+    def get_linear_method(self, name) -> LinearMethodBase:
         """Get the linear method to use for the quantized linear layer."""
         raise NotImplementedError
 
-    @abstractmethod
     def get_scaled_act_names(self) -> List[str]:
         """Returns the activation function names that should be post-scaled.
-
         For now, this is only used by AWQ.
         """
-        raise NotImplementedError
+        return []
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 53baf710ed81..8c3492ae67d8 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -89,13 +89,16 @@ def __init__(self, quant_config: GPTQConfig):
 
     def create_weights(
         self,
+        layer_name: str,
         input_size_per_partition: int,
-        output_size_per_partition: int,
+        output_sizes_per_partition: List[int],
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
     ) -> Dict[str, Any]:
-        del output_size  # Unused.
+        del output_size, layer_name  # Unused.
+        output_size_per_partition = sum(output_sizes_per_partition)
+
         if input_size_per_partition % self.quant_config.group_size != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 784229878edf..59d217567919 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -91,13 +91,15 @@ def __init__(self, quant_config: MarlinConfig):
 
     def create_weights(
         self,
+        layer_name: str,
         input_size_per_partition: int,
-        output_size_per_partition: int,
+        output_sizes_per_partition: List[int],
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
     ) -> Dict[str, Any]:
-        del output_size  # Unused.
+        del layer_name, input_size, output_size # Unused.
+        output_size_per_partition = sum(output_sizes_per_partition)
 
         if params_dtype != torch.float16:
             raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py
deleted file mode 100644
index 9aaa9db45f57..000000000000
--- a/vllm/model_executor/layers/quantization/smoothquant.py
+++ /dev/null
@@ -1,242 +0,0 @@
-from typing import Any, Dict, List, Tuple, Optional, Union
-
-import torch
-from torch._tensor import Tensor
-from torch.nn.parameter import Parameter
-from torch.nn import ParameterList
-import threading
-
-from vllm._C import ops
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
-
-
-class SmoothQuantConfig(QuantizationConfig):
-    """Config class for SmoothQuant
-
-    Reference: https://github.com/mit-han-lab/smoothquant
-    """
-
-    def __init__(self,
-                 weight_bits: int = 8,
-                 quant_map: dict[str:str] = None) -> None:
-        self.weight_bits = weight_bits
-        self.quant_map = quant_map
-
-        if self.weight_bits != 8:
-            raise ValueError(
-                "Currently, only w8a8 quantization is supported for "
-                f"SmoothQuant, but got {self.weight_bits} bits.")
-        if self.quant_map is None or self.quant_map == {}:
-            raise ValueError(
-                'Quant_map for SmoothQuant should not be None or an empty dict. '
-                'For example, when using llama, you should set a quant_config.json in model directory, like '
-                '{ "qkv": "per-tensor", "out": "per-token", "fc1": "per-tensor", "fc2": "per-token" }'
-            )
-
-    def __repr__(self) -> str:
-        return (f"SmoothQuantConfig(weight_bits={self.weight_bits}, "
-                f"quant_map={self.quant_map})")
-
-    def get_name(self) -> str:
-        return "smoothquant"
-
-    def get_supported_act_dtypes(self) -> List[torch.dtype]:
-        return [torch.half, torch.float]
-
-    def get_min_capability(self) -> int:
-        # The smoothquant kernel only supports Ampere or newer GPUs.
-        return 80
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        """List of filenames to search for in the model directory."""
-        return [
-            "quant_config.json",
-            "quantize_config.json",
-        ]
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "SmoothQuantConfig":
-        try:
-            weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
-        except ValueError as e:
-            weight_bits = 8
-            print(str(e) + " Set weight_bits = 8 by default.")
-
-        quant_map = {}
-        for key, value in config.items():
-            if value in ["per-tensor", "per-token"]:
-                quant_map[key] = value
-        return cls(weight_bits, quant_map)
-
-    def get_linear_method(self) -> "SQLinearMethod":
-        return SQLinearMethod()
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-
-class Int8GEMM(object):
-    _instance_lock = threading.Lock()
-
-    def __init__(self):
-        if not hasattr(self, "i8cugemm"):
-            self.i8cugemm = ops.I8CUGEMM()
-
-    def __new__(cls, *args, **kwargs):
-        if not hasattr(Int8GEMM, "_instance"):
-            with Int8GEMM._instance_lock:
-                if not hasattr(Int8GEMM, "_instance"):
-                    Int8GEMM._instance = object.__new__(cls)
-        return Int8GEMM._instance
-
-    def get_i8cugemm(self):
-        return self.i8cugemm
-
-class SQLinearMethod(LinearMethodBase):
-    """Linear method for AutoSmoothQuant.
-    """
-
-    def __init__(self,):
-        self.i8cugemm = Int8GEMM().get_i8cugemm()
-
-    
-    def maybe_update_loaded_weight_name(self, name: str) -> str:
-        # Convert prefix.k_dequant_scale >> prefix.dequant_scale.
-        if "dequant_scale" in name:
-            suffix = name.split('.')[-1]
-            name.replace(suffix, "dequant_scale")
-        return name
-
-
-    def create_weights(
-        self,
-        input_size_per_partition: int,
-        output_size_per_partition: int,
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        logical_widths: Optional[List[int]] = None,
-        per_token_quant: bool = False,
-    ) -> Dict[str, Tensor]:
-        self.output_dtype = params_dtype
-
-        weight = Parameter(
-            torch.empty(
-                output_size_per_partition,
-                input_size_per_partition,
-                device="cuda",
-                dtype=torch.int8,
-            ), requires_grad=False,
-        )
-        set_weight_attrs(weight, {
-            "input_dim": 1,
-            "output_dim": 0,
-        })
-
-        dequant_scale = Parameter(
-            torch.tensor(
-                [1.0] * len(logical_widths), 
-                dtype=params_dtype,
-                device='cuda'
-            ), requires_grad=False
-        )
-        set_weight_attrs(dequant_scale, {
-            "shard_splitter": self.shard_splitter_scales,
-        })
-
-        return {
-            "weight": weight,
-            "dequant_scale": dequant_scale,
-            "logical_widths": logical_widths,
-            "per_token_quant": per_token_quant,
-        }
-
-    @staticmethod
-    def shard_splitter_scales(
-        param: Tensor,
-        loaded_weight: Tensor,
-        shard_id: Union[str, int]
-    ) -> Tuple[Tensor, Tensor]:
-        index_in_param = None
-        if shard_id == "q":
-            index_in_param = 0
-        elif shard_id == "k":
-            index_in_param = 1
-        elif shard_id == "v":
-            index_in_param = 2        
-        elif type(shard_id) == int:
-            index_in_param = shard_id
-        else:
-            raise ValueError(
-                f"shard_id must be 'q', 'k', 'v' or an int, but got {shard_id}"
-            )
-        
-        return param[index_in_param], loaded_weight
-
-
-    def _dequantize(self, x_q, weight_scales, activation_scales, logical_widths):
-        x_dq = torch.empty_like(x_q, dtype=self.output_dtype)
-
-        # Split into shards.
-        x_q_split = x_q.split(logical_widths, dim=-1)
-        x_dq_split = x_dq.split(logical_widths, dim=-1)
-
-        # If QuantType is Static per Tensor:
-        if activation_scales is None:
-            for xdq, xq, weight_scale in zip(x_dq_split, x_q_split, weight_scales):
-                ops.dequant(xdq, xq, weight_scale)
-        
-        # If QuantType is Dynamic per Token:
-        else:
-            for xdq, xq, weight_scale, activation_scale in zip(
-                x_dq_split, x_q_split, weight_scales, activation_scales):
-                ops.dequant(xdq, xq, activation_scale, weight_scale)
-
-        # Return dequantized activation.
-        return x_dq
-
-
-    def _quantize(self, x, per_token_quant: bool):
-        x_q = torch.empty_like(x, dtype=torch.int8)
-        
-        # Compute activation scale if per token.
-        if per_token_quant:
-            activation_scale = torch.empty(
-                x.numel() // x.shape[-1],
-                dtype=torch.float32,
-                device=x.device)
-            ops.quant(x_q, x, activation_scale)
-        # Set activation scale if per tensor. TODO: why 1.0? << static?
-        else:   
-            activation_scale = None
-            ops.quant(x_q, x, 1.0)
-        
-        return x_q, activation_scale
-
-    def _gemm(self, x_q, weight):
-        x_q = x_q.view(-1, x_q.shape[-1])
-        out_q = torch.empty(
-            (x_q.shape[0], weight.shape[0]),
-            dtype=torch.int32, device="cuda")
-        
-        self.i8cugemm.linear_a8_w8_o32_(x_q, weight, out_q)
-        return out_q.view(*x_q.shape[:-1], -1)
-
-
-    def apply_weights(self,
-                      weights: Dict[str, Tensor],
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> Tensor:
-        assert bias is None
-        weight = weights["weight"]
-        dequant_scale = weights["dequant_scale"]
-        logical_widths = weights["logical_widths"]
-        per_token_quant = weights["per_token_quant"]
-
-        # Q --> GEMM --> DQ
-        x_q, activation_scale = self._quantize(x, per_token_quant)
-        out_q = self._gemm(x_q, weight)
-        return self._dequantize(out_q, dequant_scale, activation_scale, logical_widths) 
diff --git a/vllm/model_executor/layers/quantization/smoothquant/__init__.py b/vllm/model_executor/layers/quantization/smoothquant/__init__.py
new file mode 100644
index 000000000000..2f62cee49d95
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/smoothquant/__init__.py
@@ -0,0 +1,14 @@
+from vllm.model_executor.layers.quantization.smoothquant.formats import (
+    SmoothQuantFormat
+)
+
+from vllm.model_executor.layers.quantization.smoothquant.config import (
+    SmoothQuantConfig,
+    SmoothQuantLinearMethod
+)
+
+__all__ = [
+    "SmoothQuantFormat",
+    "SmoothQuantConfig",
+    "SmoothQuantLinearMethod",
+]
diff --git a/vllm/model_executor/layers/quantization/smoothquant/config.py b/vllm/model_executor/layers/quantization/smoothquant/config.py
new file mode 100644
index 000000000000..885ffce3e36d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/smoothquant/config.py
@@ -0,0 +1,306 @@
+from typing import Any, Dict, List, Tuple, Type, Optional, Union
+import threading
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm._C import ops
+from vllm.model_executor.layers.linear import (
+    LinearMethodBase,
+    set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.smoothquant.formats import (
+    SmoothQuantFormat,
+    SmoothQuantDynamicPerToken,
+    SmoothQuantStaticPerTensor,
+)
+
+LAYER_KEYS = ["qkv", "out", "fc1", "fc2"]
+FORMAT_REGISTRY = {
+    "per-token": SmoothQuantDynamicPerToken,
+    "per-tensor": SmoothQuantStaticPerTensor,
+}
+
+def get_sq_format_cls(format_key: str) -> Type[SmoothQuantFormat]:
+    if format_key not in FORMAT_REGISTRY:
+        raise ValueError(f"Invalid smoothquant format: {format_key}")
+    return FORMAT_REGISTRY[format_key]
+
+class SmoothQuantConfig(QuantizationConfig):
+    """Config class for SmoothQuant.
+
+    Reference: https://github.com/mit-han-lab/smoothquant
+    """
+    def __init__(self,
+                 layer_format_map: Dict[str, str]) -> None:
+        self.layer_format_map = layer_format_map
+
+        for key, format in self.layer_format_map.items():
+            if key not in LAYER_KEYS:
+                raise ValueError(
+                    f"Found key of {key} in {self.layer_format_map}, " 
+                    f"but key must be one of {LAYER_KEYS}"
+                )
+            if format not in FORMAT_REGISTRY:
+                raise ValueError(
+                    f"Found format of {format} in {self.layer_format_map}, "
+                    f"but format must be one of {FORMAT_REGISTRY}"
+                )
+        for key in LAYER_KEYS:
+            if key not in self.layer_format_map:
+                raise ValueError(
+                    f"Could not find {key} in {layer_format_map}"
+                )
+
+    def __repr__(self) -> str:
+        return (f"SmoothQuantConfig(layer_format_map={self.layer_format_map})")
+
+    def get_name(self) -> str:
+        return "smoothquant"
+
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        # TODO: check if we support fp16 / bf16 as well.
+        return [torch.float]
+
+    def get_min_capability(self) -> int:
+        # TODO: check if this is right.
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        """List of filenames to search for in the model directory."""
+        return [
+            "quant_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "SmoothQuantConfig":
+        layer_format_map: Dict[str, str] = {}
+        for layer_key, format in config.items():
+            if format in FORMAT_REGISTRY:
+                layer_format_map[layer_key] = format
+        return cls(layer_format_map)
+    
+    def get_linear_method(self) -> "SmoothQuantLinearMethod":
+        return SmoothQuantLinearMethod(self)
+
+
+# TODO: why is this needed?
+class Int8GEMM(object):
+    _instance_lock = threading.Lock()
+
+    def __init__(self):
+        if not hasattr(self, "i8cugemm"):
+            self.i8cugemm = ops.I8CUGEMM()
+
+    def __new__(cls, *args, **kwargs):
+        if not hasattr(Int8GEMM, "_instance"):
+            with Int8GEMM._instance_lock:
+                if not hasattr(Int8GEMM, "_instance"):
+                    Int8GEMM._instance = object.__new__(cls)
+        return Int8GEMM._instance
+
+    def get_i8cugemm(self):
+        return self.i8cugemm
+
+
+class SmoothQuantLinearMethod(LinearMethodBase):
+    def __init__(self, sq_config: SmoothQuantConfig) -> None:
+        self.sq_config = sq_config
+        self.sq_type = None
+        self.i8cugemm = Int8GEMM().get_i8cugemm()    
+
+    def maybe_update_loaded_weight_name(self, 
+                                        name: str) -> str:
+        """Convert serialized name k_dequant_scale to dequant_scale.
+
+        This function is called by model_cls.load_weights() during the weight
+        loading process to match on disk state dict to vllm state dict.
+        """
+        if "dequant_scale" in name:
+            suffix = name.split('.')[-1]
+            name.replace(suffix, "dequant_scale")
+        return name
+
+    def scales_shard_splitter(self, 
+                              param: torch.Tensor,
+                              loaded_weight: torch.Tensor, 
+                              shard_id: Union[str, int]) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Index into param for for loading.
+        
+        This function is called by QKVColumnLinear and MergedColumnParallelLinear
+        during weight loading to put the scales from disk in the right spot.
+        """
+        if type(shard_id) == str:
+            qkv_idxs = { "q": 0, "k": 1, "v": 2 }
+            if shard_id not in qkv_idxs:
+                raise ValueError(f"Invalid shard_id {shard_id}")
+            shard_id = qkv_idxs[shard_id]
+        elif type(shard_id) != int:
+            raise ValueError(f"Invalid shard id {shard_id}")
+
+        return param[shard_id], loaded_weight
+
+    def get_layer_format(self, layer_name: str) -> SmoothQuantFormat:
+        """
+        Gets the SmoothQuantFormat for a specific layer.
+
+        SmoothQuantLinearMethod uses SmoothQuantType to support non-uniform quantization 
+        (where each layer has a different format). To determine the SmoothQuantFormat 
+        for a layer, we match the layer_name to the layer_keys=["qkv","out","fc1","fc2"] 
+        and use layer_format_map to to determine the SQFormat.
+        
+        Args:
+            layer_name: Name of the layer we are creating the LinearMethod for.
+        Returns
+            sq_linear_method: SmoothQuantLinearMethod with the right SQFormat.
+        """      
+        # Note: AutoSmoothQuant Serialization is not very good yet.
+        #
+        # It looks like the following (which does not map to layer names in the model):
+        # {
+        #   "qkv": "per-tensor",
+        #   "out": "per-token",
+        #   "fc1": "per-tensor",
+        #   "fc2": "per-token"
+        # }
+        #
+        # So, this is a hack for llama now. But with the SparseMLConfig, we can make robust,
+        # where we actually use the layer_name in the model to look up what the format is
+        # based on the config.
+        #
+        # What it would actually look like:
+        # layer_config is None
+        # for supported_key in SUPPORTED_LAYER_KEYS:
+        #     if supported_key in layer_name:
+        #         sq_format = self.layer_mapping[lookup_key]
+        #           return get_sq_format_cls(sq_format)()
+
+        HACKED_REMAP_FOR_LLAMA = {
+            "qkv": "qkv", 
+            "o_proj": "out", 
+            "gate_up": 
+            "fc1", "down": "fc2",
+        }
+
+        for match_key, lookup_key in HACKED_REMAP_FOR_LLAMA.items():
+            if match_key in layer_name:
+                sq_format = self.sq_config.layer_format_map[lookup_key]
+                return get_sq_format_cls(sq_format)()
+            
+        raise ValueError
+        
+    def create_weights(self, 
+                       layer_name: str,
+                       input_size_per_partition: int,
+                       output_sizes_per_partition: int,
+                       input_size: int,
+                       output_size: int,
+                       params_dtype: torch.dtype) -> Dict[str, torch.Tensor]:
+        del input_size, output_size
+
+        # Statically Quantized Weights.
+        weight = Parameter(
+            torch.empty(
+                sum(output_sizes_per_partition),
+                input_size_per_partition,
+                device="cuda", dtype=torch.int8,
+            ), requires_grad=False,
+        )
+        set_weight_attrs(weight, {
+            "input_dim": 1,
+            "output_dim": 0,
+        })
+
+        # Static scale for each logical weight (e.g. 3 for QKV).
+        dequant_scale = Parameter(
+            torch.empty(
+                len(output_sizes_per_partition), 
+                 device='cuda', dtype=params_dtype,
+            ), requires_grad=False
+        )
+        set_weight_attrs(dequant_scale, {
+            "shard_splitter": self.scales_shard_splitter,
+        })
+
+        return {
+            "weight": weight,
+            "dequant_scale": dequant_scale,
+            "logical_widths": output_sizes_per_partition,
+            "sq_format": self.get_layer_format(layer_name)
+        }
+
+    def _quantize(self,
+                  x: torch.Tensor,
+                  sq_format: SmoothQuantFormat) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Quantize activations.
+
+        Args:
+            x: Activation at floating point precision.
+        Returns:
+            x_q: Quantized activation at INT8
+            activation_scales: Optional dynamic scales for each token.
+        """
+        x_q = torch.empty_like(x, dtype=torch.int8)
+        x_q, activation_scales = sq_format.quantize_op(x, x_q)
+        return x_q, activation_scales
+
+    def _dequantize(self, 
+                    x_q: torch.Tensor, 
+                    dynamic_scales: Optional[torch.Tensor],
+                    static_scales: torch.Tensor,
+                    logical_widths: List[int],
+                    dtype: torch.dtype,
+                    sq_format: SmoothQuantFormat) -> torch.Tensor:
+        """Dequantize activations.
+
+        Args:
+            x_q: quantized activations.
+            dynamic_scales: Optional dynamic scales.
+            static_scales: Static dequantization scales.
+            logical_widths: Width of each logical activation (for QKV case).
+            dtype: Datatype to dequantize to.
+        Returns:
+            x_dq: dequantized activation at output_dtype precision
+        """
+        # Split X_q and X_dq buffer into logical activations (for QKV case).
+        x_q_split = x_q.split(logical_widths, dim=-1)
+        x_dq = torch.empty_like(x_q, dtype=dtype)
+        x_dq_split = x_dq.split(logical_widths, dim=-1)
+        # Dequantize in place and return.
+        sq_format.dequantize_op(x_q_split, x_dq_split, dynamic_scales, static_scales)
+        return x_dq
+
+
+    def apply_weights(self,
+                      weights: Dict[str, torch.Tensor],
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """Forward method. Computes Q --> GEMM --> DQ.
+
+        Args:
+            weigths: Dictionary of weights, scales, and metadata.
+            x: Input in floating point precision.
+            bias: Optional bias.
+        Returns:
+            a_dq: Dequantized activation at floating point precision.
+        """
+        if bias is not None:
+            raise NotImplementedError
+        weight_q = weights["weight"]
+        static_scales = weights["dequant_scale"]
+        logical_widths = weights["logical_widths"]
+        sq_format = weights["sq_format"]
+
+        # Q
+        x_q, activation_scales = self._quantize(x, sq_format)
+
+        # GEMM
+        x_q = x_q.view(-1, x_q.shape[-1])
+        a_q = torch.empty((x_q.shape[0], weight_q.shape[0]), dtype=torch.int32, device="cuda")
+        self.i8cugemm.linear_a8_w8_o32_(x_q, weight_q, a_q)
+        a_q = a_q.view(*x_q.shape[:-1], -1)
+
+        # DQ
+        return self._dequantize(a_q, activation_scales, static_scales, logical_widths, x.dtype, sq_format)
diff --git a/vllm/model_executor/layers/quantization/smoothquant/formats.py b/vllm/model_executor/layers/quantization/smoothquant/formats.py
new file mode 100644
index 000000000000..b8ddd642c888
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/smoothquant/formats.py
@@ -0,0 +1,100 @@
+from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple, Type
+
+import torch
+
+from vllm._C import ops
+
+
+class SmoothQuantFormat(ABC):
+    @abstractmethod
+    def dequantize_op(self,
+                      x_qs: List[torch.Tensor],
+                      x_dqs: List[torch.Tensor],
+                      dynamic_scales: Optional[torch.Tensor],
+                      static_scales: torch.Tensor) -> None:
+        """Dequantize the activations. x_dq is updated in place.
+
+        Args:
+            x_qs: List of N quantized activations.
+            x_dqs: List of N buffers to fill with dequantized values.
+            dynamic_scales: Optional dynamic scales for dequantization.
+            static_scales: Static scales for dequantization. N values.
+        """
+        raise NotImplementedError
+    
+
+    @abstractmethod
+    def quantize_op(self,
+                    x: torch.Tensor,
+                    x_q: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Quantize the input and (optionally compute dequant scales).
+
+        Args:
+            x: Input data in floating point format.
+            x_q: Buffer for quantized inputs.
+        Returns:
+            x_q: Quantized input.
+            activation_scales: Optional dynamic scales for the activations.
+        """
+        raise NotImplementedError
+
+
+class SmoothQuantDynamicPerToken(SmoothQuantFormat):
+    def dequantize_op(self,
+                      x_qs: List[torch.Tensor],
+                      x_dqs: List[torch.Tensor],
+                      dynamic_scales: Optional[torch.Tensor],
+                      static_scales: torch.Tensor) -> None:
+        """Notes:
+        dynamic_scales: N scales for N tokens in the activation.
+        static_scales: K scales for K logical activations (equals just w_scale).
+        """
+        if dynamic_scales is None:
+            raise ValueError
+        
+        # Dequantize each logical activation.
+        # TODO: test this for case when logical_widths > 1 (may need to reshape)
+        for x_dq, x_q, dynamic_scale, static_scale in zip(
+            x_dqs, x_qs, dynamic_scales, static_scales):
+            
+            # Dequantize (updates x_dq in place).
+            ops.dequant(x_dq, x_q, dynamic_scale, static_scale)
+
+
+    def quantize_op(self,
+                    x: torch.Tensor,
+                    x_q: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Notes:
+        Returns quantized activaiton and dynamic activation scales.
+        """
+        activation_scales = torch.empty(x.numel() // x.shape[-1], dtype=x.dtype, device=x.device)
+        ops.quant(x_q, x, activation_scales)
+        return x_q, activation_scales
+    
+
+class SmoothQuantStaticPerTensor(SmoothQuantFormat):
+    def dequantize_op(self,
+                      x_qs: List[torch.Tensor],
+                      x_dqs: List[torch.Tensor],
+                      dynamic_scales: Optional[torch.Tensor],
+                      static_scales: torch.Tensor) -> None:
+        """Notes:
+        dynamic_scales: None
+        static_scales: K scales for K logical activations (equals w_scale * a_scale).
+        """
+        if dynamic_scales is not None:
+            raise ValueError
+        
+        # Dequantize each logical activation.
+        for xdq, xq, static_scale in zip(x_dqs, x_qs, static_scales):
+            ops.dequant(xdq, xq, static_scale)
+
+    def quantize_op(self,
+                    x: torch.Tensor,
+                    x_q: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        """Notes:
+        Returns quantized activaiton and no dynamic scales.
+        """
+        ops.quant(x_q, x, 1.0)
+        return x_q, None
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index ed25455e6ec1..893e6781089d 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -68,10 +68,17 @@ class SqueezeLLMLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: SqueezeLLMConfig):
         self.quant_config = quant_config
 
-    def create_weights(self, input_size_per_partition: int,
-                       output_size_per_partition: int, input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype) -> Dict[str, Any]:
+    def create_weights(
+        self, 
+        layer_name: str,          
+        input_size_per_partition: int,
+        output_sizes_per_partition: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype) -> Dict[str, Any]:
+        del layer_name, input_size # Unused.
+        output_size_per_partition = sum(output_sizes_per_partition)
+
         if input_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
                 "The input size is not aligned with the quantized "
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index 07fab1f1ed6f..de7910c4860b 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -46,10 +46,6 @@ def _get_model_architecture(
 def get_architecture_class_name(model_config: ModelConfig) -> str:
     return _get_model_architecture(model_config)[1]
 
-def _is_support_smoothquant(model_config: ModelConfig) -> bool:
-    architectures = getattr(model_config.hf_config, "architectures", [])
-    supported_archs = ModelRegistry.get_supported_smoothquant_archs()
-    return any(arch in supported_archs for arch in architectures)
 
 def get_model(model_config: ModelConfig, device_config: DeviceConfig,
               **kwargs) -> nn.Module:
@@ -82,9 +78,7 @@ def get_model(model_config: ModelConfig, device_config: DeviceConfig,
         # Create a model instance.
         # The weights will be initialized as empty tensors.
         with torch.device(device_config.device):
-            if _is_support_smoothquant(model_config):
-                model = model_class(model_config.hf_config, linear_method)
-            elif hasattr(model_class, "supported_lora_modules"):
+            if hasattr(model_class, "supported_lora_modules"):
                 model = model_class(model_config.hf_config, linear_method,
                                     lora_config)
             else:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 868394c76109..0b6c75705764 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -57,17 +57,22 @@ def __init__(
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
+        parent_name: str,
         linear_method: Optional[LinearMethodBase] = None,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            layer_name=f"{parent_name}.gate_up_proj",
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(
+            layer_name=f"{parent_name}.down_proj",
+            input_size=intermediate_size,
+            output_size=hidden_size,
             bias=False,
             linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -87,6 +92,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        parent_name: str,
         rope_theta: float = 10000,
         rope_scaling: Optional[Dict[str, Any]] = None,
         max_position_embeddings: int = 8192,
@@ -127,16 +133,18 @@ def __init__(
         self.kv_scale = 1.0
 
         self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
+            layer_name=f"{parent_name}.qkv_proj",
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
             bias=bias,
             linear_method=linear_method,
         )
         self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
+            layer_name=f"{parent_name}.o_proj",
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
             bias=bias,
             linear_method=linear_method,
         )
@@ -175,6 +183,7 @@ class LlamaDecoderLayer(nn.Module):
     def __init__(
         self,
         config: LlamaConfig,
+        parent_name: str,
         linear_method: Optional[LinearMethodBase] = None,
     ) -> None:
         super().__init__()
@@ -192,6 +201,7 @@ def __init__(
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
+            parent_name=f"{parent_name}.self_attn",
             linear_method=linear_method,
             bias=getattr(config, "bias", False),
             sliding_window=sliding_window,
@@ -200,6 +210,7 @@ def __init__(
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
+            parent_name=f"{parent_name}.mlp",
             linear_method=linear_method,
         )
         self.input_layernorm = RMSNorm(config.hidden_size,
@@ -242,7 +253,7 @@ def __init__(
         self,
         config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        lora_config: Optional[LoRAConfig] = None
     ) -> None:
         super().__init__()
         self.config = config
@@ -257,8 +268,10 @@ def __init__(
             org_num_embeddings=config.vocab_size,
         )
         self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
+            LlamaDecoderLayer(config, 
+                              parent_name=f"model.layers.{idx}", 
+                              linear_method=linear_method)
+            for idx in range(config.num_hidden_layers)
         ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
@@ -386,10 +399,12 @@ def load_weights(self,
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
             # Update name of the loaded_weight if needed by the LinearMethod.
-            name = self.linear_method.maybe_update_loaded_weight_name(name)
+            if self.linear_method:
+                name = self.linear_method.maybe_update_loaded_weight_name(name)
 
             if "rotary_emb.inv_freq" in name:
                 continue