vllm-project · Tostino · Oct 9, 2023 · Oct 9, 2023 · Oct 9, 2023 · Oct 9, 2023
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -70,7 +70,7 @@ def run_to_completion(profile: bool = False):
     parser.add_argument('--tokenizer', type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',
-                        choices=['awq', None],
+                        choices=['awq', 'squeezellm', None],
                         default=None)
     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
     parser.add_argument('--input-len', type=int, default=32)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -201,7 +201,7 @@ def main(args: argparse.Namespace):
     parser.add_argument("--tokenizer", type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',
-                        choices=['awq', None],
+                        choices=['awq', 'squeezellm', None],
                         default=None)
     parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
     parser.add_argument("--n",

diff --git a/csrc/quantization.cpp b/csrc/quantization.cpp
@@ -7,9 +7,13 @@ torch::Tensor awq_gemm(
   torch::Tensor _zeros,
   int split_k_iters);
 
+void squeezellm_gemm(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor lookup_table);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def(
-    "awq_gemm",
-    &awq_gemm,
-    "Quantized GEMM for AWQ");
+  m.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
+  m.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
 }
diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -0,0 +1,148 @@
+#include <torch/all.h>
+#include <torch/python.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+// half-tensor
+#include <c10/cuda/CUDAStream.h>
+#include <ATen/cuda/CUDATensorMethods.cuh>
+
+#define BLOCKWIDTH 128
+#define BLOCKHEIGHT4 16
+
+namespace vllm {
+namespace squeezellm {
+
+__device__ inline unsigned int as_unsigned(int i) {
+  return *reinterpret_cast<unsigned int*>(&i);
+}
+
+// 4-bit matvec kernel (LUT-based)
+__global__ void NUQ4MatMulKernel(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           half2* __restrict__ mul,
+    const  __half* __restrict__ lookup_table,
+    int height,
+    int width,
+    int batch,
+    int vec_height
+) {
+
+  const int blockwidth2 = BLOCKWIDTH / 2;
+
+  int row = BLOCKHEIGHT4 * blockIdx.x;
+  int col =  BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ half2 blockvec[blockwidth2];
+
+  __shared__ __half deq2[16][BLOCKWIDTH];
+  int off = threadIdx.x;
+  int column_offset = col * 16;
+  for (int val = 0; val < 16; val += 1) {
+    int lut_index = column_offset + val;
+    deq2[val][off] = lookup_table[lut_index];
+  }
+
+  __half res;
+  half2 res2;
+  half2 tmp2;
+
+  int i;
+  int k;
+
+  unsigned int tmp1;
+  unsigned int lut_index1, lut_index2;
+
+  for (int b = 0; b < batch; ++b){
+    i = width * row + col;
+    res = __int2half_rd(0);
+    k = 0;
+
+    __syncthreads();
+    if (threadIdx.x < blockwidth2)
+      blockvec[threadIdx.x] = vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 + threadIdx.x];
+    __syncthreads();
+
+    while (k < blockwidth2) {
+      tmp1 = as_unsigned(mat[i]);
+
+      res2 = {};
+      tmp2 = {};
+
+      lut_index1 = tmp1 & 0xF;
+      lut_index2 = (tmp1 >> 4) & 0xF;
+      tmp2.x = deq2[lut_index1][off];
+      tmp2.y = deq2[lut_index2][off];
+      res2 = __hfma2(tmp2, blockvec[k + 0], res2);
+
+      lut_index1 = (tmp1 >> 8) & 0xF;
+      lut_index2 = (tmp1 >> 12) & 0xF;
+      tmp2.x = deq2[lut_index1][off];
+      tmp2.y = deq2[lut_index2][off];
+      res2 = __hfma2(tmp2, blockvec[k + 1], res2);
+
+      lut_index1 = (tmp1 >> 16) & 0xF;
+      lut_index2 = (tmp1 >> 20) & 0xF;
+      tmp2.x = deq2[lut_index1][off];
+      tmp2.y = deq2[lut_index2][off];
+      res2 = __hfma2(tmp2, blockvec[k + 2], res2);
+
+      lut_index1 = (tmp1 >> 24) & 0xF;
+      lut_index2 = (tmp1 >> 28) & 0xF;
+      tmp2.x = deq2[lut_index1][off];
+      tmp2.y = deq2[lut_index2][off];
+      res2 = __hfma2(tmp2, blockvec[k + 3], res2);
+
+      res = __hadd(__hadd(res2.x, res2.y), res);
+
+      i += width;
+      k += 4;
+    }
+
+    // col%2 -> only set one of the two values
+    half2 res3 = {};
+    if (col % 2 == 0) {
+      res3.x = res;
+    } else {
+      res3.y = res;
+    }
+
+    atomicAdd(&mul[b * width / 2 + col / 2], res3);
+  }
+}
+
+} // namespace squeezellm
+} // namespace vllm
+
+// 4-bit matvec kernel (LUT-based)
+void squeezellm_gemm(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor lookup_table
+) {
+  int height = mat.size(0);
+  int width = mat.size(1);
+
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads>>>(
+    (half2*) vec.data<at::Half>(),
+    mat.data_ptr<int>(),
+    (half2*) mul.data<at::Half>(),
+    (__half*) lookup_table.data<at::Half>(),
+    height, width, batch, vec_height
+  );
+}
+
+#undef BLOCKWIDTH
+#undef BLOCKHEIGHT4
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
@@ -95,7 +95,7 @@ Start the server:
     $ python -m vllm.entrypoints.openai.api_server \
     $     --model facebook/opt-125m
 
-By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
+By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. You can override the chat template by using the ``--chat-template`` argument which points to a json file. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
 
 This server can be queried in the same format as OpenAI API. For example, list the models:
 

diff --git a/examples/conversation_template_example.json b/examples/conversation_template_example.json
@@ -0,0 +1,13 @@
+{
+  "name": "example",
+  "system_template": "{system_message}",
+  "system_message": "A chat between a user and an artificial intelligence assistant.",
+  "roles": ["USER", "ASSISTANT"],
+  "messages": [],
+  "offset": 0,
+  "sep_style": "ADD_COLON_TWO",
+  "sep": "\n",
+  "sep2": "</s>",
+  "stop_str": "",
+  "stop_token_ids": []
+}
diff --git a/setup.py b/setup.py
@@ -200,6 +200,7 @@ def get_torch_arch_list() -> Set[str]:
     sources=[
         "csrc/quantization.cpp",
         "csrc/quantization/awq/gemm_kernels.cu",
+        "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
     ],
     extra_compile_args={
         "cxx": CXX_FLAGS,

diff --git a/vllm/config.py b/vllm/config.py
@@ -103,7 +103,7 @@ def _verify_tokenizer_mode(self) -> None:
         self.tokenizer_mode = tokenizer_mode
 
     def _verify_quantization(self) -> None:
-        supported_quantization = ["awq"]
+        supported_quantization = ["awq", "squeezellm"]
         if self.quantization is None:
             return
         quantization = self.quantization.lower()

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -168,7 +168,7 @@ def add_cli_args(
         parser.add_argument('--quantization',
                             '-q',
                             type=str,
-                            choices=['awq', None],
+                            choices=['awq', 'squeezellm', None],
                             default=None,
                             help='Method used to quantize the weights')
         return parser

@@ -45,6 +45,7 @@
 served_model = None
 app = fastapi.FastAPI()
 engine = None
+chat_template = None
 
 
 def create_error_response(status_code: HTTPStatus,
@@ -70,50 +71,58 @@ async def check_model(request) -> Optional[JSONResponse]:
 
 
 async def get_gen_prompt(request) -> str:
-    if not _fastchat_available:
-        raise ModuleNotFoundError(
-            "fastchat is not installed. Please install fastchat to use "
-            "the chat completion and conversation APIs: `$ pip install fschat`"
+    if chat_template is not None:
+        return tokenizer.apply_chat_template(conversation=request.messages,
+                                             chat_template=chat_template,
+                                             tokenize=False)
+    elif tokenizer.chat_template is not None:
+        return tokenizer.apply_chat_template(conversation=request.messages,
+                                             tokenize=False)
+    else:
+        if not _fastchat_available:
+            raise ModuleNotFoundError(
+                "fastchat is not installed. Please install fastchat "
+                "to use the chat completion and conversation APIs: "
+                "`$ pip install fschat`")
+        if version.parse(fastchat.__version__) < version.parse("0.2.23"):
+            raise ImportError("fastchat version is low. "
+                              f"Current version: {fastchat.__version__} "
+                              "Please upgrade fastchat to use: "
+                              "`$ pip install -U fschat`")
+        template = get_conversation_template(request.model)
+        conv = Conversation(
+            name=template.name,
+            system_template=template.system_template,
+            system_message=template.system_message,
+            roles=template.roles,
+            messages=list(template.messages),  # prevent in-place modification
+            offset=template.offset,
+            sep_style=SeparatorStyle(template.sep_style),
+            sep=template.sep,
+            sep2=template.sep2,
+            stop_str=template.stop_str,
+            stop_token_ids=template.stop_token_ids,
         )
-    if version.parse(fastchat.__version__) < version.parse("0.2.23"):
-        raise ImportError(
-            f"fastchat version is low. Current version: {fastchat.__version__} "
-            "Please upgrade fastchat to use: `$ pip install -U fschat`")
-
-    conv = get_conversation_template(request.model)
-    conv = Conversation(
-        name=conv.name,
-        system_template=conv.system_template,
-        system_message=conv.system_message,
-        roles=conv.roles,
-        messages=list(conv.messages),  # prevent in-place modification
-        offset=conv.offset,
-        sep_style=SeparatorStyle(conv.sep_style),
-        sep=conv.sep,
-        sep2=conv.sep2,
-        stop_str=conv.stop_str,
-        stop_token_ids=conv.stop_token_ids,
-    )
 
-    if isinstance(request.messages, str):
-        prompt = request.messages
-    else:
-        for message in request.messages:
-            msg_role = message["role"]
-            if msg_role == "system":
-                conv.system_message = message["content"]
-            elif msg_role == "user":
-                conv.append_message(conv.roles[0], message["content"])
-            elif msg_role == "assistant":
-                conv.append_message(conv.roles[1], message["content"])
-            else:
-                raise ValueError(f"Unknown role: {msg_role}")
+        if isinstance(request.messages, str):
+            prompt = request.messages
+        else:
+            for message in request.messages:
+                msg_role = message["role"]
+                if msg_role == "system":
+                    conv.system_message = message["content"]
+                elif msg_role == "user":
+                    conv.append_message(conv.roles[0], message["content"])
+                elif msg_role == "assistant":
+                    conv.append_message(conv.roles[1], message["content"])
+                else:
+                    raise ValueError(f"Unknown role: {msg_role}")
 
-        # Add a blank message for the assistant.
-        conv.append_message(conv.roles[1], None)
-        prompt = conv.get_prompt()
+            # Add a blank message for the assistant.
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
 
-    return prompt
+        return prompt
 
 
 async def check_length(
@@ -590,6 +599,11 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
                         help="The model name used in the API. If not "
                         "specified, the model name will be the same as "
                         "the huggingface name.")
+    parser.add_argument("--chat-template",
+                        type=str,
+                        default=None,
+                        help="The path to the chat template to use "
+                        "with the specified model.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
@@ -609,6 +623,10 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
     else:
         served_model = args.model
 
+    if args.chat_template is not None:
+        with open(args.chat_template, "r") as f:
+            chat_template = f.read()
+
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngine.from_engine_args(engine_args)
     engine_model_config = asyncio.run(engine.get_model_config())

diff --git a/vllm/model_executor/layers/quantized_linear/__init__.py b/vllm/model_executor/layers/quantized_linear/__init__.py
@@ -1,10 +1,14 @@
 from vllm.model_executor.layers.quantized_linear.awq import (
     AWQColumnParallelLinear, AWQRowParallelLinear)
+from vllm.model_executor.layers.quantized_linear.squeezellm import (
+    SqueezeLLMColumnParallelLinear, SqueezeLLMRowParallelLinear)
 from vllm.model_executor.parallel_utils.layers import (ColumnParallelLinear,
                                                        RowParallelLinear)
 
 _QUANTIZED_LINEAR_REGISTRY = {
     "awq": (AWQColumnParallelLinear, AWQRowParallelLinear),
+    "squeezellm":
+    (SqueezeLLMColumnParallelLinear, SqueezeLLMRowParallelLinear),
 }