Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
9d5480e
Add support for conversation-template argument which takes the path t…
Tostino Oct 9, 2023
93236ad
Merge branch 'vllm-project:main' into main
Tostino Oct 9, 2023
990e5b1
Fix formatting
Tostino Oct 9, 2023
19d20c5
Merge remote-tracking branch 'origin/main'
Tostino Oct 9, 2023
d7db472
Fix formatting as yapf wants it to be...
Tostino Oct 9, 2023
7116b29
Additional formatting fixes.
Tostino Oct 9, 2023
b4c267d
Add support for HF chat templates for OpenAI endpoint. Add support fo…
Tostino Oct 16, 2023
600fdbe
Add support for conversation-template argument which takes the path t…
Tostino Oct 9, 2023
ea8eead
Fix formatting
Tostino Oct 9, 2023
218a55a
Fix formatting as yapf wants it to be...
Tostino Oct 9, 2023
ac18629
Additional formatting fixes.
Tostino Oct 9, 2023
591d41d
Add support for HF chat templates for OpenAI endpoint. Add support fo…
Tostino Oct 16, 2023
2572d92
Merge remote-tracking branch 'origin/main'
Tostino Oct 16, 2023
2438c28
Fix readme for incorrect reference.
Tostino Oct 16, 2023
082dac2
Additional formatting changes...unsure how some of these got messed u…
Tostino Oct 16, 2023
6fe0752
More failing formatting because some code was more indented than orig…
Tostino Oct 16, 2023
5d28cf6
More failing formatting...
Tostino Oct 16, 2023
14a849f
More failing formatting...I am getting sick of this.
Tostino Oct 16, 2023
a38afb1
More failing formatting...I am getting extremely sick of this.
Tostino Oct 16, 2023
2ae9043
Merge branch 'vllm-project:main' into main
Tostino Oct 19, 2023
6152cc3
Merge branch 'vllm-project:main' into main
Tostino Oct 21, 2023
1f24755
Support SqueezeLLM (#1326)
chooper1 Oct 22, 2023
a2de84d
Add support for conversation-template argument which takes the path t…
Tostino Oct 9, 2023
c6e13a0
Fix formatting
Tostino Oct 9, 2023
d2b9959
Fix formatting as yapf wants it to be...
Tostino Oct 9, 2023
da3c7e2
Additional formatting fixes.
Tostino Oct 9, 2023
c9a0068
Add support for HF chat templates for OpenAI endpoint. Add support fo…
Tostino Oct 16, 2023
3b999c6
Fix readme for incorrect reference.
Tostino Oct 16, 2023
8d36a3c
Additional formatting changes...unsure how some of these got messed u…
Tostino Oct 16, 2023
6d7195b
More failing formatting because some code was more indented than orig…
Tostino Oct 16, 2023
76464e9
More failing formatting...
Tostino Oct 16, 2023
aad4d8d
More failing formatting...I am getting sick of this.
Tostino Oct 16, 2023
758a08c
More failing formatting...I am getting extremely sick of this.
Tostino Oct 16, 2023
83f2861
Merge remote-tracking branch 'origin/main'
lucasstinson Oct 25, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def run_to_completion(profile: bool = False):
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=['awq', None],
choices=['awq', 'squeezellm', None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def main(args: argparse.Namespace):
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=['awq', None],
choices=['awq', 'squeezellm', None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
Expand Down
12 changes: 8 additions & 4 deletions csrc/quantization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@ torch::Tensor awq_gemm(
torch::Tensor _zeros,
int split_k_iters);

void squeezellm_gemm(
torch::Tensor vec,
torch::Tensor mat,
torch::Tensor mul,
torch::Tensor lookup_table);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def(
"awq_gemm",
&awq_gemm,
"Quantized GEMM for AWQ");
m.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
m.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
}
148 changes: 148 additions & 0 deletions csrc/quantization/squeezellm/quant_cuda_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#include <torch/all.h>
#include <torch/python.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>

// half-tensor
#include <c10/cuda/CUDAStream.h>
#include <ATen/cuda/CUDATensorMethods.cuh>

#define BLOCKWIDTH 128
#define BLOCKHEIGHT4 16

namespace vllm {
namespace squeezellm {

__device__ inline unsigned int as_unsigned(int i) {
return *reinterpret_cast<unsigned int*>(&i);
}

// 4-bit matvec kernel (LUT-based)
__global__ void NUQ4MatMulKernel(
const half2* __restrict__ vec,
const int* __restrict__ mat,
half2* __restrict__ mul,
const __half* __restrict__ lookup_table,
int height,
int width,
int batch,
int vec_height
) {

const int blockwidth2 = BLOCKWIDTH / 2;

int row = BLOCKHEIGHT4 * blockIdx.x;
int col = BLOCKWIDTH * blockIdx.y + threadIdx.x;

__shared__ half2 blockvec[blockwidth2];

__shared__ __half deq2[16][BLOCKWIDTH];
int off = threadIdx.x;
int column_offset = col * 16;
for (int val = 0; val < 16; val += 1) {
int lut_index = column_offset + val;
deq2[val][off] = lookup_table[lut_index];
}

__half res;
half2 res2;
half2 tmp2;

int i;
int k;

unsigned int tmp1;
unsigned int lut_index1, lut_index2;

for (int b = 0; b < batch; ++b){
i = width * row + col;
res = __int2half_rd(0);
k = 0;

__syncthreads();
if (threadIdx.x < blockwidth2)
blockvec[threadIdx.x] = vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 + threadIdx.x];
__syncthreads();

while (k < blockwidth2) {
tmp1 = as_unsigned(mat[i]);

res2 = {};
tmp2 = {};

lut_index1 = tmp1 & 0xF;
lut_index2 = (tmp1 >> 4) & 0xF;
tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off];
res2 = __hfma2(tmp2, blockvec[k + 0], res2);

lut_index1 = (tmp1 >> 8) & 0xF;
lut_index2 = (tmp1 >> 12) & 0xF;
tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off];
res2 = __hfma2(tmp2, blockvec[k + 1], res2);

lut_index1 = (tmp1 >> 16) & 0xF;
lut_index2 = (tmp1 >> 20) & 0xF;
tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off];
res2 = __hfma2(tmp2, blockvec[k + 2], res2);

lut_index1 = (tmp1 >> 24) & 0xF;
lut_index2 = (tmp1 >> 28) & 0xF;
tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off];
res2 = __hfma2(tmp2, blockvec[k + 3], res2);

res = __hadd(__hadd(res2.x, res2.y), res);

i += width;
k += 4;
}

// col%2 -> only set one of the two values
half2 res3 = {};
if (col % 2 == 0) {
res3.x = res;
} else {
res3.y = res;
}

atomicAdd(&mul[b * width / 2 + col / 2], res3);
}
}

} // namespace squeezellm
} // namespace vllm

// 4-bit matvec kernel (LUT-based)
void squeezellm_gemm(
torch::Tensor vec,
torch::Tensor mat,
torch::Tensor mul,
torch::Tensor lookup_table
) {
int height = mat.size(0);
int width = mat.size(1);

int batch = vec.size(0);
int vec_height = vec.size(1);

dim3 blocks(
(height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
(width + BLOCKWIDTH - 1) / BLOCKWIDTH
);
dim3 threads(BLOCKWIDTH);

vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads>>>(
(half2*) vec.data<at::Half>(),
mat.data_ptr<int>(),
(half2*) mul.data<at::Half>(),
(__half*) lookup_table.data<at::Half>(),
height, width, batch, vec_height
);
}

#undef BLOCKWIDTH
#undef BLOCKHEIGHT4
2 changes: 1 addition & 1 deletion docs/source/getting_started/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ Start the server:
$ python -m vllm.entrypoints.openai.api_server \
$ --model facebook/opt-125m

By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. You can override the chat template by using the ``--chat-template`` argument which points to a json file. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.

This server can be queried in the same format as OpenAI API. For example, list the models:

Expand Down
13 changes: 13 additions & 0 deletions examples/conversation_template_example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"name": "example",
"system_template": "{system_message}",
"system_message": "A chat between a user and an artificial intelligence assistant.",
"roles": ["USER", "ASSISTANT"],
"messages": [],
"offset": 0,
"sep_style": "ADD_COLON_TWO",
"sep": "\n",
"sep2": "</s>",
"stop_str": "",
"stop_token_ids": []
}
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ def get_torch_arch_list() -> Set[str]:
sources=[
"csrc/quantization.cpp",
"csrc/quantization/awq/gemm_kernels.cu",
"csrc/quantization/squeezellm/quant_cuda_kernel.cu",
],
extra_compile_args={
"cxx": CXX_FLAGS,
Expand Down
2 changes: 1 addition & 1 deletion vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def _verify_tokenizer_mode(self) -> None:
self.tokenizer_mode = tokenizer_mode

def _verify_quantization(self) -> None:
supported_quantization = ["awq"]
supported_quantization = ["awq", "squeezellm"]
if self.quantization is None:
return
quantization = self.quantization.lower()
Expand Down
2 changes: 1 addition & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def add_cli_args(
parser.add_argument('--quantization',
'-q',
type=str,
choices=['awq', None],
choices=['awq', 'squeezellm', None],
default=None,
help='Method used to quantize the weights')
return parser
Expand Down
98 changes: 58 additions & 40 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
served_model = None
app = fastapi.FastAPI()
engine = None
chat_template = None


def create_error_response(status_code: HTTPStatus,
Expand All @@ -70,50 +71,58 @@ async def check_model(request) -> Optional[JSONResponse]:


async def get_gen_prompt(request) -> str:
if not _fastchat_available:
raise ModuleNotFoundError(
"fastchat is not installed. Please install fastchat to use "
"the chat completion and conversation APIs: `$ pip install fschat`"
if chat_template is not None:
return tokenizer.apply_chat_template(conversation=request.messages,
chat_template=chat_template,
tokenize=False)
elif tokenizer.chat_template is not None:
return tokenizer.apply_chat_template(conversation=request.messages,
tokenize=False)
else:
if not _fastchat_available:
raise ModuleNotFoundError(
"fastchat is not installed. Please install fastchat "
"to use the chat completion and conversation APIs: "
"`$ pip install fschat`")
if version.parse(fastchat.__version__) < version.parse("0.2.23"):
raise ImportError("fastchat version is low. "
f"Current version: {fastchat.__version__} "
"Please upgrade fastchat to use: "
"`$ pip install -U fschat`")
template = get_conversation_template(request.model)
conv = Conversation(
name=template.name,
system_template=template.system_template,
system_message=template.system_message,
roles=template.roles,
messages=list(template.messages), # prevent in-place modification
offset=template.offset,
sep_style=SeparatorStyle(template.sep_style),
sep=template.sep,
sep2=template.sep2,
stop_str=template.stop_str,
stop_token_ids=template.stop_token_ids,
)
if version.parse(fastchat.__version__) < version.parse("0.2.23"):
raise ImportError(
f"fastchat version is low. Current version: {fastchat.__version__} "
"Please upgrade fastchat to use: `$ pip install -U fschat`")

conv = get_conversation_template(request.model)
conv = Conversation(
name=conv.name,
system_template=conv.system_template,
system_message=conv.system_message,
roles=conv.roles,
messages=list(conv.messages), # prevent in-place modification
offset=conv.offset,
sep_style=SeparatorStyle(conv.sep_style),
sep=conv.sep,
sep2=conv.sep2,
stop_str=conv.stop_str,
stop_token_ids=conv.stop_token_ids,
)

if isinstance(request.messages, str):
prompt = request.messages
else:
for message in request.messages:
msg_role = message["role"]
if msg_role == "system":
conv.system_message = message["content"]
elif msg_role == "user":
conv.append_message(conv.roles[0], message["content"])
elif msg_role == "assistant":
conv.append_message(conv.roles[1], message["content"])
else:
raise ValueError(f"Unknown role: {msg_role}")
if isinstance(request.messages, str):
prompt = request.messages
else:
for message in request.messages:
msg_role = message["role"]
if msg_role == "system":
conv.system_message = message["content"]
elif msg_role == "user":
conv.append_message(conv.roles[0], message["content"])
elif msg_role == "assistant":
conv.append_message(conv.roles[1], message["content"])
else:
raise ValueError(f"Unknown role: {msg_role}")

# Add a blank message for the assistant.
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
# Add a blank message for the assistant.
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

return prompt
return prompt


async def check_length(
Expand Down Expand Up @@ -590,6 +599,11 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
help="The model name used in the API. If not "
"specified, the model name will be the same as "
"the huggingface name.")
parser.add_argument("--chat-template",
type=str,
default=None,
help="The path to the chat template to use "
"with the specified model.")

parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
Expand All @@ -609,6 +623,10 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
else:
served_model = args.model

if args.chat_template is not None:
with open(args.chat_template, "r") as f:
chat_template = f.read()

engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(engine_args)
engine_model_config = asyncio.run(engine.get_model_config())
Expand Down
4 changes: 4 additions & 0 deletions vllm/model_executor/layers/quantized_linear/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from vllm.model_executor.layers.quantized_linear.awq import (
AWQColumnParallelLinear, AWQRowParallelLinear)
from vllm.model_executor.layers.quantized_linear.squeezellm import (
SqueezeLLMColumnParallelLinear, SqueezeLLMRowParallelLinear)
from vllm.model_executor.parallel_utils.layers import (ColumnParallelLinear,
RowParallelLinear)

_QUANTIZED_LINEAR_REGISTRY = {
"awq": (AWQColumnParallelLinear, AWQRowParallelLinear),
"squeezellm":
(SqueezeLLMColumnParallelLinear, SqueezeLLMRowParallelLinear),
}


Expand Down
Loading