From 7b8a84113b6959cf3b6df3e828f919eb74591f92 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 26 Nov 2024 17:10:28 +0800 Subject: [PATCH 01/40] refactor --- CMakeLists.txt | 2 +- lmdeploy/turbomind/turbomind.py | 8 +- src/turbomind/models/llama/LlamaBatch.h | 4 +- src/turbomind/models/llama/LlamaV2.h | 6 +- src/turbomind/python/bind.cpp | 214 ++++++------- .../triton_backend/llama/LlamaTritonModel.cc | 191 ++++++------ .../triton_backend/llama/LlamaTritonModel.h | 70 ++--- .../llama/LlamaTritonModelInstance.cc | 206 +++++-------- .../llama/LlamaTritonModelInstance.h | 36 +-- .../transformer_triton_backend.cpp | 52 ++-- .../transformer_triton_backend.hpp | 283 ++---------------- src/turbomind/utils/Tensor.h | 10 + src/turbomind/utils/instance_comm.h | 16 - 13 files changed, 386 insertions(+), 712 deletions(-) delete mode 100644 src/turbomind/utils/instance_comm.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ff2ac7dded..356da56f58 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,7 +304,7 @@ link_directories( # add_subdirectory(3rdparty) add_subdirectory(src) -add_subdirectory(examples) +# add_subdirectory(examples) if(BUILD_TEST) add_subdirectory(tests/csrc) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 05bc3e400e..a1b2fff944 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -358,12 +358,10 @@ def _forward_callback(self, result, ctx): self.que.put((False, result)) def _forward_thread(self, inputs): - instance_comm = self.tm_model.model_comm.create_instance_comm( - self.gpu_count) def _func(): try: - output = self.model_inst.forward(inputs, instance_comm) + output = self.model_inst.forward(inputs) except Exception as e: logger.error(f'unhandled exception: {e}') self.que.put((-1, None)) @@ -377,12 +375,10 @@ def _async_forward_callback(self, result, ctx, que: LifoQueue): que.put((False, result)) def _async_forward_thread(self, inputs, que: LifoQueue): - instance_comm = self.tm_model.model_comm.create_instance_comm( - self.gpu_count) def _func(): try: - output = self.model_inst.forward(inputs, instance_comm) + output = self.model_inst.forward(inputs) except Exception as e: logger.error(f'unhandled exception: {e}') que.put((-1, None)) diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index 9c66948999..f952da6bae 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -12,7 +12,6 @@ #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/instance_comm.h" #include #include #include @@ -32,8 +31,7 @@ struct SharedState { }; struct Control { - AbstractInstanceComm* comm; - Request::Callback callback; + Request::Callback callback; }; struct BatchState { diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h index 6321d09d7c..a32183d41b 100644 --- a/src/turbomind/models/llama/LlamaV2.h +++ b/src/turbomind/models/llama/LlamaV2.h @@ -21,6 +21,9 @@ #pragma once +#include +#include + #include "src/turbomind/layers/DynamicDecodeLayer.h" #include "src/turbomind/models/llama/Barrier.h" #include "src/turbomind/models/llama/LlamaBatch.h" @@ -31,10 +34,7 @@ #include "src/turbomind/models/llama/unified_decoder.h" #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cublasMMWrapper.h" -#include "src/turbomind/utils/instance_comm.h" #include "src/turbomind/utils/nccl_utils.h" -#include -#include namespace turbomind { diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 4eb34249ff..da20b94837 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -1,34 +1,38 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/python/dlpack.h" -#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/nccl_utils.h" -#include #include +#include + +#include + #include #include #include #include #include -#include + +#include "src/turbomind/python/dlpack.h" +#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" +#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" +#include "src/turbomind/utils/Tensor.h" +#include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/utils/nccl_utils.h" namespace py = pybind11; namespace ft = turbomind; using namespace pybind11::literals; // prepare to bind container -using TensorVector = std::vector; +using TensorVector = std::vector; PYBIND11_MAKE_OPAQUE(TensorVector); -using TensorMap = std::unordered_map; +using TensorMap = std::unordered_map; PYBIND11_MAKE_OPAQUE(TensorMap); static const char kDlTensorCapsuleName[] = "dltensor"; -DLDevice getDLDevice(triton::Tensor& tensor) +DLDevice getDLDevice(ft::Tensor& tensor) { int device_id = 0; - if (tensor.where == triton::MEMORY_GPU) { + if (tensor.where == ft::MEMORY_GPU) { cudaPointerAttributes ptr_attr; cudaPointerGetAttributes(&ptr_attr, tensor.data); device_id = ptr_attr.device; @@ -37,13 +41,13 @@ DLDevice getDLDevice(triton::Tensor& tensor) DLDevice device{kDLCPU, device_id}; switch (tensor.where) { - case triton::MEMORY_CPU: + case ft::MEMORY_CPU: device.device_type = DLDeviceType::kDLCPU; break; - case triton::MEMORY_CPU_PINNED: + case ft::MEMORY_CPU_PINNED: device.device_type = DLDeviceType::kDLCUDAHost; break; - case triton::MEMORY_GPU: + case ft::MEMORY_GPU: device.device_type = DLDeviceType::kDLCUDA; break; default: @@ -53,62 +57,62 @@ DLDevice getDLDevice(triton::Tensor& tensor) return device; } -DLManagedTensor* TritonTensorToDLManagedTensor(triton::Tensor& tensor) +DLManagedTensor* TritonTensorToDLManagedTensor(ft::Tensor& tensor) { DLDevice device = getDLDevice(tensor); DLDataType data_type{0, 0, 1}; switch (tensor.type) { - case triton::TYPE_BOOL: + case ft::TYPE_BOOL: data_type.code = DLDataTypeCode::kDLBool; data_type.bits = 8; break; - case triton::TYPE_UINT8: + case ft::TYPE_UINT8: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 8; break; - case triton::TYPE_UINT16: + case ft::TYPE_UINT16: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 16; break; - case triton::TYPE_UINT32: + case ft::TYPE_UINT32: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 32; break; - case triton::TYPE_UINT64: + case ft::TYPE_UINT64: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 64; break; - case triton::TYPE_INT8: - case triton::TYPE_BYTES: + case ft::TYPE_INT8: + case ft::TYPE_BYTES: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 8; break; - case triton::TYPE_INT16: + case ft::TYPE_INT16: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 16; break; - case triton::TYPE_INT32: + case ft::TYPE_INT32: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 32; break; - case triton::TYPE_INT64: + case ft::TYPE_INT64: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 64; break; - case triton::TYPE_FP16: + case ft::TYPE_FP16: data_type.code = DLDataTypeCode::kDLFloat; data_type.bits = 16; break; - case triton::TYPE_FP32: + case ft::TYPE_FP32: data_type.code = DLDataTypeCode::kDLFloat; data_type.bits = 32; break; - case triton::TYPE_FP64: + case ft::TYPE_FP64: data_type.code = DLDataTypeCode::kDLFloat; data_type.bits = 64; break; - case triton::TYPE_BF16: + case ft::TYPE_BF16: data_type.code = DLDataTypeCode::kDLBfloat; data_type.bits = 16; break; @@ -125,78 +129,78 @@ DLManagedTensor* TritonTensorToDLManagedTensor(triton::Tensor& tensor) return new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor* dlmt) { delete dlmt; }}; } -triton::MemoryType getMemoryType(DLDevice device) +ft::MemoryType getMemoryType(DLDevice device) { switch (device.device_type) { case DLDeviceType::kDLCUDAHost: - return triton::MemoryType::MEMORY_CPU_PINNED; + return ft::MemoryType::MEMORY_CPU_PINNED; case DLDeviceType::kDLCUDA: - return triton::MemoryType::MEMORY_GPU; + return ft::MemoryType::MEMORY_GPU; case DLDeviceType::kDLCPU: default: - return triton::MemoryType::MEMORY_CPU; + return ft::MemoryType::MEMORY_CPU; } } -triton::DataType getDataType(DLDataType data_type) +ft::DataType getDataType(DLDataType data_type) { switch (data_type.code) { case DLDataTypeCode::kDLUInt: switch (data_type.bits) { case 8: - return triton::TYPE_UINT8; + return ft::TYPE_UINT8; case 16: - return triton::TYPE_UINT16; + return ft::TYPE_UINT16; case 32: - return triton::TYPE_UINT32; + return ft::TYPE_UINT32; case 64: - return triton::TYPE_UINT64; + return ft::TYPE_UINT64; default: - return triton::TYPE_INVALID; + return ft::TYPE_INVALID; } break; case DLDataTypeCode::kDLInt: switch (data_type.bits) { case 8: - return triton::TYPE_INT8; + return ft::TYPE_INT8; case 16: - return triton::TYPE_INT16; + return ft::TYPE_INT16; case 32: - return triton::TYPE_INT32; + return ft::TYPE_INT32; case 64: - return triton::TYPE_INT64; + return ft::TYPE_INT64; default: - return triton::TYPE_INVALID; + return ft::TYPE_INVALID; } break; case DLDataTypeCode::kDLFloat: switch (data_type.bits) { case 16: - return triton::TYPE_FP16; + return ft::TYPE_FP16; case 32: - return triton::TYPE_FP32; + return ft::TYPE_FP32; case 64: - return triton::TYPE_FP64; + return ft::TYPE_FP64; default: - return triton::TYPE_INVALID; + return ft::TYPE_INVALID; } break; case DLDataTypeCode::kDLBfloat: switch (data_type.bits) { case 16: - return triton::TYPE_BF16; + return ft::TYPE_BF16; default: - return triton::TYPE_INVALID; + return ft::TYPE_INVALID; } break; case DLDataTypeCode::kDLBool: - return triton::TYPE_BOOL; + return ft::TYPE_BOOL; default: - return triton::TYPE_INVALID; + return ft::TYPE_INVALID; } } -std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tensor) +std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tensor) { auto& dl_tensor = tensor->dl_tensor; auto where = getMemoryType(dl_tensor.device); @@ -205,7 +209,7 @@ std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* t std::vector shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim); auto data = dl_tensor.data; - return std::make_shared(where, dtype, shape, data); + return std::make_shared(where, dtype, shape, data); } DLTensor GetDLTensor(py::object obj) @@ -225,70 +229,65 @@ PYBIND11_MODULE(_turbomind, m) // custom comm py::class_>(m, "AbstractCustomComm"); - // instance comm - py::class_(m, "AbstractInstanceComm"); - // data type - py::enum_(m, "DataType") - .value("TYPE_INVALID", triton::DataType::TYPE_INVALID) - .value("TYPE_BOOL", triton::DataType::TYPE_BOOL) - .value("TYPE_UINT8", triton::DataType::TYPE_UINT8) - .value("TYPE_UINT16", triton::DataType::TYPE_UINT16) - .value("TYPE_UINT32", triton::DataType::TYPE_UINT32) - .value("TYPE_UINT64", triton::DataType::TYPE_UINT64) - .value("TYPE_INT8", triton::DataType::TYPE_INT8) - .value("TYPE_INT16", triton::DataType::TYPE_INT16) - .value("TYPE_INT32", triton::DataType::TYPE_INT32) - .value("TYPE_INT64", triton::DataType::TYPE_INT64) - .value("TYPE_FP16", triton::DataType::TYPE_FP16) - .value("TYPE_FP32", triton::DataType::TYPE_FP32) - .value("TYPE_FP64", triton::DataType::TYPE_FP64) - .value("TYPE_BYTES", triton::DataType::TYPE_BYTES) - .value("TYPE_BF16", triton::DataType::TYPE_BF16); + py::enum_(m, "DataType") + .value("TYPE_INVALID", ft::DataType::TYPE_INVALID) + .value("TYPE_BOOL", ft::DataType::TYPE_BOOL) + .value("TYPE_UINT8", ft::DataType::TYPE_UINT8) + .value("TYPE_UINT16", ft::DataType::TYPE_UINT16) + .value("TYPE_UINT32", ft::DataType::TYPE_UINT32) + .value("TYPE_UINT64", ft::DataType::TYPE_UINT64) + .value("TYPE_INT8", ft::DataType::TYPE_INT8) + .value("TYPE_INT16", ft::DataType::TYPE_INT16) + .value("TYPE_INT32", ft::DataType::TYPE_INT32) + .value("TYPE_INT64", ft::DataType::TYPE_INT64) + .value("TYPE_FP16", ft::DataType::TYPE_FP16) + .value("TYPE_FP32", ft::DataType::TYPE_FP32) + .value("TYPE_FP64", ft::DataType::TYPE_FP64) + .value("TYPE_BYTES", ft::DataType::TYPE_BYTES) + .value("TYPE_BF16", ft::DataType::TYPE_BF16); // memory type - py::enum_(m, "MemoryType") - .value("MEMORY_CPU", triton::MemoryType::MEMORY_CPU) - .value("MEMORY_CPU_PINNED", triton::MemoryType::MEMORY_CPU_PINNED) - .value("MEMORY_GPU", triton::MemoryType::MEMORY_GPU); + py::enum_(m, "MemoryType") + .value("MEMORY_CPU", ft::MemoryType::MEMORY_CPU) + .value("MEMORY_CPU_PINNED", ft::MemoryType::MEMORY_CPU_PINNED) + .value("MEMORY_GPU", ft::MemoryType::MEMORY_GPU); // tensor - py::class_>(m, "Tensor") - .def_readonly("where", &triton::Tensor::where) - .def_readonly("type", &triton::Tensor::type) - .def_readonly("shape", &triton::Tensor::shape) - .def_readonly("data", &triton::Tensor::data) - .def(py::init([](const triton::MemoryType where, - const triton::DataType type, - const std::vector& shape, - const long data) { - auto data_ptr = reinterpret_cast(data); - return new triton::Tensor(where, type, shape, data_ptr); - })) + py::class_>(m, "Tensor") + .def_readonly("where", &ft::Tensor::where) + .def_readonly("type", &ft::Tensor::type) + .def_readonly("shape", &ft::Tensor::shape) + .def_readonly("data", &ft::Tensor::data) + .def(py::init( + [](const ft::MemoryType where, const ft::DataType type, const std::vector& shape, const long data) { + auto data_ptr = reinterpret_cast(data); + return new ft::Tensor(where, type, shape, data_ptr); + })) .def( "view", - [](triton::Tensor* self, triton::DataType new_type) { - return new triton::Tensor(self->where, new_type, self->shape, self->data); + [](ft::Tensor* self, ft::DataType new_type) { + return new ft::Tensor(self->where, new_type, self->shape, self->data); }, "new_type"_a) .def( "view", - [](triton::Tensor* self, std::vector new_shape) { - return new triton::Tensor(self->where, self->type, new_shape, self->data); + [](ft::Tensor* self, std::vector new_shape) { + return new ft::Tensor(self->where, self->type, new_shape, self->data); }, "new_shape"_a) .def( "copy_from", - [](triton::Tensor* self, py::object obj) { + [](ft::Tensor* self, py::object obj) { py::capsule cap = obj.attr("__dlpack__")(); DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); auto src = DLManagedTensorToTritonTensor(dlmt); switch (self->type) { - case triton::TYPE_FP16: - case triton::TYPE_FP32: - case triton::TYPE_INT32: - case triton::TYPE_BF16: { + case ft::TYPE_FP16: + case ft::TYPE_FP32: + case ft::TYPE_INT32: + case ft::TYPE_BF16: { auto num_element = std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies()); auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; @@ -304,7 +303,7 @@ PYBIND11_MODULE(_turbomind, m) "tensor"_a) .def( "__dlpack__", - [](triton::Tensor* self, long stream) { + [](ft::Tensor* self, long stream) { DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(*self); return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) { DLManagedTensor* dlmt = @@ -320,7 +319,7 @@ PYBIND11_MODULE(_turbomind, m) }); }, "stream"_a = 0) - .def("__dlpack_device__", [](triton::Tensor* self) { + .def("__dlpack_device__", [](ft::Tensor* self) { auto device = getDLDevice(*self); return std::tuple(int(device.device_type), device.device_id); }); @@ -336,19 +335,19 @@ PYBIND11_MODULE(_turbomind, m) "dl_managed_tensor"_a); // transformer model instance + using ft::AbstractTransformerModelInstance; py::bind_map>(m, "TensorMap"); py::class_(m, "AbstractTransformerModelInstance") .def( "forward", - [](AbstractTransformerModelInstance* model, - std::shared_ptr input_tensors, - ft::AbstractInstanceComm* inst_comm) { return model->forward(input_tensors, inst_comm); }, + [](AbstractTransformerModelInstance* model, std::shared_ptr input_tensors) { + return model->forward(input_tensors); + }, py::call_guard(), - "input_tensors"_a, - "inst_comm"_a = nullptr) + "input_tensors"_a) .def( "register_callback", - [](AbstractTransformerModelInstance* self, triton_stream_cb_t cb, py::object ctx) { + [](AbstractTransformerModelInstance* self, ft::triton_stream_cb_t cb, py::object ctx) { self->registerCallback(cb, ctx.ptr()); }, "callback"_a, @@ -356,6 +355,8 @@ PYBIND11_MODULE(_turbomind, m) .def("unregister_callback", &AbstractTransformerModelInstance::unRegisterCallback); // transformer model + using ft::AbstractTransformerModel; + using ft::LlamaTritonModel; py::class_>(m, "AbstractTransformerModel") .def_static( "create_llama_model", @@ -419,7 +420,6 @@ PYBIND11_MODULE(_turbomind, m) return ret; }, "world_size"_a) - .def("create_instance_comm", &AbstractTransformerModel::createInstanceComm, "size"_a) .def( "create_model_instance", [](AbstractTransformerModel* model, diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 2deca46380..aab9287762 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -27,17 +27,18 @@ #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/utils/allocator.h" +#include "src/turbomind/utils/cuda_utils.h" + #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include "src/turbomind/utils/allocator.h" -#include "src/turbomind/utils/cuda_utils.h" -namespace ft = turbomind; +namespace turbomind { -static std::optional get_moe_method() +static std::optional get_moe_method() { - static const auto value = []() -> std::optional { + static const auto value = []() -> std::optional { const auto p = std::getenv("TM_MOE_METHOD"); if (p) { std::string str(p); @@ -45,10 +46,10 @@ static std::optional get_moe_method() x = std::tolower(x); } if (str == "naive") { - return ft::MoeParam::kNaive; + return MoeParam::kNaive; } else if (str == "fused") { - return ft::MoeParam::kFused; + return MoeParam::kFused; } else { std::cerr << "[WARNING] unrecognised MoE method: " << str << "\n"; @@ -67,7 +68,7 @@ std::shared_ptr AbstractTransformerModel::createLlamaM } catch (const YAML::Exception& e) { std::cerr << "Error reading YAML config: " << e.what() << std::endl; - ft::FT_CHECK(false); + FT_CHECK(false); } const auto ft_instance_hyperparameter = reader["ft_instance_hyperparameter"]; @@ -91,7 +92,7 @@ std::shared_ptr AbstractTransformerModel::createLlamaM model_dir); #else TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF16"); - ft::FT_CHECK(false); + FT_CHECK(false); #endif } else { @@ -103,7 +104,7 @@ std::shared_ptr AbstractTransformerModel::createLlamaM model_dir); #else TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF32"); - ft::FT_CHECK(false); + FT_CHECK(false); #endif } return nullptr; @@ -205,10 +206,10 @@ void LlamaTritonModel::handleMissingParams() template LlamaTritonModel::~LlamaTritonModel() { - ft::FT_CHECK(weights_.size() == engines_.size()); + FT_CHECK(weights_.size() == engines_.size()); for (int device_id = 0; device_id < (int)engines_.size(); ++device_id) { // Set device id before destructing CUDA resources - ft::check_cuda_error(cudaSetDevice(device_id)); + check_cuda_error(cudaSetDevice(device_id)); engines_[device_id].reset(); weights_[device_id].reset(); } @@ -222,7 +223,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, std::string config): tensor_para_size_(tensor_para_size), pipeline_para_size_(pipeline_para_size), - weights_(ft::getDeviceCount()), + weights_(getDeviceCount()), enable_custom_all_reduce_(enable_custom_all_reduce) { FT_CHECK_WITH_INFO(!(config.empty() && model_dir.empty()), "invalid init options"); @@ -242,7 +243,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, } catch (const YAML::Exception& e) { std::cerr << "Error reading YAML config: " << e.what() << std::endl; - ft::FT_CHECK(false); + FT_CHECK(false); } const auto model_reader = reader["model_config"]; @@ -297,7 +298,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as(0); engine_param_.max_prefill_iters = engine_reader["max_prefill_iters"].as(1); - lora_param_.policy = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as("")); + lora_param_.policy = getLoraPolicy(reader["lora_config"]["lora_policy"].as("")); lora_param_.r = lora_reader["lora_r"].as(0); lora_param_.scale = lora_reader["lora_scale"].as(0); lora_param_.max_wo_r = lora_reader["lora_max_wo_r"].as(0); @@ -313,75 +314,75 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, handleMissingParams(); - shared_state_ = std::make_shared(); - shared_state_->barrier = std::make_shared(tensor_para_size); + shared_state_ = std::make_shared(); + shared_state_->barrier = std::make_shared(tensor_para_size); - const auto device_count = ft::getDeviceCount(); + const auto device_count = getDeviceCount(); engines_.resize(device_count); const std::string weight_type_str = model_reader["weight_type"].as(); if (weight_type_str == "fp16" || weight_type_str == "float16") { - weight_type_ = ft::WeightType::kFP16; + weight_type_ = WeightType::kFP16; } else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") { - weight_type_ = ft::WeightType::kBF16; + weight_type_ = WeightType::kBF16; } else if (weight_type_str == "fp32") { - weight_type_ = ft::WeightType::kFP32; + weight_type_ = WeightType::kFP32; } else if (weight_type_str == "int8") { - weight_type_ = ft::WeightType::kINT8; + weight_type_ = WeightType::kINT8; } else if (weight_type_str == "int4") { - weight_type_ = ft::WeightType::kINT4; + weight_type_ = WeightType::kINT4; } else { std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n"; - ft::FT_CHECK(0); + FT_CHECK(0); } if (auto method = get_moe_method()) { moe_param_.method = *method; } else { - moe_param_.method = ft::MoeParam::kFused; + moe_param_.method = MoeParam::kFused; } TM_LOG_INFO("%s", toString().c_str()); } template -std::unique_ptr> LlamaTritonModel::createSharedModelInstance( - int device_id, - int rank, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm) +std::unique_ptr> +LlamaTritonModel::createSharedModelInstance(int device_id, + int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm) { - ft::check_cuda_error(cudaSetDevice(device_id)); + check_cuda_error(cudaSetDevice(device_id)); const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_); - auto ctx = std::make_unique>(device_id); + auto ctx = std::make_unique>(device_id); - ft::NcclParam tensor_para = nccl_params.first[comms_rank]; - ft::NcclParam pipeline_para = nccl_params.second[comms_rank]; + NcclParam tensor_para = nccl_params.first[comms_rank]; + NcclParam pipeline_para = nccl_params.second[comms_rank]; - ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_); - ft::FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_); + FT_CHECK(tensor_para.world_size_ == tensor_para_size_); + FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_); - auto model = std::make_unique>(model_param_, // - attn_param_, - moe_param_, - lora_param_, - tensor_para, - *ctx, - engine_param_.max_batch_size, - weights_[device_id]); + auto model = std::make_unique>(model_param_, // + attn_param_, + moe_param_, + lora_param_, + tensor_para, + *ctx, + engine_param_.max_batch_size, + weights_[device_id]); - auto engine = std::make_unique>(engine_param_, // - std::move(model), - std::move(ctx), - shared_state_, - device_id); + auto engine = std::make_unique>(engine_param_, // + std::move(model), + std::move(ctx), + shared_state_, + device_id); // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang // due to concurrent kernel launch & cudaMallocHost @@ -397,14 +398,14 @@ std::unique_ptr LlamaTritonModel::createModelInstance(int device_id, int rank, cudaStream_t stream, - std::pair, std::vector>, - std::shared_ptr) + std::pair, std::vector>, + std::shared_ptr) { - ft::check_cuda_error(cudaSetDevice(device_id)); + check_cuda_error(cudaSetDevice(device_id)); - ft::FT_CHECK(engines_[device_id] != nullptr); + FT_CHECK(engines_[device_id] != nullptr); - auto allocator = std::make_unique>(device_id, false); + auto allocator = std::make_unique>(device_id, false); allocator->setStream(stream); @@ -414,25 +415,25 @@ LlamaTritonModel::createModelInstance(int device_id, template void LlamaTritonModel::createSharedWeights(int device_id, int rank) { - ft::check_cuda_error(cudaSetDevice(device_id)); + check_cuda_error(cudaSetDevice(device_id)); const int tensor_para_rank = rank % tensor_para_size_; const int pipeline_para_rank = rank / tensor_para_size_; - ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0); - weights_[device_id] = std::make_shared>(model_param_.head_num, - model_param_.kv_head_num, - model_param_.head_dim, - model_param_.hidden_units, - model_param_.inter_size, - model_param_.vocab_size, - model_param_.embedding_size, - model_param_.layer_num, - attn_bias_, - weight_type_, - group_size_, - lora_param_, - moe_param_, - tensor_para_size_, - tensor_para_rank); + FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0); + weights_[device_id] = std::make_shared>(model_param_.head_num, + model_param_.kv_head_num, + model_param_.head_dim, + model_param_.hidden_units, + model_param_.inter_size, + model_param_.vocab_size, + model_param_.embedding_size, + model_param_.layer_num, + attn_bias_, + weight_type_, + group_size_, + lora_param_, + moe_param_, + tensor_para_size_, + tensor_para_rank); // model inited with model_dir if (model_dir_ != "") { weights_[device_id]->loadModel(model_dir_); @@ -441,37 +442,41 @@ void LlamaTritonModel::createSharedWeights(int device_id, int rank) } template -TensorMap LlamaTritonModel::getParams(int deviceId, int rank) +std::unordered_map LlamaTritonModel::getParams(int deviceId, int rank) { - ft::check_cuda_error(cudaSetDevice(deviceId)); + check_cuda_error(cudaSetDevice(deviceId)); + // shared_weight should be created before getParams - ft::FT_CHECK(weights_[deviceId] != nullptr); - ft::TensorMap output = weights_[deviceId]->getParams(); - TensorMap result; + FT_CHECK(weights_[deviceId] != nullptr); + + TensorMap output = weights_[deviceId]->getParams(); + + std::unordered_map result; for (auto [name, tensor] : output) { - result.emplace(name, triton::Tensor{tensor.where, tensor.type, tensor.shape, tensor.data}); + result.insert({{name, Tensor{tensor.where, tensor.type, tensor.shape, tensor.data}}}); } + return result; } template void LlamaTritonModel::processWeights(int device_id, int rank) { - ft::check_cuda_error(cudaSetDevice(device_id)); - ft::FT_CHECK(weights_[device_id] != nullptr); + check_cuda_error(cudaSetDevice(device_id)); + FT_CHECK(weights_[device_id] != nullptr); cudaDeviceProp props{}; - ft::check_cuda_error(cudaGetDeviceProperties(&props, device_id)); + check_cuda_error(cudaGetDeviceProperties(&props, device_id)); weights_[device_id]->prepare(props); - ft::sync_check_cuda_error(); + sync_check_cuda_error(); } template -void LlamaTritonModel::createEngine(int device_id, - int rank, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm) +void LlamaTritonModel::createEngine(int device_id, + int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm) { auto engine = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm); @@ -509,17 +514,11 @@ std::string LlamaTritonModel::toString() } template -void LlamaTritonModel::createCustomComms( - std::vector>* custom_all_reduce_comms, int world_size) +void LlamaTritonModel::createCustomComms(std::vector>* custom_all_reduce_comms, + int world_size) { - using commDataType = typename ft::CustomARCommTypeConverter::Type; - ft::initCustomAllReduceComm(custom_all_reduce_comms, enable_custom_all_reduce_, world_size); -} - -template -std::unique_ptr LlamaTritonModel::createInstanceComm(int size) -{ - return nullptr; + using commDataType = typename CustomARCommTypeConverter::Type; + initCustomAllReduceComm(custom_all_reduce_comms, enable_custom_all_reduce_, world_size); } template @@ -541,3 +540,5 @@ template struct LlamaTritonModel; #ifdef ENABLE_BF16 template struct LlamaTritonModel<__nv_bfloat16>; #endif + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h index 19a143e721..b1d00a7cb6 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h @@ -31,7 +31,7 @@ #include #include -namespace ft = turbomind; +namespace turbomind { template struct LlamaTritonModel: public AbstractTransformerModel { @@ -44,27 +44,25 @@ struct LlamaTritonModel: public AbstractTransformerModel { ~LlamaTritonModel() override; std::unique_ptr - createModelInstance(int deviceId, - int rank, - cudaStream_t stream, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr) override; + createModelInstance(int deviceId, + int rank, + cudaStream_t stream, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm = nullptr) override; void createSharedWeights(int deviceId, int rank) override; - TensorMap getParams(int deviceId, int rank) override; + std::unordered_map getParams(int deviceId, int rank) override; void processWeights(int deviceId, int rank) override; - void createEngine(int device_id, - int rank, - std::pair, std::vector> nccl_params, - std::shared_ptr) override; + void createEngine(int device_id, + int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr) override; - void createCustomComms(std::vector>* custom_all_reduce_comms, - int world_size) override; - - std::unique_ptr createInstanceComm(int size) override; + void createCustomComms(std::vector>* custom_all_reduce_comms, + int world_size) override; void handleMissingParams(); @@ -78,27 +76,27 @@ struct LlamaTritonModel: public AbstractTransformerModel { int getPipelineParaSize() override; private: - std::unique_ptr> - createSharedModelInstance(int deviceId, - int rank, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr); - - ft::ModelParam model_param_; - ft::AttentionParam attn_param_; - ft::MoeParam moe_param_; - ft::LoraParam lora_param_; - ft::EngineParam engine_param_; - size_t tensor_para_size_; - size_t pipeline_para_size_; - ft::WeightType weight_type_; - bool attn_bias_; - int group_size_; - - std::shared_ptr shared_state_; + std::unique_ptr> + createSharedModelInstance(int deviceId, + int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm = nullptr); + + ModelParam model_param_; + AttentionParam attn_param_; + MoeParam moe_param_; + LoraParam lora_param_; + EngineParam engine_param_; + size_t tensor_para_size_; + size_t pipeline_para_size_; + WeightType weight_type_; + bool attn_bias_; + int group_size_; + + std::shared_ptr shared_state_; // Weights & engine instances for the ranks - std::vector>> weights_; - std::vector>> engines_; + std::vector>> weights_; + std::vector>> engines_; bool is_fp16_; int enable_custom_all_reduce_ = 0; @@ -108,3 +106,5 @@ struct LlamaTritonModel: public AbstractTransformerModel { ffi_api_lock_ctrl_t ffi_lock_ = nullptr; }; + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc index 8221f932ce..976fc9cc1d 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc @@ -31,78 +31,23 @@ #include #include -namespace ft = turbomind; +namespace turbomind { template -void triton_stream_callback(std::unordered_map* output_tensors, void* ctx) +void triton_stream_callback(std::unordered_map* outputs, void* ctx) { - LlamaTritonModelInstance* model = reinterpret_cast*>(ctx); - auto result = LlamaTritonModelInstance::convert_outputs(*output_tensors); - - model->stream_cb_(result, model->stream_ctx_); + LlamaTritonModelInstance* model = reinterpret_cast*>(ctx); + model->stream_cb_(std::make_shared>(*outputs), model->stream_ctx_); } template -LlamaTritonModelInstance::LlamaTritonModelInstance(ft::Engine& instance, - std::unique_ptr> allocator, - int device_id): +LlamaTritonModelInstance::LlamaTritonModelInstance(Engine& instance, + std::unique_ptr> allocator, + int device_id): device_id_{device_id}, instance_(&instance), allocator_(std::move(allocator)) { } -template -std::unordered_map LlamaTritonModelInstance::convert_inputs( - std::shared_ptr> input_tensors) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - - const size_t request_batch_size = input_tensors->at("input_ids").shape[0]; - const size_t input_data_len = input_tensors->at("input_ids").shape[1]; - h_total_output_lengths_ = - (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t)); - - std::unordered_map ft_input_tensors{}; - - for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) { - if (ft_input_tensors.count(t->first) == 0) { - ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()}); - } - } - - return ft_input_tensors; -} - -template -std::shared_ptr> -LlamaTritonModelInstance::convert_outputs(const std::unordered_map& output_tensors) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - std::unordered_map* outputs_mapping = - new std::unordered_map(); - - for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) { - outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)}); - } - - return std::shared_ptr>(outputs_mapping); -} - -template -std::shared_ptr> -LlamaTritonModelInstance::forward(std::shared_ptr> input_tensors) -{ - ft::FT_CHECK(false); - return nullptr; -} - -template -std::shared_ptr> -LlamaTritonModelInstance::forward(std::shared_ptr> input_tensors) -{ - ft::FT_CHECK(false); - return nullptr; -} - template std::string format_vector(const std::vector& vec) { @@ -118,120 +63,109 @@ std::string format_vector(const std::vector& vec) } template -std::shared_ptr> -LlamaTritonModelInstance::forward(std::shared_ptr> input_tensors, - ft::AbstractInstanceComm* instance_comm) +std::shared_ptr> +LlamaTritonModelInstance::forward(std::shared_ptr> inputs) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); // In some cases, this is needed to trigger the creation of CUDA context, or later `cudaMallocAsync` will die - ft::check_cuda_error(cudaSetDevice(device_id_)); + check_cuda_error(cudaSetDevice(device_id_)); - FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2, - "input_tensors->at(\"input_ids\").shape.size() == 2"); - FT_CHECK_WITH_INFO(input_tensors->at("input_lengths").shape.size() == 1, - "input_tensors->at(\"input_lengths\").shape.size() == 1"); + FT_CHECK_WITH_INFO(inputs->at("input_ids").shape.size() == 2, "inputs->at(\"input_ids\").shape.size() == 2"); + FT_CHECK_WITH_INFO(inputs->at("input_lengths").shape.size() == 1, + "inputs->at(\"input_lengths\").shape.size() == 1"); - const uint32_t request_batch_size = input_tensors->at("input_ids").shape[0]; - const uint32_t max_request_output_len = (size_t)*std::max_element( - (int*)input_tensors->at("request_output_len").data, - (int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]); + const uint32_t request_batch_size = inputs->at("input_ids").shape[0]; + const uint32_t max_request_output_len = (size_t)*std::max_element((int*)inputs->at("request_output_len").data, + (int*)inputs->at("request_output_len").data + + inputs->at("request_output_len").shape[0]); // const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1]; - const uint32_t beam_width = - input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1; + const uint32_t beam_width = inputs->count("beam_width") ? (size_t)(*(uint*)inputs->at("beam_width").data) : 1; FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented"); - std::unordered_map ft_input_tensors = convert_inputs(input_tensors); + h_total_output_lengths_ = + (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t)); - const size_t max_input_len = input_tensors->at("input_ids").shape[1]; - const bool is_return_logits = - input_tensors->count("is_return_logits") && *(bool*)input_tensors->at("is_return_logits").data; + const size_t max_input_len = inputs->at("input_ids").shape[1]; + const bool is_return_logits = inputs->count("is_return_logits") && *(bool*)inputs->at("is_return_logits").data; const size_t vocab_size = instance_->model().vocab_size(); allocateBuffer(request_batch_size, max_input_len, beam_width, instance_->session_len(), is_return_logits); - std::unordered_map output_tensors = std::unordered_map{ + std::unordered_map outputs{ {"output_ids", - ft::Tensor{ft::MEMORY_CPU, - ft::TYPE_UINT32, - std::vector{request_batch_size, beam_width, (size_t)instance_->session_len()}, - d_output_ids_}}, + Tensor{MEMORY_CPU, + TYPE_UINT32, + std::vector{request_batch_size, beam_width, (size_t)instance_->session_len()}, + d_output_ids_}}, {"sequence_length", - ft::Tensor{ft::MEMORY_CPU, - ft::TYPE_UINT32, - std::vector{request_batch_size, beam_width}, - d_sequence_lengths_}}}; - - if (input_tensors->count("is_return_log_probs") && *((bool*)input_tensors->at("is_return_log_probs").data)) { - output_tensors.insert({"output_log_probs", - ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_FP32, - std::vector{request_batch_size, beam_width, max_request_output_len}, - d_output_log_probs_}}); - output_tensors.insert({"cum_log_probs", - ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_FP32, - std::vector{request_batch_size, beam_width}, - d_cum_log_probs_}}); + Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{request_batch_size, beam_width}, d_sequence_lengths_}}}; + + if (inputs->count("is_return_log_probs") && *((bool*)inputs->at("is_return_log_probs").data)) { + outputs.insert({"output_log_probs", + Tensor{MEMORY_GPU, + TYPE_FP32, + std::vector{request_batch_size, beam_width, max_request_output_len}, + d_output_log_probs_}}); + outputs.insert( + {"cum_log_probs", + Tensor{MEMORY_GPU, TYPE_FP32, std::vector{request_batch_size, beam_width}, d_cum_log_probs_}}); } - if (input_tensors->count("logprobs")) { + if (inputs->count("logprobs")) { size_t max_logprob_length = std::min((int)max_request_output_len, instance_->session_len()) + 1; h_logprob_vals_ = (float*)std::realloc( - h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * ft::kMaxLogProb); - h_logprob_indexes_ = (uint32_t*)std::realloc(h_logprob_indexes_, - sizeof(uint32_t) * request_batch_size * beam_width - * max_logprob_length * ft::kMaxLogProb); - h_logprob_nums_ = (uint32_t*)std::realloc( + h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb); + h_logprob_indexes_ = (uint32_t*)std::realloc( + h_logprob_indexes_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb); + h_logprob_nums_ = (uint32_t*)std::realloc( h_logprob_nums_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length); - output_tensors.insert( - {{"logprob_vals", - ft::Tensor{ft::MEMORY_CPU, - ft::TYPE_FP32, - std::vector{request_batch_size, beam_width, max_logprob_length, ft::kMaxLogProb}, - h_logprob_vals_}}}); - - output_tensors.insert( - {{"logprob_indexes", - ft::Tensor{ft::MEMORY_CPU, - ft::TYPE_UINT32, - std::vector{request_batch_size, beam_width, max_logprob_length, ft::kMaxLogProb}, - h_logprob_indexes_}}}); - - output_tensors.insert({{"logprob_nums", - ft::Tensor{ft::MEMORY_CPU, - ft::TYPE_UINT32, - std::vector{request_batch_size, beam_width, max_logprob_length}, - h_logprob_nums_}}}); + outputs.insert({{"logprob_vals", + Tensor{MEMORY_CPU, + TYPE_FP32, + std::vector{request_batch_size, beam_width, max_logprob_length, kMaxLogProb}, + h_logprob_vals_}}}); + + outputs.insert({{"logprob_indexes", + Tensor{MEMORY_CPU, + TYPE_UINT32, + std::vector{request_batch_size, beam_width, max_logprob_length, kMaxLogProb}, + h_logprob_indexes_}}}); + + outputs.insert({{"logprob_nums", + Tensor{MEMORY_CPU, + TYPE_UINT32, + std::vector{request_batch_size, beam_width, max_logprob_length}, + h_logprob_nums_}}}); } if (is_return_logits) { - output_tensors.insert( - {"logits", - {ft::MEMORY_GPU, ft::TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}}); + outputs.insert( + {{"logits", {MEMORY_GPU, TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}}}); } try { - ft::Request::Callback callback; + Request::Callback callback; if (stream_cb_) { - callback = [this](std::unordered_map* outputs) { + callback = [this](std::unordered_map* outputs) { triton_stream_callback(outputs, this); }; } - ft::check_cuda_error(cudaStreamSynchronize(allocator_->returnStream())); - instance_->Submit(&output_tensors, &ft_input_tensors, {instance_comm, callback}); + check_cuda_error(cudaStreamSynchronize(allocator_->returnStream())); + + instance_->Submit(&outputs, inputs.get(), {callback}); // ! stream synced by the model before returning } catch (...) { h_exception_ = std::current_exception(); - output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}}); + outputs.insert({"error_message", Tensor{MEMORY_CPU, TYPE_BYTES, {1}, &h_exception_}}); } - return convert_outputs(output_tensors); + return std::make_shared>(std::move(outputs)); } template @@ -278,3 +212,5 @@ template struct LlamaTritonModelInstance; #ifdef ENABLE_BF16 template struct LlamaTritonModelInstance<__nv_bfloat16>; #endif + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h index 08088c05d5..2cf69b9fa5 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h @@ -20,41 +20,29 @@ #pragma once +#include + #include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include -namespace ft = turbomind; +namespace turbomind { template struct LlamaTritonModelInstance: AbstractTransformerModelInstance { - LlamaTritonModelInstance(ft::Engine& instance, - std::unique_ptr> allocator, - int device_id); - ~LlamaTritonModelInstance(); - - std::shared_ptr> - forward(std::shared_ptr> input_tensors) override; + LlamaTritonModelInstance(Engine& instance, + std::unique_ptr> allocator, + int device_id); + ~LlamaTritonModelInstance() override; - std::shared_ptr> - forward(std::shared_ptr> input_tensors) override; - - std::shared_ptr> - forward(std::shared_ptr> input_tensors, - ft::AbstractInstanceComm*) override; - - static std::shared_ptr> - convert_outputs(const std::unordered_map& output_tensors); + virtual std::shared_ptr> + forward(std::shared_ptr> input_tensors) override; private: - ft::Engine* instance_; - const std::unique_ptr> allocator_; - - std::unordered_map - convert_inputs(std::shared_ptr> input_tensors); + Engine* instance_; + const std::unique_ptr> allocator_; void allocateBuffer(const size_t request_batch_size, const size_t max_input_len, @@ -88,3 +76,5 @@ struct LlamaTritonModelInstance: AbstractTransformerModelInstance { uint32_t* h_total_output_lengths_ = nullptr; std::exception_ptr h_exception_ = nullptr; }; + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/transformer_triton_backend.cpp b/src/turbomind/triton_backend/transformer_triton_backend.cpp index 16c64b17d5..acf5e06e88 100644 --- a/src/turbomind/triton_backend/transformer_triton_backend.cpp +++ b/src/turbomind/triton_backend/transformer_triton_backend.cpp @@ -21,62 +21,66 @@ #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/utils/nccl_utils.h" -std::pair, std::vector> +namespace turbomind { + +std::pair, std::vector> AbstractTransformerModel::createNcclParams(const int node_id, const int device_id_start, const bool multi_node) { - const int gpu_count = ft::getDeviceCount(); + const int gpu_count = getDeviceCount(); const int tensor_para_size = getTensorParaSize(); const int pipeline_para_size = getPipelineParaSize(); const int local_comm_size = multi_node ? gpu_count : tensor_para_size * pipeline_para_size; - ft::FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0); - ft::FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count); + FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0); + FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count); - std::vector nccl_ids; + std::vector nccl_ids; if (tensor_para_size > 1 || pipeline_para_size > 1) { nccl_ids.resize(tensor_para_size + pipeline_para_size); if (node_id == 0) { for (uint32_t i = 0; i < nccl_ids.size(); i++) { - ft::ftNcclGetUniqueId(nccl_ids[i]); + ftNcclGetUniqueId(nccl_ids[i]); } } } - std::vector tensor_para_params(local_comm_size); - std::vector pipeline_para_params(local_comm_size); + std::vector tensor_para_params(local_comm_size); + std::vector pipeline_para_params(local_comm_size); // Don't init comm when size == 1 if (tensor_para_size > 1) { - const auto group_id = ft::ftNcclNextGroupId(); - ft::ftNcclGroupStart(); + const auto group_id = ftNcclNextGroupId(); + ftNcclGroupStart(); for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) { int rank = node_id * gpu_count + gid - device_id_start; int tensor_para_rank = rank % tensor_para_size; int pipeline_para_rank = rank / tensor_para_size; - ft::NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank]; - ft::check_cuda_error(cudaSetDevice(gid)); - ft::ftNcclCommInitRank( + NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank]; + check_cuda_error(cudaSetDevice(gid)); + ftNcclCommInitRank( tensor_para_params[gid - device_id_start], tensor_para_rank, tensor_para_size, tensor_para_nccl_uid); tensor_para_params[gid - device_id_start].group_id_ = group_id; } - ft::ftNcclGroupEnd(); + ftNcclGroupEnd(); } if (pipeline_para_size > 1) { - const auto group_id = ft::ftNcclNextGroupId(); - ft::ftNcclGroupStart(); + const auto group_id = ftNcclNextGroupId(); + ftNcclGroupStart(); for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) { int rank = node_id * gpu_count + gid - device_id_start; int tensor_para_rank = rank % tensor_para_size; int pipeline_para_rank = rank / tensor_para_size; - ft::NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank]; - ft::check_cuda_error(cudaSetDevice(gid)); - ft::ftNcclCommInitRank(pipeline_para_params[gid - device_id_start], - pipeline_para_rank, - pipeline_para_size, - pipeline_para_nccl_uid); + NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank]; + check_cuda_error(cudaSetDevice(gid)); + ftNcclCommInitRank(pipeline_para_params[gid - device_id_start], + pipeline_para_rank, + pipeline_para_size, + pipeline_para_nccl_uid); pipeline_para_params[gid - device_id_start].group_id_ = group_id; } - ft::ftNcclGroupEnd(); + ftNcclGroupEnd(); } - return std::pair, std::vector>(tensor_para_params, pipeline_para_params); + return std::pair, std::vector>(tensor_para_params, pipeline_para_params); } + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp index 066d75a780..6d49df4578 100644 --- a/src/turbomind/triton_backend/transformer_triton_backend.hpp +++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp @@ -30,242 +30,11 @@ #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/custom_ar_comm.h" -#include "src/turbomind/utils/instance_comm.h" #include "src/turbomind/utils/nccl_utils.h" -namespace ft = turbomind; +namespace turbomind { -namespace triton { -#ifdef USE_TRITONSERVER_DATATYPE - -#include "triton/core/tritonbackend.h" -#include "triton/core/tritonserver.h" - -#ifndef TRITONSERVER_API_VERSION_MAJOR -#error TRITONSERVER_API_VERSION_MAJOR Undefined! -#endif - -#ifndef TRITONSERVER_API_VERSION_MINOR -#error TRITONSERVER_API_VERSION_MINOR Undefined! -#endif - -#if (TRITONSERVER_API_VERSION_MAJOR == 1 && TRITONSERVER_API_VERSION_MINOR >= 17) \ - || (TRITONSERVER_API_VERSION_MAJOR > 1) -#define ENABLE_TRITON_BF16 1 -#endif - -typedef TRITONSERVER_DataType DataType; -typedef TRITONSERVER_MemoryType MemoryType; - -constexpr TRITONSERVER_DataType TYPE_INVALID = TRITONSERVER_TYPE_INVALID; -constexpr TRITONSERVER_DataType TYPE_BOOL = TRITONSERVER_TYPE_BOOL; -constexpr TRITONSERVER_DataType TYPE_UINT8 = TRITONSERVER_TYPE_UINT8; -constexpr TRITONSERVER_DataType TYPE_UINT16 = TRITONSERVER_TYPE_UINT16; -constexpr TRITONSERVER_DataType TYPE_UINT32 = TRITONSERVER_TYPE_UINT32; -constexpr TRITONSERVER_DataType TYPE_UINT64 = TRITONSERVER_TYPE_UINT64; -constexpr TRITONSERVER_DataType TYPE_INT8 = TRITONSERVER_TYPE_INT8; -constexpr TRITONSERVER_DataType TYPE_INT16 = TRITONSERVER_TYPE_INT16; -constexpr TRITONSERVER_DataType TYPE_INT32 = TRITONSERVER_TYPE_INT32; -constexpr TRITONSERVER_DataType TYPE_INT64 = TRITONSERVER_TYPE_INT64; -constexpr TRITONSERVER_DataType TYPE_FP16 = TRITONSERVER_TYPE_FP16; -constexpr TRITONSERVER_DataType TYPE_FP32 = TRITONSERVER_TYPE_FP32; -constexpr TRITONSERVER_DataType TYPE_FP64 = TRITONSERVER_TYPE_FP64; -constexpr TRITONSERVER_DataType TYPE_BYTES = TRITONSERVER_TYPE_BYTES; - -#ifdef ENABLE_TRITON_BF16 -constexpr TRITONSERVER_DataType TYPE_BF16 = TRITONSERVER_TYPE_BF16; -#endif -constexpr TRITONSERVER_MemoryType MEMORY_CPU = TRITONSERVER_MEMORY_CPU; -constexpr TRITONSERVER_MemoryType MEMORY_CPU_PINNED = TRITONSERVER_MEMORY_CPU_PINNED; -constexpr TRITONSERVER_MemoryType MEMORY_GPU = TRITONSERVER_MEMORY_GPU; - -#else - -typedef ft::DataType DataType; -typedef ft::MemoryType MemoryType; - -constexpr DataType TYPE_INVALID = ft::TYPE_INVALID; -constexpr DataType TYPE_BOOL = ft::TYPE_BOOL; -constexpr DataType TYPE_UINT8 = ft::TYPE_UINT8; -constexpr DataType TYPE_UINT16 = ft::TYPE_UINT16; -constexpr DataType TYPE_UINT32 = ft::TYPE_UINT32; -constexpr DataType TYPE_UINT64 = ft::TYPE_UINT64; -constexpr DataType TYPE_INT8 = ft::TYPE_INT8; -constexpr DataType TYPE_INT16 = ft::TYPE_INT16; -constexpr DataType TYPE_INT32 = ft::TYPE_INT32; -constexpr DataType TYPE_INT64 = ft::TYPE_INT64; -constexpr DataType TYPE_FP16 = ft::TYPE_FP16; -constexpr DataType TYPE_FP32 = ft::TYPE_FP32; -constexpr DataType TYPE_FP64 = ft::TYPE_FP64; -constexpr DataType TYPE_BYTES = ft::TYPE_BYTES; -constexpr DataType TYPE_BF16 = ft::TYPE_BF16; -constexpr MemoryType MEMORY_CPU = ft::MEMORY_CPU; -constexpr MemoryType MEMORY_CPU_PINNED = ft::MEMORY_CPU_PINNED; -constexpr MemoryType MEMORY_GPU = ft::MEMORY_GPU; - -#endif - -struct Tensor { - const MemoryType where; - const DataType type; - const std::vector shape; - const void* data; - - Tensor(const MemoryType _where, const DataType _type, const std::vector _shape, const void* _data): - where(_where), type(_type), shape(_shape), data(_data) - { - } - - static ft::DataType convertTritonTypeToFt(DataType tmp_type) - { - ft::DataType ft_data_type; - switch (tmp_type) { - case TYPE_INVALID: - ft_data_type = ft::DataType::TYPE_INVALID; - break; - case TYPE_BOOL: - ft_data_type = ft::DataType::TYPE_BOOL; - break; - case TYPE_UINT8: - ft_data_type = ft::DataType::TYPE_UINT8; - break; - case TYPE_UINT16: - ft_data_type = ft::DataType::TYPE_UINT16; - break; - case TYPE_UINT32: - ft_data_type = ft::DataType::TYPE_UINT32; - break; - case TYPE_UINT64: - ft_data_type = ft::DataType::TYPE_UINT64; - break; - case TYPE_INT8: - ft_data_type = ft::DataType::TYPE_INT8; - break; - case TYPE_INT16: - ft_data_type = ft::DataType::TYPE_INT16; - break; - case TYPE_INT32: - ft_data_type = ft::DataType::TYPE_INT32; - break; - case TYPE_INT64: - ft_data_type = ft::DataType::TYPE_INT64; - break; - case TYPE_FP16: - ft_data_type = ft::DataType::TYPE_FP16; - break; - case TYPE_FP32: - ft_data_type = ft::DataType::TYPE_FP32; - break; - case TYPE_FP64: - ft_data_type = ft::DataType::TYPE_FP64; - break; -#ifdef ENABLE_TRITON_BF16 - case TYPE_BF16: - ft_data_type = ft::DataType::TYPE_BF16; - break; -#endif - case TYPE_BYTES: - ft_data_type = ft::DataType::TYPE_BYTES; - break; - default: - FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(tmp_type)); - break; - } - return ft_data_type; - } - - ft::Tensor convertTritonTensorToFt() - { - ft::DataType ft_data_type = convertTritonTypeToFt(type); - ft::MemoryType ft_memory_type; - switch (where) { - case MEMORY_CPU: - ft_memory_type = ft::MemoryType::MEMORY_CPU; - break; - case MEMORY_CPU_PINNED: - ft_memory_type = ft::MemoryType::MEMORY_CPU_PINNED; - break; - case MEMORY_GPU: - ft_memory_type = ft::MemoryType::MEMORY_GPU; - break; - } - return ft::Tensor{ft_memory_type, ft_data_type, shape, data}; - } - - static Tensor convertFtTensorToTriton(ft::Tensor ft_tensor) - { - DataType triton_data_type; - switch (ft_tensor.type) { - case TYPE_INVALID: - triton_data_type = TYPE_INVALID; - break; - case TYPE_BOOL: - triton_data_type = TYPE_BOOL; - break; - case TYPE_UINT8: - triton_data_type = TYPE_UINT8; - break; - case TYPE_UINT16: - triton_data_type = TYPE_UINT16; - break; - case TYPE_UINT32: - triton_data_type = TYPE_UINT32; - break; - case TYPE_UINT64: - triton_data_type = TYPE_UINT64; - break; - case TYPE_INT8: - triton_data_type = TYPE_INT8; - break; - case TYPE_INT16: - triton_data_type = TYPE_INT16; - break; - case TYPE_INT32: - triton_data_type = TYPE_INT32; - break; - case TYPE_INT64: - triton_data_type = TYPE_INT64; - break; - case TYPE_FP16: - triton_data_type = TYPE_FP16; - break; - case TYPE_FP32: - triton_data_type = TYPE_FP32; - break; - case TYPE_FP64: - triton_data_type = TYPE_FP64; - break; -#ifdef ENABLE_TRITON_BF16 - case TYPE_BF16: - triton_data_type = TYPE_BF16; - break; -#endif - case TYPE_BYTES: - triton_data_type = TYPE_BYTES; - break; - default: - FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(ft_tensor.type)); - break; - } - MemoryType triton_memory_type; - switch (ft_tensor.where) { - case MEMORY_CPU: - triton_memory_type = MEMORY_CPU; - break; - case MEMORY_CPU_PINNED: - triton_memory_type = MEMORY_CPU_PINNED; - break; - case MEMORY_GPU: - triton_memory_type = MEMORY_GPU; - break; - } - return Tensor{triton_memory_type, triton_data_type, ft_tensor.shape, ft_tensor.data}; - } -}; - -} // namespace triton - -using triton_stream_cb_t = std::function>, void*)>; +using triton_stream_cb_t = std::function>, void*)>; struct AbstractTransformerModel; struct AbstractTransformerModelInstance; @@ -273,17 +42,8 @@ struct AbstractTransformerModelInstance; struct AbstractTransformerModelInstance { virtual ~AbstractTransformerModelInstance() = default; - virtual std::shared_ptr> - forward(std::shared_ptr> input_tensors) = 0; - - virtual std::shared_ptr> - forward(std::shared_ptr> input_tensors) = 0; - - virtual std::shared_ptr> - forward(std::shared_ptr> input_tensors, ft::AbstractInstanceComm*) - { - return forward(input_tensors); - } + virtual std::shared_ptr> + forward(std::shared_ptr> input_tensors) = 0; void registerCallback(triton_stream_cb_t cb, void* ctx) { @@ -301,43 +61,38 @@ struct AbstractTransformerModelInstance { void* stream_ctx_ = nullptr; }; -using TensorMap = std::unordered_map; - struct AbstractTransformerModel { static std::shared_ptr createLlamaModel(std::string model_dir); virtual ~AbstractTransformerModel() = default; - virtual std::pair, std::vector> + virtual std::pair, std::vector> createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false); - virtual void createCustomComms(std::vector>* custom_all_reduce_comms, - int world_size) = 0; - - virtual std::unique_ptr createInstanceComm(int size) - { - return nullptr; - } + virtual void createCustomComms(std::vector>* custom_all_reduce_comms, + int world_size) = 0; virtual std::unique_ptr - createModelInstance(int deviceId, - int rank, - cudaStream_t stream, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr) = 0; + createModelInstance(int deviceId, + int rank, + cudaStream_t stream, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm = nullptr) = 0; virtual void createSharedWeights(int deviceId, int rank) = 0; - virtual TensorMap getParams(int deviceId, int rank) = 0; + virtual std::unordered_map getParams(int deviceId, int rank) = 0; virtual void processWeights(int deviceId, int rank) = 0; - virtual void createEngine(int device_id, - int rank, - std::pair, std::vector> nccl_params, - std::shared_ptr) = 0; + virtual void createEngine(int device_id, + int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr) = 0; virtual std::string toString() = 0; virtual int getTensorParaSize() = 0; virtual int getPipelineParaSize() = 0; }; + +} // namespace turbomind diff --git a/src/turbomind/utils/Tensor.h b/src/turbomind/utils/Tensor.h index 6214f6bbc2..b2b8524e09 100644 --- a/src/turbomind/utils/Tensor.h +++ b/src/turbomind/utils/Tensor.h @@ -515,6 +515,16 @@ class TensorMap { return tensor_map_.end(); } + int count(const std::string& key) const + { + return tensor_map_.count(key); + } + + bool empty() const + { + return tensor_map_.empty(); + } + std::string toString(); static TensorMap fromNpyFolder(const std::string& base_folder); void saveNpy(const std::string& base_folder); diff --git a/src/turbomind/utils/instance_comm.h b/src/turbomind/utils/instance_comm.h deleted file mode 100644 index 5a25360a05..0000000000 --- a/src/turbomind/utils/instance_comm.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -namespace turbomind { - -class AbstractInstanceComm { -public: - virtual ~AbstractInstanceComm() = default; - - virtual void barrier() = 0; - - virtual void setSharedObject(void*) = 0; - - virtual void* getSharedObject() = 0; -}; - -} // namespace turbomind From 382f92bb5d214c639fd0f7f4ff477b2a3d0ba87c Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 5 Dec 2024 19:56:47 +0800 Subject: [PATCH 02/40] async interface --- lmdeploy/turbomind/chat.py | 60 ++- lmdeploy/turbomind/turbomind.py | 452 +++++++----------- src/turbomind/models/llama/LlamaBatch.cc | 189 +++----- src/turbomind/models/llama/LlamaBatch.h | 18 +- src/turbomind/models/llama/Request.h | 108 ++++- src/turbomind/python/bind.cpp | 229 ++++++--- src/turbomind/triton_backend/CMakeLists.txt | 2 +- .../triton_backend/llama/LlamaTritonModel.cc | 15 +- .../triton_backend/llama/LlamaTritonModel.h | 7 +- .../llama/LlamaTritonModelInstance.cc | 8 + .../llama/LlamaTritonModelInstance.h | 8 +- .../triton_backend/model_request.cpp | 153 ++++++ src/turbomind/triton_backend/model_request.h | 53 ++ .../transformer_triton_backend.hpp | 13 +- src/turbomind/utils/Tensor.h | 25 + 15 files changed, 807 insertions(+), 533 deletions(-) create mode 100644 src/turbomind/triton_backend/model_request.cpp create mode 100644 src/turbomind/triton_backend/model_request.h diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index e106beae17..7dc8778957 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import asyncio import os import random @@ -28,6 +29,40 @@ def input_prompt(model_name): return '\n'.join(iter(input, sentinel)) +def infer(generator, session_id, input_ids, gen_config, sequence_start, + sequence_end, step, stream_output, tokenizer, state): + for outputs in generator.stream_infer(session_id=session_id, + input_ids=input_ids, + gen_config=gen_config, + sequence_start=sequence_start, + sequence_end=sequence_end, + step=step, + stream_output=stream_output): + res, tokens = input_ids + outputs.token_ids, outputs.num_token + # decode res + response, state = tokenizer.detokenize_incrementally(res, state=state) + print(response, end='', flush=True) + return tokens + + +async def async_infer(generator, session_id, input_ids, gen_config, + sequence_start, sequence_end, step, stream_output, + tokenizer, state): + async for output in generator.async_stream_infer( + session_id=session_id, + input_ids=input_ids, + gen_config=gen_config, + sequence_start=sequence_start, + sequence_end=sequence_end, + step=step, + stream_output=stream_output): + res, tokens = input_ids + output.token_ids, output.num_token + # decode res + response, state = tokenizer.detokenize_incrementally(res, state=state) + print(response, end='', flush=True) + return tokens + + def main(model_path: str, session_id: int = 1, top_k: float = 40, @@ -47,6 +82,7 @@ def main(model_path: str, stream_output: bool = True, request_output_len: int = 1024, chat_template_config: ChatTemplateConfig = None, + use_async: bool = True, **kwargs): """An example to perform model inference through the command line interface. @@ -163,20 +199,16 @@ def main(model_path: str, print(f'{prompt}', end='', flush=True) state = DetokenizeState(len(input_ids)) - for outputs in generator.stream_infer( - session_id=session_id, - input_ids=[input_ids], - gen_config=gen_config, - sequence_start=sequence_start, - sequence_end=sequence_end, - step=step, - stream_output=stream_output): - - res, tokens = input_ids + outputs.token_ids, outputs.num_token - # decode res - response, state = tokenizer.detokenize_incrementally( - res, state=state) - print(response, end='', flush=True) + + if use_async: + coro = async_infer(generator, session_id, input_ids, + gen_config, sequence_start, sequence_end, + step, stream_output, tokenizer, state) + tokens = asyncio.run(coro) + else: + tokens = infer(generator, session_id, input_ids, gen_config, + sequence_start, sequence_end, step, + stream_output, tokenizer, state) # update step step += len(input_ids) + tokens diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index a1b2fff944..07034c2f64 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -4,10 +4,12 @@ import json import os.path as osp import sys +import threading +from collections.abc import Sequence from concurrent.futures import ThreadPoolExecutor from dataclasses import asdict from itertools import repeat -from queue import LifoQueue, Queue +from queue import Queue from typing import Dict, Iterable, List import numpy as np @@ -343,51 +345,12 @@ def __init__(self, # create model instances self.model_inst = self._create_model_instance(0) - self.que = Queue() - self.executor: ThreadPoolExecutor = None - self.future = None self.config = config def _create_model_instance(self, device_id): - rank = self.node_id * self.gpu_count + device_id - model_inst = self.tm_model.model_comm.create_model_instance( - device_id, rank, self.cuda_stream_id, self.nccl_params) + model_inst = self.tm_model.model_comm.create_model_instance(device_id) return model_inst - def _forward_callback(self, result, ctx): - self.que.put((False, result)) - - def _forward_thread(self, inputs): - - def _func(): - try: - output = self.model_inst.forward(inputs) - except Exception as e: - logger.error(f'unhandled exception: {e}') - self.que.put((-1, None)) - return - self.que.put((True, output)) - - self.executor = ThreadPoolExecutor(1) - self.future = self.executor.submit(_func) - - def _async_forward_callback(self, result, ctx, que: LifoQueue): - que.put((False, result)) - - def _async_forward_thread(self, inputs, que: LifoQueue): - - def _func(): - try: - output = self.model_inst.forward(inputs) - except Exception as e: - logger.error(f'unhandled exception: {e}') - que.put((-1, None)) - return - que.put((True, output)) - - self.executor = ThreadPoolExecutor(1) - self.future = self.executor.submit(_func) - def _get_logprobs(self, logprob_vals: torch.Tensor, logprob_indexes: torch.Tensor, @@ -506,61 +469,17 @@ def prepare_embeddings(self, return input_embeddings, input_embedding_ranges def prepare_inputs(self, - session_id, input_ids, gen_config: GenerationConfig, input_embeddings=None, - input_embedding_ranges=None, - sequence_start: bool = True, - sequence_end: bool = False, - step=0, - stop=False): + input_embedding_ranges=None): """Convert inputs format.""" - if len(input_ids) == 0: - input_ids = [[]] - if isinstance(input_ids[0], int): - input_ids = [input_ids] - - batch_size = len(input_ids) - - def _broadcast_np(data, dtype, shape=(batch_size, )): - if isinstance(data, Iterable): - assert len(data) == batch_size - return data - - return np.full(shape, data, dtype=dtype) - - input_ids = [torch.IntTensor(ids) for ids in input_ids] - input_lengths = torch.IntTensor([len(ids) for ids in input_ids]) - input_ids = pad_sequence(input_ids, - batch_first=True, - padding_value=self.eos_id) + assert isinstance(input_ids, Sequence) - if isinstance(session_id, int): - session_id = [session_id] - assert len(session_id) == batch_size + input_ids = torch.IntTensor(input_ids) + input_lengths = torch.IntTensor([len(input_ids)]) - step = _broadcast_np(step, np.int32) - - inputs = dict( - input_ids=input_ids, - input_lengths=input_lengths, - request_output_len=np.full(input_lengths.shape, - gen_config.max_new_tokens, - dtype=np.uint32), - runtime_top_k=_broadcast_np(gen_config.top_k, np.uint32), - runtime_top_p=_broadcast_np(gen_config.top_p, np.float32), - runtime_min_p=_broadcast_np(gen_config.min_p, np.float32), - temperature=_broadcast_np(gen_config.temperature, np.float32), - repetition_penalty=_broadcast_np(gen_config.repetition_penalty, - np.float32), - step=step, - - # session input - START=_broadcast_np((1 if sequence_start else 0), np.int32), - END=_broadcast_np((1 if sequence_end else 0), np.int32), - CORRID=np.array(session_id, dtype=np.uint64), - STOP=_broadcast_np((1 if stop else 0), np.int32)) + inputs = dict(input_ids=input_ids, ) input_embeddings, input_embedding_ranges = self.prepare_embeddings( input_embeddings, input_embedding_ranges) @@ -568,17 +487,6 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): inputs['input_embeddings'] = input_embeddings inputs['input_embedding_ranges'] = input_embedding_ranges - if gen_config.min_new_tokens is not None: - inputs['min_length'] = _broadcast_np(gen_config.min_new_tokens, - np.int32) - - if gen_config.logprobs is not None and gen_config.logprobs > 0: - if gen_config.logprobs > MAX_LOGPROBS: - gen_config.logprobs = MAX_LOGPROBS - logger.warning('logprobs shoudd be in range [1, 1024]' - f'update logprobs={gen_config.logprobs}') - inputs['logprobs'] = _broadcast_np(gen_config.logprobs, np.int32) - bad_words = [] if gen_config.bad_token_ids is not None: bad_words.extend(gen_config.bad_token_ids) @@ -597,11 +505,17 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): if bad_words is not None: inputs['bad_words_list'] = bad_words - if gen_config.random_seed is not None: - inputs['random_seed'] = _broadcast_np(gen_config.random_seed, - np.uint64) return inputs, input_lengths + async def async_signal(self, state): + async with self.cond: + self.flag, self.state = 1, state + self.cond.notify() + + def async_signal_cb(self, state): + coro = self.async_signal(state) + asyncio.run_coroutine_threadsafe(coro, self.event_loop) + async def async_stream_infer(self, session_id, input_ids, @@ -631,99 +545,115 @@ async def async_stream_infer(self, kwargs (dict): kwargs for backward compatibility """ # start forward thread - que = LifoQueue() - from functools import partial - _forward_callback = partial(self._async_forward_callback, que=que) - _forward_thread = partial(self._async_forward_thread, que=que) - if stream_output and not stop: - logger.info(f'Register stream callback for {session_id}') - self.model_inst.register_callback(_forward_callback) - - inputs, input_lengths = self.prepare_inputs( - session_id=session_id, + + self.event_loop = asyncio.get_running_loop() + self.cond = asyncio.Condition() + self.flag = 0 + + gen_cfg = self._get_generation_config(gen_config) + + inputs, input_length = self.prepare_inputs( input_ids=input_ids, input_embeddings=input_embeddings, input_embedding_ranges=input_embedding_ranges, - sequence_start=sequence_start, - sequence_end=sequence_end, - step=step, - stop=stop, gen_config=gen_config) - tm_inputs = _np_dict_to_tm_dict(inputs) - _forward_thread(tm_inputs) + session = _tm.SessionParam(id=session_id, + step=step, + start=sequence_start, + end=sequence_end, + stop=stop) - seq_start = input_lengths + input_lengths.new_tensor(step) + inputs = _np_dict_to_tm_dict(inputs) - out_logprobs = None - prev_len = 0 - # generator - while True: - while que.qsize() == 0: # let other requests in - await asyncio.sleep(0.002) - - finish, tm_outputs = que.get() - if finish < 0: - yield EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR, - token_ids=[], - num_token=0) - self.executor.shutdown() - break + outputs = self.model_inst.forward(inputs, session, gen_cfg, + stream_output, self.async_signal_cb) - outputs = _tm_dict_to_torch_dict(tm_outputs) + outputs = _tm_dict_to_torch_dict(outputs) - output_ids = outputs['output_ids'][:, 0, :] - sequence_length = outputs['sequence_length'].long()[:, 0] - output_ids = [ - output_id[s:l] for output_id, s, l in zip( - output_ids, seq_start, sequence_length) - ] - sequence_length -= seq_start.to(sequence_length.device) - - if 'logprob_vals' in outputs: - logprob_vals = outputs['logprob_vals'][0, 0] - logprob_indexes = outputs['logprob_indexes'][0, 0] - logprob_nums = outputs['logprob_nums'][0, 0] - out_logprobs = self._get_logprobs(logprob_vals, - logprob_indexes, - logprob_nums, output_ids[0], - gen_config.logprobs, - sequence_length.cpu().item(), - out_logprobs, session_id) - - outputs = [] - status = ResponseType.FINISH if finish else ResponseType.SUCCESS - for output, len_ in zip(output_ids, sequence_length): - output, len_ = output, len_.item() - if len(output) > 0 and output[-1].item() == self.eos_id \ - and not gen_config.ignore_eos: - outputs = EngineOutput(status, output[:-1].tolist(), - len_ - 1) - elif len(output) > 0 and \ - gen_config.stop_token_ids is not None and \ - output[-1].item() in gen_config.stop_token_ids: - outputs = EngineOutput(status, output[:-1].tolist(), len_) - else: - outputs = EngineOutput(status, output.tolist(), len_) - if outputs.num_token < prev_len and not finish: - continue - else: - prev_len = outputs.num_token - - if out_logprobs: - output_token_len = len(outputs.token_ids) - outputs.logprobs = out_logprobs[:output_token_len] - - yield outputs - - if finish: - self.future.result() - self.executor.shutdown() - break + output_ids_buf = outputs['output_ids'] - if stream_output and not stop: - logger.info(f'UN-register stream callback for {session_id}') - self.model_inst.unregister_callback() + seq_start = step + input_length[0] + + out_logprobs = None + finish = False + + try: + # generator + while True: + async with self.cond: + while not self.flag: + await self.cond.wait() + state, self.flag = self.state, 0 + + status, seq_len = state.status, state.seq_len + + if status == 7: + finish = True + status = 0 + elif status: + yield self._get_error_output() + break + + if seq_start == seq_len and not finish: + continue + + output_ids = output_ids_buf[seq_start:seq_len] + gen_len = seq_len - seq_start + status = ResponseType.FINISH if finish else ResponseType.SUCCESS + output = EngineOutput(status, output_ids.tolist(), gen_len, + out_logprobs) + yield output + + if finish: + break + + except Exception as e: + logger.error(e) + yield self._get_error_output() + + finally: + async with self.cond: + # Contract: `notfiy` won't be called again if status is non-zero + # wait for status to be set as `finish` or `error` + while not state or state.status == 0: + while not self.flag: + await self.cond.wait() + state = self.state + self.cond = None + self.event_loop = None + + def _get_error_output(self): + return EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR, + token_ids=[], + num_token=0) + + def _get_generation_config(self, cfg: GenerationConfig): + c = _tm.GenerationConfig() + c.max_new_tokens = cfg.max_new_tokens + c.top_k = cfg.top_k + c.top_p = cfg.top_p + c.min_p = cfg.min_p + c.temperature = cfg.temperature + c.repetition_penalty = cfg.repetition_penalty + if cfg.min_new_tokens: + c.min_new_tokens = cfg.min_new_tokens + if cfg.logprobs: + if cfg.logprobs > MAX_LOGPROBS: + cfg.logprobs = MAX_LOGPROBS + logger.warning( + f'logprobs shoudd be in range [1, {MAX_LOGPROBS}]' + f'update logprobs={cfg.logprobs}') + c.output_logprobs = cfg.logprobs + if cfg.random_seed is not None: + c.random_seed = cfg.random_seed + # print (c) + return c + + def signal_cb(self, state): + with self.cond: + self.flag, self.state = 1, state + self.cond.notify() def stream_infer(self, session_id, @@ -753,96 +683,86 @@ def stream_infer(self, stream_output (bool): indicator for stream output kwargs (dict): kwargs for backward compatibility """ - if stream_output and not stop: - logger.info(f'Register stream callback for {session_id}') - self.model_inst.register_callback(self._forward_callback) - inputs, input_lengths = self.prepare_inputs( - session_id=session_id, + gen_cfg = self._get_generation_config(gen_config) + + inputs, input_length = self.prepare_inputs( input_ids=input_ids, input_embeddings=input_embeddings, input_embedding_ranges=input_embedding_ranges, - sequence_start=sequence_start, - sequence_end=sequence_end, - step=step, - stop=stop, gen_config=gen_config) - tm_inputs = _np_dict_to_tm_dict(inputs) - # start forward thread - self.que = Queue() - self._forward_thread(tm_inputs) + inputs = _np_dict_to_tm_dict(inputs) - seq_start = input_lengths + input_lengths.new_tensor(step) - out_logprobs = None + session = _tm.SessionParam(id=session_id, + step=step, + start=sequence_start, + end=sequence_end, + stop=stop) - # generator - while True: - while self.que.qsize() > 1: - self.que.get() - - finish, tm_outputs = self.que.get() - if finish < 0: - yield EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR, - token_ids=[], - num_token=0) - self.executor.shutdown() - break + self.cond = threading.Condition() + self.flag = 0 - outputs = _tm_dict_to_torch_dict(tm_outputs) + outputs = self.model_inst.forward(inputs, session, gen_cfg, + stream_output, self.signal_cb) - output_ids = outputs['output_ids'][:, 0, :] - sequence_length = outputs['sequence_length'].long()[:, 0] - output_ids = [ - output_id[s:l] for output_id, s, l in zip( - output_ids, seq_start, sequence_length) - ] - sequence_length -= seq_start.to(sequence_length.device) - - if 'logprob_vals' in outputs: - logprob_vals = outputs['logprob_vals'][0, 0] - logprob_indexes = outputs['logprob_indexes'][0, 0] - logprob_nums = outputs['logprob_nums'][0, 0] - out_logprobs = self._get_logprobs(logprob_vals, - logprob_indexes, - logprob_nums, output_ids[0], - gen_config.logprobs, - sequence_length.cpu().item(), - out_logprobs, session_id) - - outputs = [] - status = ResponseType.FINISH if finish else ResponseType.SUCCESS - for output, len_ in zip(output_ids, sequence_length): - output, len_ = output, len_.item() - if len(output) > 0 and output[-1].item() == self.eos_id \ - and not gen_config.ignore_eos: - outputs = EngineOutput(status, output[:-1].tolist(), - len_ - 1, out_logprobs) - elif len(output) > 0 and \ - gen_config.stop_token_ids is not None and \ - output[-1].item() in gen_config.stop_token_ids: - outputs = EngineOutput(status, output[:-1].tolist(), len_, - out_logprobs) - else: - outputs = EngineOutput(status, output.tolist(), len_, - out_logprobs) - - if out_logprobs: - output_token_len = len(outputs.token_ids) - outputs.logprobs = out_logprobs[:output_token_len] - - yield outputs - - if finish: - self.future.result() - self.executor.shutdown() - while self.que.qsize() > 0: - self.que.get() - break + outputs = _tm_dict_to_torch_dict(outputs) - if stream_output and not stop: - logger.info(f'UN-register stream callback for {session_id}') - self.model_inst.unregister_callback() + output_ids_buf = outputs['output_ids'] + + seq_start = step + input_length[0] + + out_logprobs = None + finish = False + state = None + + try: + # generator + while True: + with self.cond: + while not self.flag: + self.cond.wait() + state = self.state + self.flag = 0 + + status, seq_len = state.status, state.seq_len + + if status == 7: # TODO: use enum + finish = True + status = 0 + elif status: + yield self._get_error_output() + break + + output_ids = output_ids_buf[seq_start:seq_len] + gen_len = seq_len - seq_start + + status = ResponseType.FINISH if finish else ResponseType.SUCCESS + output = EngineOutput(status, output_ids.tolist(), gen_len, + out_logprobs) + + if out_logprobs: + output_token_len = len(output.token_ids) + output.logprobs = out_logprobs[:output_token_len] + + yield output + + if finish: + break + + except Exception as e: + logger.error(e) + yield self._get_error_output() + + finally: + with self.cond: + # Contract: `notfiy` won't be called again if status is non-zero + # wait for status to be set as `finish` or `error` + while not state or state.status == 0: + while not self.flag: + self.cond.wait() + state = self.state + self.cond = None def decode(self, input_ids, diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index ea321d06a0..309ca1b4b4 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -97,7 +97,16 @@ void LlamaBatch::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_r auto reject = [](const char* type, std::shared_ptr& req, int ec) { TM_LOG_WARNING( "[RejectInvalidRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec); - req->signal.set_value(ec); + /// FIXME: make these signals + if (req->cancel_cb) { + req->cancel_cb(ec); + } + else if (req->end_cb) { + req->end_cb(ec); + } + else if (req->forward_cb) { + FT_CHECK(0); // not implemtented + } req.reset(); }; @@ -106,21 +115,22 @@ void LlamaBatch::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_r if (r) { int ec = 0; - const int input_length = r->inputs.getVal("input_lengths", 0); + const int input_length = r->inputs.at("input_ids").shape[0]; const auto get_offset = [&](int token_count) { - return std::max(0, std::min(token_count, r->inputs.getVal("step", token_count))); + const int step = r->session.step < 0 ? token_count : r->session.step; + return std::max(0, std::min(token_count, r->session.step)); }; if (occurrence[r->id] != 1) { ec = Request::kConflict; } - else if (r->start_flag && r->stop_flag) { + else if (r->session.start_flag && r->session.stop_flag) { ec = Request::kInvalid; } else if (input_length > session_len_) { ec = Request::kTooLong; } - else if (!r->start_flag) { + else if (!r->session.start_flag) { if (auto seq = sequence_manager_->Get(r->id); seq == nullptr) { ec = Request::kInvalid; } @@ -154,7 +164,7 @@ void LlamaBatch::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_r // invalidate stop-only requests for inactive sequences for (auto& r : stop_reqs) { - if (r && r->end_flag == false) { + if (r && r->session.end_flag == false) { int ec = Request::kInactive; for (int i = 0; i < state_->size; ++i) { if (state_->requests[i] && state_->requests[i]->id == r->id) { @@ -203,20 +213,22 @@ auto LlamaBatch::ProcessStopRequests(const Requests& requests) -> std::vector // stop & optionally erase active sequence if (state_->requests[i] && state_->requests[i]->id == r->id) { ec = 0; - signals.push_back(Interrupt(i, true, r->end_flag)); + signals.push_back(Interrupt(i, true, r->session.end_flag)); ++count; break; } } // mismatch, try erase inactive sequence, in this case there is no active request to interrupt - if (ec && r->end_flag) { + if (ec && r->session.end_flag) { if (sequence_manager_->Erase(r->id)) { ec = 0; } } signals.push_back([=] { if (rank_ == 0) { - r->signal.set_value(ec); + if (r->cancel_cb) { + r->cancel_cb(ec); + } } }); } @@ -248,12 +260,12 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) state.requests[idx] = r; // get sequence for the request - state.sequences[idx] = r->start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id); + state.sequences[idx] = r->session.start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id); FT_CHECK(state.sequences[idx]); auto& seq = *state.sequences[idx]; - if (int step = r->inputs.getVal("step", -1); step >= 0) { + if (int step = r->session.step; step >= 0) { if (step <= seq.tokens.size()) { seq.tokens.resize(step); seq.cache_len = std::min(seq.cache_len, step); @@ -265,7 +277,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) } } - const int input_length = r->inputs.getVal("input_lengths"); + const int input_length = r->inputs.at("input_ids").shape[0]; const int* input_ids = r->inputs.getPtr("input_ids"); { @@ -289,7 +301,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) } // copy input tokens to prompt for prefix matching - if (input_length && r->start_flag && !r->inputs.isExist("input_embedding_ranges")) { + if (input_length && r->session.start_flag && !r->inputs.isExist("input_embedding_ranges")) { // TODO: truncate prompt to enable prefix caching for VLM seq.prompt.resize(input_length); std::copy_n(input_ids, input_length, seq.prompt.data()); @@ -348,8 +360,8 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) } } - const int request_output_len = state.requests[idx]->inputs.getVal("request_output_len"); - state.seq_len_limit[idx] = state.h_context_length[idx] + request_output_len; + const int max_new_tokens = state.requests[idx]->gen_cfg.max_new_tokens; + state.seq_len_limit[idx] = state.h_context_length[idx] + max_new_tokens; // `length_criterion` sets finish flag when step >= seq_limit_len, however when step == seq_limit_len // the actual sequence length is seq_limit_len + 1, hence seq_limit_len must truncated to session_len - 1 if (state.seq_len_limit[idx] >= session_len_) { @@ -357,17 +369,17 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) if (rank_ == 0) { const int trunc_output_len = state.seq_len_limit[idx] - state.h_context_length[idx]; TM_LOG_WARNING( - "[ProcessInferRequests] [%ld] total sequence length (%d + %d) exceeds `session_len` (%d), `request_output_len` is truncated to %d", + "[ProcessInferRequests] [%ld] total sequence length (%d + %d) exceeds `session_len` (%d), `max_new_tokens` is truncated to %d", (long)seq.id, state.h_context_length[idx], - request_output_len, + max_new_tokens, (int)session_len_, trunc_output_len); } } // compute rope scaling factor - if (r->start_flag) { + if (r->session.start_flag) { seq.rope_theta = model_->attn_param_.rotary_embedding_base; if (model_->attn_param_.use_dynamic_ntk) { auto scaling_factor = model_->attn_param_.rope_scaling_factor; @@ -388,7 +400,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) } state.h_rope_theta[idx] = seq.rope_theta; - if (r->start_flag) { + if (r->session.start_flag) { // prepare to initialize random state for new sequence h_random_seed_[idx] = r->inputs.getVal("random_seed", 0); } @@ -799,12 +811,6 @@ void LlamaBatch::AllocatePersistantBuffer(size_t max_batch_size, int cache_bl sampling_params_ = { {"stop_words_list", (std::byte*)h_stop_words_, (std::byte*)d_stop_words_}, {"bad_words_list", (std::byte*)h_bad_words_, (std::byte*)d_bad_words_}, - {"min_length", (std::byte*)h_min_length_, nullptr}, - {"runtime_top_k", (std::byte*)h_runtime_top_k_, nullptr}, - {"runtime_top_p", (std::byte*)h_runtime_top_p_, nullptr}, - {"runtime_min_p", (std::byte*)h_runtime_min_p_, nullptr}, - {"temperature", (std::byte*)h_temperature_, nullptr}, - {"repetition_penalty", (std::byte*)h_repetition_penalty_, nullptr}, }; for (auto& s : states_) { @@ -1087,6 +1093,27 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) Copy(h_seq_limit_len_, batch_size, seq_limit_len_); TensorMap inputs; + + auto member_to_tensor = [&](auto getter, auto key, auto dest, auto init) { + int count = 0; + for (int i = 0; i < batch_size; ++i) { + // `std::invoke` + dest[i] = state_->requests[i]->gen_cfg.*getter; + count += dest[i] != init; + } + if (count) { + inputs.insert(key, {MEMORY_CPU, getTensorType(), {(size_t)batch_size}, dest}); + } + }; + + using G = GenerationConfig; + member_to_tensor(&G::top_k, "runtime_top_k", h_runtime_top_k_, 0); + member_to_tensor(&G::top_p, "runtime_top_p", h_runtime_top_p_, 0); + member_to_tensor(&G::min_p, "runtime_min_p", h_runtime_min_p_, 0); + member_to_tensor(&G::temperature, "temperature", h_temperature_, 0.f); + member_to_tensor(&G::repetition_penalty, "repetition_penalty", h_repetition_penalty_, 1.f); + member_to_tensor(&G::min_new_tokens, "min_length", h_min_length_, 0); + for (const auto& [name, h_ptr, d_ptr] : sampling_params_) { // find an exemplar that matches the param name const Tensor* ptr{}; @@ -1333,10 +1360,11 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector // set output tokens ids and sequence length int* output_ptr = h_output_ids_; for (int i = 0; i < batch_size - g.partial; ++i) { - if (state_->requests[i] && (state_->requests[i]->stream_cb || state_->h_finished[i])) { + if (state_->requests[i] && (state_->requests[i]->stream_output || state_->h_finished[i])) { auto output_ids = state_->requests[i]->outputs.getPtr("output_ids"); auto output_len = state_->requests[i]->outputs.getPtr("sequence_length"); const int count = state_->h_context_length[i]; + FT_CHECK(state_->requests[i]->outputs.at("output_ids").shape[0] >= count); // TODO: sync history output tokens at when receiving the request and copy the last token here std::copy(output_ptr, output_ptr + count, output_ids); *output_len = count; @@ -1372,12 +1400,12 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector signals.push_back(Interrupt(i)); ++g.finished_count; } - else if (state_->requests[i]->stream_cb) { + else if (state_->requests[i]->stream_output) { // Create signals by copying the request handles for non-finished streaming requests signals.push_back([this, r = state_->requests[i]] { if (rank_ == 0) { try { - r->stream_cb(&r->outputs.get()); + r->forward_cb({Request::kOk, r->outputs.getVal("sequence_length")}); } catch (const std::bad_function_call& e) { TM_LOG_ERROR("Null stream callback for (%s)", std::to_string(r->id).c_str()); @@ -1424,7 +1452,7 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig TM_LOG_INFO("[Interrupt] slot %d, tokens [%s]", index, ss.str().c_str()); } - if (state_->requests[index]->end_flag || force_end) { + if (state_->requests[index]->session.end_flag || force_end) { // Sequence is ending this round or a stop request is issued to end it FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id)); } @@ -1457,12 +1485,14 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig state_->sequences[index] = nullptr; - auto ec = std::exchange(state_->errors[index], 0); + auto ec = std::exchange(state_->errors[index], Request::kOk); // move the request handle into the signal return [this, ec, r = std::move(state_->requests[index])] { if (rank_ == 0) { - r->signal.set_value(ec); + if (r->forward_cb) { + r->forward_cb({Request::kFinish, r->outputs.getVal("sequence_length")}); + } } }; } @@ -1586,7 +1616,7 @@ void LlamaBatch::OutputThreadEntry() if (rank_ == 0 && ffi_lock_) { ffi_lock_(1); } - // invoke stream cbs & signals + // send all bufferred signals for (const auto& s : signals) { s(); } @@ -1800,101 +1830,6 @@ bool LlamaBatch::Forward(GenerationState& g) return true; } -static inline Tensor slice(const Tensor& tensor, int index) -{ - auto shape = tensor.shape; - if (shape.at(0) == 1) { - return tensor; - } - shape[0] = 1; - const auto offset = std::accumulate(shape.begin(), shape.end(), (size_t)index, std::multiplies<>{}); - return tensor.slice(shape, offset); -} - -// ! implicit conversion from `unordered_map` to `TensorMap` drops 0-sized tensors -static inline TensorMap slice(const std::unordered_map& src, int index) -{ - TensorMap dst; - for (const auto& kv : src) { - dst.insert({kv.first, slice(kv.second, index)}); - } - return dst; -} - -template -void LlamaBatch::Submit(std::unordered_map* outputs, - const std::unordered_map* inputs, - Control control) -{ - if (debug_) { - for (const auto& kv : *inputs) { - TM_LOG_INFO("[Submit] INPUT: %s", format(kv).c_str()); - } - for (const auto& kv : *outputs) { - TM_LOG_INFO("[Submit] OUTPUT: %s", format(kv).c_str()); - } - } - - const int batch_size = outputs->at("output_ids").shape[0]; - - std::vector> requests(batch_size); - - // allocates all requests for the batch - for (int i = 0; i < batch_size; ++i) { - requests[i] = std::make_shared(); - } - - for (int i = 0; i < batch_size; ++i) { - auto& r = requests[i]; - - r->inputs = slice(*inputs, i); - r->outputs = slice(*outputs, i); - - r->id = r->inputs.getVal("CORRID", i); - r->start_flag = r->inputs.getVal("START", 1); - r->end_flag = r->inputs.getVal("END", 1); - r->stop_flag = r->inputs.getVal("STOP", 0); - r->stream_cb = control.callback; - } - - // Submits the tasks and wait for finish - std::vector error_codes; - bool has_error = 0; - - TM_LOG_INFO("[forward] Enqueue requests"); - - std::vector ids; - for (const auto& r : requests) { - ids.push_back(r->id); - } - - auto futures = shared_state_->request_queue.enqueue(std::move(requests)); - - FT_CHECK_WITH_INFO(ids.size() == futures.size(), "check failed"); - - TM_LOG_INFO("[forward] Wait for requests to complete ..."); - - for (int i = 0; i < futures.size(); ++i) { - auto ec = futures[i].get(); - error_codes.push_back(ec); - if (ec) { - has_error = true; - TM_LOG_WARNING("[forward] Request failed for %ld, code %d", (long)ids[i], (int)ec); - } - else { - TM_LOG_INFO("[forward] Request completed for %ld", (long)ids[i]); - } - } - - if (has_error) { - std::stringstream ss; - for (int i = 0; i < error_codes.size(); ++i) { - ss << (i ? "" : " ") << error_codes[i]; - } - throw std::runtime_error(ss.str()); - } -} - namespace { template diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index f952da6bae..bd9c4cc136 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -2,6 +2,13 @@ #pragma once +#include +#include +#include +#include + +#include + #include "src/turbomind/models/llama/Barrier.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/Request.h" @@ -12,10 +19,6 @@ #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cuda_utils.h" -#include -#include -#include -#include using ffi_api_lock_ctrl_t = std::function; @@ -30,9 +33,6 @@ struct SharedState { std::atomic free_size{std::numeric_limits::max()}; }; -struct Control { - Request::Callback callback; -}; struct BatchState { int* h_prompt_length; // history + input, ignore generated @@ -122,10 +122,6 @@ class LlamaBatch { void Start(); - void Submit(std::unordered_map* outputs, - const std::unordered_map* inputs, - Control control); - void set_ffi_lock(ffi_api_lock_ctrl_t func) { ffi_lock_ = func; diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/models/llama/Request.h index 2a715e9c9c..5f2be68923 100644 --- a/src/turbomind/models/llama/Request.h +++ b/src/turbomind/models/llama/Request.h @@ -2,50 +2,123 @@ #pragma once -#include "src/turbomind/utils/Tensor.h" +#include #include #include -#include -#include +#include #include -#include + +#include "src/turbomind/utils/Tensor.h" namespace turbomind { -struct Request { - uint64_t id; // sequence id - uint64_t unique_id; // monotonic increasing +struct GenerationConfig { + int max_new_tokens = 0; + int min_new_tokens = 0; + + int top_k = 1; + float top_p = 0.f; + float min_p = 0.f; + float temperature = 1.f; + + float repetition_penalty = 1.f; + + uint64_t random_seed = 0; + + int output_logprobs = 0; + + // placeholders that are not implemented yet + bool output_hidden_states = false; + bool output_logits = false; +}; + +inline std::ostream& operator<<(std::ostream& os, const GenerationConfig& c) +{ + os << "GenerationConfig { "; + os << "max_new_tokens=" << c.max_new_tokens; + os << ", min_new_tokens=" << c.min_new_tokens; + os << ", top_p=" << c.top_p; + os << ", top_k=" << c.top_k; + os << ", min_p=" << c.min_p; + os << ", temperature=" << c.temperature; + os << ", repetition_penalty=" << c.repetition_penalty; + os << ", random_seed=" << c.random_seed; + os << ", output_logprobs=" << c.output_logprobs; + os << ", output_hidden_states=" << c.output_hidden_states; + os << ", output_logits=" << c.output_logits; + os << " }"; + return os; +} + +struct SessionParam { + uint64_t id; + + int step; bool start_flag; bool end_flag; bool stop_flag; +}; + +struct RequestState { + int status; + int seq_len; +}; + +struct AtomicRequestState { - // per rank inputs/outputs + AtomicRequestState(): data(std::make_shared()) {} + + void update(RequestState state) + { + std::atomic_store_explicit(&data, std::make_shared(std::move(state)), std::memory_order_release); + } + + std::shared_ptr load() + { + return std::atomic_load_explicit(&data, std::memory_order_acquire); + } + + std::shared_ptr data; +}; + +struct Request { + uint64_t id; // sequence id + uint64_t unique_id; // monotonic increasing + + SessionParam session; + GenerationConfig gen_cfg; + + bool stream_output; + + // reference to IO tensors TensorMap inputs; TensorMap outputs; - using Callback = std::function*)>; - Callback stream_cb; + std::function cancel_cb; + std::function end_cb; + + std::function forward_cb; enum { + kOk = 0, kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set kConflict = 2, // Concurrent requests to the same sequence kBusy = 3, // Sequence is already running kInactive = 4, // Sequence to `stop` is not active kFail = 5, // Can't find sequence for `stop` request or internal error during inference - kTooLong = 6 // history + prompt > session_len + kTooLong = 6, // history + prompt > session_len, + kFinish = 7, }; - std::promise signal; + // std::promise signal; }; class RequestQueue { public: - std::vector> enqueue(std::vector> requests) + void enqueue(std::vector> requests) { - std::vector> futures; - futures.reserve(requests.size()); { std::lock_guard lock(mutex_); @@ -54,8 +127,8 @@ class RequestQueue { } for (auto& r : requests) { - futures.push_back(r->signal.get_future()); - if (r->stop_flag) { + // futures.push_back(r->signal.get_future()); + if (r->session.stop_flag) { stop_queue_.push(std::move(r)); } else { @@ -64,7 +137,6 @@ class RequestQueue { } } cv_.notify_one(); - return futures; } void dequeue(std::vector>& stop_requests, diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 71792a4be8..1977b0cffa 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -1,6 +1,7 @@ // Copyright (c) OpenMMLab. All rights reserved. #include +#include #include #include @@ -11,8 +12,10 @@ #include #include +#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/python/dlpack.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" +#include "src/turbomind/triton_backend/model_request.h" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" @@ -22,18 +25,19 @@ namespace py = pybind11; namespace ft = turbomind; using namespace pybind11::literals; +using ft::ManagedTensor; +using ft::Tensor; + // prepare to bind container -using TensorVector = std::vector; -PYBIND11_MAKE_OPAQUE(TensorVector); -using TensorMap = std::unordered_map; +using TensorMap = std::unordered_map; PYBIND11_MAKE_OPAQUE(TensorMap); static const char kDlTensorCapsuleName[] = "dltensor"; -DLDevice getDLDevice(ft::Tensor& tensor) +DLDevice getDLDevice(const ft::Tensor& tensor) { int device_id = 0; if (tensor.where == ft::MEMORY_GPU) { - cudaPointerAttributes ptr_attr; + cudaPointerAttributes ptr_attr{}; cudaPointerGetAttributes(&ptr_attr, tensor.data); device_id = ptr_attr.device; } @@ -57,12 +61,12 @@ DLDevice getDLDevice(ft::Tensor& tensor) return device; } -DLManagedTensor* TritonTensorToDLManagedTensor(ft::Tensor& tensor) +DLManagedTensor* TritonTensorToDLManagedTensor(ManagedTensor& tensor) { - DLDevice device = getDLDevice(tensor); + DLDevice device = getDLDevice(*tensor); DLDataType data_type{0, 0, 1}; - switch (tensor.type) { + switch (tensor->type) { case ft::TYPE_BOOL: data_type.code = DLDataTypeCode::kDLBool; data_type.bits = 8; @@ -119,14 +123,26 @@ DLManagedTensor* TritonTensorToDLManagedTensor(ft::Tensor& tensor) default: break; } - DLTensor dl_tensor{const_cast(tensor.data), + ManagedTensor* ctx = new ManagedTensor(tensor); + DLTensor dl_tensor{const_cast((*ctx)->data), device, - (int32_t)(tensor.shape.size()), + (int32_t)((*ctx)->shape.size()), data_type, - reinterpret_cast(const_cast(tensor.shape.data())), + reinterpret_cast(const_cast((*ctx)->shape.data())), (int64_t*)(nullptr), 0}; - return new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor* dlmt) { delete dlmt; }}; + return new DLManagedTensor{dl_tensor, ctx, [](DLManagedTensor* dlmt) { // + // auto& x = *(ManagedTensor*)dlmt->manager_ctx; + // std::stringstream ss; + // ss << "("; + // for (const auto& d : x->shape) { + // ss << d << ","; + // } + // ss << ")"; + // std::cerr << "turbomind tensor dtor " << ss.str() << " " << std::endl; + delete (ManagedTensor*)dlmt->manager_ctx; + delete dlmt; + }}; } ft::MemoryType getMemoryType(DLDevice device) @@ -200,7 +216,7 @@ ft::DataType getDataType(DLDataType data_type) } } -std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tensor) +std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tensor) { auto& dl_tensor = tensor->dl_tensor; auto where = getMemoryType(dl_tensor.device); @@ -209,14 +225,15 @@ std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tenso std::vector shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim); auto data = dl_tensor.data; - return std::make_shared(where, dtype, shape, data); -} - -DLTensor GetDLTensor(py::object obj) -{ - py::capsule cap = obj.attr("__dlpack__")(); - DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); - return dlmt->dl_tensor; + auto ret = std::make_shared(); + ret->tensor = Tensor(where, dtype, std::move(shape), data); + ret->data_holder.reset((void*)nullptr, [tensor](void*) { + // std::cerr << "dlpack tensor dtor" << std::endl; + if (tensor->deleter) { + tensor->deleter(tensor); + } + }); + return ret; } static void safe_memcpy(void* dst, const void* src, size_t size) @@ -272,7 +289,54 @@ PYBIND11_MODULE(_turbomind, m) .def("__str__", &ft::NcclParam::toString); // custom comm - py::class_>(m, "AbstractCustomComm"); + (void)py::class_>(m, "AbstractCustomComm"); + + py::class_(m, "SessionParam") + .def(py::init([](uint64_t id, int step, bool start, bool end, bool stop) { + ft::SessionParam param{}; + param.id = id; + param.step = step; + param.start_flag = start; + param.end_flag = end; + param.stop_flag = stop; + return param; + }), + "id"_a, + "step"_a, + "start"_a, + "end"_a, + "stop"_a) + .def_readwrite("id", &ft::SessionParam::id) + .def_readwrite("step", &ft::SessionParam::step) + .def_readwrite("start", &ft::SessionParam::start_flag) + .def_readwrite("end", &ft::SessionParam::end_flag) + .def_readwrite("stop", &ft::SessionParam::stop_flag); + + py::class_(m, "GenerationConfig") + .def(py::init()) + .def_readwrite("max_new_tokens", &ft::GenerationConfig::max_new_tokens) + .def_readwrite("min_new_tokens", &ft::GenerationConfig::min_new_tokens) + .def_readwrite("top_p", &ft::GenerationConfig::top_p) + .def_readwrite("top_k", &ft::GenerationConfig::top_k) + .def_readwrite("min_p", &ft::GenerationConfig::min_p) + .def_readwrite("temperature", &ft::GenerationConfig::temperature) + .def_readwrite("repetition_penalty", &ft::GenerationConfig::repetition_penalty) + .def_readwrite("random_seed", &ft::GenerationConfig::random_seed) + .def_readwrite("output_logprobs", &ft::GenerationConfig::output_logprobs) + .def_readwrite("output_hidden_states", &ft::GenerationConfig::output_hidden_states) + .def_readwrite("output_logits", &ft::GenerationConfig::output_logits) + .def("__repr__", [](const ft::GenerationConfig& c) { + std::ostringstream oss; + oss << c; + return oss.str(); + }); + + py::class_>(m, "RequestState") + .def_readonly("status", &ft::RequestState::status) + .def_readonly("seq_len", &ft::RequestState::seq_len); + + py::class_>(m, "AtomicRequestState") + .def("load", [](ft::AtomicRequestState& s) { return s.load(); }); // data type py::enum_(m, "DataType") @@ -299,45 +363,46 @@ PYBIND11_MODULE(_turbomind, m) .value("MEMORY_GPU", ft::MemoryType::MEMORY_GPU); // tensor - py::class_>(m, "Tensor") - .def_readonly("where", &ft::Tensor::where) - .def_readonly("type", &ft::Tensor::type) - .def_readonly("shape", &ft::Tensor::shape) - .def_readonly("data", &ft::Tensor::data) - .def(py::init( - [](const ft::MemoryType where, const ft::DataType type, const std::vector& shape, const long data) { - auto data_ptr = reinterpret_cast(data); - return new ft::Tensor(where, type, shape, data_ptr); - })) + py::class_>(m, "Tensor") + .def_property_readonly("where", [](const ManagedTensor& t) { return t->where; }) + .def_property_readonly("type", [](const ManagedTensor& t) { return t->type; }) + .def_property_readonly("shape", [](const ManagedTensor& t) { return t->shape; }) + .def_property_readonly("data", [](const ManagedTensor& t) { return t->data; }) .def( "view", - [](ft::Tensor* self, ft::DataType new_type) { - return new ft::Tensor(self->where, new_type, self->shape, self->data); + [](const ManagedTensor& self, ft::DataType new_type) { + auto x = self; + x->type = new_type; + return std::make_shared(std::move(x)); }, "new_type"_a) .def( "view", - [](ft::Tensor* self, std::vector new_shape) { - return new ft::Tensor(self->where, self->type, new_shape, self->data); + [](const ManagedTensor& self, std::vector new_shape) { + auto x = self; + x->shape = new_shape; + return std::make_shared(std::move(x)); }, "new_shape"_a) .def( "copy_from", - [](ft::Tensor* self, py::object obj) { + [](ManagedTensor& self, py::object obj) { py::capsule cap = obj.attr("__dlpack__")(); DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); auto src = DLManagedTensorToTritonTensor(dlmt); + // take ownership of capsule's payload + cap.set_name("used_dltensor"); switch (self->type) { case ft::TYPE_FP16: case ft::TYPE_FP32: case ft::TYPE_INT32: case ft::TYPE_BF16: { - auto num_element = - std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies()); + auto num_element = std::accumulate( + (*src)->shape.begin(), (*src)->shape.end(), 1LL, std::multiplies()); auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]); - safe_memcpy(const_cast(self->data), src->data, num_bytes); + safe_memcpy(const_cast(self->data), (*src)->data, num_bytes); break; } default: @@ -347,8 +412,8 @@ PYBIND11_MODULE(_turbomind, m) "tensor"_a) .def( "__dlpack__", - [](ft::Tensor* self, long stream) { - DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(*self); + [](ManagedTensor& self, long stream) { + DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(self); return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) { DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(obj, kDlTensorCapsuleName)); @@ -363,7 +428,7 @@ PYBIND11_MODULE(_turbomind, m) }); }, "stream"_a = 0) - .def("__dlpack_device__", [](ft::Tensor* self) { + .def("__dlpack_device__", [](const ManagedTensor& self) { auto device = getDLDevice(*self); return std::tuple(int(device.device_type), device.device_id); }); @@ -374,29 +439,60 @@ PYBIND11_MODULE(_turbomind, m) DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); auto ret = DLManagedTensorToTritonTensor(dlmt); + // take ownership of capsule's payload + cap.set_name("used_dltensor"); return ret; }, "dl_managed_tensor"_a); // transformer model instance - using ft::AbstractTransformerModelInstance; + using ft::ModelRequest; py::bind_map>(m, "TensorMap"); - py::class_(m, "AbstractTransformerModelInstance") + py::class_(m, "ModelRequest") .def( "forward", - [](AbstractTransformerModelInstance* model, std::shared_ptr input_tensors) { - return model->forward(input_tensors); + [](ModelRequest* model_request, + std::shared_ptr input_tensors, + const ft::SessionParam& session, + const ft::GenerationConfig& gen_cfg, + bool stream_output, + std::function cb) { + ModelRequest::InputParam param{}; + param.tensors = std::move(input_tensors); + param.session = session; + param.gen_cfg = gen_cfg; + param.stream_output = stream_output; + auto ret = model_request->Forward(std::move(param), [cb = std::move(cb)](ft::RequestState s) { + try { + cb(s); + } + catch (const py::error_already_set& e) { + std::cerr << e.what() << std::endl; + } + }); + return ret.tensors; }, py::call_guard(), - "input_tensors"_a) + "input_tensors"_a, + "session"_a, + "gen_cfg"_a, + "stream_output"_a, + "cb"_a) .def( - "register_callback", - [](AbstractTransformerModelInstance* self, ft::triton_stream_cb_t cb, py::object ctx) { - self->registerCallback(cb, ctx.ptr()); + "cancel", + [](ModelRequest* model_request, bool end, std::function cb) { + model_request->Cancel(end, std::move(cb)); // }, - "callback"_a, - "context"_a = nullptr) - .def("unregister_callback", &AbstractTransformerModelInstance::unRegisterCallback); + py::call_guard(), + "end"_a, + "cb"_a) + .def( + "end", + [](ModelRequest* model_request, std::function cb) { + model_request->End(std::move(cb)); // + }, + py::call_guard(), + "cb"_a); // transformer model using ft::AbstractTransformerModel; @@ -466,21 +562,9 @@ PYBIND11_MODULE(_turbomind, m) "world_size"_a) .def( "create_model_instance", - [](AbstractTransformerModel* model, - int deviceId, - int rank, - long stream_id, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr) { - cudaStream_t stream = reinterpret_cast(stream_id); - return model->createModelInstance(deviceId, rank, stream, nccl_params, custom_all_reduce_comm); - }, + [](AbstractTransformerModel* model, int deviceId) { return model->createModelInstance(deviceId); }, py::call_guard(), - "device_id"_a, - "rank"_a, - "stream"_a, - "nccl_params"_a, - "custom_all_reduce_comm"_a = nullptr) + "device_id"_a) .def("create_shared_weights", &AbstractTransformerModel::createSharedWeights, py::call_guard(), @@ -489,8 +573,13 @@ PYBIND11_MODULE(_turbomind, m) .def( "get_params", [](AbstractTransformerModel* model, int deviceId, int rank) { - TensorMap output = model->getParams(deviceId, rank); - return output; + auto output = model->getParams(deviceId, rank); + TensorMap ret; + for (const auto& [k, v] : output) { + // export reference to weight data only (no ownership) + ret.emplace(k, ManagedTensor{v}); + } + return ret; }, py::call_guard(), "device_id"_a, diff --git a/src/turbomind/triton_backend/CMakeLists.txt b/src/turbomind/triton_backend/CMakeLists.txt index 4311d9d9be..d6aec06990 100644 --- a/src/turbomind/triton_backend/CMakeLists.txt +++ b/src/turbomind/triton_backend/CMakeLists.txt @@ -28,7 +28,7 @@ cmake_minimum_required (VERSION 3.18) project(tritonturbomindbackend LANGUAGES C CXX) -add_library(TransformerTritonBackend STATIC transformer_triton_backend.cpp) +add_library(TransformerTritonBackend STATIC transformer_triton_backend.cpp model_request.cpp) target_link_libraries(TransformerTritonBackend PUBLIC nccl_utils) set_property(TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON) install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 40c5ac8907..87eff96a58 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -27,6 +27,7 @@ #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/triton_backend/model_request.h" #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cuda_utils.h" @@ -410,22 +411,14 @@ LlamaTritonModel::createSharedModelInstance(int } template -std::unique_ptr -LlamaTritonModel::createModelInstance(int device_id, - int rank, - cudaStream_t stream, - std::pair, std::vector>, - std::shared_ptr) +std::unique_ptr LlamaTritonModel::createModelInstance(int device_id) { check_cuda_error(cudaSetDevice(device_id)); FT_CHECK(engines_[device_id] != nullptr); - auto allocator = std::make_unique>(device_id, false); - - allocator->setStream(stream); - - return std::make_unique>(*engines_[device_id], std::move(allocator), device_id); + return std::make_unique( + &shared_state_->request_queue, engine_param_.session_len, model_param_.vocab_size); } template diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h index 8f473cd4cd..24c252bae6 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h @@ -43,12 +43,7 @@ struct LlamaTritonModel: public AbstractTransformerModel { ~LlamaTritonModel() override; - std::unique_ptr - createModelInstance(int deviceId, - int rank, - cudaStream_t stream, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr) override; + std::unique_ptr createModelInstance(int deviceId) override; void createSharedWeights(int deviceId, int rank) override; diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc index 976fc9cc1d..4e2b29d765 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc @@ -28,11 +28,15 @@ #include #include #include +#include #include +#include #include namespace turbomind { +#if 0 + template void triton_stream_callback(std::unordered_map* outputs, void* ctx) { @@ -213,4 +217,8 @@ template struct LlamaTritonModelInstance; template struct LlamaTritonModelInstance<__nv_bfloat16>; #endif +#endif + + + } // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h index 2cf69b9fa5..38b1ade7f0 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h @@ -20,15 +20,17 @@ #pragma once +#include #include #include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" +// #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" namespace turbomind { +#if 0 template struct LlamaTritonModelInstance: AbstractTransformerModelInstance { @@ -77,4 +79,8 @@ struct LlamaTritonModelInstance: AbstractTransformerModelInstance { std::exception_ptr h_exception_ = nullptr; }; +#endif + + + } // namespace turbomind diff --git a/src/turbomind/triton_backend/model_request.cpp b/src/turbomind/triton_backend/model_request.cpp new file mode 100644 index 0000000000..d233dc0e11 --- /dev/null +++ b/src/turbomind/triton_backend/model_request.cpp @@ -0,0 +1,153 @@ + + +#include +#include +#include +#include +#include +#include +#include + +#include "src/turbomind/models/llama/Request.h" +#include "src/turbomind/triton_backend/model_request.h" +#include "src/turbomind/utils/Tensor.h" +#include "src/turbomind/utils/constant.h" +#include "src/turbomind/utils/cuda_utils.h" + +namespace turbomind { + +static ManagedTensor create(DataType dtype, MemoryType where, const std::vector& size, int64_t& byte_size) +{ + byte_size = std::accumulate(size.begin(), size.end(), Tensor::getTypeSize(dtype), std::multiplies<>{}); + void* data{}; + + if (where == MEMORY_GPU) { + check_cuda_error(cudaMallocAsync(&data, byte_size, nullptr)); + } + else { + data = std::malloc(byte_size); + } + + ManagedTensor ret; + ret.tensor = Tensor{where, dtype, std::vector(size.begin(), size.end()), data}; + // FT_CHECK_WITH_INFO(byte_size == ret.tensor.sizeBytes(), fmtstr("%ld vs %ld", byte_size, ret.tensor.sizeBytes())); + ret.data_holder.reset((void*)nullptr, [data, where](auto) { + // std::cerr << "turbomind tensor deallocate" << std::endl; + if (where == MEMORY_GPU) { + /// TODO: guard device id + check_cuda_error(cudaFreeAsync(data, nullptr)); + } + else { + std::free(data); + } + }); + return ret; +} + +template +static T get(const std::unordered_map& m, const std::string& key, T fallback = {}) +{ + auto it = m.find(key); + if (it != m.end()) { + return it->second->getVal(); + } + return fallback; +} + +ModelRequest::ModelRequest(RequestQueue* queue, int session_len, int vocab_size): + queue_{queue}, session_len_{session_len}, vocab_size_{vocab_size} +{ +} + +void ModelRequest::Cancel(bool end, std::function cb) +{ + auto r = std::make_shared(); + + r->id = session_id_; + // r->stop_flag = true; + + r->cancel_cb = std::move(cb); + + queue_->enqueue({std::move(r)}); +} + +void ModelRequest::End(std::function cb) +{ + auto r = std::make_shared(); + + r->id = session_id_; + // r->end_flag = true; + + r->end_cb = std::move(cb); + + queue_->enqueue({std::move(r)}); +} + +auto ModelRequest::Forward(InputParam param, std::function cb) -> OutputParam +{ + inputs_ = std::make_shared(); + outputs_ = std::make_shared(); + + auto add = [](auto& dest, auto key, auto dtype, auto where, auto shape, auto&&... dims) { + std::vector shape_; + if constexpr (std::is_integral_v) { + shape_ = {shape, dims...}; + } + else { + shape_ = {shape.cbegin(), shape.cend()}; + } + int64_t byte_size{}; + auto it = dest->emplace(key, create(dtype, where, shape_, byte_size)).first; + return std::make_pair(it->second->data, byte_size); + }; + + auto& inputs = *param.tensors; + + const int batch_size = 1; + const int beam_width = 1; + + FT_CHECK(inputs.at("input_ids")->shape.size() == 1); + + const int input_len = inputs.at("input_ids")->shape[0]; + const int output_len = input_len + param.gen_cfg.max_new_tokens; + + for (auto& [k, v] : *param.tensors) { + inputs_->emplace(k, v); + } + + add(outputs_, "output_ids", TYPE_INT32, MEMORY_CPU, session_len_); + add(outputs_, "sequence_length", TYPE_INT32, MEMORY_CPU, 1); + + if (param.gen_cfg.output_logprobs) { + const int max_logprob_len = std::min(output_len, session_len_) + 1; + add(outputs_, "logprob_vals", TYPE_FP32, MEMORY_CPU, max_logprob_len, kMaxLogProb); + add(outputs_, "logprob_indexes", TYPE_INT32, MEMORY_CPU, max_logprob_len, kMaxLogProb); + add(outputs_, "logprob_nums", TYPE_INT32, MEMORY_CPU, max_logprob_len); + } + + if (param.gen_cfg.output_logits) { + /// TODO: allow output logits on GPU + add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, output_len, vocab_size_); + } + + auto r = std::make_shared(); + + for (const auto& [k, v] : *inputs_) { + r->inputs.insert(k, *v); + } + for (const auto& [k, v] : *outputs_) { + r->outputs.insert(k, *v); + } + + r->id = param.session.id; + r->session = param.session; + r->gen_cfg = param.gen_cfg; + r->stream_output = param.stream_output; + r->forward_cb = std::move(cb); + + queue_->enqueue({std::move(r)}); + + return OutputParam{outputs_}; +} + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/model_request.h b/src/turbomind/triton_backend/model_request.h new file mode 100644 index 0000000000..e40e2fd48e --- /dev/null +++ b/src/turbomind/triton_backend/model_request.h @@ -0,0 +1,53 @@ + + +#pragma once + +#include + +#include "src/turbomind/models/llama/Request.h" +#include "src/turbomind/utils/Tensor.h" + +namespace turbomind { + +class ModelRequest { +public: + virtual ~ModelRequest() = default; + + ModelRequest(RequestQueue* queue, int session_len, int vocab_size); + + // Cancel running request, calls `cb` when done + void Cancel(bool end, std::function cb); + + // Reset the channel to uninitailized state, calls `notify` when done + void End(std::function cb); + + using TensorMap_ = std::unordered_map; + + struct InputParam { + std::shared_ptr tensors; + + SessionParam session; + GenerationConfig gen_cfg; + + bool stream_output; + }; + + struct OutputParam { + std::shared_ptr tensors; + }; + + OutputParam Forward(InputParam param, std::function cb); + +protected: + RequestQueue* queue_; + + uint64_t session_id_; + + int session_len_; + int vocab_size_; + + std::shared_ptr inputs_; // owned by caller + std::shared_ptr outputs_; // owned by `this` +}; + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp index 6d49df4578..c283568ab7 100644 --- a/src/turbomind/triton_backend/transformer_triton_backend.hpp +++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp @@ -22,16 +22,18 @@ #include #include -#include +#include + #ifdef __linux__ #include #endif -#include #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/custom_ar_comm.h" #include "src/turbomind/utils/nccl_utils.h" +#include "src/turbomind/triton_backend/model_request.h" + namespace turbomind { using triton_stream_cb_t = std::function>, void*)>; @@ -72,12 +74,7 @@ struct AbstractTransformerModel { virtual void createCustomComms(std::vector>* custom_all_reduce_comms, int world_size) = 0; - virtual std::unique_ptr - createModelInstance(int deviceId, - int rank, - cudaStream_t stream, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr) = 0; + virtual std::unique_ptr createModelInstance(int deviceId) = 0; virtual void createSharedWeights(int deviceId, int rank) = 0; diff --git a/src/turbomind/utils/Tensor.h b/src/turbomind/utils/Tensor.h index b2b8524e09..1f6f737b72 100644 --- a/src/turbomind/utils/Tensor.h +++ b/src/turbomind/utils/Tensor.h @@ -530,4 +530,29 @@ class TensorMap { void saveNpy(const std::string& base_folder); }; +struct ManagedTensor { + Tensor tensor; + std::shared_ptr data_holder; + + Tensor* operator->() noexcept + { + return &tensor; + } + + const Tensor* operator->() const noexcept + { + return &tensor; + } + + Tensor& operator*() noexcept + { + return tensor; + } + + const Tensor& operator*() const noexcept + { + return tensor; + } +}; + } // namespace turbomind From 2cf49bd5ac924aa4b94f46a69a73da03335b2a3f Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 11 Dec 2024 14:30:36 +0800 Subject: [PATCH 03/40] update perf metrics & adaptive tokens per tick --- benchmark/profile_throughput.py | 231 ++++++++++-------- lmdeploy/turbomind/turbomind.py | 43 +++- .../kernels/sampling_topp_kernels.cu | 8 +- src/turbomind/models/llama/LlamaBatch.cc | 23 +- src/turbomind/models/llama/LlamaBatch.h | 2 +- src/turbomind/models/llama/Request.h | 22 +- src/turbomind/python/bind.cpp | 10 +- .../triton_backend/llama/LlamaTritonModel.cc | 6 +- .../triton_backend/model_request.cpp | 39 ++- src/turbomind/triton_backend/model_request.h | 12 +- src/turbomind/utils/cuda_utils.h | 3 +- 11 files changed, 255 insertions(+), 144 deletions(-) diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 4f06fad4f9..55cb67ff22 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -2,6 +2,7 @@ import argparse import asyncio import csv +import itertools import json import os import random @@ -86,20 +87,23 @@ def __init__(self, model_path: str, self.csv = csv self.pbar = None - async def _inference(self, req_queue: Queue, res_queue: Queue, - session_id: int, temperature: float, top_p: float, - top_k: int, stream_output: bool): + async def _inference(self, req_queue: Queue, session_id: int, + temperature: float, top_p: float, top_k: int, + stream_output: bool, pretokenize: bool, + skip_detokenize: bool): model_inst = self.tm_model.create_instance() - stats = [] - # get each generated token's latency - per_token_latency_stats = [] + counters = [] for prompt, input_seqlen, output_seqlen in iter( req_queue.get_nowait, [None, None, None]): - _per_token_latency_stats = [0] * (output_seqlen + 1) - prev = time.perf_counter() - n_prev_token = 0 - input_ids = self.tokenizer(prompt).input_ids + ts = [time.perf_counter()] + ns = [0] + + if pretokenize: + input_ids = prompt + else: + input_ids = self.tokenizer(prompt).input_ids + state = DetokenizeState(len(input_ids)) async for outputs in model_inst.async_stream_infer( @@ -114,43 +118,37 @@ async def _inference(self, req_queue: Queue, res_queue: Queue, sequence_end=True, stream_output=stream_output): res, n_token = input_ids + outputs.token_ids, outputs.num_token - _, state = self.tokenizer.detokenize_incrementally(res, state) - now = time.perf_counter() - if n_prev_token != n_token: - _per_token_latency_stats[n_prev_token] = np.round( - now - prev, 3) - n_prev_token = n_token - prev = now + if not skip_detokenize: + _, state = self.tokenizer.detokenize_incrementally( + res, state) + # The following does not help + # await asyncio.sleep(0) + # _, state = await loop.run_in_executor(None, self.tokenizer.detokenize_incrementally, res, state) + + ts.append(time.perf_counter()) + ns.append(n_token) + # for pytorch engine to restart a session if isinstance(model_inst, EngineInstance): await model_inst.async_end(session_id) - assert output_seqlen <= n_token <= output_seqlen + 1, \ - f'Error. session_id({session_id}) request {output_seqlen} ' \ - f'tokens, but generate {n_token} tokens.\n' \ - f'prompt: {prompt}' - - first_token_latency = _per_token_latency_stats[0] - completion_tokens = n_token - total_tokens = n_token + input_seqlen - stats.append([ - first_token_latency, completion_tokens, output_seqlen, - total_tokens - ]) - # skip the first token latency - per_token_latency_stats.append(_per_token_latency_stats[1:]) + + counters.append((ts, ns, input_seqlen)) self.pbar.update(1) - res_queue.put_nowait((session_id, stats, per_token_latency_stats)) + + return counters def process_request(self, requests, concurrency, temperature, top_p, top_k, - stream_output): - res_queue = Queue() + stream_output, pretokenize, skip_detokenize): req_queue = Queue() self.pbar = tqdm(total=len(requests)) # feed request to q for req in requests: - req_queue.put(req) + if pretokenize: + req_queue.put((self.tokenizer.encode(req[0]), *req[1:])) + else: + req_queue.put(req) for i in range(concurrency): req_queue.put([None, None, None]) @@ -162,87 +160,95 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k, # start threads tasks = [] for i in range(concurrency): - task = self._inference(req_queue, res_queue, i, temperature, top_p, - top_k, stream_output) + task = self._inference(req_queue, i, temperature, top_p, top_k, + stream_output, pretokenize, skip_detokenize) tasks.append(task) async def _gather_tasks(tasks): return await asyncio.gather(*tasks) - event_loop.run_until_complete(_gather_tasks(tasks)) + counters = asyncio.run(_gather_tasks(tasks)) + + self.pbar.close() elapsed_time = time.time() - start - stats = [] - per_token_latency_stats = [] - while not res_queue.empty(): - session_id, _stats, _per_token_latency_stats = res_queue.get() - stats.append(np.array(_stats)) - per_token_latency_stats += [ - item for sublist in _per_token_latency_stats - for item in sublist - ] - stats = np.concatenate(stats).reshape(-1, 4) - - first_token_latency_min = np.min(stats[:, 0], axis=0) - first_token_latency_max = np.max(stats[:, 0], axis=0) - first_token_latency_ave = np.mean(stats[:, 0], axis=0) - completion_tokens = np.sum(stats[:, 1], axis=0) - total_tokens = np.sum(stats[:, 3], axis=0) - prompt_tokens = total_tokens - completion_tokens - completion_token_throughput = completion_tokens / elapsed_time - total_token_throughput = total_tokens / elapsed_time + ttfts: List[float] = [] + tpots: List[float] = [] + e2es: List[float] = [] + itls: List[float] = [] + tpts: List[int] = [] + + total_output = 0 + total_input = 0 + + for ts, ns, input_len in itertools.chain.from_iterable(counters): + # print (ts) + # print (ns) + # assert 0 + total_output += ns[-1] + total_input += input_len + e2es.append(ts[-1] - ts[0]) + ttfts.append(ts[1] - ts[0]) + if ns[-1] > ns[1]: + tpots.append((ts[-1] - ts[1]) / (ns[-1] - ns[1])) + else: # no-stream-output + tpots.append((ts[-1] - ts[0]) / (ns[-1] - ns[0])) + t_dif = np.subtract(ts[1:], ts[:-1]) + n_dif = np.subtract(ns[1:], ns[:-1]) + itls.extend(t_dif[1:]) + tpts.extend(n_dif[1:]) + + output_throughput = total_output / elapsed_time + input_throughput = total_input / elapsed_time + + qs = (50, 75, 90, 99) + + tpot_ms_mean = np.mean(tpots) + tpot_ms_stat = tuple(np.percentile(tpots, qs)) + e2e_mean = np.mean(e2es) + e2e_stat = tuple(np.percentile(e2es, qs)) + + if stream_output: + ttft_ms_mean = np.mean(ttfts) + ttft_ms_stat = tuple(np.percentile(ttfts, qs)) + itls_ms_mean = np.mean(itls) + itls_ms_stat = tuple(np.percentile(itls, qs)) + tpts_ms_mean = np.mean(tpts) + tpts_ms_stat = tuple(np.percentile(tpts, qs).astype(int)) + rps = len(requests) / elapsed_time - rpm = rps * 60 - - per_token_latency_stats.sort() - percentiles = [ - np.round( - per_token_latency_stats[int(percent * - len(per_token_latency_stats))], 3) - for percent in [0.5, 0.75, 0.95, 0.99] - ] - - print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n' - f'elapsed_time: {elapsed_time:.3f}s\n') + + def tab_row(name, *items): + + def fmt(x): + return '{:>10.3f}'.format(x) if isinstance( + x, float) else '{:>10}'.format(x) + + print('{:<35}{}'.format(name, ''.join([fmt(x) for x in items]))) + + print('\n{s:{c}^{n}}'.format(s=' Profile Throughtput ', n=85, c='=')) + tab_row('Benchmark duration', elapsed_time) + tab_row('Total requests', len(requests)) + tab_row('Concurrency', concurrency) + tab_row('Stream output', str(stream_output).lower()) + tab_row('Pre-tokenization', str(pretokenize).lower()) + tab_row('Skip detokenization', str(skip_detokenize).lower()) + tab_row('Total input tokens', total_input) + tab_row('Total generated tokens', total_output) + tab_row('Input token throughput (tok/s)', input_throughput) + tab_row('Output token throughput (tok/s)', output_throughput) + tab_row('Request throughput (req/s)', rps) + print('-' * 85) + tab_row('', 'mean', *(f'P{q}' for q in qs)) + tab_row('End-to-end Latency', e2e_mean, *e2e_stat) + if stream_output: + tab_row('Time to First Token (TTFT)', ttft_ms_mean, *ttft_ms_stat) + tab_row('Time per Output Token (TPOT)', tpot_ms_mean, *tpot_ms_stat) if stream_output: - print(f'first token latency(s)(min, max, ave): ' - f'{first_token_latency_min:.3f}, ' - f'{first_token_latency_max:.3f}, ' - f'{first_token_latency_ave:.3f}') - print(f'per-token latency(s) percentile(50, 75, 95, 99): ' - f'{percentiles}\n') - print( - f'number of prompt tokens: {prompt_tokens:.0f}\n' - f'number of completion tokens: {completion_tokens:.0f}\n' - f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa - f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa - f'RPS (request per second): {rps:.3f} req/s\n' - f'RPM (request per minute): {rpm:.3f} req/min\n' - f'{"-" * 50}\n') - - if self.csv: - with open(self.csv, 'w') as csvfile: - writer = csv.writer(csvfile) - writer.writerow([ - 'batch', 'num_promts', 'RPS', 'RPM', 'FTL(ave)(s)', - 'FTL(min)(s)', 'FTL(max)(s)', '50%(s)', '75%(s)', '95%(s)', - '99%(s)', 'throughput(out tok/s)', - 'throughput(total tok/s)' - ]) - writer.writerow([ - concurrency, - len(requests), f'{rps:.3f}', f'{rpm:.3f}', - f'{first_token_latency_ave:.3f}' if stream_output else '-', - f'{first_token_latency_min:.3f}' if stream_output else '-', - f'{first_token_latency_max:.3f}' if stream_output else '-', - f'{percentiles[0]:.3f}' if stream_output else '-', - f'{percentiles[1]:.3f}' if stream_output else '-', - f'{percentiles[2]:.3f}' if stream_output else '-', - f'{percentiles[3]:.3f}' if stream_output else '-', - f'{completion_token_throughput:.3f}', - f'{total_token_throughput:.3f}' - ]) + tab_row('Inter-token Latency (ITL)', itls_ms_mean, *itls_ms_stat) + tab_row('Tokens per Tick', tpts_ms_mean, *tpts_ms_stat) + print('=' * 85) def parse_args(): @@ -266,6 +272,15 @@ def parse_args(): type=int, help='Number of prompts to process', default=5000) + parser.add_argument('--no-stream-output', + action='store_true', + help='Use stream output') + parser.add_argument('--pre-tokenize', + action='store_true', + help='Pre-tokenize input prompts before starting') + parser.add_argument('--skip-detokenize', + action='store_true', + help='Skip detokenizing output tokens') parser.add_argument('--csv', type=str, help='Where to save the result.', @@ -350,7 +365,9 @@ def main(): top_p=args.top_p, top_k=args.top_k, concurrency=args.concurrency, - stream_output=True) + stream_output=not args.no_stream_output, + pretokenize=args.pre_tokenize, + skip_detokenize=args.skip_detokenize) if __name__ == '__main__': diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 07034c2f64..849c9f3b7b 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -132,6 +132,9 @@ def __init__(self, self.session_len = self.config.session_len self.eos_id = self.tokenizer.eos_token_id + self.pending_num = 0 + self.pending_cond = asyncio.Condition() + def _create_weight(self, model_comm): """Allocate weight buffer, load params if from_workspace.""" @@ -511,11 +514,18 @@ async def async_signal(self, state): async with self.cond: self.flag, self.state = 1, state self.cond.notify() + # self.flag, self.state = 1, state + # self.state_ready.set() def async_signal_cb(self, state): coro = self.async_signal(state) asyncio.run_coroutine_threadsafe(coro, self.event_loop) + def add_pending(self, n: int = 1): + # self.tm_model.pending_event.clear() + # self.tm_model.pending_num += n + self.tm_model.pending_num += n + async def async_stream_infer(self, session_id, input_ids, @@ -548,6 +558,7 @@ async def async_stream_infer(self, self.event_loop = asyncio.get_running_loop() self.cond = asyncio.Condition() + # self.state_ready = asyncio.Event() self.flag = 0 gen_cfg = self._get_generation_config(gen_config) @@ -573,14 +584,30 @@ async def async_stream_infer(self, output_ids_buf = outputs['output_ids'] - seq_start = step + input_length[0] + # seq_start = step + input_length[0] out_logprobs = None finish = False + # async with self.tm_model.pending_cond: + # self.tm_model.pending_num -= 1 + # if self.tm_model.pending_num == 0: + # self.tm_model.pending_cond.notify_all() + + # self.tm_model.pending_num -= 1 + # if self.tm_model.pending_num == 0: + # self.tm_model.pending_event.set() + output_ids = [] + output_len = 0 + prev_len = step + input_length[0] try: # generator while True: + # async with self.tm_model.pending_cond: + # while self.tm_model.pending_num > 0: + # await self.tm_model.pending_cond.wait() + # await self.tm_model.pending_event.wait() + async with self.cond: while not self.flag: await self.cond.wait() @@ -595,14 +622,20 @@ async def async_stream_infer(self, yield self._get_error_output() break - if seq_start == seq_len and not finish: + if seq_len == prev_len and not finish: continue - output_ids = output_ids_buf[seq_start:seq_len] - gen_len = seq_len - seq_start + output_ids += output_ids_buf[prev_len:seq_len] + output_len += seq_len - prev_len + + self.model_inst.report_tokens_per_tick(seq_len - prev_len) + status = ResponseType.FINISH if finish else ResponseType.SUCCESS - output = EngineOutput(status, output_ids.tolist(), gen_len, + output = EngineOutput(status, output_ids, output_len.item(), out_logprobs) + + prev_len = seq_len + yield output if finish: diff --git a/src/turbomind/kernels/sampling_topp_kernels.cu b/src/turbomind/kernels/sampling_topp_kernels.cu index 04ea0577d1..cf7faf95b4 100644 --- a/src/turbomind/kernels/sampling_topp_kernels.cu +++ b/src/turbomind/kernels/sampling_topp_kernels.cu @@ -216,9 +216,9 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream) size_t topp_id_val_buf_size = sizeof(int) * params.batch_size * params.vocab_size_padded; size_t begin_offset_buf_size = sizeof(int) * params.batch_size; size_t end_offset_buf_size = sizeof(int) * params.batch_size; - topp_id_val_buf_size = div_up(topp_id_val_buf_size, 256) * 256; - begin_offset_buf_size = div_up(begin_offset_buf_size, 256) * 256; - end_offset_buf_size = div_up(end_offset_buf_size, 256) * 256; + topp_id_val_buf_size = div_up(topp_id_val_buf_size, 256UL) * 256; + begin_offset_buf_size = div_up(begin_offset_buf_size, 256UL) * 256; + end_offset_buf_size = div_up(end_offset_buf_size, 256UL) * 256; if (params.workspace == nullptr) { size_t cub_temp_storage_size; @@ -236,7 +236,7 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream) 0, // begin_bit sizeof(T) * 8, // end_bit = sizeof(KeyT) * 8 stream)); // cudaStream_t - cub_temp_storage_size = div_up(cub_temp_storage_size, 256) * 256; + cub_temp_storage_size = div_up(cub_temp_storage_size, 256UL) * 256; params.workspace_size = topp_id_val_buf_size + begin_offset_buf_size + end_offset_buf_size + cub_temp_storage_size; return; diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 309ca1b4b4..2fb132a40e 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -22,6 +22,7 @@ #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/nccl_utils.h" #include +#include #include #include #include @@ -1392,20 +1393,26 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector std::vector signals; { - NvtxScope _("stream_and_completion_signal"); + NvtxScope _("stream_and_completion_signal"); + const float tok_per_tick = shared_state_->tok_per_tick.load(); + const int tpt = std::min(std::max(1, (int)std::round(tok_per_tick)), 8); for (int i = 0; i < batch_size - g.partial; ++i) { if (state_->requests[i]) { + auto& r = state_->requests[i]; if (state_->h_finished[i]) { // Interrupt finished sequences and move the request handle into the signal closure signals.push_back(Interrupt(i)); ++g.finished_count; } - else if (state_->requests[i]->stream_output) { - // Create signals by copying the request handles for non-finished streaming requests - signals.push_back([this, r = state_->requests[i]] { - if (rank_ == 0) { + else if (r->stream_output && rank_ == 0) { + const auto seq_len = r->outputs.getVal("sequence_length"); + const auto v = r->flag->load(std::memory_order_relaxed) < 1; + if (v) { + r->flag->fetch_add(1, std::memory_order_relaxed); + // Create signals by copying the request handles for non-finished streaming requests + signals.push_back([this, r, seq_len] { try { - r->forward_cb({Request::kOk, r->outputs.getVal("sequence_length")}); + r->forward_cb({Request::kOk, seq_len}); } catch (const std::bad_function_call& e) { TM_LOG_ERROR("Null stream callback for (%s)", std::to_string(r->id).c_str()); @@ -1414,8 +1421,8 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector TM_LOG_ERROR("Unknown exception invoking stream callback for (%s)", std::to_string(r->id).c_str()); } - } - }); + }); + } } } } diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index bd9c4cc136..b0394bdcbe 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -28,12 +28,12 @@ struct SharedState { std::vector> infer_requests; std::vector> stop_requests; RequestQueue request_queue; + std::atomic tok_per_tick{1}; std::shared_ptr barrier; bool abort; std::atomic free_size{std::numeric_limits::max()}; }; - struct BatchState { int* h_prompt_length; // history + input, ignore generated int* h_context_length; diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/models/llama/Request.h index 5f2be68923..359f03c47a 100644 --- a/src/turbomind/models/llama/Request.h +++ b/src/turbomind/models/llama/Request.h @@ -67,19 +67,19 @@ struct RequestState { struct AtomicRequestState { - AtomicRequestState(): data(std::make_shared()) {} + std::atomic data_; - void update(RequestState state) + static_assert(std::atomic::is_always_lock_free); + + ~AtomicRequestState() { - std::atomic_store_explicit(&data, std::make_shared(std::move(state)), std::memory_order_release); + auto data = exchange(nullptr); } - std::shared_ptr load() + std::unique_ptr exchange(RequestState* data) { - return std::atomic_load_explicit(&data, std::memory_order_acquire); + return std::unique_ptr{data_.exchange(data, std::memory_order_acq_rel)}; } - - std::shared_ptr data; }; struct Request { @@ -100,8 +100,12 @@ struct Request { std::function forward_cb; - enum - { + // std::atomic_flag* flag; + std::atomic* flag; + + std::atomic* seq_len; + + enum { kOk = 0, kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set kConflict = 2, // Concurrent requests to the same sequence diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 1977b0cffa..833b73547b 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -335,8 +335,8 @@ PYBIND11_MODULE(_turbomind, m) .def_readonly("status", &ft::RequestState::status) .def_readonly("seq_len", &ft::RequestState::seq_len); - py::class_>(m, "AtomicRequestState") - .def("load", [](ft::AtomicRequestState& s) { return s.load(); }); + // py::class_>(m, "AtomicRequestState") + // .def("load", [](ft::AtomicRequestState& s) { return s.load(); }); // data type py::enum_(m, "DataType") @@ -492,7 +492,11 @@ PYBIND11_MODULE(_turbomind, m) model_request->End(std::move(cb)); // }, py::call_guard(), - "cb"_a); + "cb"_a) + .def( + "report_tokens_per_tick", + [](ModelRequest* model_request, int tok_per_tick) { model_request->ReportTokensPerTick(tok_per_tick); }, + "tokens_per_tick"_a); // transformer model using ft::AbstractTransformerModel; diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 87eff96a58..6ae51b03eb 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -417,8 +417,10 @@ std::unique_ptr LlamaTritonModel::createModelInstance(int devic FT_CHECK(engines_[device_id] != nullptr); - return std::make_unique( - &shared_state_->request_queue, engine_param_.session_len, model_param_.vocab_size); + return std::make_unique(&shared_state_->request_queue, + &shared_state_->tok_per_tick, + engine_param_.session_len, + model_param_.vocab_size); } template diff --git a/src/turbomind/triton_backend/model_request.cpp b/src/turbomind/triton_backend/model_request.cpp index d233dc0e11..10697ddb53 100644 --- a/src/turbomind/triton_backend/model_request.cpp +++ b/src/turbomind/triton_backend/model_request.cpp @@ -1,6 +1,7 @@ #include +#include #include #include #include @@ -54,8 +55,8 @@ static T get(const std::unordered_map& m, const std: return fallback; } -ModelRequest::ModelRequest(RequestQueue* queue, int session_len, int vocab_size): - queue_{queue}, session_len_{session_len}, vocab_size_{vocab_size} +ModelRequest::ModelRequest(RequestQueue* queue, std::atomic* tok_per_tick, int session_len, int vocab_size): + queue_{queue}, tok_per_tick_{tok_per_tick}, session_len_{session_len}, vocab_size_{vocab_size} { } @@ -144,10 +145,44 @@ auto ModelRequest::Forward(InputParam param, std::function c r->gen_cfg = param.gen_cfg; r->stream_output = param.stream_output; r->forward_cb = std::move(cb); + r->flag = &flag_; + + // flag_.clear(std::memory_order_release); + flag_.store(0); queue_->enqueue({std::move(r)}); return OutputParam{outputs_}; } +void ModelRequest::ReportTokensPerTick(int observed) +{ + // flag_.clear(std::memory_order_release); + + flag_.fetch_sub(1, std::memory_order_relaxed); + +#if 0 + constexpr float decay = 0.525; + + float value = (float)observed; + // value -= std::max(0.f, std::min(decay, value - 1.f)); + + float old = tok_per_tick_->load(); + float cur{}; + auto update = [&]() mutable { + float alpha = old > value ? 0.001 : 0.002; + cur = old * (1 - alpha) + value * alpha; + }; + update(); + while (!tok_per_tick_->compare_exchange_weak(old, cur)) { + update(); + } + + static int count = 0; + if (++count % 100 == 0) { + std::cerr << cur << std::endl; + } +#endif +} + } // namespace turbomind diff --git a/src/turbomind/triton_backend/model_request.h b/src/turbomind/triton_backend/model_request.h index e40e2fd48e..f90ad81d23 100644 --- a/src/turbomind/triton_backend/model_request.h +++ b/src/turbomind/triton_backend/model_request.h @@ -13,7 +13,7 @@ class ModelRequest { public: virtual ~ModelRequest() = default; - ModelRequest(RequestQueue* queue, int session_len, int vocab_size); + ModelRequest(RequestQueue* queue, std::atomic* tok_per_tick, int session_len, int vocab_size); // Cancel running request, calls `cb` when done void Cancel(bool end, std::function cb); @@ -38,8 +38,16 @@ class ModelRequest { OutputParam Forward(InputParam param, std::function cb); + void ReportTokensPerTick(int observed); + protected: - RequestQueue* queue_; + RequestQueue* queue_; + std::atomic* tok_per_tick_; + + // std::atomic_flag flag_; + std::atomic flag_; + + std::atomic seq_len_; uint64_t session_id_; diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h index 8311e6eb9e..3f54401cff 100644 --- a/src/turbomind/utils/cuda_utils.h +++ b/src/turbomind/utils/cuda_utils.h @@ -306,7 +306,8 @@ inline std::string getDeviceName() return std::string(props.name); } -inline int div_up(int a, int n) +template +inline T div_up(T a, T n) { return (a + n - 1) / n; } From aa5573dbb44b68e65a6ec93371286e45356d94d0 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 12 Dec 2024 18:08:38 +0800 Subject: [PATCH 04/40] wait-free --- benchmark/profile_throughput.py | 58 +++++++++---------- lmdeploy/turbomind/chat.py | 13 +++-- lmdeploy/turbomind/turbomind.py | 42 ++++---------- src/turbomind/models/llama/LlamaBatch.cc | 19 +++--- src/turbomind/models/llama/Request.h | 9 +-- src/turbomind/python/bind.cpp | 24 ++++---- .../triton_backend/model_request.cpp | 9 +-- src/turbomind/triton_backend/model_request.h | 8 +-- 8 files changed, 82 insertions(+), 100 deletions(-) diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 55cb67ff22..34f31b4137 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -89,7 +89,7 @@ def __init__(self, model_path: str, async def _inference(self, req_queue: Queue, session_id: int, temperature: float, top_p: float, top_k: int, - stream_output: bool, pretokenize: bool, + stream_output: bool, skip_tokenize: bool, skip_detokenize: bool): model_inst = self.tm_model.create_instance() counters = [] @@ -99,13 +99,16 @@ async def _inference(self, req_queue: Queue, session_id: int, ts = [time.perf_counter()] ns = [0] - if pretokenize: + if skip_tokenize: input_ids = prompt else: input_ids = self.tokenizer(prompt).input_ids state = DetokenizeState(len(input_ids)) + prev_len = 0 + token_ids = input_ids.copy() + async for outputs in model_inst.async_stream_infer( session_id, input_ids=input_ids, @@ -117,16 +120,15 @@ async def _inference(self, req_queue: Queue, session_id: int, sequence_start=True, sequence_end=True, stream_output=stream_output): - res, n_token = input_ids + outputs.token_ids, outputs.num_token - if not skip_detokenize: - _, state = self.tokenizer.detokenize_incrementally( - res, state) - # The following does not help - # await asyncio.sleep(0) - # _, state = await loop.run_in_executor(None, self.tokenizer.detokenize_incrementally, res, state) - - ts.append(time.perf_counter()) - ns.append(n_token) + n_token = outputs.num_token + if n_token > prev_len: + token_ids += outputs.token_ids[prev_len - n_token:] + if not skip_detokenize: + _, state = self.tokenizer.detokenize_incrementally( + token_ids, state) + ts.append(time.perf_counter()) + ns.append(n_token) + prev_len = n_token # for pytorch engine to restart a session if isinstance(model_inst, EngineInstance): @@ -138,22 +140,18 @@ async def _inference(self, req_queue: Queue, session_id: int, return counters def process_request(self, requests, concurrency, temperature, top_p, top_k, - stream_output, pretokenize, skip_detokenize): + stream_output, skip_tokenize, skip_detokenize): req_queue = Queue() - self.pbar = tqdm(total=len(requests)) - # feed request to q for req in requests: - if pretokenize: + if skip_tokenize: req_queue.put((self.tokenizer.encode(req[0]), *req[1:])) else: req_queue.put(req) for i in range(concurrency): req_queue.put([None, None, None]) - start = time.time() - event_loop = asyncio.new_event_loop() asyncio.set_event_loop(event_loop) @@ -161,18 +159,23 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k, tasks = [] for i in range(concurrency): task = self._inference(req_queue, i, temperature, top_p, top_k, - stream_output, pretokenize, skip_detokenize) + stream_output, skip_tokenize, + skip_detokenize) tasks.append(task) async def _gather_tasks(tasks): return await asyncio.gather(*tasks) - counters = asyncio.run(_gather_tasks(tasks)) + self.pbar = tqdm(total=len(requests)) - self.pbar.close() + start = time.time() + + counters = asyncio.run(_gather_tasks(tasks)) elapsed_time = time.time() - start + self.pbar.close() + ttfts: List[float] = [] tpots: List[float] = [] e2es: List[float] = [] @@ -183,9 +186,6 @@ async def _gather_tasks(tasks): total_input = 0 for ts, ns, input_len in itertools.chain.from_iterable(counters): - # print (ts) - # print (ns) - # assert 0 total_output += ns[-1] total_input += input_len e2es.append(ts[-1] - ts[0]) @@ -197,7 +197,7 @@ async def _gather_tasks(tasks): t_dif = np.subtract(ts[1:], ts[:-1]) n_dif = np.subtract(ns[1:], ns[:-1]) itls.extend(t_dif[1:]) - tpts.extend(n_dif[1:]) + tpts.extend(n_dif) output_throughput = total_output / elapsed_time input_throughput = total_input / elapsed_time @@ -232,8 +232,8 @@ def fmt(x): tab_row('Total requests', len(requests)) tab_row('Concurrency', concurrency) tab_row('Stream output', str(stream_output).lower()) - tab_row('Pre-tokenization', str(pretokenize).lower()) - tab_row('Skip detokenization', str(skip_detokenize).lower()) + tab_row('Skip tokenize', str(skip_tokenize).lower()) + tab_row('Skip detokenize', str(skip_detokenize).lower()) tab_row('Total input tokens', total_input) tab_row('Total generated tokens', total_output) tab_row('Input token throughput (tok/s)', input_throughput) @@ -275,7 +275,7 @@ def parse_args(): parser.add_argument('--no-stream-output', action='store_true', help='Use stream output') - parser.add_argument('--pre-tokenize', + parser.add_argument('--skip-tokenize', action='store_true', help='Pre-tokenize input prompts before starting') parser.add_argument('--skip-detokenize', @@ -366,7 +366,7 @@ def main(): top_k=args.top_k, concurrency=args.concurrency, stream_output=not args.no_stream_output, - pretokenize=args.pre_tokenize, + skip_tokenize=args.skip_tokenize, skip_detokenize=args.skip_detokenize) diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index 7dc8778957..46108eaaf7 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -48,6 +48,8 @@ def infer(generator, session_id, input_ids, gen_config, sequence_start, async def async_infer(generator, session_id, input_ids, gen_config, sequence_start, sequence_end, step, stream_output, tokenizer, state): + token_ids = input_ids.copy() + prev_len = 0 async for output in generator.async_stream_infer( session_id=session_id, input_ids=input_ids, @@ -56,10 +58,13 @@ async def async_infer(generator, session_id, input_ids, gen_config, sequence_end=sequence_end, step=step, stream_output=stream_output): - res, tokens = input_ids + output.token_ids, output.num_token - # decode res - response, state = tokenizer.detokenize_incrementally(res, state=state) - print(response, end='', flush=True) + tokens = output.num_token + if tokens > prev_len: + token_ids += output.token_ids[prev_len - tokens:] + response, state = tokenizer.detokenize_incrementally(token_ids, + state=state) + prev_len = tokens + print(response, end='', flush=True) return tokens diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 849c9f3b7b..984fd80659 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -132,9 +132,6 @@ def __init__(self, self.session_len = self.config.session_len self.eos_id = self.tokenizer.eos_token_id - self.pending_num = 0 - self.pending_cond = asyncio.Condition() - def _create_weight(self, model_comm): """Allocate weight buffer, load params if from_workspace.""" @@ -510,22 +507,15 @@ def prepare_inputs(self, return inputs, input_lengths - async def async_signal(self, state): + async def async_signal(self): async with self.cond: - self.flag, self.state = 1, state + self.flag = 1 self.cond.notify() - # self.flag, self.state = 1, state - # self.state_ready.set() - def async_signal_cb(self, state): - coro = self.async_signal(state) + def async_signal_cb(self): + coro = self.async_signal() asyncio.run_coroutine_threadsafe(coro, self.event_loop) - def add_pending(self, n: int = 1): - # self.tm_model.pending_event.clear() - # self.tm_model.pending_num += n - self.tm_model.pending_num += n - async def async_stream_infer(self, session_id, input_ids, @@ -577,41 +567,29 @@ async def async_stream_infer(self, inputs = _np_dict_to_tm_dict(inputs) - outputs = self.model_inst.forward(inputs, session, gen_cfg, - stream_output, self.async_signal_cb) + outputs, shared_state = self.model_inst.forward( + inputs, session, gen_cfg, stream_output, self.async_signal_cb) outputs = _tm_dict_to_torch_dict(outputs) output_ids_buf = outputs['output_ids'] - # seq_start = step + input_length[0] - out_logprobs = None finish = False - # async with self.tm_model.pending_cond: - # self.tm_model.pending_num -= 1 - # if self.tm_model.pending_num == 0: - # self.tm_model.pending_cond.notify_all() - - # self.tm_model.pending_num -= 1 - # if self.tm_model.pending_num == 0: - # self.tm_model.pending_event.set() output_ids = [] output_len = 0 prev_len = step + input_length[0] try: # generator while True: - # async with self.tm_model.pending_cond: - # while self.tm_model.pending_num > 0: - # await self.tm_model.pending_cond.wait() - # await self.tm_model.pending_event.wait() async with self.cond: while not self.flag: await self.cond.wait() - state, self.flag = self.state, 0 + self.flag = 0 + + state = shared_state.consume() status, seq_len = state.status, state.seq_len @@ -625,7 +603,7 @@ async def async_stream_infer(self, if seq_len == prev_len and not finish: continue - output_ids += output_ids_buf[prev_len:seq_len] + output_ids += output_ids_buf[prev_len:seq_len].tolist() output_len += seq_len - prev_len self.model_inst.report_tokens_per_tick(seq_len - prev_len) diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 2fb132a40e..07839721e9 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1406,13 +1406,15 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector } else if (r->stream_output && rank_ == 0) { const auto seq_len = r->outputs.getVal("sequence_length"); - const auto v = r->flag->load(std::memory_order_relaxed) < 1; - if (v) { - r->flag->fetch_add(1, std::memory_order_relaxed); + if (true) { // Create signals by copying the request handles for non-finished streaming requests signals.push_back([this, r, seq_len] { try { - r->forward_cb({Request::kOk, seq_len}); + auto new_state = new RequestState{Request::kOk, seq_len}; + auto old_state = r->state->exchange(new_state); + if (!old_state) { + r->forward_cb(); + } } catch (const std::bad_function_call& e) { TM_LOG_ERROR("Null stream callback for (%s)", std::to_string(r->id).c_str()); @@ -1494,11 +1496,14 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig auto ec = std::exchange(state_->errors[index], Request::kOk); + const auto len = state_->requests[index]->outputs.getVal("sequence_length"); // move the request handle into the signal - return [this, ec, r = std::move(state_->requests[index])] { + return [this, ec, len, r = std::move(state_->requests[index])] { if (rank_ == 0) { - if (r->forward_cb) { - r->forward_cb({Request::kFinish, r->outputs.getVal("sequence_length")}); + auto new_state = new RequestState{Request::kFinish, len}; + auto old_state = r->state->exchange(new_state); + if (!old_state) { + r->forward_cb(); } } }; diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/models/llama/Request.h index 359f03c47a..b231aff40d 100644 --- a/src/turbomind/models/llama/Request.h +++ b/src/turbomind/models/llama/Request.h @@ -98,12 +98,9 @@ struct Request { std::function cancel_cb; std::function end_cb; - std::function forward_cb; + std::function forward_cb; - // std::atomic_flag* flag; - std::atomic* flag; - - std::atomic* seq_len; + std::shared_ptr state; enum { kOk = 0, @@ -115,8 +112,6 @@ struct Request { kTooLong = 6, // history + prompt > session_len, kFinish = 7, }; - - // std::promise signal; }; class RequestQueue { diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 833b73547b..2d316a114e 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -331,12 +331,12 @@ PYBIND11_MODULE(_turbomind, m) return oss.str(); }); - py::class_>(m, "RequestState") + py::class_>(m, "RequestState") .def_readonly("status", &ft::RequestState::status) .def_readonly("seq_len", &ft::RequestState::seq_len); - // py::class_>(m, "AtomicRequestState") - // .def("load", [](ft::AtomicRequestState& s) { return s.load(); }); + py::class_>(m, "AtomicRequestState") + .def("consume", [](ft::AtomicRequestState& s) { return s.exchange(nullptr); }); // data type py::enum_(m, "DataType") @@ -451,26 +451,26 @@ PYBIND11_MODULE(_turbomind, m) py::class_(m, "ModelRequest") .def( "forward", - [](ModelRequest* model_request, - std::shared_ptr input_tensors, - const ft::SessionParam& session, - const ft::GenerationConfig& gen_cfg, - bool stream_output, - std::function cb) { + [](ModelRequest* model_request, + std::shared_ptr input_tensors, + const ft::SessionParam& session, + const ft::GenerationConfig& gen_cfg, + bool stream_output, + std::function cb) { ModelRequest::InputParam param{}; param.tensors = std::move(input_tensors); param.session = session; param.gen_cfg = gen_cfg; param.stream_output = stream_output; - auto ret = model_request->Forward(std::move(param), [cb = std::move(cb)](ft::RequestState s) { + auto ret = model_request->Forward(std::move(param), [cb = std::move(cb)]() { try { - cb(s); + cb(); } catch (const py::error_already_set& e) { std::cerr << e.what() << std::endl; } }); - return ret.tensors; + return std::make_tuple(std::move(ret.tensors), std::move(ret.state)); }, py::call_guard(), "input_tensors"_a, diff --git a/src/turbomind/triton_backend/model_request.cpp b/src/turbomind/triton_backend/model_request.cpp index 10697ddb53..1132bc61bb 100644 --- a/src/turbomind/triton_backend/model_request.cpp +++ b/src/turbomind/triton_backend/model_request.cpp @@ -84,7 +84,7 @@ void ModelRequest::End(std::function cb) queue_->enqueue({std::move(r)}); } -auto ModelRequest::Forward(InputParam param, std::function cb) -> OutputParam +auto ModelRequest::Forward(InputParam param, std::function cb) -> OutputParam { inputs_ = std::make_shared(); outputs_ = std::make_shared(); @@ -140,19 +140,20 @@ auto ModelRequest::Forward(InputParam param, std::function c r->outputs.insert(k, *v); } + auto state = std::make_shared(); + r->id = param.session.id; r->session = param.session; r->gen_cfg = param.gen_cfg; r->stream_output = param.stream_output; r->forward_cb = std::move(cb); - r->flag = &flag_; + r->state = state; - // flag_.clear(std::memory_order_release); flag_.store(0); queue_->enqueue({std::move(r)}); - return OutputParam{outputs_}; + return OutputParam{outputs_, state}; } void ModelRequest::ReportTokensPerTick(int observed) diff --git a/src/turbomind/triton_backend/model_request.h b/src/turbomind/triton_backend/model_request.h index f90ad81d23..2efb42134f 100644 --- a/src/turbomind/triton_backend/model_request.h +++ b/src/turbomind/triton_backend/model_request.h @@ -33,10 +33,11 @@ class ModelRequest { }; struct OutputParam { - std::shared_ptr tensors; + std::shared_ptr tensors; + std::shared_ptr state; }; - OutputParam Forward(InputParam param, std::function cb); + OutputParam Forward(InputParam param, std::function cb); void ReportTokensPerTick(int observed); @@ -44,11 +45,8 @@ class ModelRequest { RequestQueue* queue_; std::atomic* tok_per_tick_; - // std::atomic_flag flag_; std::atomic flag_; - std::atomic seq_len_; - uint64_t session_id_; int session_len_; From 6378aaac12c3e836265a71e0593a21dcaa3a9b39 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 19 Dec 2024 19:58:37 +0800 Subject: [PATCH 05/40] refactor gateway --- lmdeploy/turbomind/chat.py | 28 +- lmdeploy/turbomind/turbomind.py | 114 ++--- src/turbomind/CMakeLists.txt | 1 + src/turbomind/engine/CMakeLists.txt | 7 + src/turbomind/engine/gateway.cc | 40 ++ src/turbomind/engine/gateway.h | 61 +++ .../model_request.cc} | 70 +-- .../model_request.h | 17 +- .../llama/Request.h => engine/request.h} | 90 +--- src/turbomind/engine/request_queue.cc | 93 ++++ src/turbomind/engine/request_queue.h | 46 ++ src/turbomind/engine/signal_buffer.h | 61 +++ src/turbomind/models/llama/CMakeLists.txt | 1 + src/turbomind/models/llama/LlamaBatch.cc | 440 +++++++----------- src/turbomind/models/llama/LlamaBatch.h | 46 +- src/turbomind/models/llama/LlamaV2.cc | 24 +- src/turbomind/models/llama/LlamaV2.h | 5 - src/turbomind/python/bind.cpp | 68 +-- src/turbomind/triton_backend/CMakeLists.txt | 2 +- .../triton_backend/llama/CMakeLists.txt | 1 - .../triton_backend/llama/LlamaTritonModel.cc | 79 +--- .../triton_backend/llama/LlamaTritonModel.h | 26 +- .../llama/LlamaTritonModelInstance.cc | 224 --------- .../llama/LlamaTritonModelInstance.h | 86 ---- .../transformer_triton_backend.hpp | 3 +- 25 files changed, 689 insertions(+), 944 deletions(-) create mode 100644 src/turbomind/engine/CMakeLists.txt create mode 100644 src/turbomind/engine/gateway.cc create mode 100644 src/turbomind/engine/gateway.h rename src/turbomind/{triton_backend/model_request.cpp => engine/model_request.cc} (71%) rename src/turbomind/{triton_backend => engine}/model_request.h (70%) rename src/turbomind/{models/llama/Request.h => engine/request.h} (55%) create mode 100644 src/turbomind/engine/request_queue.cc create mode 100644 src/turbomind/engine/request_queue.h create mode 100644 src/turbomind/engine/signal_buffer.h delete mode 100644 src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc delete mode 100644 src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index 46108eaaf7..c45c2ac793 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -29,13 +29,13 @@ def input_prompt(model_name): return '\n'.join(iter(input, sentinel)) -def infer(generator, session_id, input_ids, gen_config, sequence_start, - sequence_end, step, stream_output, tokenizer, state): +def infer(generator, session_id, input_ids, gen_config, sequence_start, step, + stream_output, tokenizer, state): for outputs in generator.stream_infer(session_id=session_id, input_ids=input_ids, gen_config=gen_config, sequence_start=sequence_start, - sequence_end=sequence_end, + sequence_end=False, step=step, stream_output=stream_output): res, tokens = input_ids + outputs.token_ids, outputs.num_token @@ -46,8 +46,7 @@ def infer(generator, session_id, input_ids, gen_config, sequence_start, async def async_infer(generator, session_id, input_ids, gen_config, - sequence_start, sequence_end, step, stream_output, - tokenizer, state): + sequence_start, step, stream_output, tokenizer, state): token_ids = input_ids.copy() prev_len = 0 async for output in generator.async_stream_infer( @@ -55,7 +54,7 @@ async def async_infer(generator, session_id, input_ids, gen_config, input_ids=input_ids, gen_config=gen_config, sequence_start=sequence_start, - sequence_end=sequence_end, + sequence_end=False, step=step, stream_output=stream_output): tokens = output.num_token @@ -65,6 +64,8 @@ async def async_infer(generator, session_id, input_ids, gen_config, state=state) prev_len = tokens print(response, end='', flush=True) + # if 'I' in response: + # await generator.async_cancel(0, blocking=False) return tokens @@ -179,7 +180,10 @@ def main(model_path: str, if prompt == 'exit': exit(0) elif prompt == 'end': - generator.end(session_id) + if use_async: + asyncio.run(generator.async_end(session_id)) + else: + generator.end(session_id) nth_round = 1 step = 0 seed = random.getrandbits(64) @@ -190,10 +194,8 @@ def main(model_path: str, if model.capability == 'chat': sequence_start = (nth_round == 1) - sequence_end = False else: sequence_start = True - sequence_end = True step = 0 if step + len( @@ -207,13 +209,13 @@ def main(model_path: str, if use_async: coro = async_infer(generator, session_id, input_ids, - gen_config, sequence_start, sequence_end, - step, stream_output, tokenizer, state) + gen_config, sequence_start, step, + stream_output, tokenizer, state) tokens = asyncio.run(coro) else: tokens = infer(generator, session_id, input_ids, gen_config, - sequence_start, sequence_end, step, - stream_output, tokenizer, state) + sequence_start, step, stream_output, tokenizer, + state) # update step step += len(input_ids) + tokens diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 984fd80659..e33c54c718 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -384,38 +384,43 @@ def _get_logprobs(self, out_logprobs.append(tok_res) return out_logprobs - def end(self, session_id: int): - """End the given session.""" - input_ids = [self.tm_model.tokenizer.eos_token_id] - end_generator = self.tm_model.create_instance() - for outputs in end_generator.stream_infer( - session_id, - input_ids, - sequence_start=False, - sequence_end=True, - gen_config=GenerationConfig(max_new_tokens=0)): - pass - - async def async_end(self, session_id: int): - """End the given session.""" - self.end(session_id) - - def cancel(self, session_id: int): - """Stop current streaming inference.""" - input_ids = [self.tm_model.tokenizer.eos_token_id] - stop_generator = self.tm_model.create_instance() - for outputs in stop_generator.stream_infer( - session_id, - input_ids, - sequence_start=False, - sequence_end=False, - stop=True, - gen_config=GenerationConfig(max_new_tokens=0)): - pass - - async def async_cancel(self, session_id: int): + def async_end_cb(self, status: int): + + async def _signal(): + print(f'session ended, status = {status}') + self.end_event.set() + + asyncio.run_coroutine_threadsafe(_signal(), self.event_loop) + + async def async_end(self, session_id): + await self.done_event.wait() + self.end_event = asyncio.Event() + self.event_loop = asyncio.get_running_loop() + self.model_inst.end(self.async_end_cb) + await self.end_event.wait() + + def end_cb(self, status: int): + print(f'session ended, status = {status}') + self.end_event.set() + + def end(self): + self.done_event.wait() + self.end_event = threading.Event() + self.model_inst.end(self.end_cb) + self.end_event.wait() + + def cancel(self, session_id: int, blocking: bool = True): + self.model_inst.cancel() + if blocking: + self.done_event.wait() + + async def async_cancel(self, session_id: int, blocking: bool = True): """End the given session.""" - self.cancel(session_id) + if not self.is_canceled: + self.model_inst.cancel() + self.is_canceled = True + if blocking: + await self.done_event.wait() def prepare_embeddings(self, input_embeddings=None, @@ -524,7 +529,6 @@ async def async_stream_infer(self, sequence_start: bool = True, sequence_end: bool = False, step=0, - stop=False, gen_config: GenerationConfig = None, stream_output=False, **kwargs): @@ -544,11 +548,10 @@ async def async_stream_infer(self, stream_output (bool): indicator for stream output kwargs (dict): kwargs for backward compatibility """ - # start forward thread - self.event_loop = asyncio.get_running_loop() self.cond = asyncio.Condition() - # self.state_ready = asyncio.Event() + self.done_event = asyncio.Event() + self.is_canceled = False self.flag = 0 gen_cfg = self._get_generation_config(gen_config) @@ -562,8 +565,7 @@ async def async_stream_infer(self, session = _tm.SessionParam(id=session_id, step=step, start=sequence_start, - end=sequence_end, - stop=stop) + end=sequence_end) inputs = _np_dict_to_tm_dict(inputs) @@ -576,6 +578,7 @@ async def async_stream_infer(self, out_logprobs = None finish = False + state = None output_ids = [] output_len = 0 @@ -590,7 +593,6 @@ async def async_stream_infer(self, self.flag = 0 state = shared_state.consume() - status, seq_len = state.status, state.seq_len if status == 7: @@ -606,8 +608,6 @@ async def async_stream_infer(self, output_ids += output_ids_buf[prev_len:seq_len].tolist() output_len += seq_len - prev_len - self.model_inst.report_tokens_per_tick(seq_len - prev_len) - status = ResponseType.FINISH if finish else ResponseType.SUCCESS output = EngineOutput(status, output_ids, output_len.item(), out_logprobs) @@ -630,7 +630,9 @@ async def async_stream_infer(self, while not state or state.status == 0: while not self.flag: await self.cond.wait() - state = self.state + self.flag = 0 + state = shared_state.consume() + self.done_event.set() self.cond = None self.event_loop = None @@ -661,9 +663,9 @@ def _get_generation_config(self, cfg: GenerationConfig): # print (c) return c - def signal_cb(self, state): + def signal_cb(self): with self.cond: - self.flag, self.state = 1, state + self.flag = 1 self.cond.notify() def stream_infer(self, @@ -713,29 +715,32 @@ def stream_infer(self, self.cond = threading.Condition() self.flag = 0 + self.done_event = threading.Event() - outputs = self.model_inst.forward(inputs, session, gen_cfg, - stream_output, self.signal_cb) + outputs, shared_state = self.model_inst.forward( + inputs, session, gen_cfg, stream_output, self.signal_cb) outputs = _tm_dict_to_torch_dict(outputs) output_ids_buf = outputs['output_ids'] - seq_start = step + input_length[0] - out_logprobs = None finish = False state = None + output_ids = [] + output_len = 0 + prev_len = step + input_length[0] + try: # generator while True: with self.cond: while not self.flag: self.cond.wait() - state = self.state self.flag = 0 + state = shared_state.consume() status, seq_len = state.status, state.seq_len if status == 7: # TODO: use enum @@ -745,13 +750,18 @@ def stream_infer(self, yield self._get_error_output() break - output_ids = output_ids_buf[seq_start:seq_len] - gen_len = seq_len - seq_start + if seq_len == prev_len and not finish: + continue + + output_ids += output_ids_buf[prev_len:seq_len].tolist() + output_len += seq_len - prev_len status = ResponseType.FINISH if finish else ResponseType.SUCCESS - output = EngineOutput(status, output_ids.tolist(), gen_len, + output = EngineOutput(status, output_ids, output_len.item(), out_logprobs) + prev_len = seq_len + if out_logprobs: output_token_len = len(output.token_ids) output.logprobs = out_logprobs[:output_token_len] @@ -772,7 +782,7 @@ def stream_infer(self, while not state or state.status == 0: while not self.flag: self.cond.wait() - state = self.state + state = shared_state.consume() self.cond = None def decode(self, diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt index aec443a1aa..62adb94e5a 100644 --- a/src/turbomind/CMakeLists.txt +++ b/src/turbomind/CMakeLists.txt @@ -16,6 +16,7 @@ add_subdirectory(utils) add_subdirectory(kernels) add_subdirectory(layers) add_subdirectory(models) +add_subdirectory(engine) if(BUILD_PYT) add_subdirectory(th_op) endif() diff --git a/src/turbomind/engine/CMakeLists.txt b/src/turbomind/engine/CMakeLists.txt new file mode 100644 index 0000000000..1d68116cf6 --- /dev/null +++ b/src/turbomind/engine/CMakeLists.txt @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +cmake_minimum_required(VERSION 3.8) + +add_library(engine STATIC gateway.cc request_queue.cc model_request.cc) +set_property(TARGET engine PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET engine PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/turbomind/engine/gateway.cc b/src/turbomind/engine/gateway.cc new file mode 100644 index 0000000000..8bc728072f --- /dev/null +++ b/src/turbomind/engine/gateway.cc @@ -0,0 +1,40 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include + +#include "src/turbomind/engine/gateway.h" +#include "src/turbomind/engine/request_queue.h" + +namespace turbomind { + +Gateway::Gateway(std::function()> ctx_factory): request_queue_{this}, ctx_factory_{ctx_factory} +{ + signal_thread_ = std::thread(&Gateway::signal_thread_entry, this); +} + +void Gateway::shutdown() +{ + request_queue_.close(); + signal_buffer_.clsoe(); + + signal_thread_.join(); +} + +void Gateway::signal_thread_entry() noexcept +{ + while (true) { + bool abort{}; + std::vector signals = signal_buffer_.take_all(abort); + if (abort) { + break; + } + else { + auto ctx = ctx_factory_(); + for (const auto& s : signals) { + s(); + } + } + } +} + +} // namespace turbomind diff --git a/src/turbomind/engine/gateway.h b/src/turbomind/engine/gateway.h new file mode 100644 index 0000000000..d939c0bcc2 --- /dev/null +++ b/src/turbomind/engine/gateway.h @@ -0,0 +1,61 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "src/turbomind/engine/request_queue.h" +#include "src/turbomind/engine/signal_buffer.h" + +namespace turbomind { + +class Gateway { +public: + Gateway(std::function()> ctx_factory); + + void shutdown(); + + void push(std::vector> reqs) + { + return request_queue_.push(std::move(reqs)); + } + + void pop(std::vector>& infer_reqs, + std::vector>& kill_reqs, + unsigned max_infer_num, + bool blocking, + bool& abort) + { + return request_queue_.pop(infer_reqs, kill_reqs, max_infer_num, blocking, abort); + } + + void cancel(std::shared_ptr req) + { + return request_queue_.cancel(std::move(req)); + } + + void kill(std::shared_ptr req) + { + return request_queue_.kill(std::move(req)); + } + + void notify(std::vector signals) + { + return signal_buffer_.push(std::move(signals)); + } + +private: + void signal_thread_entry() noexcept; + +private: + RequestQueue request_queue_; + SignalBuffer signal_buffer_; + + std::function()> ctx_factory_; + + std::thread signal_thread_; +}; + +} // namespace turbomind diff --git a/src/turbomind/triton_backend/model_request.cpp b/src/turbomind/engine/model_request.cc similarity index 71% rename from src/turbomind/triton_backend/model_request.cpp rename to src/turbomind/engine/model_request.cc index 1132bc61bb..710513f079 100644 --- a/src/turbomind/triton_backend/model_request.cpp +++ b/src/turbomind/engine/model_request.cc @@ -1,16 +1,16 @@ #include -#include #include +#include #include #include #include #include #include -#include "src/turbomind/models/llama/Request.h" -#include "src/turbomind/triton_backend/model_request.h" +#include "src/turbomind/engine/request.h" +#include "src/turbomind/engine/model_request.h" #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/constant.h" #include "src/turbomind/utils/cuda_utils.h" @@ -31,7 +31,6 @@ static ManagedTensor create(DataType dtype, MemoryType where, const std::vector< ManagedTensor ret; ret.tensor = Tensor{where, dtype, std::vector(size.begin(), size.end()), data}; - // FT_CHECK_WITH_INFO(byte_size == ret.tensor.sizeBytes(), fmtstr("%ld vs %ld", byte_size, ret.tensor.sizeBytes())); ret.data_holder.reset((void*)nullptr, [data, where](auto) { // std::cerr << "turbomind tensor deallocate" << std::endl; if (where == MEMORY_GPU) { @@ -55,33 +54,29 @@ static T get(const std::unordered_map& m, const std: return fallback; } -ModelRequest::ModelRequest(RequestQueue* queue, std::atomic* tok_per_tick, int session_len, int vocab_size): - queue_{queue}, tok_per_tick_{tok_per_tick}, session_len_{session_len}, vocab_size_{vocab_size} +ModelRequest::ModelRequest(Gateway* gateway, int session_len, int vocab_size): + gateway_{gateway}, session_len_{session_len}, vocab_size_{vocab_size} { } -void ModelRequest::Cancel(bool end, std::function cb) +void ModelRequest::Cancel() { - auto r = std::make_shared(); - - r->id = session_id_; - // r->stop_flag = true; - - r->cancel_cb = std::move(cb); - - queue_->enqueue({std::move(r)}); + // request is finished if lock failed + if (auto r = request_.lock()) { + gateway_->cancel(std::move(r)); + } } void ModelRequest::End(std::function cb) { auto r = std::make_shared(); - r->id = session_id_; - // r->end_flag = true; + r->id = r->session.id = session_id_; + r->session.kill_flag = true; r->end_cb = std::move(cb); - queue_->enqueue({std::move(r)}); + gateway_->kill(std::move(r)); } auto ModelRequest::Forward(InputParam param, std::function cb) -> OutputParam @@ -142,6 +137,10 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output auto state = std::make_shared(); + if (param.session.start_flag) { + session_id_ = param.session.id; + } + r->id = param.session.id; r->session = param.session; r->gen_cfg = param.gen_cfg; @@ -149,41 +148,12 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output r->forward_cb = std::move(cb); r->state = state; - flag_.store(0); + // Keep a weak reference for canceling the request + request_ = r; - queue_->enqueue({std::move(r)}); + gateway_->push({std::move(r)}); return OutputParam{outputs_, state}; } -void ModelRequest::ReportTokensPerTick(int observed) -{ - // flag_.clear(std::memory_order_release); - - flag_.fetch_sub(1, std::memory_order_relaxed); - -#if 0 - constexpr float decay = 0.525; - - float value = (float)observed; - // value -= std::max(0.f, std::min(decay, value - 1.f)); - - float old = tok_per_tick_->load(); - float cur{}; - auto update = [&]() mutable { - float alpha = old > value ? 0.001 : 0.002; - cur = old * (1 - alpha) + value * alpha; - }; - update(); - while (!tok_per_tick_->compare_exchange_weak(old, cur)) { - update(); - } - - static int count = 0; - if (++count % 100 == 0) { - std::cerr << cur << std::endl; - } -#endif -} - } // namespace turbomind diff --git a/src/turbomind/triton_backend/model_request.h b/src/turbomind/engine/model_request.h similarity index 70% rename from src/turbomind/triton_backend/model_request.h rename to src/turbomind/engine/model_request.h index 2efb42134f..c05824ed5b 100644 --- a/src/turbomind/triton_backend/model_request.h +++ b/src/turbomind/engine/model_request.h @@ -4,7 +4,7 @@ #include -#include "src/turbomind/models/llama/Request.h" +#include "src/turbomind/engine/gateway.h" #include "src/turbomind/utils/Tensor.h" namespace turbomind { @@ -13,10 +13,10 @@ class ModelRequest { public: virtual ~ModelRequest() = default; - ModelRequest(RequestQueue* queue, std::atomic* tok_per_tick, int session_len, int vocab_size); + ModelRequest(Gateway* gateway, int session_len, int vocab_size); - // Cancel running request, calls `cb` when done - void Cancel(bool end, std::function cb); + // Cancel running request + void Cancel(); // Reset the channel to uninitailized state, calls `notify` when done void End(std::function cb); @@ -39,19 +39,16 @@ class ModelRequest { OutputParam Forward(InputParam param, std::function cb); - void ReportTokensPerTick(int observed); - protected: - RequestQueue* queue_; - std::atomic* tok_per_tick_; - - std::atomic flag_; + Gateway* gateway_; uint64_t session_id_; int session_len_; int vocab_size_; + std::weak_ptr request_; + std::shared_ptr inputs_; // owned by caller std::shared_ptr outputs_; // owned by `this` }; diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/engine/request.h similarity index 55% rename from src/turbomind/models/llama/Request.h rename to src/turbomind/engine/request.h index b231aff40d..aa2cba14d6 100644 --- a/src/turbomind/models/llama/Request.h +++ b/src/turbomind/engine/request.h @@ -3,10 +3,8 @@ #pragma once #include -#include #include #include -#include #include "src/turbomind/utils/Tensor.h" @@ -57,7 +55,7 @@ struct SessionParam { bool start_flag; bool end_flag; - bool stop_flag; + bool kill_flag; }; struct RequestState { @@ -95,13 +93,16 @@ struct Request { TensorMap inputs; TensorMap outputs; - std::function cancel_cb; std::function end_cb; + std::function forward_cb; - std::function forward_cb; - + std::atomic cancel_flag; std::shared_ptr state; + bool is_canceled{}; + + int ec; + enum { kOk = 0, kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set @@ -111,76 +112,25 @@ struct Request { kFail = 5, // Can't find sequence for `stop` request or internal error during inference kTooLong = 6, // history + prompt > session_len, kFinish = 7, + kCancel = 8, }; }; -class RequestQueue { -public: - void enqueue(std::vector> requests) - { - { - std::lock_guard lock(mutex_); - - if (closed_) { - throw std::runtime_error("Queue is closed"); - } - - for (auto& r : requests) { - // futures.push_back(r->signal.get_future()); - if (r->session.stop_flag) { - stop_queue_.push(std::move(r)); - } - else { - infer_queue_.push(std::move(r)); - } - } +inline void UpdateState(Request& r, int status, int seq_len) +{ + try { + auto new_state = new RequestState{status, seq_len}; + auto old_state = r.state->exchange(new_state); + if (!old_state && r.forward_cb) { + r.forward_cb(); } - cv_.notify_one(); } - - void dequeue(std::vector>& stop_requests, - std::vector>& infer_requests, - unsigned max_infer_count, - bool blocking, - bool& abort) - { - std::unique_lock lock(mutex_); - if (blocking) { - cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty()) || closed_; }); - if (closed_) { - abort = true; - return; - } - } - - stop_requests.clear(); - while (!stop_queue_.empty()) { - stop_requests.push_back(std::move(stop_queue_.front())); - stop_queue_.pop(); - } - - infer_requests.clear(); - while (!infer_queue_.empty() && infer_requests.size() < max_infer_count) { - infer_requests.push_back(std::move(infer_queue_.front())); - infer_queue_.pop(); - } + catch (const std::exception& e) { + TM_LOG_ERROR("Error invoking callback for (%lu): %s", r.id, e.what()); } - - void close() - { - { - std::lock_guard lock(mutex_); - closed_ = true; - } - cv_.notify_all(); + catch (...) { + TM_LOG_ERROR("Unknown error invoking callback for (%lu)", r.id); } - -private: - std::queue> stop_queue_; - std::queue> infer_queue_; - std::mutex mutex_; - std::condition_variable cv_; - bool closed_{false}; -}; +} } // namespace turbomind diff --git a/src/turbomind/engine/request_queue.cc b/src/turbomind/engine/request_queue.cc new file mode 100644 index 0000000000..8c0b52b5bf --- /dev/null +++ b/src/turbomind/engine/request_queue.cc @@ -0,0 +1,93 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/engine/request_queue.h" +#include "src/turbomind/engine/gateway.h" + +#include "src/turbomind/engine/request.h" + +namespace turbomind { + +void RequestQueue::push(std::vector> reqs) +{ + { + std::lock_guard lock(mutex_); + if (closed_) { + throw std::runtime_error("Queue is closed"); + } + for (auto& r : reqs) { + queue_.push(std::move(r)); + } + } + cv_.notify_one(); +} + +void RequestQueue::cancel(std::shared_ptr r) +{ + // -1 canceled + // 0 queued + // 1 active + if (r->cancel_flag.exchange(-1, std::memory_order_acq_rel) != 0) { + // request is picked up by engine + return; + } + else { + // not picked by engine yet, skip directly + gateway_->notify({[r = std::move(r)] { // + UpdateState(*r, Request::kCancel, 0); + }}); + } +} + +void RequestQueue::kill(std::shared_ptr r) +{ + { + std::lock_guard lock(mutex_); + if (closed_) { + throw std::runtime_error("Queue is closed"); + } + kill_.push_back(std::move(r)); + } + cv_.notify_one(); +} + +void RequestQueue::pop(std::vector>& infer_reqs, + std::vector>& kill_reqs, + unsigned max_infer_num, + bool blocking, + bool& abort) +{ + std::unique_lock lock(mutex_); + + if (blocking) { + cv_.wait(lock, [this] { return !queue_.empty() || !kill_.empty() || closed_; }); + if (closed_) { + abort = true; + return; + } + } + + infer_reqs.clear(); + while (!queue_.empty() && infer_reqs.size() <= max_infer_num) { + auto& r = queue_.front(); + if (r->cancel_flag.exchange(1, std::memory_order_acq_rel) == 0) { + infer_reqs.push_back(std::move(r)); + } + else { + // Canceled requests are simply ignored + } + queue_.pop(); + } + + kill_reqs = std::move(kill_); +} + +void RequestQueue::close() +{ + { + std::lock_guard lock(mutex_); + closed_ = true; + } + cv_.notify_all(); +} + +} // namespace turbomind diff --git a/src/turbomind/engine/request_queue.h b/src/turbomind/engine/request_queue.h new file mode 100644 index 0000000000..c029f38f4b --- /dev/null +++ b/src/turbomind/engine/request_queue.h @@ -0,0 +1,46 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "src/turbomind/engine/request.h" + +namespace turbomind { + +class Gateway; + +class RequestQueue { +public: + RequestQueue(Gateway* gateway): gateway_{gateway} {} + + void push(std::vector> reqs); + + void pop(std::vector>& infer_reqs, + std::vector>& kill_reqs, + unsigned max_infer_num, + bool blocking, + bool& abort); + + void cancel(std::shared_ptr r); + + void kill(std::shared_ptr r); + + void close(); + +private: + Gateway* gateway_; + + std::queue> queue_; + + std::vector> kill_; + + std::mutex mutex_; + std::condition_variable cv_; + + bool closed_{false}; +}; + +} // namespace turbomind diff --git a/src/turbomind/engine/signal_buffer.h b/src/turbomind/engine/signal_buffer.h new file mode 100644 index 0000000000..10a50e15f7 --- /dev/null +++ b/src/turbomind/engine/signal_buffer.h @@ -0,0 +1,61 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include +#include +#include + +namespace turbomind { + +using Signal = std::function; + +class SignalBuffer { +public: + void push(std::vector signals) + { + if (signals.empty()) { + return; + } + { + std::lock_guard lock{mutex_}; + signals_.insert(signals_.end(), std::move_iterator{signals.begin()}, std::move_iterator{signals.end()}); + } + cv_.notify_one(); + } + + void clsoe() + { + { + std::lock_guard lock{mutex_}; + aborted_ = true; + } + cv_.notify_all(); + } + + std::vector take_all(bool& abort) + { + std::vector signals; + { + std::unique_lock lock{mutex_}; + cv_.wait(lock, [&] { return !signals_.empty() || aborted_; }); + if (aborted_) { + abort = true; + } + else { + signals.swap(signals_); + } + } + return signals; + } + +private: + std::vector signals_; + + std::mutex mutex_; + std::condition_variable cv_; + + bool aborted_{false}; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt index 3c714bd234..6c297e3d56 100644 --- a/src/turbomind/models/llama/CMakeLists.txt +++ b/src/turbomind/models/llama/CMakeLists.txt @@ -25,6 +25,7 @@ add_library(Llama STATIC set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(Llama PUBLIC CUDA::cudart + engine gemm2 rms_norm cublasMMWrapper diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 07839721e9..0f8a0082f5 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1,19 +1,42 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/models/llama/LlamaBatch.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/turbomind/macro.h" + +#include "src/turbomind/engine/gateway.h" +#include "src/turbomind/engine/request.h" + #include "src/turbomind/kernels/core/data_type.h" #include "src/turbomind/kernels/decoding_kernels.h" #include "src/turbomind/kernels/gemm/tuner/params.h" #include "src/turbomind/kernels/sampling_topk_kernels.h" -#include "src/turbomind/macro.h" + #include "src/turbomind/models/llama/BlockManager.h" +#include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaV2.h" -#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/copy.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_utils.h" + #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/constant.h" @@ -21,21 +44,6 @@ #include "src/turbomind/utils/debug_utils.h" #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/nccl_utils.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include namespace turbomind { @@ -85,162 +93,88 @@ void DropEmbeddings(const Sequence& seq) } template -void LlamaBatch::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs) +void LlamaBatch::MarkConflictRequests(Requests& infer_reqs, Requests& kill_reqs) { - std::unordered_map occurrence; - - auto count_occurrence = [&occurrence](const Requests& rs) { - for (const auto& r : rs) { - ++occurrence[r->id]; - } - }; + std::pmr::monotonic_buffer_resource mbr; + std::pmr::unordered_map occur(&mbr); - auto reject = [](const char* type, std::shared_ptr& req, int ec) { - TM_LOG_WARNING( - "[RejectInvalidRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec); - /// FIXME: make these signals - if (req->cancel_cb) { - req->cancel_cb(ec); - } - else if (req->end_cb) { - req->end_cb(ec); + auto count = [&occur](const auto& reqs) { + for (const auto& r : reqs) { + ++occur[r->id]; } - else if (req->forward_cb) { - FT_CHECK(0); // not implemtented - } - req.reset(); }; - auto handle_conflict_or_invalid = [this, &occurrence, &reject](Requests& rs, const char* type) { - for (auto& r : rs) { - if (r) { - int ec = 0; - - const int input_length = r->inputs.at("input_ids").shape[0]; - const auto get_offset = [&](int token_count) { - const int step = r->session.step < 0 ? token_count : r->session.step; - return std::max(0, std::min(token_count, r->session.step)); - }; - - if (occurrence[r->id] != 1) { - ec = Request::kConflict; - } - else if (r->session.start_flag && r->session.stop_flag) { - ec = Request::kInvalid; - } - else if (input_length > session_len_) { - ec = Request::kTooLong; - } - else if (!r->session.start_flag) { - if (auto seq = sequence_manager_->Get(r->id); seq == nullptr) { - ec = Request::kInvalid; - } - else if (get_offset(seq->tokens.size()) + input_length > session_len_) { - ec = Request::kTooLong; - } - } - - if (ec) { - reject(type, r, ec); - } + auto validate = [&occur](auto& reqs, const char* type) { + for (const auto& r : reqs) { + if (occur[r->id] > 1) { + TM_LOG_ERROR("Skip conflicting %s request for ID %lu", type, r->id); + r->ec = Request::kConflict; } } }; - auto drop_invalid = [](Requests& rs) { - int count = 0; - for (int i = 0; i < rs.size(); ++i) { - if (rs[i]) { - rs[count++] = std::move(rs[i]); - } + for (int i = 0; i < state_->size; ++i) { + if (state_->requests[i]) { + ++occur[state_->requests[i]->id]; } - rs.resize(count); - }; + } - count_occurrence(stop_reqs); - count_occurrence(infer_reqs); + count(kill_reqs); + count(infer_reqs); - if (!stop_reqs.empty()) { - handle_conflict_or_invalid(stop_reqs, "stop"); + validate(kill_reqs, "kill"); + validate(infer_reqs, "infer"); +} - // invalidate stop-only requests for inactive sequences - for (auto& r : stop_reqs) { - if (r && r->session.end_flag == false) { - int ec = Request::kInactive; - for (int i = 0; i < state_->size; ++i) { - if (state_->requests[i] && state_->requests[i]->id == r->id) { - ec = 0; - break; - } - } - if (ec) { - reject("stop", r, ec); - } - } +template +void LlamaBatch::BroadcastCancelFlags() +{ + for (int i = 0; i < state_->size; ++i) { + const auto& r = state_->requests[i]; + if (r && r->cancel_flag.load(std::memory_order_acquire) == -1) { + r->is_canceled = true; } - - drop_invalid(stop_reqs); } +} - if (!infer_reqs.empty()) { - handle_conflict_or_invalid(infer_reqs, "infer"); - - // invalidate requests for busy sequences - for (auto& r : infer_reqs) { - if (r) { - for (int i = 0; i < state_->size; ++i) { - if (state_->requests[i] && state_->requests[i]->id == r->id) { - reject("infer", r, Request::kBusy); - break; - } - } - } +template +void LlamaBatch::ProcessCancelRequests(std::vector& signals) +{ + int count = 0; + for (int i = 0; i < state_->size; ++i) { + const auto& r = state_->requests[i]; + if (r && r->is_canceled) { + ++count; + signals.push_back(Interrupt(i, true)); } - - drop_invalid(infer_reqs); + } + if (count) { + check_cuda_error(cudaStreamSynchronize(stream_)); } } -template -auto LlamaBatch::ProcessStopRequests(const Requests& requests) -> std::vector +template +void LlamaBatch::ProcessKillRequests(const Requests& kill_reqs, std::vector& signals) { - NvtxScope scope("stop_request"); - std::vector signals; - int count = 0; - for (const auto& r : requests) { - int ec = Request::kFail; - // find matching active sequence - for (int i = 0; i < state_->size; ++i) { - // stop & optionally erase active sequence - if (state_->requests[i] && state_->requests[i]->id == r->id) { - ec = 0; - signals.push_back(Interrupt(i, true, r->session.end_flag)); - ++count; - break; - } - } - // mismatch, try erase inactive sequence, in this case there is no active request to interrupt - if (ec && r->session.end_flag) { - if (sequence_manager_->Erase(r->id)) { - ec = 0; - } - } - signals.push_back([=] { - if (rank_ == 0) { - if (r->cancel_cb) { - r->cancel_cb(ec); + for (auto& r : kill_reqs) { + if (r) { + int ec = r->ec; + if (!ec) { + if (!sequence_manager_->Erase(r->id)) { + ec = Request::kInvalid; } } - }); - } - if (count) { - check_cuda_error(cudaStreamSynchronize(stream_)); + signals.push_back([=] { + if (r->end_cb) { + r->end_cb(ec); + } + }); + } } - return signals; } template -void LlamaBatch::ProcessInferRequests(const Requests& requests) +void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vector& signals) { NvtxScope scope("infer_request"); auto& state = *incoming_; @@ -251,35 +185,63 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) std::vector existing_idx; int idx = 0; - for (const auto& r : requests) { - FT_CHECK(!state.requests[idx]); + for (const auto& r : reqs) { if (rank_ == 0) { TM_LOG_INFO("[ProcessInferRequests] Request for %ld received.", (long)r->id); } - state.requests[idx] = r; + if (r->ec) { + signals.push_back([r] { UpdateState(*r, r->ec, 0); }); + continue; + } - // get sequence for the request - state.sequences[idx] = r->session.start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id); - FT_CHECK(state.sequences[idx]); + const int input_length = r->inputs.at("input_ids").shape[0]; - auto& seq = *state.sequences[idx]; + if (input_length > session_len_) { + signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); }); + continue; + } - if (int step = r->session.step; step >= 0) { - if (step <= seq.tokens.size()) { - seq.tokens.resize(step); - seq.cache_len = std::min(seq.cache_len, step); - DropEmbeddings(seq); + auto ptr = r->session.start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id); + if (!ptr) { + signals.push_back([r] { UpdateState(*r, Request::kInvalid, 0); }); + continue; + } + + const int step = [&] { + int s = r->session.step; + if (s < 0) { + s = ptr->tokens.size(); } - else if (rank_ == 0) { - TM_LOG_WARNING( - "[ProcessInferRequests] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id); + else if (s > ptr->tokens.size()) { + if (rank_ == 0) { + TM_LOG_WARNING("[ProcessInferRequests] Skipping invalid step (%d) setting for ID %lu", s, ptr->id); + } + s = ptr->tokens.size(); } + return s; + }(); + + if (step + input_length > session_len_) { + signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); }); + continue; + } + + FT_CHECK(!state.requests[idx]); + + state.requests[idx] = r; + state.sequences[idx] = ptr; + + auto& seq = *state.sequences[idx]; + + if (step < seq.tokens.size()) { + seq.tokens.resize(step); + seq.cache_len = std::min(seq.cache_len, step); + DropEmbeddings(seq); } - const int input_length = r->inputs.at("input_ids").shape[0]; - const int* input_ids = r->inputs.getPtr("input_ids"); + const int* input_ids = r->inputs.getPtr("input_ids"); { // `output_ids` contains all token ids of the sequences @@ -403,7 +365,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) if (r->session.start_flag) { // prepare to initialize random state for new sequence - h_random_seed_[idx] = r->inputs.getVal("random_seed", 0); + h_random_seed_[idx] = r->gen_cfg.random_seed; } else { // Recover device states if not a new sequence @@ -948,19 +910,9 @@ template LlamaBatch::~LlamaBatch() { TM_LOG_DEBUG("~LlamaBatch()"); - shared_state_->request_queue.close(); internal_thread_.join(); - if (output_thread_.joinable()) { - { - std::lock_guard lock{output_mutex_}; - output_stop_token_ = true; - } - output_cv_.notify_one(); - output_thread_.join(); - } - // The dtor maybe called from unknown thread, set device id before CUDA calls check_cuda_error(cudaSetDevice(device_id_)); check_cuda_error(cudaStreamSynchronize(stream_)); @@ -977,8 +929,10 @@ LlamaBatch::LlamaBatch(const EngineParam& param, std::unique_ptr> model, // ! This is moved std::unique_ptr> ctx, // ! This is moved std::shared_ptr state, + std::shared_ptr gateway, int device_id): param_(param), + gateway_(gateway), shared_state_(state), max_batch_size_(param.max_batch_size), max_forward_token_num_(param.max_prefill_token_num + param.max_batch_size), @@ -1393,9 +1347,7 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector std::vector signals; { - NvtxScope _("stream_and_completion_signal"); - const float tok_per_tick = shared_state_->tok_per_tick.load(); - const int tpt = std::min(std::max(1, (int)std::round(tok_per_tick)), 8); + NvtxScope _("stream_and_completion_signal"); for (int i = 0; i < batch_size - g.partial; ++i) { if (state_->requests[i]) { auto& r = state_->requests[i]; @@ -1406,25 +1358,10 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector } else if (r->stream_output && rank_ == 0) { const auto seq_len = r->outputs.getVal("sequence_length"); - if (true) { - // Create signals by copying the request handles for non-finished streaming requests - signals.push_back([this, r, seq_len] { - try { - auto new_state = new RequestState{Request::kOk, seq_len}; - auto old_state = r->state->exchange(new_state); - if (!old_state) { - r->forward_cb(); - } - } - catch (const std::bad_function_call& e) { - TM_LOG_ERROR("Null stream callback for (%s)", std::to_string(r->id).c_str()); - } - catch (...) { - TM_LOG_ERROR("Unknown exception invoking stream callback for (%s)", - std::to_string(r->id).c_str()); - } - }); - } + // Create signals by copying the request handles for non-finished streaming requests + signals.push_back([this, r, seq_len] { // + UpdateState(*r, Request::kOk, seq_len); + }); } } } @@ -1498,14 +1435,8 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig const auto len = state_->requests[index]->outputs.getVal("sequence_length"); // move the request handle into the signal - return [this, ec, len, r = std::move(state_->requests[index])] { - if (rank_ == 0) { - auto new_state = new RequestState{Request::kFinish, len}; - auto old_state = r->state->exchange(new_state); - if (!old_state) { - r->forward_cb(); - } - } + return [this, len, r = std::move(state_->requests[index])] { // + UpdateState(*r, Request::kFinish, len); }; } @@ -1518,33 +1449,27 @@ void LlamaBatch::InternalThreadEntry() // Initialize `AnomalyHandler` AnomalyHandler::instance().Init(rank_, model_->vocab_size_padded_, model_->end_id_, max_batch_size_, stream_); - auto& request_queue = shared_state_->request_queue; - auto& infer_requests = shared_state_->infer_requests; - auto& stop_requests = shared_state_->stop_requests; + // auto& request_queue = shared_state_->request_queue; + auto& infer_reqs = shared_state_->infer_reqs; + auto& kill_reqs = shared_state_->kill_reqs; GenerationState g{}; - constexpr int request_interval = 1; - long request_counter = 0; - while (1) { + if (rank_ == 0) { const int free_slot_count = max_batch_size_ - state_->size + g.finished_count; const bool is_empty = (free_slot_count == max_batch_size_); - stop_requests.clear(); - infer_requests.clear(); - if (is_empty || request_counter % request_interval == 0) { - // Block if batch is empty - request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty, shared_state_->abort); - if (!shared_state_->abort) { - RejectInvalidRequests(stop_requests, infer_requests); - } - } + // Block if batch is empty + gateway_->pop(infer_reqs, kill_reqs, free_slot_count, is_empty, shared_state_->abort); + // Mark reqs to the same session_id as invalid (which are dangerous to the engine) + MarkConflictRequests(infer_reqs, kill_reqs); } NvtxScope scope("mainloop"); - // wait while rank-0 is dequeueing + // 1. Wait while rank-0 is dequeueing + // 2. Broadcast `ec` from rank-0 shared_state_->barrier->wait(); if (shared_state_->abort) { @@ -1552,15 +1477,27 @@ void LlamaBatch::InternalThreadEntry() return; } - auto signals = ProcessStopRequests(stop_requests); + std::vector signals; + + ProcessKillRequests(kill_reqs, signals); // Shared `priority` field will be assigned by rank-0 - ProcessInferRequests(infer_requests); + ProcessInferRequests(infer_reqs, signals); - // Wait while shared `requests` is being used + // is_canceled <- cancel_flag.load() + if (rank_ == 0) { + BroadcastCancelFlags(); + } + + // 1. Wait while shared `requests` is being used + // 2. Broadcast modifcations from rank-0 shared_state_->barrier->wait(); - SendSignals(std::move(signals)); + ProcessCancelRequests(signals); + + if (rank_ == 0) { + gateway_->notify(std::move(signals)); + } Initialize(g); @@ -1575,67 +1512,22 @@ void LlamaBatch::InternalThreadEntry() // resources shared_state_->barrier->wait(); } - SendSignals(std::move(signals)); + if (rank_ == 0) { + gateway_->notify(std::move(signals)); + } } } - - ++request_counter; } + // Unreachable FT_CHECK(0); } -template -void LlamaBatch::SendSignals(std::vector signals) -{ - if (rank_ != 0 || signals.empty()) { - return; - } - { - std::lock_guard lock{output_mutex_}; - output_signals_.insert(output_signals_.end(), // - std::move_iterator{signals.begin()}, - std::move_iterator{signals.end()}); - } - output_cv_.notify_one(); -} - template void LlamaBatch::Start() { TM_LOG_INFO("LlamaBatch::Start()"); internal_thread_ = std::thread(&LlamaBatch::InternalThreadEntry, this); - if (rank_ == 0) { - output_thread_ = std::thread(&LlamaBatch::OutputThreadEntry, this); - } -} - -template -void LlamaBatch::OutputThreadEntry() -{ - while (true) { - std::vector signals; - { - // Wait for signals to come - std::unique_lock lock(output_mutex_); - output_cv_.wait(lock, [&] { return !output_signals_.empty() || output_stop_token_; }); - if (output_stop_token_) { - TM_LOG_INFO("[OutputThreadEntry] stop requested."); - return; - } - signals = std::move(output_signals_); - } - if (rank_ == 0 && ffi_lock_) { - ffi_lock_(1); - } - // send all bufferred signals - for (const auto& s : signals) { - s(); - } - if (rank_ == 0 && ffi_lock_) { - ffi_lock_(0); - } - } } template diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index b0394bdcbe..e2ffc0a230 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -2,33 +2,26 @@ #pragma once -#include -#include -#include -#include - #include +#include "src/turbomind/engine/gateway.h" +#include "src/turbomind/engine/request.h" + #include "src/turbomind/models/llama/Barrier.h" -#include "src/turbomind/models/llama/LlamaNcclGuard.h" -#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_params.h" + #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cuda_utils.h" -using ffi_api_lock_ctrl_t = std::function; - namespace turbomind { struct SharedState { - std::vector> infer_requests; - std::vector> stop_requests; - RequestQueue request_queue; - std::atomic tok_per_tick{1}; + std::vector> infer_reqs; + std::vector> kill_reqs; std::shared_ptr barrier; bool abort; std::atomic free_size{std::numeric_limits::max()}; @@ -87,11 +80,11 @@ class LlamaBatch { using Requests = std::vector>; using Signal = std::function; - void RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs); + void MarkConflictRequests(Requests& infer_reqs, Requests& kill_reqs); - [[nodiscard]] auto ProcessStopRequests(const Requests& requests) -> std::vector; + void ProcessKillRequests(const Requests& reqs, std::vector& signals); - void ProcessInferRequests(const Requests& requests); + void ProcessInferRequests(const Requests& reqs, std::vector& signals); int AdjustMaxInputCount(GenerationState& g, const std::vector& sequences, @@ -116,17 +109,13 @@ class LlamaBatch { std::unique_ptr> model, std::unique_ptr> ctx, std::shared_ptr state, + std::shared_ptr gateway, int device_id); ~LlamaBatch(); void Start(); - void set_ffi_lock(ffi_api_lock_ctrl_t func) - { - ffi_lock_ = func; - } - LlamaV2& model() noexcept { return *model_; @@ -140,14 +129,16 @@ class LlamaBatch { void tune(); private: + void BroadcastCancelFlags(); + + void ProcessCancelRequests(std::vector& signals); + void InternalThreadEntry(); void OutputThreadEntry(); void CopyState(const std::vector>& desc); - void SendSignals(std::vector signals); - // analogs to `std::copy_n` template U* Copy(const U* src, size_t count, U* dst) @@ -207,6 +198,7 @@ class LlamaBatch { private: const EngineParam param_; + const std::shared_ptr gateway_; const std::shared_ptr shared_state_; const int max_batch_size_; @@ -319,14 +311,6 @@ class LlamaBatch { std::thread internal_thread_; - // async stream callback utils - std::thread output_thread_; - std::mutex output_mutex_; - std::condition_variable output_cv_; - std::vector output_signals_; - bool output_stop_token_{false}; - ffi_api_lock_ctrl_t ffi_lock_; - int* h_output_ids_{}; }; diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 05b22deed5..6018ac5819 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -20,35 +20,31 @@ // Modified from // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc -#include "src/turbomind/models/llama/LlamaV2.h" -#include "src/turbomind/kernels/attention/attention_params.h" -#include "src/turbomind/kernels/decoding_kernels.h" -#include "src/turbomind/kernels/gemm/tuner/params.h" -#include "src/turbomind/kernels/gpt_kernels.h" + +#include +#include + #include "src/turbomind/macro.h" + +#include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaWeight.h" -#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/unified_decoder.h" + +#include "src/turbomind/kernels/decoding_kernels.h" +#include "src/turbomind/kernels/gpt_kernels.h" + #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/memory_utils.h" -#include "src/turbomind/utils/monotonic.h" -#include -#include -#include -#include -#include -#include -#include namespace turbomind { diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h index a0d35b887f..8101310812 100644 --- a/src/turbomind/models/llama/LlamaV2.h +++ b/src/turbomind/models/llama/LlamaV2.h @@ -21,14 +21,9 @@ #pragma once -#include -#include - #include "src/turbomind/layers/DynamicDecodeLayer.h" -#include "src/turbomind/models/llama/Barrier.h" #include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaWeight.h" -#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/unified_decoder.h" diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 2d316a114e..6ac820ce43 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -12,10 +12,9 @@ #include #include -#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/python/dlpack.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -#include "src/turbomind/triton_backend/model_request.h" +#include "src/turbomind/engine/model_request.h" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" @@ -281,6 +280,26 @@ static void safe_memcpy(void* dst, const void* src, size_t size) } } +namespace { + +struct ScopedGIL { + ScopedGIL(const ScopedGIL&) = delete; + ScopedGIL& operator=(const ScopedGIL&) = delete; + ScopedGIL(ScopedGIL&&) = delete; + ScopedGIL& operator=(ScopedGIL&&) = delete; + ScopedGIL() + { + state = PyGILState_Ensure(); + } + ~ScopedGIL() + { + PyGILState_Release(state); + } + PyGILState_STATE state; +}; + +} // namespace + PYBIND11_MODULE(_turbomind, m) { // nccl param @@ -292,25 +311,25 @@ PYBIND11_MODULE(_turbomind, m) (void)py::class_>(m, "AbstractCustomComm"); py::class_(m, "SessionParam") - .def(py::init([](uint64_t id, int step, bool start, bool end, bool stop) { + .def(py::init([](uint64_t id, int step, bool start, bool end) { + if (!start && end) { + throw std::logic_error("unsupported arguments: start=false, end=true"); + } ft::SessionParam param{}; param.id = id; param.step = step; param.start_flag = start; param.end_flag = end; - param.stop_flag = stop; return param; }), "id"_a, "step"_a, "start"_a, - "end"_a, - "stop"_a) + "end"_a) .def_readwrite("id", &ft::SessionParam::id) .def_readwrite("step", &ft::SessionParam::step) .def_readwrite("start", &ft::SessionParam::start_flag) - .def_readwrite("end", &ft::SessionParam::end_flag) - .def_readwrite("stop", &ft::SessionParam::stop_flag); + .def_readwrite("end", &ft::SessionParam::end_flag); py::class_(m, "GenerationConfig") .def(py::init()) @@ -480,23 +499,15 @@ PYBIND11_MODULE(_turbomind, m) "cb"_a) .def( "cancel", - [](ModelRequest* model_request, bool end, std::function cb) { - model_request->Cancel(end, std::move(cb)); // - }, - py::call_guard(), - "end"_a, - "cb"_a) + [](ModelRequest* model_request) { model_request->Cancel(); }, + py::call_guard()) .def( "end", [](ModelRequest* model_request, std::function cb) { model_request->End(std::move(cb)); // }, py::call_guard(), - "cb"_a) - .def( - "report_tokens_per_tick", - [](ModelRequest* model_request, int tok_per_tick) { model_request->ReportTokensPerTick(tok_per_tick); }, - "tokens_per_tick"_a); + "cb"_a); // transformer model using ft::AbstractTransformerModel; @@ -510,25 +521,19 @@ PYBIND11_MODULE(_turbomind, m) size_t pipeline_para_size, int enable_custom_all_reduce, std::string data_type) -> std::shared_ptr { - auto gil_control = [state = PyGILState_STATE{}](int op) mutable { - if (op) { - state = PyGILState_Ensure(); - } - else { - PyGILState_Release(state); - } + auto gil_factory = [] { // + // erase the type + return std::static_pointer_cast(std::make_shared()); }; if (data_type == "half" || data_type == "fp16" || data_type == "float16" || data_type == "int4") { auto model = std::make_shared>( - tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config); - model->set_ffi_lock(gil_control); + tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config, gil_factory); return model; } else if (data_type == "bf16" || data_type == "bfloat16") { #ifdef ENABLE_BF16 auto model = std::make_shared>( - tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config); - model->set_ffi_lock(gil_control); + tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config, gil_factory); return model; #else throw std::runtime_error("Error: turbomind has not been built with bf16 support."); @@ -537,8 +542,7 @@ PYBIND11_MODULE(_turbomind, m) else { #ifdef ENABLE_FP32 auto model = std::make_shared>( - tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config); - model->set_ffi_lock(gil_control); + tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config, gil_factory); return model; #else throw std::runtime_error("Error: turbomind has not been built with fp32 support."); diff --git a/src/turbomind/triton_backend/CMakeLists.txt b/src/turbomind/triton_backend/CMakeLists.txt index d6aec06990..4311d9d9be 100644 --- a/src/turbomind/triton_backend/CMakeLists.txt +++ b/src/turbomind/triton_backend/CMakeLists.txt @@ -28,7 +28,7 @@ cmake_minimum_required (VERSION 3.18) project(tritonturbomindbackend LANGUAGES C CXX) -add_library(TransformerTritonBackend STATIC transformer_triton_backend.cpp model_request.cpp) +add_library(TransformerTritonBackend STATIC transformer_triton_backend.cpp) target_link_libraries(TransformerTritonBackend PUBLIC nccl_utils) set_property(TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON) install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/src/turbomind/triton_backend/llama/CMakeLists.txt b/src/turbomind/triton_backend/llama/CMakeLists.txt index 26c580714a..7e193f6677 100644 --- a/src/turbomind/triton_backend/llama/CMakeLists.txt +++ b/src/turbomind/triton_backend/llama/CMakeLists.txt @@ -19,7 +19,6 @@ cmake_minimum_required(VERSION 3.8) set(llama_triton_backend_files LlamaTritonModel.cc - LlamaTritonModelInstance.cc ) find_package(CUDAToolkit REQUIRED) diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 6ae51b03eb..c045d7baf9 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -24,16 +24,16 @@ #include #include +#include "src/turbomind/engine/gateway.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h" +#include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/triton_backend/model_request.h" +#include "src/turbomind/engine/model_request.h" #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" namespace turbomind { @@ -61,56 +61,6 @@ static std::optional get_moe_method() return value; } -std::shared_ptr AbstractTransformerModel::createLlamaModel(std::string config_file) -{ - YAML::Node reader; - try { - reader = YAML::Load(config_file); - } - catch (const YAML::Exception& e) { - std::cerr << "Error reading YAML config: " << e.what() << std::endl; - FT_CHECK(false); - } - - const auto ft_instance_hyperparameter = reader["ft_instance_hyperparameter"]; - const std::string data_type = ft_instance_hyperparameter["data_type"].as(); - int tensor_para_size = ft_instance_hyperparameter["tensor_para_size"].as(); - std::string model_dir = ft_instance_hyperparameter["model_dir"].as(); - - if (data_type == "half" || data_type == "fp16" || data_type == "float16") { - return std::make_shared>( - ft_instance_hyperparameter["tensor_para_size"].as(), - ft_instance_hyperparameter["pipeline_para_size"].as(), - ft_instance_hyperparameter["enable_custom_all_reduce"].as(0), - model_dir); - } - else if (data_type == "bf16" || data_type == "bfloat16") { -#ifdef ENABLE_BF16 - return std::make_shared>( - ft_instance_hyperparameter["tensor_para_size"].as(), - ft_instance_hyperparameter["pipeline_para_size"].as(), - ft_instance_hyperparameter["enable_custom_all_reduce"].as(0), - model_dir); -#else - TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF16"); - FT_CHECK(false); -#endif - } - else { -#ifdef ENABLE_FP32 - return std::make_shared>( - ft_instance_hyperparameter["tensor_para_size"].as(), - ft_instance_hyperparameter["pipeline_para_size"].as(), - ft_instance_hyperparameter["enable_custom_all_reduce"].as(0), - model_dir); -#else - TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF32"); - FT_CHECK(false); -#endif - } - return nullptr; -} - template std::map> getLoraPattern(std::string pattern, T (*func)(const std::string& s)) { @@ -208,6 +158,9 @@ template LlamaTritonModel::~LlamaTritonModel() { FT_CHECK(weights_.size() == engines_.size()); + + gateway_->shutdown(); + for (int device_id = 0; device_id < (int)engines_.size(); ++device_id) { // Set device id before destructing CUDA resources check_cuda_error(cudaSetDevice(device_id)); @@ -217,11 +170,12 @@ LlamaTritonModel::~LlamaTritonModel() } template -LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, - size_t pipeline_para_size, - int enable_custom_all_reduce, - std::string model_dir, - std::string config): +LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, + size_t pipeline_para_size, + int enable_custom_all_reduce, + std::string model_dir, + std::string config, + std::function()> ffi_ctx_factory): tensor_para_size_(tensor_para_size), pipeline_para_size_(pipeline_para_size), weights_(getDeviceCount()), @@ -334,6 +288,8 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, shared_state_ = std::make_shared(); shared_state_->barrier = std::make_shared(tensor_para_size); + gateway_ = std::make_shared(ffi_ctx_factory); + const auto device_count = getDeviceCount(); engines_.resize(device_count); @@ -399,6 +355,7 @@ LlamaTritonModel::createSharedModelInstance(int std::move(model), std::move(ctx), shared_state_, + gateway_, device_id); // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang @@ -417,10 +374,7 @@ std::unique_ptr LlamaTritonModel::createModelInstance(int devic FT_CHECK(engines_[device_id] != nullptr); - return std::make_unique(&shared_state_->request_queue, - &shared_state_->tok_per_tick, - engine_param_.session_len, - model_param_.vocab_size); + return std::make_unique(gateway_.get(), engine_param_.session_len, model_param_.vocab_size); } template @@ -478,7 +432,6 @@ void LlamaTritonModel::createEngine(int { auto engine = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm); - engine->set_ffi_lock(ffi_lock_); engine->tune(); diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h index 24c252bae6..21b124e5a8 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h @@ -20,26 +20,25 @@ #pragma once +#include "src/turbomind/engine/gateway.h" #include "src/turbomind/models/llama/LlamaBatch.h" -#include "src/turbomind/models/llama/LlamaV2.h" +#include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/custom_ar_comm.h" #include "src/turbomind/utils/nccl_utils.h" #include -#include namespace turbomind { template struct LlamaTritonModel: public AbstractTransformerModel { - LlamaTritonModel(size_t tensor_para_size, - size_t pipeline_para_size, - int enable_custom_all_reduce, - std::string model_dir, - std::string config = ""); + LlamaTritonModel(size_t tensor_para_size, + size_t pipeline_para_size, + int enable_custom_all_reduce, + std::string model_dir, + std::string config, + std::function()> ffi_ctx_factory); ~LlamaTritonModel() override; @@ -61,11 +60,6 @@ struct LlamaTritonModel: public AbstractTransformerModel { void handleMissingParams(); - void set_ffi_lock(ffi_api_lock_ctrl_t func) - { - ffi_lock_ = func; - } - std::string toString() override; int getTensorParaSize() override; int getPipelineParaSize() override; @@ -86,6 +80,8 @@ struct LlamaTritonModel: public AbstractTransformerModel { size_t pipeline_para_size_; std::shared_ptr shared_state_; + std::shared_ptr gateway_; + // Weights & engine instances for the ranks std::vector>> weights_; std::vector>> engines_; @@ -95,8 +91,6 @@ struct LlamaTritonModel: public AbstractTransformerModel { std::string model_name_; std::string model_dir_; - - ffi_api_lock_ctrl_t ffi_lock_ = nullptr; }; } // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc deleted file mode 100644 index 4e2b29d765..0000000000 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h - -#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" -#include "src/turbomind/macro.h" -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/constant.h" -#include "src/turbomind/utils/cuda_utils.h" -#include -#include -#include -#include -#include -#include -#include -#include - -namespace turbomind { - -#if 0 - -template -void triton_stream_callback(std::unordered_map* outputs, void* ctx) -{ - LlamaTritonModelInstance* model = reinterpret_cast*>(ctx); - model->stream_cb_(std::make_shared>(*outputs), model->stream_ctx_); -} - -template -LlamaTritonModelInstance::LlamaTritonModelInstance(Engine& instance, - std::unique_ptr> allocator, - int device_id): - device_id_{device_id}, instance_(&instance), allocator_(std::move(allocator)) -{ -} - -template -std::string format_vector(const std::vector& vec) -{ - std::stringstream ss; - ss << "["; - bool first = true; - for (const auto& x : vec) { - ss << (first ? "" : ", ") << x; - first = false; - } - ss << "]"; - return ss.str(); -} - -template -std::shared_ptr> -LlamaTritonModelInstance::forward(std::shared_ptr> inputs) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - - // In some cases, this is needed to trigger the creation of CUDA context, or later `cudaMallocAsync` will die - check_cuda_error(cudaSetDevice(device_id_)); - - FT_CHECK_WITH_INFO(inputs->at("input_ids").shape.size() == 2, "inputs->at(\"input_ids\").shape.size() == 2"); - FT_CHECK_WITH_INFO(inputs->at("input_lengths").shape.size() == 1, - "inputs->at(\"input_lengths\").shape.size() == 1"); - - const uint32_t request_batch_size = inputs->at("input_ids").shape[0]; - const uint32_t max_request_output_len = (size_t)*std::max_element((int*)inputs->at("request_output_len").data, - (int*)inputs->at("request_output_len").data - + inputs->at("request_output_len").shape[0]); - // const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1]; - const uint32_t beam_width = inputs->count("beam_width") ? (size_t)(*(uint*)inputs->at("beam_width").data) : 1; - FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented"); - - h_total_output_lengths_ = - (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t)); - - const size_t max_input_len = inputs->at("input_ids").shape[1]; - const bool is_return_logits = inputs->count("is_return_logits") && *(bool*)inputs->at("is_return_logits").data; - - const size_t vocab_size = instance_->model().vocab_size(); - - allocateBuffer(request_batch_size, max_input_len, beam_width, instance_->session_len(), is_return_logits); - - std::unordered_map outputs{ - {"output_ids", - Tensor{MEMORY_CPU, - TYPE_UINT32, - std::vector{request_batch_size, beam_width, (size_t)instance_->session_len()}, - d_output_ids_}}, - {"sequence_length", - Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{request_batch_size, beam_width}, d_sequence_lengths_}}}; - - if (inputs->count("is_return_log_probs") && *((bool*)inputs->at("is_return_log_probs").data)) { - outputs.insert({"output_log_probs", - Tensor{MEMORY_GPU, - TYPE_FP32, - std::vector{request_batch_size, beam_width, max_request_output_len}, - d_output_log_probs_}}); - outputs.insert( - {"cum_log_probs", - Tensor{MEMORY_GPU, TYPE_FP32, std::vector{request_batch_size, beam_width}, d_cum_log_probs_}}); - } - - if (inputs->count("logprobs")) { - size_t max_logprob_length = std::min((int)max_request_output_len, instance_->session_len()) + 1; - h_logprob_vals_ = (float*)std::realloc( - h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb); - h_logprob_indexes_ = (uint32_t*)std::realloc( - h_logprob_indexes_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb); - h_logprob_nums_ = (uint32_t*)std::realloc( - h_logprob_nums_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length); - - outputs.insert({{"logprob_vals", - Tensor{MEMORY_CPU, - TYPE_FP32, - std::vector{request_batch_size, beam_width, max_logprob_length, kMaxLogProb}, - h_logprob_vals_}}}); - - outputs.insert({{"logprob_indexes", - Tensor{MEMORY_CPU, - TYPE_UINT32, - std::vector{request_batch_size, beam_width, max_logprob_length, kMaxLogProb}, - h_logprob_indexes_}}}); - - outputs.insert({{"logprob_nums", - Tensor{MEMORY_CPU, - TYPE_UINT32, - std::vector{request_batch_size, beam_width, max_logprob_length}, - h_logprob_nums_}}}); - } - - if (is_return_logits) { - outputs.insert( - {{"logits", {MEMORY_GPU, TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}}}); - } - - try { - Request::Callback callback; - - if (stream_cb_) { - callback = [this](std::unordered_map* outputs) { - triton_stream_callback(outputs, this); - }; - } - - check_cuda_error(cudaStreamSynchronize(allocator_->returnStream())); - - instance_->Submit(&outputs, inputs.get(), {callback}); - // ! stream synced by the model before returning - } - catch (...) { - h_exception_ = std::current_exception(); - outputs.insert({"error_message", Tensor{MEMORY_CPU, TYPE_BYTES, {1}, &h_exception_}}); - } - - return std::make_shared>(std::move(outputs)); -} - -template -LlamaTritonModelInstance::~LlamaTritonModelInstance() -{ - freeBuffer(); -} - -template -void LlamaTritonModelInstance::allocateBuffer(const size_t request_batch_size, - const size_t max_input_len, - const size_t beam_width, - const size_t session_len, - const bool is_return_logits) -{ - d_output_ids_ = (int*)std::realloc(d_output_ids_, sizeof(int) * request_batch_size * beam_width * session_len); - d_sequence_lengths_ = (int*)std::realloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width); - - if (is_return_logits) { - d_output_logits_ = (float*)allocator_->reMalloc(d_output_logits_, - sizeof(float) * request_batch_size * max_input_len - * instance_->model().vocab_size(), - false); - } -} - -template -void LlamaTritonModelInstance::freeBuffer() -{ - std::free(d_output_ids_); - std::free(d_sequence_lengths_); - allocator_->free((void**)(&d_output_log_probs_)); - allocator_->free((void**)(&d_cum_log_probs_)); - std::free(h_total_output_lengths_); - std::free(h_logprob_vals_); - std::free(h_logprob_indexes_); - std::free(h_logprob_nums_); -} - -#ifdef ENABLE_FP32 -template struct LlamaTritonModelInstance; -#endif -template struct LlamaTritonModelInstance; -#ifdef ENABLE_BF16 -template struct LlamaTritonModelInstance<__nv_bfloat16>; -#endif - -#endif - - - -} // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h deleted file mode 100644 index 38b1ade7f0..0000000000 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h - -#pragma once - -#include -#include - -#include "src/turbomind/models/llama/LlamaBatch.h" -#include "src/turbomind/models/llama/LlamaV2.h" -#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -// #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" - -namespace turbomind { - -#if 0 -template -struct LlamaTritonModelInstance: AbstractTransformerModelInstance { - - LlamaTritonModelInstance(Engine& instance, - std::unique_ptr> allocator, - int device_id); - ~LlamaTritonModelInstance() override; - - virtual std::shared_ptr> - forward(std::shared_ptr> input_tensors) override; - -private: - Engine* instance_; - const std::unique_ptr> allocator_; - - void allocateBuffer(const size_t request_batch_size, - const size_t max_input_len, - const size_t beam_width, - const size_t session_len, - const bool is_return_logits); - void freeBuffer(); - - int device_id_; - - int* d_input_ids_ = nullptr; - int* d_input_lengths_ = nullptr; - int* d_input_bad_words_ = nullptr; - int* d_input_stop_words_ = nullptr; - int* d_request_prompt_lengths_ = nullptr; - T* d_request_prompt_embedding_ = nullptr; - float* d_top_p_decay_ = nullptr; - float* d_top_p_min_ = nullptr; - int* d_top_p_reset_ids_ = nullptr; - - int* d_output_ids_ = nullptr; - int* d_sequence_lengths_ = nullptr; - float* d_output_log_probs_ = nullptr; - float* d_cum_log_probs_ = nullptr; - float* d_output_logits_ = nullptr; - - float* h_logprob_vals_ = nullptr; - uint32_t* h_logprob_indexes_ = nullptr; - uint32_t* h_logprob_nums_ = nullptr; - - uint32_t* h_total_output_lengths_ = nullptr; - std::exception_ptr h_exception_ = nullptr; -}; - -#endif - - - -} // namespace turbomind diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp index c283568ab7..7e1e235160 100644 --- a/src/turbomind/triton_backend/transformer_triton_backend.hpp +++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp @@ -32,7 +32,7 @@ #include "src/turbomind/utils/custom_ar_comm.h" #include "src/turbomind/utils/nccl_utils.h" -#include "src/turbomind/triton_backend/model_request.h" +#include "src/turbomind/engine/model_request.h" namespace turbomind { @@ -64,7 +64,6 @@ struct AbstractTransformerModelInstance { }; struct AbstractTransformerModel { - static std::shared_ptr createLlamaModel(std::string model_dir); virtual ~AbstractTransformerModel() = default; From 9812538d95f3a6f095a02375a2054d82b2a79c1d Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Sat, 21 Dec 2024 20:04:01 +0800 Subject: [PATCH 06/40] optimize throughput --- lmdeploy/messages.py | 2 +- lmdeploy/turbomind/turbomind.py | 16 +- src/turbomind/engine/model_request.cc | 5 +- src/turbomind/engine/request.h | 4 + src/turbomind/kernels/gpt_kernels.cu | 57 +++++ src/turbomind/kernels/gpt_kernels.h | 15 ++ .../kernels/sampling_penalty_kernels.cu | 77 +++++++ .../kernels/sampling_penalty_kernels.h | 8 + .../sampling_layers/LogitsProcessorLayer.cc | 2 +- src/turbomind/models/llama/LlamaBatch.cc | 206 +++++++++++------- src/turbomind/models/llama/LlamaBatch.h | 6 +- 11 files changed, 309 insertions(+), 89 deletions(-) diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index 90823598ea..813a6acd21 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -124,7 +124,7 @@ def __post_init__(self): """Check input validation.""" assert type( self.n) == int and self.n > 0, 'n is not a positive integer' - assert self.top_p > 0 and self.top_p <= 1 # (0, 1] + assert self.top_p >= 0 and self.top_p <= 1 # [0, 1] assert self.top_k >= 0, 'top_k can not be a negative integer' assert self.temperature >= 0 and self.temperature <= 2 # [0,2] assert 0 <= self.min_p <= 1, \ diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index e33c54c718..baa2757432 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -347,6 +347,9 @@ def __init__(self, self.config = config + self.cond = None + self.done_event = None + def _create_model_instance(self, device_id): model_inst = self.tm_model.model_comm.create_model_instance(device_id) return model_inst @@ -414,7 +417,7 @@ def cancel(self, session_id: int, blocking: bool = True): if blocking: self.done_event.wait() - async def async_cancel(self, session_id: int, blocking: bool = True): + async def async_cancel(self, session_id: int, blocking: bool = False): """End the given session.""" if not self.is_canceled: self.model_inst.cancel() @@ -549,8 +552,14 @@ async def async_stream_infer(self, kwargs (dict): kwargs for backward compatibility """ self.event_loop = asyncio.get_running_loop() - self.cond = asyncio.Condition() - self.done_event = asyncio.Event() + + if self.done_event is not None: + await self.done_event.wait() + self.done_event.clear() + else: + self.cond = asyncio.Condition() + self.done_event = asyncio.Event() + self.is_canceled = False self.flag = 0 @@ -633,7 +642,6 @@ async def async_stream_infer(self, self.flag = 0 state = shared_state.consume() self.done_event.set() - self.cond = None self.event_loop = None def _get_error_output(self): diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index 710513f079..cd95f788f7 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -9,8 +9,8 @@ #include #include -#include "src/turbomind/engine/request.h" #include "src/turbomind/engine/model_request.h" +#include "src/turbomind/engine/request.h" #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/constant.h" #include "src/turbomind/utils/cuda_utils.h" @@ -148,6 +148,9 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output r->forward_cb = std::move(cb); r->state = state; + r->output_ids = *outputs_->at("output_ids"); + r->sequence_length = *outputs_->at("sequence_length"); + // Keep a weak reference for canceling the request request_ = r; diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index aa2cba14d6..e0af543185 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -103,6 +103,10 @@ struct Request { int ec; + // fast path for accessing common output buffers + Tensor output_ids; + Tensor sequence_length; + enum { kOk = 0, kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set diff --git a/src/turbomind/kernels/gpt_kernels.cu b/src/turbomind/kernels/gpt_kernels.cu index 4f47631fa5..a0c47fff09 100644 --- a/src/turbomind/kernels/gpt_kernels.cu +++ b/src/turbomind/kernels/gpt_kernels.cu @@ -269,4 +269,61 @@ void invokeTransposeAxis01( template void invokeTransposeAxis01( int* out, int* in, const int* in_skipping_dim1, const int dim0, const int dim1, cudaStream_t stream); +template +__global__ void transpose_2d_kernel(T* __restrict__ dst, const T* __restrict__ src, int rows, int cols, bool swap_xy) +{ + __shared__ T smem[TILE_DIM][TILE_DIM + 1]; + + const int block_idx_x = swap_xy ? blockIdx.y : blockIdx.x; + const int block_idx_y = swap_xy ? blockIdx.x : blockIdx.y; + + { + const int j = block_idx_x * TILE_DIM + threadIdx.x; + const int i = block_idx_y * TILE_DIM + threadIdx.y; + +#pragma unroll + for (int y = 0; y < TILE_DIM; y += BLOCK_ROWS) { + if (i + y < rows && j < cols) { + smem[threadIdx.y + y][threadIdx.x] = src[(i + y) * cols + j]; + } + } + } + + __syncthreads(); + + { + const int j = block_idx_y * TILE_DIM + threadIdx.x; + const int i = block_idx_x * TILE_DIM + threadIdx.y; + +#pragma unroll + for (int y = 0; y < TILE_DIM; y += BLOCK_ROWS) { + if (i + y < cols && j < rows) { + dst[(i + y) * rows + j] = smem[threadIdx.x][threadIdx.y + y]; + } + } + } +} + +template +void invokeTranspose2D_(T* dst, const T* src, int rows, int cols, cudaStream_t st) +{ + constexpr int TILE_DIM = 32; // warp size + constexpr int BLOCK_ROWS = 8; + + const dim3 block(TILE_DIM, BLOCK_ROWS); + + dim3 grid((cols + TILE_DIM - 1) / TILE_DIM, // + (rows + TILE_DIM - 1) / TILE_DIM); + bool swap_xy = false; + + if (grid.y > 65535) { // max dim for grid.y + std::swap(grid.x, grid.y); + swap_xy = true; + } + + transpose_2d_kernel<<>>(dst, src, rows, cols, swap_xy); +} + +template void invokeTranspose2D_(uint32_t*, const uint32_t*, int, int, cudaStream_t); + } // namespace turbomind diff --git a/src/turbomind/kernels/gpt_kernels.h b/src/turbomind/kernels/gpt_kernels.h index 4e1dc49be8..a351473332 100644 --- a/src/turbomind/kernels/gpt_kernels.h +++ b/src/turbomind/kernels/gpt_kernels.h @@ -238,4 +238,19 @@ void invokeSumLengthDimension(float* out_buf, const size_t hidden_dim, cudaStream_t stream = 0); +template +void invokeTranspose2D_(T* dst, const T* src, int rows, int cols, cudaStream_t st); + +template +void invokeTranspose2D(T* dst, const T* src, int rows, int cols, cudaStream_t st) +{ + if constexpr (sizeof(T) == 4) { + // FT_CHECK(0); + invokeTranspose2D_((uint32_t*)dst, (const uint32_t*)src, rows, cols, st); + } + else { + FT_CHECK(0); + } +} + } // namespace turbomind diff --git a/src/turbomind/kernels/sampling_penalty_kernels.cu b/src/turbomind/kernels/sampling_penalty_kernels.cu index 1d4cfe24b0..cf360580b9 100644 --- a/src/turbomind/kernels/sampling_penalty_kernels.cu +++ b/src/turbomind/kernels/sampling_penalty_kernels.cu @@ -17,6 +17,8 @@ #include #include +#include "src/turbomind/kernels/core/array_ops.h" +#include "src/turbomind/kernels/core/common.h" #include "src/turbomind/kernels/sampling_penalty_kernels.h" namespace turbomind { @@ -221,6 +223,81 @@ template void invokeBatchApplyTemperaturePenalty(half* logits, const int vocab_size_padd, cudaStream_t stream); #endif + +template +__global__ void batchApplyTemperaturePenalty_v2(float* logits, + const float* bias, + const float* temperatures, + const int batch_size, + const int vocab_size, + const int vocab_size_padded) +{ + const int vi = blockIdx.x * blockDim.x + threadIdx.x; + const int bi = blockIdx.y; + + __shared__ float shared_scale; + + if (threadIdx.x == 0) { + shared_scale = fdividef(1.f, temperatures[bi] + 1e-6f); + } + + __syncthreads(); + + const float scale = shared_scale; + + logits += (size_t)bi * vocab_size_padded; + + const int step = gridDim.x * blockDim.x * vec_size; + + for (int i = vi * vec_size; i < vocab_size_padded; i += step) { + Array vec; + Load(vec, logits + i); + PRAGMA_UNROLL + for (int c = 0; c < vec_size; ++c) { + if (i + c < vocab_size) { + vec[c] *= scale; + } + else { + vec[c] = -FLT_MAX; + } + } + Store(logits + i, vec); + } +} + +void invokeBatchApplyTemperaturePenalty_v2(float* logits, + const float* bias, + const float* temperatures, + const int batch_size, + const int vocab_size, + const int vocab_size_padded, + cudaStream_t stream) +{ + + auto invoke = [&](auto vec_size) { + constexpr int threads = 256; + const int blocks_per_tok = (vocab_size_padded + threads * vec_size - 1) / (threads * vec_size); + const dim3 blocks(blocks_per_tok, batch_size); + batchApplyTemperaturePenalty_v2<<>>( // + logits, + bias, + temperatures, + batch_size, + vocab_size, + vocab_size_padded); + }; + + if (vocab_size_padded % 4 == 0) { + invoke(std::integral_constant{}); + } + else if (vocab_size_padded % 2 == 0) { + invoke(std::integral_constant{}); + } + else { + invoke(std::integral_constant{}); + } +} + template __global__ void applyRepetitionPenalty(T* logits, const float penalty, diff --git a/src/turbomind/kernels/sampling_penalty_kernels.h b/src/turbomind/kernels/sampling_penalty_kernels.h index e12698cdf7..1f26b7d352 100644 --- a/src/turbomind/kernels/sampling_penalty_kernels.h +++ b/src/turbomind/kernels/sampling_penalty_kernels.h @@ -69,6 +69,14 @@ void invokeBatchApplyTemperaturePenalty(T* logits, const int vocab_size_padd, cudaStream_t stream); +void invokeBatchApplyTemperaturePenalty_v2(float* logits, + const float* bias, + const float* temperatures, + const int batch_size, + const int vocab_size, + const int vocab_size_padd, + cudaStream_t stream); + template void invokeMinLengthPenalty(T* logits, const int* min_lengths, diff --git a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc index b588d8b6f5..c458998031 100644 --- a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc +++ b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc @@ -178,7 +178,7 @@ void LogitsProcessorLayer::forward(TensorMap* output_tensors, TensorMap* inpu // temperature { if (!ALL_OF(temperature_.begin(), batch_size, float, 1.f)) { - invokeBatchApplyTemperaturePenalty( + invokeBatchApplyTemperaturePenalty_v2( logits, (T*)nullptr, temperature_buf_, batch_size, args_.vocab_size, args_.vocab_size_padded, stream_); sync_check_cuda_error(); } diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 0f8a0082f5..343baf21e5 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -93,8 +93,10 @@ void DropEmbeddings(const Sequence& seq) } template -void LlamaBatch::MarkConflictRequests(Requests& infer_reqs, Requests& kill_reqs) +void LlamaBatch::DisableConflictRequests(Requests& infer_reqs, Requests& kill_reqs) { + NvtxScope _("disable conflict"); + std::pmr::monotonic_buffer_resource mbr; std::pmr::unordered_map occur(&mbr); @@ -146,6 +148,8 @@ void LlamaBatch::ProcessCancelRequests(std::vector& signals) if (r && r->is_canceled) { ++count; signals.push_back(Interrupt(i, true)); + // Interrupt should reset r + FT_CHECK(!r); } } if (count) { @@ -236,6 +240,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vector::ProcessInferRequests(const Requests& reqs, std::vectoroutput_ids.getPtr(); // copy history tokens if (!seq.tokens.empty()) { - output_ids = Copy(seq.tokens.data(), seq.tokens.size(), output_ids); + d_output_ids = Copy(seq.tokens.data(), seq.tokens.size(), d_output_ids); + h_output_ids = std::copy_n(seq.tokens.data(), seq.tokens.size(), h_output_ids); } // copy input tokens if (input_length) { - output_ids = Copy(input_ids, input_length, output_ids); + d_output_ids = Copy(input_ids, input_length, d_output_ids); + h_output_ids = std::copy_n(input_ids, input_length, h_output_ids); } // total context length (history + input) - state.h_prompt_length[idx] = output_ids - output_ids_base; - state.h_context_length[idx] = output_ids - output_ids_base; + state.h_prompt_length[idx] = d_output_ids - output_ids_base; + state.h_context_length[idx] = d_output_ids - output_ids_base; state.h_finished[idx] = false; } @@ -1029,7 +1037,7 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) sync_check_cuda_error(); Clear(token_ids_buf_, batch_size * session_len_); - invokeTransposeAxis01(token_ids_buf_, state_->output_ids, batch_size, session_len_, 1, stream_); + invokeTranspose2D(token_ids_buf_, state_->output_ids, batch_size, session_len_, stream_); sync_check_cuda_error(); // token_ids_buf_[s, b] @@ -1247,11 +1255,13 @@ void LlamaBatch::OutputContextLogits(T* cont } template -auto LlamaBatch::Finish(GenerationState& g) -> std::vector +void LlamaBatch::Finish(GenerationState& g, std::vector& signals) { NvtxScope scope("Finish"); const int batch_size = state_->active_size; + signals.reserve(batch_size); + if (batch_size - g.partial) { FT_CHECK(g.step >= 0); @@ -1267,13 +1277,22 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector sync_check_cuda_error(); } - Copy(state_->output_ids, batch_size * session_len_, h_output_ids_); + Copy(token_ids_buf_ + (g.step - 1) * (batch_size - g.partial), batch_size - g.partial, h_output_ids_); Copy(finished_buf_, batch_size, state_->h_finished); Copy(sequence_lengths_, batch_size, state_->h_context_length); - Copy(sampled_logprobs_, batch_size * kMaxLogProb, h_sampled_logprobs_); - Copy(sampled_indexes_, batch_size * kMaxLogProb, h_sampled_indexes_); - Copy(sampled_nums_, batch_size, h_sampled_nums_); + bool output_logprobs = false; + for (int i = 0; i < batch_size - g.partial; ++i) { + if (state_->requests[i]->gen_cfg.output_logprobs) { + output_logprobs = true; + break; + } + } + if (output_logprobs) { + Copy(sampled_logprobs_, batch_size * kMaxLogProb, h_sampled_logprobs_); + Copy(sampled_indexes_, batch_size * kMaxLogProb, h_sampled_indexes_); + Copy(sampled_nums_, batch_size, h_sampled_nums_); + } check_cuda_error(cudaStreamSynchronize(stream_)); @@ -1284,13 +1303,14 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector } // ! Only rank-0 writes to output - if (rank_ == 0) { + if (rank_ == 0 && output_logprobs) { + NvtxScope scope("logprobs"); // output logprobs, should be set before sequence_length float* sampled_logprobs_ptr = h_sampled_logprobs_; uint32_t* sampled_indexes_ptr = h_sampled_indexes_; uint32_t* sampled_nums_ptr = h_sampled_nums_; for (int i = 0; i < batch_size - g.partial; ++i) { - if (state_->requests[i] && state_->requests[i]->inputs.isExist("logprobs")) { + if (state_->requests[i] && state_->requests[i]->gen_cfg.output_logprobs) { auto logprob_vals = state_->requests[i]->outputs.getPtr("logprob_vals"); auto logprob_indexes = state_->requests[i]->outputs.getPtr("logprob_indexes"); auto logprob_nums = state_->requests[i]->outputs.getPtr("logprob_nums"); @@ -1312,19 +1332,37 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector // ! Only rank-0 writes to output if (rank_ == 0) { - // set output tokens ids and sequence length - int* output_ptr = h_output_ids_; - for (int i = 0; i < batch_size - g.partial; ++i) { - if (state_->requests[i] && (state_->requests[i]->stream_output || state_->h_finished[i])) { - auto output_ids = state_->requests[i]->outputs.getPtr("output_ids"); - auto output_len = state_->requests[i]->outputs.getPtr("sequence_length"); - const int count = state_->h_context_length[i]; - FT_CHECK(state_->requests[i]->outputs.at("output_ids").shape[0] >= count); - // TODO: sync history output tokens at when receiving the request and copy the last token here - std::copy(output_ptr, output_ptr + count, output_ids); - *output_len = count; + NvtxScope scope("output_ids"); + if constexpr (0) { + // set output tokens ids and sequence length + int* output_ptr = h_output_ids_; + for (int i = 0; i < batch_size - g.partial; ++i) { + if (auto& r = state_->requests[i]) { + auto output_ids = static_cast(r->output_ids.data); + auto output_len = static_cast(r->sequence_length.data); + const int count = state_->h_context_length[i]; + if (r->stream_output) { + output_ids[count - 1] = output_ptr[count - 1]; + *output_len = count; + } + else if (state_->h_finished[i]) { + std::copy(output_ptr, output_ptr + count, output_ids); + *output_len = count; + } + } + output_ptr += session_len_; + } + } + else { + for (int i = 0; i < batch_size - g.partial; ++i) { + if (auto& r = state_->requests[i]) { + auto output_ids = static_cast(r->output_ids.data); + auto output_len = static_cast(r->sequence_length.data); + const int count = state_->h_context_length[i]; + output_ids[count - 1] = h_output_ids_[i]; + *output_len = count; + } } - output_ptr += session_len_; } } @@ -1345,39 +1383,53 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector } } - std::vector signals; { - NvtxScope _("stream_and_completion_signal"); + NvtxScope _("count and sync"); + bool need_sync = false; for (int i = 0; i < batch_size - g.partial; ++i) { - if (state_->requests[i]) { - auto& r = state_->requests[i]; - if (state_->h_finished[i]) { - // Interrupt finished sequences and move the request handle into the signal closure - signals.push_back(Interrupt(i)); - ++g.finished_count; - } - else if (r->stream_output && rank_ == 0) { - const auto seq_len = r->outputs.getVal("sequence_length"); - // Create signals by copying the request handles for non-finished streaming requests - signals.push_back([this, r, seq_len] { // - UpdateState(*r, Request::kOk, seq_len); - }); + if (state_->h_finished[i]) { + ++g.finished_count; + if (!state_->requests[i]->session.end_flag) { + need_sync = true; } } } - if (g.finished_count) { - // synchronize for interrupted sequences - check_cuda_error(cudaStreamSynchronize(stream_)); + if (need_sync) { + // Release updates on request output buffers to all ranks (`Interrupt` will use it) + shared_state_->barrier->wait(); + } + } + + { + NvtxScope _("stream_and_completion_signal"); + for (int i = 0; i < batch_size - g.partial; ++i) { + auto& r = state_->requests[i]; + if (state_->h_finished[i]) { + // Interrupt finished sequences and move the request handle into the signal closure + signals.push_back(Interrupt(i)); + // Interrupt should reset r + FT_CHECK(!r); + } + else if (r->stream_output && rank_ == 0) { + const auto seq_len = r->sequence_length.getVal(); + // Create signals by copying the request handles for non-finished streaming requests + signals.push_back([this, r, seq_len] { // + UpdateState(*r, Request::kOk, seq_len); + }); + } } } + if (g.finished_count) { + // synchronize for interrupted sequences + check_cuda_error(cudaStreamSynchronize(stream_)); + } + if (g.partial) { const int i = batch_size - 1; // recover full context length of partial state_->h_context_length[i] = g.partial_context_legnth; } - - return signals; } template @@ -1408,17 +1460,10 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig // Update token IDs seq.tokens.resize(output_len); - const auto output_ids_data = [&] { - if (force_stop) { - // `h_output_ids_` is UNDEFINED at `ProcessStopRequests` - return state_->requests[index]->outputs.at("output_ids").getPtr(); - } - else { - // `h_output_ids_` just updated by `Finish`, but `outputs` is NOT synced atm - return h_output_ids_ + index * (size_t)session_len_; - } - }(); - std::copy_n(output_ids_data, output_len, seq.tokens.data()); + + // output_ids is updated & synced in `Finish` + const auto output_ids = state_->requests[index]->output_ids.getPtr(); + std::copy_n(output_ids, output_len, seq.tokens.data()); // Save random state in host memory seq.random_state.resize(sizeof(curandState_t)); @@ -1433,7 +1478,7 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig auto ec = std::exchange(state_->errors[index], Request::kOk); - const auto len = state_->requests[index]->outputs.getVal("sequence_length"); + const auto len = state_->requests[index]->sequence_length.getVal(); // move the request handle into the signal return [this, len, r = std::move(state_->requests[index])] { // UpdateState(*r, Request::kFinish, len); @@ -1458,12 +1503,15 @@ void LlamaBatch::InternalThreadEntry() while (1) { if (rank_ == 0) { - const int free_slot_count = max_batch_size_ - state_->size + g.finished_count; - const bool is_empty = (free_slot_count == max_batch_size_); - // Block if batch is empty - gateway_->pop(infer_reqs, kill_reqs, free_slot_count, is_empty, shared_state_->abort); + { + NvtxScope _("pop"); + const int free_slot_count = max_batch_size_ - state_->size + g.finished_count; + const bool is_empty = (free_slot_count == max_batch_size_); + // Block if batch is empty + gateway_->pop(infer_reqs, kill_reqs, free_slot_count, is_empty, shared_state_->abort); + } // Mark reqs to the same session_id as invalid (which are dangerous to the engine) - MarkConflictRequests(infer_reqs, kill_reqs); + DisableConflictRequests(infer_reqs, kill_reqs); } NvtxScope scope("mainloop"); @@ -1503,18 +1551,19 @@ void LlamaBatch::InternalThreadEntry() if (state_->active_size) { // - (void)Forward(g); - // - if (auto signals = Finish(g); !signals.empty()) { - if (g.finished_count) { - // Finished requests and corresponding output tensors will be released when notified - // wait for all ranks to ensure no rank (except for output thread) will access related - // resources - shared_state_->barrier->wait(); - } - if (rank_ == 0) { - gateway_->notify(std::move(signals)); - } + Forward(g); + + Finish(g, signals); + + if (g.finished_count) { + // Finished requests and corresponding output tensors will be released when notified + // wait for all ranks to ensure no rank (except for output thread) will access related + // resources + shared_state_->barrier->wait(); + } + + if (rank_ == 0) { + gateway_->notify(std::move(signals)); } } } @@ -1661,10 +1710,9 @@ bool LlamaBatch::Forward(GenerationState& g) // `SequenceManager` needs real-time value of cache length for (int i = 0; i < active_size; ++i) { - if (state_->requests[i]) { - FT_CHECK(state_->sequences[i]); - state_->sequences[i]->cache_len += state_->sequences[i]->input_length; - } + FT_CHECK((bool)state_->requests[i]); + FT_CHECK(state_->sequences[i]); + state_->sequences[i]->cache_len += state_->sequences[i]->input_length; } if (active_size > g.partial) { diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index e2ffc0a230..144463c225 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -80,7 +80,7 @@ class LlamaBatch { using Requests = std::vector>; using Signal = std::function; - void MarkConflictRequests(Requests& infer_reqs, Requests& kill_reqs); + void DisableConflictRequests(Requests& infer_reqs, Requests& kill_reqs); void ProcessKillRequests(const Requests& reqs, std::vector& signals); @@ -94,9 +94,9 @@ class LlamaBatch { void InitializeSampling(const GenerationState& g); - [[nodiscard]] bool Forward(GenerationState& g); + bool Forward(GenerationState& g); - [[nodiscard]] auto Finish(GenerationState& g) -> std::vector; + void Finish(GenerationState& g, std::vector& signals); [[nodiscard]] Signal Interrupt(int index, bool force_stop = false, bool force_end = false); From 8baa78483703317710f71432489154df14f0c13a Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Sun, 22 Dec 2024 00:17:50 +0800 Subject: [PATCH 07/40] add cancel cb --- lmdeploy/turbomind/chat.py | 2 +- lmdeploy/turbomind/turbomind.py | 17 +++++++++++++---- src/turbomind/engine/model_request.cc | 4 +++- src/turbomind/engine/model_request.h | 2 +- src/turbomind/engine/request.h | 17 +++++++++-------- src/turbomind/engine/request_queue.cc | 3 +++ src/turbomind/models/llama/LlamaBatch.cc | 3 +++ src/turbomind/python/bind.cpp | 9 ++++++--- 8 files changed, 39 insertions(+), 18 deletions(-) diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index c45c2ac793..a897922bfd 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -65,7 +65,7 @@ async def async_infer(generator, session_id, input_ids, gen_config, prev_len = tokens print(response, end='', flush=True) # if 'I' in response: - # await generator.async_cancel(0, blocking=False) + # await generator.async_cancel() return tokens diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index baa2757432..ce3368c33b 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -417,13 +417,22 @@ def cancel(self, session_id: int, blocking: bool = True): if blocking: self.done_event.wait() - async def async_cancel(self, session_id: int, blocking: bool = False): + def async_cancel_cb(self, status: int): + + async def _signal(): + print(f'session canceled, status = {status}') + self.cancel_event.set() + + asyncio.run_coroutine_threadsafe(_signal(), self.event_loop) + + + async def async_cancel(self, session_id: int = None): """End the given session.""" if not self.is_canceled: - self.model_inst.cancel() + self.cancel_event = asyncio.Event() + self.model_inst.cancel(self.async_cancel_cb) self.is_canceled = True - if blocking: - await self.done_event.wait() + await self.cancel_event.wait() def prepare_embeddings(self, input_embeddings=None, diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index cd95f788f7..0061618f7f 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -59,10 +59,12 @@ ModelRequest::ModelRequest(Gateway* gateway, int session_len, int vocab_size): { } -void ModelRequest::Cancel() +void ModelRequest::Cancel(std::function cb) { // request is finished if lock failed if (auto r = request_.lock()) { + // the cb will be synced to engine via release-acquire semantics + r->cancel_cb = std::move(cb); gateway_->cancel(std::move(r)); } } diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h index c05824ed5b..96870cf1ef 100644 --- a/src/turbomind/engine/model_request.h +++ b/src/turbomind/engine/model_request.h @@ -16,7 +16,7 @@ class ModelRequest { ModelRequest(Gateway* gateway, int session_len, int vocab_size); // Cancel running request - void Cancel(); + void Cancel(std::function cb); // Reset the channel to uninitailized state, calls `notify` when done void End(std::function cb); diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index e0af543185..ceec020d46 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -92,20 +92,21 @@ struct Request { // reference to IO tensors TensorMap inputs; TensorMap outputs; + // fast path for accessing common output buffers + Tensor output_ids; + Tensor sequence_length; std::function end_cb; - std::function forward_cb; - std::atomic cancel_flag; - std::shared_ptr state; + std::function cancel_cb; + std::atomic cancel_flag; + bool is_canceled{}; - bool is_canceled{}; + std::function forward_cb; - int ec; + std::shared_ptr state; - // fast path for accessing common output buffers - Tensor output_ids; - Tensor sequence_length; + int ec; // set when disabling conflicting requests enum { kOk = 0, diff --git a/src/turbomind/engine/request_queue.cc b/src/turbomind/engine/request_queue.cc index 8c0b52b5bf..c39a23b0c2 100644 --- a/src/turbomind/engine/request_queue.cc +++ b/src/turbomind/engine/request_queue.cc @@ -34,6 +34,9 @@ void RequestQueue::cancel(std::shared_ptr r) // not picked by engine yet, skip directly gateway_->notify({[r = std::move(r)] { // UpdateState(*r, Request::kCancel, 0); + if (r->cancel_cb) { + r->cancel_cb(0); + } }}); } } diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 343baf21e5..5f3b5660ed 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1482,6 +1482,9 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig // move the request handle into the signal return [this, len, r = std::move(state_->requests[index])] { // UpdateState(*r, Request::kFinish, len); + if (r->cancel_cb) { + r->cancel_cb(0); + } }; } diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 6ac820ce43..c4f67cdb6d 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -12,9 +12,9 @@ #include #include +#include "src/turbomind/engine/model_request.h" #include "src/turbomind/python/dlpack.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -#include "src/turbomind/engine/model_request.h" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" @@ -499,8 +499,11 @@ PYBIND11_MODULE(_turbomind, m) "cb"_a) .def( "cancel", - [](ModelRequest* model_request) { model_request->Cancel(); }, - py::call_guard()) + [](ModelRequest* model_request, std::function cb) { + model_request->Cancel(std::move(cb)); // + }, + py::call_guard(), + "cb"_a) .def( "end", [](ModelRequest* model_request, std::function cb) { From 1bc68d11cbbc4f325bf42d948ee3191dd22c1960 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Sun, 22 Dec 2024 03:17:39 +0800 Subject: [PATCH 08/40] simplify async engine --- lmdeploy/serve/async_engine.py | 36 ++++++++++++++------------------- lmdeploy/turbomind/turbomind.py | 3 +-- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index f3c3432328..572a54aca4 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -180,10 +180,10 @@ def __init__(self, self.tokenizer = self.engine.tokenizer self.id2step = {} self.id2generator = {} - self.running_session_ids = set() - self.gens_set = set() - for i in range(self.instance_num): - self.gens_set.add(self.engine.create_instance()) + self.free_gens = None + self.instances = [ + self.engine.create_instance() for _ in range(self.instance_num) + ] self._session_id = count(0) self.request_logger = RequestLogger(max_log_len) @@ -247,18 +247,12 @@ async def stop_session(self, session_id: int): """Stop a session by a session_id.""" if str(session_id) in self.id2generator: await self.id2generator[str(session_id)].async_cancel(session_id) - self.gens_set.add(self.id2generator[str(session_id)]) - - self.running_session_ids.discard(session_id) async def end_session(self, session_id: int): """Clear a session by a session_id.""" if str(session_id) in self.id2generator: await self.id2generator[str(session_id)].async_end(session_id) self.id2step[str(session_id)] = 0 - self.gens_set.add(self.id2generator[str(session_id)]) - - self.running_session_ids.discard(session_id) @asynccontextmanager async def safe_run(self, session_id: Optional[int] = None): @@ -266,23 +260,17 @@ async def safe_run(self, session_id: Optional[int] = None): try: yield except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa + print(f'exception caught: {e}') # TODO: find out why await would block the coroutine here - _get_event_loop().create_task(self.stop_session(session_id)) - raise e + await self.stop_session(session_id) if str(session_id) in self.id2generator: - self.gens_set.add(self.id2generator[str(session_id)]) - self.running_session_ids.discard(session_id) + self.free_gens.put_nowait(self.id2generator[str(session_id)]) async def get_generator(self, stop: bool, session_id: int): """Only return the model instance if it is available.""" - if stop: - return self.engine.create_instance() - # waiting no generator is available or the same session_id is running - while self.gens_set == set() or session_id in self.running_session_ids: - await asyncio.sleep(0.1) - generator = self.gens_set.pop() + assert not stop, 'not implemented' + generator = await self.free_gens.get() self.id2generator[str(session_id)] = generator - self.running_session_ids.add(session_id) return generator def batch_infer(self, @@ -568,6 +556,12 @@ def is_error(status): ResponseType.SUCCESS, ResponseType.FINISH ] + if self.free_gens is None: + # `asyncio.Queue` must be created in an async context + self.free_gens = asyncio.Queue() + for inst in self.instances: + self.free_gens.put_nowait(inst) + generator = await self.get_generator(False, session_id) async with self.safe_run(session_id): state = DetokenizeState(len(input_ids)) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index ce3368c33b..fb8330debf 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -425,7 +425,6 @@ async def _signal(): asyncio.run_coroutine_threadsafe(_signal(), self.event_loop) - async def async_cancel(self, session_id: int = None): """End the given session.""" if not self.is_canceled: @@ -568,7 +567,7 @@ async def async_stream_infer(self, else: self.cond = asyncio.Condition() self.done_event = asyncio.Event() - + self.is_canceled = False self.flag = 0 From f22076214a25f894070bea051e947ee470b0c1eb Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Sun, 22 Dec 2024 16:02:15 +0800 Subject: [PATCH 09/40] simplify async engine --- lmdeploy/serve/async_engine.py | 103 ++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 48 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 572a54aca4..06f3185555 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -245,33 +245,38 @@ def __call__(self, async def stop_session(self, session_id: int): """Stop a session by a session_id.""" - if str(session_id) in self.id2generator: - await self.id2generator[str(session_id)].async_cancel(session_id) + generator = self.id2generator.get(self.id2generator) + if generator: + await generator.async_cancel(session_id) + # else it's not running at all async def end_session(self, session_id: int): - """Clear a session by a session_id.""" - if str(session_id) in self.id2generator: - await self.id2generator[str(session_id)].async_end(session_id) - self.id2step[str(session_id)] = 0 + """For ending a session that is not running.""" + # TODO: wait for generator to finish `await generator.async_done()` + assert session_id not in self.id2generator + generator = await self.free_gens.get() + try: + await generator.async_end(session_id) + self.id2step[session_id] = 0 + except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa + print(f'[end_session] exception caught: {e}') + finally: + self.free_gens.put_nowait(generator) @asynccontextmanager - async def safe_run(self, session_id: Optional[int] = None): + async def safe_run(self, session_id: int): """A context manager to make sure server's safe running.""" + generator = await self.free_gens.get() + assert session_id not in self.id2generator + self.id2generator[session_id] = generator try: - yield + yield generator except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa - print(f'exception caught: {e}') - # TODO: find out why await would block the coroutine here - await self.stop_session(session_id) - if str(session_id) in self.id2generator: - self.free_gens.put_nowait(self.id2generator[str(session_id)]) - - async def get_generator(self, stop: bool, session_id: int): - """Only return the model instance if it is available.""" - assert not stop, 'not implemented' - generator = await self.free_gens.get() - self.id2generator[str(session_id)] = generator - return generator + print(f'[safe_run] exception caught: {e}') + await generator.async_cancel(session_id) + finally: + self.id2generator.pop(session_id) + self.free_gens.put_nowait(generator) def batch_infer(self, prompts: Union[List[str], str, List[Dict], @@ -478,10 +483,10 @@ async def generate( do_preprocess (bool): whether pre-process the messages. Default to True, which means chat_template will be applied. """ - if str(session_id) not in self.id2step: - self.id2step[str(session_id)] = 0 + if session_id not in self.id2step: + self.id2step[session_id] = 0 if step != 0: - self.id2step[str(session_id)] = step + self.id2step[session_id] = step if gen_config is None: gen_config = GenerationConfig() else: @@ -524,7 +529,7 @@ async def generate( gen_config=gen_config, adapter_name=adapter_name) logger.info(f'session_id={session_id}, ' - f'history_tokens={self.id2step[str(session_id)]}, ' + f'history_tokens={self.id2step[session_id]}, ' f'input_tokens={len(input_ids)}, ' f'max_new_tokens={gen_config.max_new_tokens}, ' f'seq_start={sequence_start}, seq_end={sequence_end}, ' @@ -533,19 +538,19 @@ async def generate( if gen_config.max_new_tokens is None: # for interactive endpoint, will try maximum possible token num gen_config.max_new_tokens = max( - 128, self.session_len - self.id2step[str(session_id)] - - len(input_ids)) - elif self.id2step[str(session_id)] + len( + 128, + self.session_len - self.id2step[session_id] - len(input_ids)) + elif self.id2step[session_id] + len( input_ids) + gen_config.max_new_tokens > self.session_len: gen_config.max_new_tokens = max( - self.session_len - self.id2step[str(session_id)] - - len(input_ids), 128) + self.session_len - self.id2step[session_id] - len(input_ids), + 128) logger.error( f'Truncate max_new_tokens to {gen_config.max_new_tokens}') - if self.id2step[str(session_id)] + len( + if self.id2step[session_id] + len( input_ids) + gen_config.max_new_tokens > self.session_len: logger.error(f'run out of tokens. session_id={session_id}.') - yield GenOut('', self.id2step[str(session_id)], len(input_ids), 0, + yield GenOut('', self.id2step[session_id], len(input_ids), 0, 'length') if sequence_end is True and sequence_start is False: await self.end_session(session_id) @@ -562,9 +567,10 @@ def is_error(status): for inst in self.instances: self.free_gens.put_nowait(inst) - generator = await self.get_generator(False, session_id) - async with self.safe_run(session_id): + async with self.safe_run(session_id) as generator: state = DetokenizeState(len(input_ids)) + res = input_ids.copy() + prev_len = 0 start_ids_offset = state.ids_offset response = '' async for outputs in generator.async_stream_infer( @@ -575,12 +581,15 @@ def is_error(status): stream_output=stream_response, sequence_start=sequence_start, sequence_end=sequence_end, - step=self.id2step[str(session_id)]): + step=self.id2step[session_id]): # decode res if is_error(outputs.status): tokens = 0 break - res, tokens = input_ids + outputs.token_ids, outputs.num_token # noqa + tokens = outputs.num_token + res += outputs.token_ids[prev_len - tokens:] + prev_len = tokens + if len(res) <= state.ids_offset: continue @@ -598,7 +607,7 @@ def is_error(status): # response, history token len, # input token len, gen token len - yield GenOut(response, self.id2step[str(session_id)], + yield GenOut(response, self.id2step[session_id], len(input_ids), tokens, finish_reason, res, logprobs) if not is_error(outputs.status): @@ -609,24 +618,22 @@ def is_error(status): if not response.endswith('�'): # avaid returning the last response twice response = '' - yield GenOut(response, self.id2step[str(session_id)], + yield GenOut(response, self.id2step[session_id], len(input_ids), tokens, finish_reason) else: - yield GenOut( - response='internal error happened', - history_token_len=self.id2step[str(session_id)], - input_token_len=len(input_ids), - generate_token_len=0, - finish_reason='error', - token_ids=[]) + yield GenOut(response='internal error happened', + history_token_len=self.id2step[session_id], + input_token_len=len(input_ids), + generate_token_len=0, + finish_reason='error', + token_ids=[]) # update step - self.id2step[str(session_id)] += len(input_ids) + tokens + self.id2step[session_id] += len(input_ids) + tokens if sequence_end: - self.id2step[str(session_id)] = 0 + self.id2step[session_id] = 0 # manually end pytorch session - # TODO modify pytorch or turbomind api if self.backend == 'pytorch' and sequence_end: - await self.end_session(session_id) + await generator.async_end(session_id) def parse_tool_response(self, text, tools, **kwargs): """Parse model response containing tool information. From 31c62233436fe31f30db15d5a1d3c9cde7b2fdcf Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Sun, 22 Dec 2024 19:10:29 +0800 Subject: [PATCH 10/40] fix end session --- lmdeploy/turbomind/turbomind.py | 2 +- src/turbomind/engine/model_request.cc | 4 ++-- src/turbomind/engine/model_request.h | 2 +- src/turbomind/python/bind.cpp | 7 ++++--- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index fb8330debf..8189ee52e9 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -399,7 +399,7 @@ async def async_end(self, session_id): await self.done_event.wait() self.end_event = asyncio.Event() self.event_loop = asyncio.get_running_loop() - self.model_inst.end(self.async_end_cb) + self.model_inst.end(self.async_end_cb, session_id) await self.end_event.wait() def end_cb(self, status: int): diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index 0061618f7f..61e9a4fef8 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -69,11 +69,11 @@ void ModelRequest::Cancel(std::function cb) } } -void ModelRequest::End(std::function cb) +void ModelRequest::End(std::function cb, uint64_t session_id) { auto r = std::make_shared(); - r->id = r->session.id = session_id_; + r->id = r->session.id = session_id; r->session.kill_flag = true; r->end_cb = std::move(cb); diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h index 96870cf1ef..7866d8d9b7 100644 --- a/src/turbomind/engine/model_request.h +++ b/src/turbomind/engine/model_request.h @@ -19,7 +19,7 @@ class ModelRequest { void Cancel(std::function cb); // Reset the channel to uninitailized state, calls `notify` when done - void End(std::function cb); + void End(std::function cb, uint64_t session_id); using TensorMap_ = std::unordered_map; diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index c4f67cdb6d..139fc2111c 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -506,11 +506,12 @@ PYBIND11_MODULE(_turbomind, m) "cb"_a) .def( "end", - [](ModelRequest* model_request, std::function cb) { - model_request->End(std::move(cb)); // + [](ModelRequest* model_request, std::function cb, uint64_t session_id) { + model_request->End(std::move(cb), session_id); // }, py::call_guard(), - "cb"_a); + "cb"_a, + "session_id"_a); // transformer model using ft::AbstractTransformerModel; From b3d15b1deaf2c434da2f44c1cf27e88c070272f3 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Mon, 23 Dec 2024 15:02:02 +0800 Subject: [PATCH 11/40] faster synchronization --- lmdeploy/serve/async_engine.py | 15 +- lmdeploy/turbomind/chat.py | 7 +- lmdeploy/turbomind/turbomind.py | 233 +++++++++++++++----------------- 3 files changed, 123 insertions(+), 132 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 06f3185555..8def542201 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -252,9 +252,13 @@ async def stop_session(self, session_id: int): async def end_session(self, session_id: int): """For ending a session that is not running.""" - # TODO: wait for generator to finish `await generator.async_done()` - assert session_id not in self.id2generator - generator = await self.free_gens.get() + generator = self.id2generator.get(session_id) + if generator: + fut = generator._fut + await fut + assert session_id not in self.id2generator + else: + generator = await self.free_gens.get() try: await generator.async_end(session_id) self.id2step[session_id] = 0 @@ -266,8 +270,9 @@ async def end_session(self, session_id: int): @asynccontextmanager async def safe_run(self, session_id: int): """A context manager to make sure server's safe running.""" - generator = await self.free_gens.get() assert session_id not in self.id2generator + generator = await self.free_gens.get() + generator._fut = asyncio.get_running_loop().create_future() self.id2generator[session_id] = generator try: yield generator @@ -276,6 +281,8 @@ async def safe_run(self, session_id: int): await generator.async_cancel(session_id) finally: self.id2generator.pop(session_id) + generator._fut.set_result(None) + generator._fut = None self.free_gens.put_nowait(generator) def batch_infer(self, diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index a897922bfd..e5fdf802df 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -172,6 +172,9 @@ def main(model_path: str, repetition_penalty=repetition_penalty, stop_token_ids=stop_words) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + nth_round = 1 step = 0 seed = random.getrandbits(64) @@ -181,7 +184,7 @@ def main(model_path: str, exit(0) elif prompt == 'end': if use_async: - asyncio.run(generator.async_end(session_id)) + loop.run_until_complete(generator.async_end(session_id)) else: generator.end(session_id) nth_round = 1 @@ -211,7 +214,7 @@ def main(model_path: str, coro = async_infer(generator, session_id, input_ids, gen_config, sequence_start, step, stream_output, tokenizer, state) - tokens = asyncio.run(coro) + tokens = loop.run_until_complete(coro) else: tokens = infer(generator, session_id, input_ids, gen_config, sequence_start, step, stream_output, tokenizer, diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 8189ee52e9..875c662926 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -8,6 +8,7 @@ from collections.abc import Sequence from concurrent.futures import ThreadPoolExecutor from dataclasses import asdict +from functools import partial from itertools import repeat from queue import Queue from typing import Dict, Iterable, List @@ -319,6 +320,13 @@ def create_instance(self, cuda_stream_id=0): return TurboMindInstance(self, self.config, cuda_stream_id) +class StreamingSignal: + + def __init__(self): + self.loop = asyncio.get_running_loop() + self.fut = self.loop.create_future() + + class TurboMindInstance: """Instance of TurboMind. @@ -346,9 +354,7 @@ def __init__(self, self.model_inst = self._create_model_instance(0) self.config = config - - self.cond = None - self.done_event = None + self.lock = None def _create_model_instance(self, device_id): model_inst = self.tm_model.model_comm.create_model_instance(device_id) @@ -387,52 +393,6 @@ def _get_logprobs(self, out_logprobs.append(tok_res) return out_logprobs - def async_end_cb(self, status: int): - - async def _signal(): - print(f'session ended, status = {status}') - self.end_event.set() - - asyncio.run_coroutine_threadsafe(_signal(), self.event_loop) - - async def async_end(self, session_id): - await self.done_event.wait() - self.end_event = asyncio.Event() - self.event_loop = asyncio.get_running_loop() - self.model_inst.end(self.async_end_cb, session_id) - await self.end_event.wait() - - def end_cb(self, status: int): - print(f'session ended, status = {status}') - self.end_event.set() - - def end(self): - self.done_event.wait() - self.end_event = threading.Event() - self.model_inst.end(self.end_cb) - self.end_event.wait() - - def cancel(self, session_id: int, blocking: bool = True): - self.model_inst.cancel() - if blocking: - self.done_event.wait() - - def async_cancel_cb(self, status: int): - - async def _signal(): - print(f'session canceled, status = {status}') - self.cancel_event.set() - - asyncio.run_coroutine_threadsafe(_signal(), self.event_loop) - - async def async_cancel(self, session_id: int = None): - """End the given session.""" - if not self.is_canceled: - self.cancel_event = asyncio.Event() - self.model_inst.cancel(self.async_cancel_cb) - self.is_canceled = True - await self.cancel_event.wait() - def prepare_embeddings(self, input_embeddings=None, input_embedding_ranges=None): @@ -493,7 +453,7 @@ def prepare_inputs(self, assert isinstance(input_ids, Sequence) input_ids = torch.IntTensor(input_ids) - input_lengths = torch.IntTensor([len(input_ids)]) + input_len = len(input_ids) inputs = dict(input_ids=input_ids, ) @@ -521,16 +481,36 @@ def prepare_inputs(self, if bad_words is not None: inputs['bad_words_list'] = bad_words - return inputs, input_lengths + return inputs, input_len - async def async_signal(self): - async with self.cond: - self.flag = 1 - self.cond.notify() + def async_cancel_cb(self, fut: asyncio.Future, status: int): + """executing on engine's signaling thread.""" + print(f'session canceled, status = {status}') + fut.get_loop().call_soon_threadsafe(fut.set_result, status) + + async def async_cancel(self, session_id: int = None): + fut = asyncio.get_running_loop().create_future() + self.model_inst.cancel(partial(self.async_cancel_cb, fut)) + return await fut + + def async_end_cb(self, fut: asyncio.Future, status: int): + """executing on engine's signaling thread.""" + print(f'session ended, status = {status}') + fut.get_loop().call_soon_threadsafe(fut.set_result, status) + + async def async_end(self, session_id): + fut = asyncio.get_running_loop().create_future() + self.model_inst.end(partial(self.async_end_cb, fut), session_id) + await fut + + def async_signal_cb(self, s: StreamingSignal): + """executing on engine's signaling thread.""" - def async_signal_cb(self): - coro = self.async_signal() - asyncio.run_coroutine_threadsafe(coro, self.event_loop) + def _signal(): + fut, s.fut = s.fut, s.loop.create_future() # exchange + fut.set_result(None) + + s.loop.call_soon_threadsafe(_signal) async def async_stream_infer(self, session_id, @@ -559,98 +539,84 @@ async def async_stream_infer(self, stream_output (bool): indicator for stream output kwargs (dict): kwargs for backward compatibility """ - self.event_loop = asyncio.get_running_loop() - if self.done_event is not None: - await self.done_event.wait() - self.done_event.clear() - else: - self.cond = asyncio.Condition() - self.done_event = asyncio.Event() + if self.lock is None: + self.lock = asyncio.Lock() - self.is_canceled = False - self.flag = 0 + async with self.lock: # reentrant proof - gen_cfg = self._get_generation_config(gen_config) + self.flag = 0 - inputs, input_length = self.prepare_inputs( - input_ids=input_ids, - input_embeddings=input_embeddings, - input_embedding_ranges=input_embedding_ranges, - gen_config=gen_config) + gen_cfg = self._get_generation_config(gen_config) - session = _tm.SessionParam(id=session_id, - step=step, - start=sequence_start, - end=sequence_end) + inputs, input_len = self.prepare_inputs( + input_ids=input_ids, + input_embeddings=input_embeddings, + input_embedding_ranges=input_embedding_ranges, + gen_config=gen_config) - inputs = _np_dict_to_tm_dict(inputs) + session = _tm.SessionParam(id=session_id, + step=step, + start=sequence_start, + end=sequence_end) - outputs, shared_state = self.model_inst.forward( - inputs, session, gen_cfg, stream_output, self.async_signal_cb) + inputs = _np_dict_to_tm_dict(inputs) - outputs = _tm_dict_to_torch_dict(outputs) + signal = StreamingSignal() + signal_cb = partial(self.async_signal_cb, signal) - output_ids_buf = outputs['output_ids'] + outputs, shared_state = self.model_inst.forward( + inputs, session, gen_cfg, stream_output, signal_cb) - out_logprobs = None - finish = False - state = None + outputs = _tm_dict_to_torch_dict(outputs) - output_ids = [] - output_len = 0 - prev_len = step + input_length[0] - try: - # generator - while True: + output_ids_buf = outputs['output_ids'] - async with self.cond: - while not self.flag: - await self.cond.wait() - self.flag = 0 + out_logprobs = None + finish = False + state = None - state = shared_state.consume() - status, seq_len = state.status, state.seq_len + output_ids = [] + output_len = 0 + prev_len = step + input_len + try: + while True: + await signal.fut - if status == 7: - finish = True - status = 0 - elif status: - yield self._get_error_output() - break + state = shared_state.consume() + status, seq_len = state.status, state.seq_len - if seq_len == prev_len and not finish: - continue + if status == 7: + finish, status = True, 0 + elif status: + yield self._get_error_output() + break - output_ids += output_ids_buf[prev_len:seq_len].tolist() - output_len += seq_len - prev_len + if seq_len == prev_len and not finish: + continue - status = ResponseType.FINISH if finish else ResponseType.SUCCESS - output = EngineOutput(status, output_ids, output_len.item(), - out_logprobs) + output_ids += output_ids_buf[prev_len:seq_len].tolist() + output_len += seq_len - prev_len + status = ResponseType.FINISH if finish else ResponseType.SUCCESS + output = EngineOutput(status, output_ids, output_len, + out_logprobs) + prev_len = seq_len - prev_len = seq_len + yield output - yield output - - if finish: - break + if finish: + break - except Exception as e: - logger.error(e) - yield self._get_error_output() + except Exception as e: + logger.error(e) + yield self._get_error_output() - finally: - async with self.cond: - # Contract: `notfiy` won't be called again if status is non-zero + finally: + # Contract: `cb` won't be called again if status is non-zero # wait for status to be set as `finish` or `error` while not state or state.status == 0: - while not self.flag: - await self.cond.wait() - self.flag = 0 + await signal.fut state = shared_state.consume() - self.done_event.set() - self.event_loop = None def _get_error_output(self): return EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR, @@ -684,6 +650,21 @@ def signal_cb(self): self.flag = 1 self.cond.notify() + def end_cb(self, status: int): + print(f'session ended, status = {status}') + self.end_event.set() + + def end(self): + self.done_event.wait() + self.end_event = threading.Event() + self.model_inst.end(self.end_cb) + self.end_event.wait() + + def cancel(self, session_id: int, blocking: bool = True): + self.model_inst.cancel() + if blocking: + self.done_event.wait() + def stream_infer(self, session_id, input_ids, From c6fd2600b6230f75edcf1ec30bc52300fd066ccd Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Mon, 23 Dec 2024 18:49:32 +0800 Subject: [PATCH 12/40] fix async engine --- lmdeploy/serve/async_engine.py | 27 ++++++++++++++------------- lmdeploy/turbomind/turbomind.py | 14 +++++++------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 8def542201..38ab5b2c3b 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -180,7 +180,7 @@ def __init__(self, self.tokenizer = self.engine.tokenizer self.id2step = {} self.id2generator = {} - self.free_gens = None + self.free_gens: asyncio.Queue = None self.instances = [ self.engine.create_instance() for _ in range(self.instance_num) ] @@ -245,7 +245,7 @@ def __call__(self, async def stop_session(self, session_id: int): """Stop a session by a session_id.""" - generator = self.id2generator.get(self.id2generator) + generator = self.id2generator.get(session_id) if generator: await generator.async_cancel(session_id) # else it's not running at all @@ -263,7 +263,7 @@ async def end_session(self, session_id: int): await generator.async_end(session_id) self.id2step[session_id] = 0 except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa - print(f'[end_session] exception caught: {e}') + logger.error(f'[end_session] exception caught: {e}') finally: self.free_gens.put_nowait(generator) @@ -277,7 +277,7 @@ async def safe_run(self, session_id: int): try: yield generator except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa - print(f'[safe_run] exception caught: {e}') + logger.error(f'[safe_run] exception caught: {e}') await generator.async_cancel(session_id) finally: self.id2generator.pop(session_id) @@ -576,7 +576,7 @@ def is_error(status): async with self.safe_run(session_id) as generator: state = DetokenizeState(len(input_ids)) - res = input_ids.copy() + token_ids = input_ids.copy() prev_len = 0 start_ids_offset = state.ids_offset response = '' @@ -594,19 +594,19 @@ def is_error(status): tokens = 0 break tokens = outputs.num_token - res += outputs.token_ids[prev_len - tokens:] + token_ids += outputs.token_ids[prev_len - tokens:] prev_len = tokens - if len(res) <= state.ids_offset: + if len(token_ids) <= state.ids_offset: continue ids_offset = state.ids_offset response, state = self.tokenizer.detokenize_incrementally( - res, + token_ids, state, skip_special_tokens=gen_config.skip_special_tokens) + res = token_ids[ids_offset:] - res = res[ids_offset:] logprobs = None if outputs.logprobs: log_offset = ids_offset - start_ids_offset @@ -617,6 +617,7 @@ def is_error(status): yield GenOut(response, self.id2step[session_id], len(input_ids), tokens, finish_reason, res, logprobs) + # end of generator loop if not is_error(outputs.status): finish_reason = 'length' \ if tokens >= gen_config.max_new_tokens else 'stop' @@ -635,12 +636,12 @@ def is_error(status): finish_reason='error', token_ids=[]) # update step - self.id2step[session_id] += len(input_ids) + tokens if sequence_end: self.id2step[session_id] = 0 - # manually end pytorch session - if self.backend == 'pytorch' and sequence_end: - await generator.async_end(session_id) + if self.backend == 'pytorch': # manually end pytorch session + await generator.async_end(session_id) + else: + self.id2step[session_id] += len(input_ids) + tokens def parse_tool_response(self, text, tools, **kwargs): """Parse model response containing tool information. diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 875c662926..a077f0762e 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -485,7 +485,7 @@ def prepare_inputs(self, def async_cancel_cb(self, fut: asyncio.Future, status: int): """executing on engine's signaling thread.""" - print(f'session canceled, status = {status}') + logger.info(f'[async_cancel_cb] session canceled, status = {status}') fut.get_loop().call_soon_threadsafe(fut.set_result, status) async def async_cancel(self, session_id: int = None): @@ -495,7 +495,7 @@ async def async_cancel(self, session_id: int = None): def async_end_cb(self, fut: asyncio.Future, status: int): """executing on engine's signaling thread.""" - print(f'session ended, status = {status}') + logger.info(f'[async_end_cb] session ended, status = {status}') fut.get_loop().call_soon_threadsafe(fut.set_result, status) async def async_end(self, session_id): @@ -543,10 +543,9 @@ async def async_stream_infer(self, if self.lock is None: self.lock = asyncio.Lock() - async with self.lock: # reentrant proof - - self.flag = 0 - + # may remove when proved not possible + async with self.lock: + logger.info(f'[async_stream_infer] session {session_id} start') gen_cfg = self._get_generation_config(gen_config) inputs, input_len = self.prepare_inputs( @@ -608,7 +607,7 @@ async def async_stream_infer(self, break except Exception as e: - logger.error(e) + logger.error(f'[async_stream_infer] {e}') yield self._get_error_output() finally: @@ -617,6 +616,7 @@ async def async_stream_infer(self, while not state or state.status == 0: await signal.fut state = shared_state.consume() + logger.info(f'[async_stream_infer] session {session_id} done') def _get_error_output(self): return EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR, From 8fa85dcf2cce3b494da520d5a8d14b2e48dccb55 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 24 Dec 2024 21:39:23 +0800 Subject: [PATCH 13/40] refactor async engine --- benchmark/profile_throughput.py | 80 +++++---- lmdeploy/serve/async_engine.py | 115 ++++++------ lmdeploy/turbomind/turbomind.py | 163 +++++++++--------- src/turbomind/engine/model_request.cc | 4 +- src/turbomind/engine/model_request.h | 2 +- src/turbomind/engine/request.h | 5 +- src/turbomind/engine/request_queue.cc | 3 - src/turbomind/models/llama/LlamaBatch.cc | 7 +- src/turbomind/python/bind.cpp | 7 +- .../triton_backend/llama/LlamaTritonModel.cc | 7 +- 10 files changed, 207 insertions(+), 186 deletions(-) diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 34f31b4137..d41f886227 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -93,8 +93,8 @@ async def _inference(self, req_queue: Queue, session_id: int, skip_detokenize: bool): model_inst = self.tm_model.create_instance() counters = [] - for prompt, input_seqlen, output_seqlen in iter( - req_queue.get_nowait, [None, None, None]): + for prompt, input_seqlen, output_seqlen, cancel_after in iter( + req_queue.get_nowait, None): ts = [time.perf_counter()] ns = [0] @@ -109,26 +109,32 @@ async def _inference(self, req_queue: Queue, session_id: int, prev_len = 0 token_ids = input_ids.copy() - async for outputs in model_inst.async_stream_infer( - session_id, - input_ids=input_ids, - gen_config=GenerationConfig(max_new_tokens=output_seqlen, - temperature=temperature, - top_p=top_p, - top_k=top_k, - ignore_eos=True), - sequence_start=True, - sequence_end=True, - stream_output=stream_output): - n_token = outputs.num_token - if n_token > prev_len: - token_ids += outputs.token_ids[prev_len - n_token:] - if not skip_detokenize: - _, state = self.tokenizer.detokenize_incrementally( - token_ids, state) - ts.append(time.perf_counter()) - ns.append(n_token) - prev_len = n_token + generator = model_inst.async_stream_infer( + session_id, + input_ids=input_ids, + gen_config=GenerationConfig(max_new_tokens=output_seqlen, + temperature=temperature, + top_p=top_p, + top_k=top_k, + ignore_eos=True), + sequence_start=True, + sequence_end=True, + stream_output=stream_output) + try: + async for outputs in generator: + n_token = outputs.num_token + if n_token > prev_len: + token_ids += outputs.token_ids[prev_len - n_token:] + if not skip_detokenize: + _, state = self.tokenizer.detokenize_incrementally( + token_ids, state) + ts.append(time.perf_counter()) + ns.append(n_token) + prev_len = n_token + if n_token > cancel_after: + break + finally: + await generator.aclose() # for pytorch engine to restart a session if isinstance(model_inst, EngineInstance): @@ -140,20 +146,23 @@ async def _inference(self, req_queue: Queue, session_id: int, return counters def process_request(self, requests, concurrency, temperature, top_p, top_k, - stream_output, skip_tokenize, skip_detokenize): + stream_output, skip_tokenize, skip_detokenize, + cancel_rate): req_queue = Queue() # feed request to q - for req in requests: + for prompt, input_len, output_len in requests: + cancel_after = output_len + 1 + if cancel_rate > 0: + if random.random() < cancel_rate: + cancel_after = random.randint(0, cancel_after) if skip_tokenize: - req_queue.put((self.tokenizer.encode(req[0]), *req[1:])) + req_queue.put((self.tokenizer.encode(prompt), input_len, + output_len, cancel_after)) else: - req_queue.put(req) + req_queue.put((prompt, input_len, output_len, cancel_after)) for i in range(concurrency): - req_queue.put([None, None, None]) - - event_loop = asyncio.new_event_loop() - asyncio.set_event_loop(event_loop) + req_queue.put(None) # start threads tasks = [] @@ -170,6 +179,9 @@ async def _gather_tasks(tasks): start = time.time() + event_loop = asyncio.new_event_loop() + asyncio.set_event_loop(event_loop) + counters = asyncio.run(_gather_tasks(tasks)) elapsed_time = time.time() - start @@ -231,6 +243,7 @@ def fmt(x): tab_row('Benchmark duration', elapsed_time) tab_row('Total requests', len(requests)) tab_row('Concurrency', concurrency) + tab_row('Cancel rate', cancel_rate) tab_row('Stream output', str(stream_output).lower()) tab_row('Skip tokenize', str(skip_tokenize).lower()) tab_row('Skip detokenize', str(skip_detokenize).lower()) @@ -281,6 +294,10 @@ def parse_args(): parser.add_argument('--skip-detokenize', action='store_true', help='Skip detokenizing output tokens') + parser.add_argument('--cancel-rate', + type=float, + help='Possibility of a request being canceled', + default=0) parser.add_argument('--csv', type=str, help='Where to save the result.', @@ -367,7 +384,8 @@ def main(): concurrency=args.concurrency, stream_output=not args.no_stream_output, skip_tokenize=args.skip_tokenize, - skip_detokenize=args.skip_detokenize) + skip_detokenize=args.skip_detokenize, + cancel_rate=args.cancel_rate) if __name__ == '__main__': diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 38ab5b2c3b..cb775ab2dc 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -267,24 +267,6 @@ async def end_session(self, session_id: int): finally: self.free_gens.put_nowait(generator) - @asynccontextmanager - async def safe_run(self, session_id: int): - """A context manager to make sure server's safe running.""" - assert session_id not in self.id2generator - generator = await self.free_gens.get() - generator._fut = asyncio.get_running_loop().create_future() - self.id2generator[session_id] = generator - try: - yield generator - except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa - logger.error(f'[safe_run] exception caught: {e}') - await generator.async_cancel(session_id) - finally: - self.id2generator.pop(session_id) - generator._fut.set_result(None) - generator._fut = None - self.free_gens.put_nowait(generator) - def batch_infer(self, prompts: Union[List[str], str, List[Dict], List[List[Dict]]], @@ -463,6 +445,33 @@ async def _get_prompt_input(self, input_ids = self.tokenizer.encode(prompt, add_bos=sequence_start) return {'prompt': prompt, 'input_ids': input_ids} + @asynccontextmanager + async def model_inst(self, session_id: int): + """A context manager to make sure server's safe running.""" + assert session_id not in self.id2generator + inst = await self.free_gens.get() + inst._fut = asyncio.get_running_loop().create_future() + self.id2generator[session_id] = inst + try: + yield inst + finally: + self.id2generator.pop(session_id) + inst._fut.set_result(None) + inst._fut = None + self.free_gens.put_nowait(inst) + + @asynccontextmanager + async def safe_run(self, inst, session_id, **kwargs): + generator = inst.async_stream_infer(session_id, **kwargs) + try: + yield generator + except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa + logger.error(f'[safe_run] exception caught: {e}') + # TODO: remove session_id from async cancel + await inst.async_cancel(session_id) + finally: + await generator.aclose() + async def generate( self, messages, @@ -561,34 +570,33 @@ async def generate( 'length') if sequence_end is True and sequence_start is False: await self.end_session(session_id) - else: - - def is_error(status): - return status not in [ - ResponseType.SUCCESS, ResponseType.FINISH - ] - - if self.free_gens is None: - # `asyncio.Queue` must be created in an async context - self.free_gens = asyncio.Queue() - for inst in self.instances: - self.free_gens.put_nowait(inst) - - async with self.safe_run(session_id) as generator: - state = DetokenizeState(len(input_ids)) - token_ids = input_ids.copy() - prev_len = 0 - start_ids_offset = state.ids_offset - response = '' - async for outputs in generator.async_stream_infer( - session_id=session_id, - **prompt_input, - gen_config=gen_config, - adapter_name=adapter_name, - stream_output=stream_response, - sequence_start=sequence_start, - sequence_end=sequence_end, - step=self.id2step[session_id]): + return + + def is_error(status): + return status not in [ResponseType.SUCCESS, ResponseType.FINISH] + + if self.free_gens is None: + # `asyncio.Queue` must be created in an async context + self.free_gens = asyncio.Queue() + for inst in self.instances: + self.free_gens.put_nowait(inst) + + async with self.model_inst(session_id) as inst: + state = DetokenizeState(len(input_ids)) + token_ids = input_ids.copy() + prev_len = 0 + start_ids_offset = state.ids_offset + response = '' + async with self.safe_run(inst, + session_id=session_id, + **prompt_input, + gen_config=gen_config, + adapter_name=adapter_name, + stream_output=stream_response, + sequence_start=sequence_start, + sequence_end=sequence_end, + step=self.id2step[session_id]) as gen: + async for outputs in gen: # decode res if is_error(outputs.status): tokens = 0 @@ -635,13 +643,14 @@ def is_error(status): generate_token_len=0, finish_reason='error', token_ids=[]) - # update step - if sequence_end: - self.id2step[session_id] = 0 - if self.backend == 'pytorch': # manually end pytorch session - await generator.async_end(session_id) - else: - self.id2step[session_id] += len(input_ids) + tokens + # update step + if sequence_end: + self.id2step[session_id] = 0 + if self.backend == 'pytorch': + # manually end pytorch session + await inst.async_end(session_id) + else: + self.id2step[session_id] += len(input_ids) + tokens def parse_tool_response(self, text, tools, **kwargs): """Parse model response containing tool information. diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index a077f0762e..470178f57a 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -320,11 +320,26 @@ def create_instance(self, cuda_stream_id=0): return TurboMindInstance(self, self.config, cuda_stream_id) -class StreamingSignal: +class StreamingSemaphore: def __init__(self): self.loop = asyncio.get_running_loop() + self.fut = None + self.val = 0 + + async def acquire(self): + if self.val: + return self.fut = self.loop.create_future() + await self.fut + self.fut = None + self.val = 0 + + def release(self): + if not self.val: + self.val = 1 + if self.fut: + self.fut.set_result(None) class TurboMindInstance: @@ -483,34 +498,22 @@ def prepare_inputs(self, return inputs, input_len - def async_cancel_cb(self, fut: asyncio.Future, status: int): - """executing on engine's signaling thread.""" - logger.info(f'[async_cancel_cb] session canceled, status = {status}') - fut.get_loop().call_soon_threadsafe(fut.set_result, status) - async def async_cancel(self, session_id: int = None): - fut = asyncio.get_running_loop().create_future() - self.model_inst.cancel(partial(self.async_cancel_cb, fut)) - return await fut + self.model_inst.cancel() def async_end_cb(self, fut: asyncio.Future, status: int): """executing on engine's signaling thread.""" logger.info(f'[async_end_cb] session ended, status = {status}') fut.get_loop().call_soon_threadsafe(fut.set_result, status) - async def async_end(self, session_id): + def async_end(self, session_id): fut = asyncio.get_running_loop().create_future() self.model_inst.end(partial(self.async_end_cb, fut), session_id) - await fut + return fut - def async_signal_cb(self, s: StreamingSignal): + def async_signal_cb(self, s: StreamingSemaphore): """executing on engine's signaling thread.""" - - def _signal(): - fut, s.fut = s.fut, s.loop.create_future() # exchange - fut.set_result(None) - - s.loop.call_soon_threadsafe(_signal) + s.loop.call_soon_threadsafe(s.release) async def async_stream_infer(self, session_id, @@ -539,83 +542,79 @@ async def async_stream_infer(self, stream_output (bool): indicator for stream output kwargs (dict): kwargs for backward compatibility """ + logger.info(f'[async_stream_infer] session {session_id} start') + gen_cfg = self._get_generation_config(gen_config) - if self.lock is None: - self.lock = asyncio.Lock() - - # may remove when proved not possible - async with self.lock: - logger.info(f'[async_stream_infer] session {session_id} start') - gen_cfg = self._get_generation_config(gen_config) - - inputs, input_len = self.prepare_inputs( - input_ids=input_ids, - input_embeddings=input_embeddings, - input_embedding_ranges=input_embedding_ranges, - gen_config=gen_config) - - session = _tm.SessionParam(id=session_id, - step=step, - start=sequence_start, - end=sequence_end) + inputs, input_len = self.prepare_inputs( + input_ids=input_ids, + input_embeddings=input_embeddings, + input_embedding_ranges=input_embedding_ranges, + gen_config=gen_config) - inputs = _np_dict_to_tm_dict(inputs) + session = _tm.SessionParam(id=session_id, + step=step, + start=sequence_start, + end=sequence_end) - signal = StreamingSignal() - signal_cb = partial(self.async_signal_cb, signal) + inputs = _np_dict_to_tm_dict(inputs) - outputs, shared_state = self.model_inst.forward( - inputs, session, gen_cfg, stream_output, signal_cb) + sem = StreamingSemaphore() + signal_cb = partial(self.async_signal_cb, sem) - outputs = _tm_dict_to_torch_dict(outputs) + outputs, shared_state = self.model_inst.forward( + inputs, session, gen_cfg, stream_output, signal_cb) - output_ids_buf = outputs['output_ids'] + outputs = _tm_dict_to_torch_dict(outputs) - out_logprobs = None - finish = False - state = None + output_ids_buf = outputs['output_ids'] - output_ids = [] - output_len = 0 - prev_len = step + input_len - try: - while True: - await signal.fut + out_logprobs = None + finish = False + state = None - state = shared_state.consume() - status, seq_len = state.status, state.seq_len + output_ids = [] + output_len = 0 + prev_len = step + input_len + try: + while True: + await sem.acquire() - if status == 7: - finish, status = True, 0 - elif status: - yield self._get_error_output() - break + state = shared_state.consume() + status, seq_len = state.status, state.seq_len - if seq_len == prev_len and not finish: - continue + if status == 7: + finish, status = True, 0 + elif status: + yield self._get_error_output() + break - output_ids += output_ids_buf[prev_len:seq_len].tolist() - output_len += seq_len - prev_len - status = ResponseType.FINISH if finish else ResponseType.SUCCESS - output = EngineOutput(status, output_ids, output_len, - out_logprobs) - prev_len = seq_len + if seq_len == prev_len and not finish: + continue - yield output + output_ids += output_ids_buf[prev_len:seq_len].tolist() + output_len += seq_len - prev_len + status = ResponseType.FINISH if finish else ResponseType.SUCCESS # noqa + output = EngineOutput(status, output_ids, output_len, + out_logprobs) + prev_len = seq_len - if finish: - break + yield output - except Exception as e: - logger.error(f'[async_stream_infer] {e}') - yield self._get_error_output() + if finish: + break - finally: - # Contract: `cb` won't be called again if status is non-zero - # wait for status to be set as `finish` or `error` - while not state or state.status == 0: - await signal.fut - state = shared_state.consume() + except GeneratorExit: + logger.info(f'[async_stream_infer] GeneratorExit {session_id}') + await self.async_cancel(session_id) + except BaseException as e: + logger.error(f'[async_stream_infer] {type(e).__name__} {e}') + yield self._get_error_output() + finally: + # Contract: `cb` won't be called again if status is non-zero + # wait for status to be set as `finish` or `error` + while not state or state.status == 0: + await sem.acquire() + state = shared_state.consume() logger.info(f'[async_stream_infer] session {session_id} done') def _get_error_output(self): @@ -740,7 +739,7 @@ def stream_infer(self, state = shared_state.consume() status, seq_len = state.status, state.seq_len - if status == 7: # TODO: use enum + if status in [7, 8]: # TODO: use enum finish = True status = 0 elif status: @@ -753,7 +752,7 @@ def stream_infer(self, output_ids += output_ids_buf[prev_len:seq_len].tolist() output_len += seq_len - prev_len - status = ResponseType.FINISH if finish else ResponseType.SUCCESS + status = ResponseType.FINISH if finish else ResponseType.SUCCESS # noqa output = EngineOutput(status, output_ids, output_len.item(), out_logprobs) @@ -774,7 +773,7 @@ def stream_infer(self, finally: with self.cond: - # Contract: `notfiy` won't be called again if status is non-zero + # Contract: `cb` won't be called again if status is non-zero # wait for status to be set as `finish` or `error` while not state or state.status == 0: while not self.flag: diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index 61e9a4fef8..fae9a7b309 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -59,12 +59,10 @@ ModelRequest::ModelRequest(Gateway* gateway, int session_len, int vocab_size): { } -void ModelRequest::Cancel(std::function cb) +void ModelRequest::Cancel() { // request is finished if lock failed if (auto r = request_.lock()) { - // the cb will be synced to engine via release-acquire semantics - r->cancel_cb = std::move(cb); gateway_->cancel(std::move(r)); } } diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h index 7866d8d9b7..b2d7dcea33 100644 --- a/src/turbomind/engine/model_request.h +++ b/src/turbomind/engine/model_request.h @@ -16,7 +16,7 @@ class ModelRequest { ModelRequest(Gateway* gateway, int session_len, int vocab_size); // Cancel running request - void Cancel(std::function cb); + void Cancel(); // Reset the channel to uninitailized state, calls `notify` when done void End(std::function cb, uint64_t session_id); diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index ceec020d46..b89483472c 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -98,9 +98,8 @@ struct Request { std::function end_cb; - std::function cancel_cb; - std::atomic cancel_flag; - bool is_canceled{}; + std::atomic cancel_flag; + bool is_canceled{}; std::function forward_cb; diff --git a/src/turbomind/engine/request_queue.cc b/src/turbomind/engine/request_queue.cc index c39a23b0c2..8c0b52b5bf 100644 --- a/src/turbomind/engine/request_queue.cc +++ b/src/turbomind/engine/request_queue.cc @@ -34,9 +34,6 @@ void RequestQueue::cancel(std::shared_ptr r) // not picked by engine yet, skip directly gateway_->notify({[r = std::move(r)] { // UpdateState(*r, Request::kCancel, 0); - if (r->cancel_cb) { - r->cancel_cb(0); - } }}); } } diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 5f3b5660ed..526f919130 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1480,11 +1480,8 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig const auto len = state_->requests[index]->sequence_length.getVal(); // move the request handle into the signal - return [this, len, r = std::move(state_->requests[index])] { // - UpdateState(*r, Request::kFinish, len); - if (r->cancel_cb) { - r->cancel_cb(0); - } + return [this, len, force_stop, r = std::move(state_->requests[index])] { // + UpdateState(*r, force_stop ? Request::kCancel : Request::kFinish, len); }; } diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 139fc2111c..e72d9eff06 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -499,11 +499,10 @@ PYBIND11_MODULE(_turbomind, m) "cb"_a) .def( "cancel", - [](ModelRequest* model_request, std::function cb) { - model_request->Cancel(std::move(cb)); // + [](ModelRequest* model_request) { + model_request->Cancel(); // }, - py::call_guard(), - "cb"_a) + py::call_guard()) .def( "end", [](ModelRequest* model_request, std::function cb, uint64_t session_id) { diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index c045d7baf9..e0cf1a02bc 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -25,11 +25,11 @@ #include #include "src/turbomind/engine/gateway.h" +#include "src/turbomind/engine/model_request.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/engine/model_request.h" #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cuda_utils.h" @@ -176,6 +176,11 @@ LlamaTritonModel::LlamaTritonModel(size_t ten std::string model_dir, std::string config, std::function()> ffi_ctx_factory): + model_param_{}, + attn_param_{}, + moe_param_{}, + lora_param_{}, + engine_param_{}, tensor_para_size_(tensor_para_size), pipeline_para_size_(pipeline_para_size), weights_(getDeviceCount()), From 3f07733baebbd60871a22025eed9d2da460760c6 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 25 Dec 2024 13:17:44 +0800 Subject: [PATCH 14/40] fix semaphore --- lmdeploy/turbomind/turbomind.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 470178f57a..8476828a37 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -329,6 +329,7 @@ def __init__(self): async def acquire(self): if self.val: + self.val = 0 return self.fut = self.loop.create_future() await self.fut @@ -578,11 +579,11 @@ async def async_stream_infer(self, try: while True: await sem.acquire() - state = shared_state.consume() + status, seq_len = state.status, state.seq_len - if status == 7: + if status in [7, 8]: # finish / canceled finish, status = True, 0 elif status: yield self._get_error_output() From 2382d7e0e7f43ab532127e57f77f3ae667a2bcfd Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Fri, 27 Dec 2024 19:27:39 +0800 Subject: [PATCH 15/40] refactor inference API --- benchmark/profile_pipeline_api.py | 22 +- lmdeploy/messages.py | 1 - lmdeploy/serve/async_engine.py | 468 +++++++++++++++++---------- src/turbomind/kernels/gpt_kernels.cu | 2 +- 4 files changed, 311 insertions(+), 182 deletions(-) diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py index 764f78399c..917867832c 100644 --- a/benchmark/profile_pipeline_api.py +++ b/benchmark/profile_pipeline_api.py @@ -69,14 +69,14 @@ def __init__(self, model_path: str, engine_config, csv: str): def process_request(self, requests, concurrency, temperature, top_p, top_k, stream_output): - stats = OrderedDict( - (session_id, None) for session_id in range(len(requests))) + stats = OrderedDict((index, None) for index in range(len(requests))) prompts = [prompt for prompt, _, _ in requests] gen_configs = [ GenerationConfig(temperature=temperature, top_p=top_p, top_k=top_k, ignore_eos=True, + do_sample=True, max_new_tokens=output_len) for _, _, output_len in requests ] @@ -87,10 +87,10 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k, for output in self.pipe.stream_infer(prompts, gen_configs, do_preprocess=False): - session_id = output.session_id + index = output.index n_token = output.generate_token_len finish_reason = output.finish_reason - stats[session_id] = (n_token, finish_reason) + stats[index] = (n_token, finish_reason) if finish_reason is not None: pbar.update(1) else: @@ -98,20 +98,20 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k, gen_configs, do_preprocess=False, use_tqdm=True): - session_id = output.session_id + index = output.index n_token = output.generate_token_len finish_reason = output.finish_reason - stats[session_id] = (n_token, finish_reason) + stats[index] = (n_token, finish_reason) elapsed_time = time.perf_counter() - start completion_tokens = 0 - for session_id, (n_token, finish_reason) in stats.items(): + for index, (n_token, finish_reason) in stats.items(): assert finish_reason == 'length', \ - f'unexpected finish_reason of session_id={session_id}, ' \ - f'prompt={requests[session_id][0]}' - assert n_token - 1 <= requests[session_id][-1] <= n_token, \ - f'request to generate {requests[session_id][-1]} tokens, ' \ + f'unexpected finish_reason of index={index}, ' \ + f'prompt={requests[index][0]}' + assert n_token - 1 <= requests[index][-1] <= n_token, \ + f'request to generate {requests[index][-1]} tokens, ' \ f'but got {n_token} tokens' completion_tokens += n_token diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index 813a6acd21..4f04906f12 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -335,7 +335,6 @@ class Response: text: str generate_token_len: int input_token_len: int - session_id: int finish_reason: Optional[Literal['stop', 'length']] = None token_ids: List[int] = field(default_factory=list) logprobs: List[Dict[int, float]] = None diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index cb775ab2dc..e9083fc110 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -1,22 +1,27 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio +import concurrent.futures import dataclasses import json import os import random import re -from contextlib import asynccontextmanager +from contextlib import asynccontextmanager, closing from copy import deepcopy +from functools import partial from itertools import count -from queue import Empty, Queue +from queue import Queue from threading import Thread -from typing import Any, Dict, List, Literal, Optional, Tuple, Union +from typing import (Any, AsyncIterator, Dict, Iterator, List, Literal, + Optional, Tuple, Union) + +import tqdm from lmdeploy.logger import RequestLogger from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, Response, ResponseType, TurbomindEngineConfig) from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model -from lmdeploy.serve.utils import LogitsMixin, _get_event_loop +from lmdeploy.serve.utils import LogitsMixin from lmdeploy.tokenizer import DetokenizeState from lmdeploy.utils import _get_and_verify_max_len, _stop_words, get_logger @@ -52,6 +57,33 @@ class GenOut: logprobs: List[Dict[int, float]] = None +def _gen_out_to_response(out: GenOut, index) -> Response: + return Response(text=out.response, + generate_token_len=out.generate_token_len, + input_token_len=out.input_token_len, + finish_reason=out.finish_reason, + token_ids=out.token_ids, + logprobs=out.logprobs, + index=index) + + +def _append_response(dst: Response, src: Response): + """dst += src.""" + if not dst: + return src + dst.text += src.text + dst.generate_token_len = src.generate_token_len + dst.input_token_len = src.input_token_len + dst.finish_reason = src.finish_reason + dst.index = src.index + if src.token_ids: + dst.token_ids += src.token_ids + if src.logprobs: + dst.logprobs = dst.logprobs or [] + dst.logprobs += src.logprobs + return dst + + class Session: """Session for AsyncEngine.chat. @@ -63,14 +95,17 @@ class Session: _engine (Any): engine for internal use. history (List[Any, str]): chat history. """ - _ids = count(0) - def __init__(self): - self._id: int = next(self._ids) + def __init__(self, + session_id: int, + engine: Any, + gen_config: GenerationConfig = None): + self._id: int = session_id + self._engine = engine self._step: int = 0 self._prompt: Any = None self._response: Response = None - self._engine: Any = None + self._gen_config = gen_config self.history: List[Tuple[Any, str]] = [] def _merge_response(self, resp: Response, step: Union[Response, GenOut]): @@ -89,8 +124,8 @@ def response(self) -> Response: def close(self): """release engine storage for this session.""" if self._engine: - inst = self._engine.create_instance() - inst.end(self._id) + self._engine._run(coro=self._engine.end_session(self._id)).result() + self._engine = None def __repr__(self) -> str: res = '' @@ -100,6 +135,60 @@ def __repr__(self) -> str: res += f'USER:\n{user}\nASSISTANT:\n{assistant}\n' return res + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + def __call__( + self, + prompt: str, + gen_config: Optional[GenerationConfig] = None, + stream_response: bool = True, + do_preprocess: bool = True) -> Union[Response, Iterator[Response]]: + self._engine.chat(prompt=prompt, + gen_config=gen_config or self._gen_config, + stream_response=stream_response, + do_preprocess=do_preprocess, + session=self) + if stream_response: + return self.generator + else: + return self.response + + +class _EventLoopThread: + + def __init__(self): + fut = concurrent.futures.Future() + self.thread = Thread( + target=partial(_EventLoopThread._thread_entry, fut)) + self.thread.start() + self.loop: asyncio.AbstractEventLoop = fut.result() + self.closed = False + + @staticmethod + def _thread_entry(fut): + loop = asyncio.new_event_loop() + fut.set_result(loop) + try: + loop.run_forever() + except BaseException as e: + logger.error(f'[internal_thread] {type(e).__name__} {e}') + finally: + loop.close() + + def close(self): + if self.closed: + return + self.closed = True + self.loop.call_soon_threadsafe(self.loop.stop) + self.thread.join() + + def __del__(self): + self.close() + class AsyncEngine(LogitsMixin): """Async inference engine. Maintaining a bunch of tm_model instances. @@ -179,13 +268,26 @@ def __init__(self, self.instance_num = self.backend_config.max_batch_size self.tokenizer = self.engine.tokenizer self.id2step = {} - self.id2generator = {} - self.free_gens: asyncio.Queue = None + self.id2inst = {} + self.free_insts: asyncio.Queue = None self.instances = [ self.engine.create_instance() for _ in range(self.instance_num) ] self._session_id = count(0) self.request_logger = RequestLogger(max_log_len) + self.internal_thread = _EventLoopThread() + self.limiter: asyncio.Semaphore = None + + def close(self): + self.internal_thread.close() + + def _get_free_insts(self): + if self.free_insts is None: + # `asyncio.Queue` must be created in an async context + self.free_insts = asyncio.Queue() + for inst in self.instances: + self.free_insts.put_nowait(inst) + return self.free_insts def _build_turbomind( self, @@ -245,27 +347,117 @@ def __call__(self, async def stop_session(self, session_id: int): """Stop a session by a session_id.""" - generator = self.id2generator.get(session_id) + generator = self.id2inst.get(session_id) if generator: await generator.async_cancel(session_id) # else it's not running at all async def end_session(self, session_id: int): """For ending a session that is not running.""" - generator = self.id2generator.get(session_id) - if generator: - fut = generator._fut - await fut - assert session_id not in self.id2generator - else: - generator = await self.free_gens.get() + inst = self.id2inst.get(session_id) + if inst: + await inst._active.wait() + assert session_id not in self.id2inst + inst = await self._get_free_insts().get() try: - await generator.async_end(session_id) + await inst.async_end(session_id) self.id2step[session_id] = 0 except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa logger.error(f'[end_session] exception caught: {e}') finally: - self.free_gens.put_nowait(generator) + self._get_free_insts().put_nowait(inst) + + def _get_limiter(self): + if not self.limiter: + self.limiter = asyncio.Semaphore(self.instance_num) + return self.limiter + + async def _async_infer(self, requests: AsyncIterator[Dict], + **kwargs) -> AsyncIterator[AsyncIterator[Response]]: + async for req in requests: + gen = self.generate(**req, **kwargs) + yield gen + + def _infer(self, + requests: Iterator[Dict], + multiplex: bool, + pbar=None, + loop=None) -> Iterator[Iterator[Response]]: + + async def _sync_resp(g, que: Queue, idx: int, sem: asyncio.Semaphore): + async for out in g: + que.put(_gen_out_to_response(out, idx)) + sem.release() + if not multiplex: + que.put(None) # sentinel of inner generator + if pbar: + pbar.update(1) + + que = Queue() + + async def _infer(): + sem = self._get_limiter() + tasks = [] + for idx, req in enumerate(requests): + await sem.acquire() + gen = self.generate(**req) + dst = que if multiplex else Queue() + if not multiplex: + que.put(iter(dst.get, None)) + # create a task to send the responses + task = asyncio.create_task(_sync_resp(gen, dst, idx, sem)) + tasks.append(task) + if not multiplex: # sentinel of outer generator + que.put(None) + await asyncio.gather(*tasks) + if multiplex: + que.put(None) # sentinel of inner generator + + loop = loop or self.internal_thread.loop + # submit the coroutine to async world + asyncio.run_coroutine_threadsafe( + _infer(), loop).add_done_callback(lambda x: x.result()) + + return iter(que.get, None) + + @staticmethod + def _is_single(prompts): + return isinstance(prompts, str) or isinstance(prompts[0], Dict) + + def infer(self, + prompts: Union[List[str], str, List[Dict], List[List[Dict]]], + gen_config: Optional[Union[GenerationConfig, + List[GenerationConfig]]] = None, + do_preprocess: bool = True, + adapter_name: Optional[str] = None, + stream_response: bool = False, + multiplex: bool = False, + pbar: Optional[tqdm.tqdm] = None, + **kwargs): + + prompts = [prompts] if AsyncEngine._is_single(prompts) else prompts + assert isinstance(prompts, List), 'prompts should be a list' + gen_config = gen_config or GenerationConfig() + if not isinstance(gen_config, List): + gen_config = [gen_config] * len(prompts) + assert len(prompts) == len(gen_config), \ + 'input gen_confg length differs from the length of prompts' # noqa + + def requests(): + for prompt, gen_cfg in zip(prompts, gen_config): + r = dict(messages=prompt, + gen_config=gen_cfg, + do_preprocess=do_preprocess, + adapter_name=adapter_name, + stream_response=stream_response, + **kwargs) + r.setdefault('sequence_start', True) + r.setdefault('sequence_end', True) + if 'session_id' not in r: + r['session_id'] = next(self._session_id) + yield r + + return self._infer(requests(), multiplex, pbar) def batch_infer(self, prompts: Union[List[str], str, List[Dict], @@ -290,59 +482,26 @@ def batch_infer(self, Pick one from adapters. Default to None, using the base model. use_tqdm (bool): Whether use the progress bar. Default to False """ - need_list_wrap = isinstance(prompts, str) or isinstance( - prompts[0], Dict) - prompts = [prompts] if need_list_wrap else prompts - assert isinstance(prompts, List), 'prompts should be a list' - if gen_config is None: - gen_config = GenerationConfig() - if not isinstance(gen_config, List): - gen_config = [gen_config] * len(prompts) - assert len(prompts) == len(gen_config), \ - 'input gen_confg length differs from the length of prompts' # noqa - prompt_num = len(prompts) - session_ids = [next(self._session_id) for _ in range(prompt_num)] - outputs = [ - Response('', 0, 0, session_ids[i], index=i) - for i in range(prompt_num) - ] - generators = [] - if use_tqdm: - import tqdm - pbar = tqdm.tqdm(total=len(prompts)) - for i, prompt in enumerate(prompts): - generators.append( - self.generate(prompt, - session_ids[i], - gen_config=gen_config[i], - stream_response=True, - sequence_start=True, - sequence_end=True, - do_preprocess=do_preprocess, - adapter_name=adapter_name, - **kwargs)) - - async def _inner_call(i, generator): - async for out in generator: - outputs[i].text += out.response - outputs[i].generate_token_len = out.generate_token_len - outputs[i].input_token_len = out.input_token_len - outputs[i].finish_reason = out.finish_reason - if out.token_ids: - outputs[i].token_ids.extend(out.token_ids) - if out.logprobs: - if outputs[i].logprobs is None: - outputs[i].logprobs = [] - outputs[i].logprobs.extend(out.logprobs) - if use_tqdm and out.finish_reason is not None: - pbar.update(1) - - async def gather(): - await asyncio.gather( - *[_inner_call(i, generators[i]) for i in range(len(prompts))]) - - _get_event_loop().run_until_complete(gather()) - outputs = outputs[0] if need_list_wrap else outputs + is_single = AsyncEngine._is_single(prompts) + outputs = [] + pbar = tqdm.tqdm( + total=1 if is_single else len(prompts)) if use_tqdm else None + try: + for g in self.infer(prompts, + gen_config, + do_preprocess, + adapter_name, + stream_response=False, + pbar=pbar, + **kwargs): + res = None + for out in g: + res = _append_response(res, out) + outputs.append(res) + finally: + if pbar: pbar.close() # noqa + if is_single: + return outputs[0] return outputs def stream_infer( @@ -352,6 +511,7 @@ def stream_infer( List[GenerationConfig]]] = None, do_preprocess: bool = True, adapter_name: Optional[str] = None, + stream_response: bool = True, **kwargs): """Inference a batch of prompts with stream mode. @@ -366,62 +526,13 @@ def stream_infer( adapter_name (str): the adapter name of slora for pytorch backend. Pick one from adapters. Default to None, using the base model. """ - need_list_wrap = isinstance(prompts, str) or isinstance( - prompts[0], Dict) - prompts = [prompts] if need_list_wrap else prompts - assert isinstance(prompts, List), 'prompts should be a list' - if gen_config is None: - gen_config = GenerationConfig() - if not isinstance(gen_config, List): - gen_config = [gen_config] * len(prompts) - assert len(prompts) == len(gen_config), \ - 'input gen_confg length differs from the length of prompts' # noqa - session_ids = [next(self._session_id) for _ in range(len(prompts))] - outputs = Queue() - generators = [] - for i, prompt in enumerate(prompts): - generators.append( - self.generate(prompt, - session_ids[i], - gen_config=gen_config[i], - stream_response=True, - sequence_start=True, - sequence_end=True, - do_preprocess=do_preprocess, - adapter_name=adapter_name, - **kwargs)) - - async def _inner_call(i, generator): - async for out in generator: - outputs.put( - Response(out.response, - out.generate_token_len, - out.input_token_len, - session_ids[i], - out.finish_reason, - out.token_ids, - out.logprobs, - index=i)) - - async def gather(): - await asyncio.gather( - *[_inner_call(i, generators[i]) for i in range(len(prompts))]) - outputs.put(None) - - loop = _get_event_loop() - proc = Thread(target=lambda: loop.run_until_complete(gather())) - proc.start() - - while True: - try: - out = outputs.get(timeout=0.001) - if out is None: - break - yield out - except Empty: - pass - - proc.join() + return self.infer(prompts, + gen_config, + do_preprocess, + adapter_name, + stream_response, + multiplex=True, + **kwargs) async def _get_prompt_input(self, prompt: str, @@ -448,17 +559,17 @@ async def _get_prompt_input(self, @asynccontextmanager async def model_inst(self, session_id: int): """A context manager to make sure server's safe running.""" - assert session_id not in self.id2generator - inst = await self.free_gens.get() - inst._fut = asyncio.get_running_loop().create_future() - self.id2generator[session_id] = inst + assert session_id not in self.id2inst + free_insts = self._get_free_insts() + inst = await free_insts.get() + inst._active = asyncio.Event() + self.id2inst[session_id] = inst try: yield inst finally: - self.id2generator.pop(session_id) - inst._fut.set_result(None) - inst._fut = None - self.free_gens.put_nowait(inst) + self.id2inst.pop(session_id) + inst._active.set() + free_insts.put_nowait(inst) @asynccontextmanager async def safe_run(self, inst, session_id, **kwargs): @@ -575,12 +686,6 @@ async def generate( def is_error(status): return status not in [ResponseType.SUCCESS, ResponseType.FINISH] - if self.free_gens is None: - # `asyncio.Queue` must be created in an async context - self.free_gens = asyncio.Queue() - for inst in self.instances: - self.free_gens.put_nowait(inst) - async with self.model_inst(session_id) as inst: state = DetokenizeState(len(input_ids)) token_ids = input_ids.copy() @@ -699,12 +804,28 @@ def parse_tool_response(self, text, tools, **kwargs): for call_info in call_info_list] return text, call_info_list + def _run(self, fn=None, coro=None, loop=None): + assert (fn or coro) and not (fn and coro) + loop = loop or self.internal_thread.loop + if fn: + + async def _coro(): + return fn() + + coro = _coro() + return asyncio.run_coroutine_threadsafe(coro, loop) + + def session(self, gen_config: GenerationConfig = None): + return Session(self._run(fn=lambda: next(self._session_id)).result(), + engine=self, + gen_config=gen_config) + def chat(self, prompt: str, session=None, gen_config: Optional[GenerationConfig] = None, - do_preprocess: bool = True, - **kwargs) -> Session: + stream_response=False, + **kwargs) -> Union[Session, Iterator]: """Chat. Args: @@ -717,8 +838,7 @@ def chat(self, **kwargs (dict): ad hoc parametrization of `gen_config """ if session is None: - session = Session() - session._engine = self.engine + session = self.session() # sync & init session._prompt = prompt @@ -726,25 +846,35 @@ def chat(self, sequence_start = session._step == 0 - async def _work(): - resp = Response('', -1, -1, session._id) - async for output in self.generate(prompt, - session_id=session._id, - gen_config=gen_config, - stream_response=False, - sequence_start=sequence_start, - sequence_end=False, - step=session._step, - do_preprocess=do_preprocess, - **kwargs): - resp = session._merge_response(resp, output) - return resp - - from lmdeploy.pytorch.engine.request import _run_until_complete - resp = _run_until_complete(_work()) - - session._response = resp - session._step += resp.generate_token_len + resp.input_token_len - session.history.append((session._prompt, resp.text)) + generator = self.infer(prompt, + gen_config, + sequence_start=sequence_start, + sequence_end=False, + session_id=session._id, + stream_response=stream_response, + multiplex=True) + + def _gen(): + resp = None + try: + for out in generator: + resp = _append_response(resp, out) + yield out + except: # noqa + self._run(coro=self.stop_session(session._id)).result() + raise + else: + session._response = resp + session._step += resp.generate_token_len + resp.input_token_len + session.history.append((session._prompt, resp.text)) + + if stream_response: + session.generator = _gen() + else: + # run the generator until finish + with closing(_gen()) as gen: + for _ in gen: + pass + session.generator = None return session diff --git a/src/turbomind/kernels/gpt_kernels.cu b/src/turbomind/kernels/gpt_kernels.cu index a0c47fff09..d611cfab43 100644 --- a/src/turbomind/kernels/gpt_kernels.cu +++ b/src/turbomind/kernels/gpt_kernels.cu @@ -315,7 +315,7 @@ void invokeTranspose2D_(T* dst, const T* src, int rows, int cols, cudaStream_t s dim3 grid((cols + TILE_DIM - 1) / TILE_DIM, // (rows + TILE_DIM - 1) / TILE_DIM); bool swap_xy = false; - + if (grid.y > 65535) { // max dim for grid.y std::swap(grid.x, grid.y); swap_xy = true; From 747252cca01fa0d6e4ba38acbf583819b1cad083 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Fri, 27 Dec 2024 20:04:20 +0800 Subject: [PATCH 16/40] remove turbomind sync interface --- lmdeploy/turbomind/chat.py | 33 +------- lmdeploy/turbomind/turbomind.py | 138 -------------------------------- 2 files changed, 3 insertions(+), 168 deletions(-) diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index e5fdf802df..6985e3dc27 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -29,22 +29,6 @@ def input_prompt(model_name): return '\n'.join(iter(input, sentinel)) -def infer(generator, session_id, input_ids, gen_config, sequence_start, step, - stream_output, tokenizer, state): - for outputs in generator.stream_infer(session_id=session_id, - input_ids=input_ids, - gen_config=gen_config, - sequence_start=sequence_start, - sequence_end=False, - step=step, - stream_output=stream_output): - res, tokens = input_ids + outputs.token_ids, outputs.num_token - # decode res - response, state = tokenizer.detokenize_incrementally(res, state=state) - print(response, end='', flush=True) - return tokens - - async def async_infer(generator, session_id, input_ids, gen_config, sequence_start, step, stream_output, tokenizer, state): token_ids = input_ids.copy() @@ -64,8 +48,6 @@ async def async_infer(generator, session_id, input_ids, gen_config, state=state) prev_len = tokens print(response, end='', flush=True) - # if 'I' in response: - # await generator.async_cancel() return tokens @@ -88,7 +70,6 @@ def main(model_path: str, stream_output: bool = True, request_output_len: int = 1024, chat_template_config: ChatTemplateConfig = None, - use_async: bool = True, **kwargs): """An example to perform model inference through the command line interface. @@ -183,10 +164,7 @@ def main(model_path: str, if prompt == 'exit': exit(0) elif prompt == 'end': - if use_async: - loop.run_until_complete(generator.async_end(session_id)) - else: - generator.end(session_id) + loop.run_until_complete(generator.async_end(session_id)) nth_round = 1 step = 0 seed = random.getrandbits(64) @@ -210,15 +188,10 @@ def main(model_path: str, print(f'{prompt}', end='', flush=True) state = DetokenizeState(len(input_ids)) - if use_async: - coro = async_infer(generator, session_id, input_ids, - gen_config, sequence_start, step, - stream_output, tokenizer, state) - tokens = loop.run_until_complete(coro) - else: - tokens = infer(generator, session_id, input_ids, gen_config, + coro = async_infer(generator, session_id, input_ids, gen_config, sequence_start, step, stream_output, tokenizer, state) + tokens = loop.run_until_complete(coro) # update step step += len(input_ids) + tokens diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 8476828a37..64a8cab89e 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -4,7 +4,6 @@ import json import os.path as osp import sys -import threading from collections.abc import Sequence from concurrent.futures import ThreadPoolExecutor from dataclasses import asdict @@ -645,143 +644,6 @@ def _get_generation_config(self, cfg: GenerationConfig): # print (c) return c - def signal_cb(self): - with self.cond: - self.flag = 1 - self.cond.notify() - - def end_cb(self, status: int): - print(f'session ended, status = {status}') - self.end_event.set() - - def end(self): - self.done_event.wait() - self.end_event = threading.Event() - self.model_inst.end(self.end_cb) - self.end_event.wait() - - def cancel(self, session_id: int, blocking: bool = True): - self.model_inst.cancel() - if blocking: - self.done_event.wait() - - def stream_infer(self, - session_id, - input_ids, - input_embeddings=None, - input_embedding_ranges=None, - sequence_start: bool = True, - sequence_end: bool = False, - step=0, - stop=False, - gen_config: GenerationConfig = None, - stream_output=False, - **kwargs): - """Perform model inference. - - Args: - session_id (int): the id of a session - input_ids (numpy.ndarray): the token ids of a prompt - input_embeddings (List[numpy.ndarray]): embeddings features - input_embedding_ranges (List[Tuple[int,int]]): the begin/end - offsets of input_embeddings to input_ids - sequence_start (bool): indicator for starting a sequence - sequence_end (bool): indicator for ending a sequence - step (int): the offset of the k/v cache - stop (bool): indicator for cancelling the session - gen_config (GenerationConfig): generation config - stream_output (bool): indicator for stream output - kwargs (dict): kwargs for backward compatibility - """ - - gen_cfg = self._get_generation_config(gen_config) - - inputs, input_length = self.prepare_inputs( - input_ids=input_ids, - input_embeddings=input_embeddings, - input_embedding_ranges=input_embedding_ranges, - gen_config=gen_config) - - inputs = _np_dict_to_tm_dict(inputs) - - session = _tm.SessionParam(id=session_id, - step=step, - start=sequence_start, - end=sequence_end, - stop=stop) - - self.cond = threading.Condition() - self.flag = 0 - self.done_event = threading.Event() - - outputs, shared_state = self.model_inst.forward( - inputs, session, gen_cfg, stream_output, self.signal_cb) - - outputs = _tm_dict_to_torch_dict(outputs) - - output_ids_buf = outputs['output_ids'] - - out_logprobs = None - finish = False - state = None - - output_ids = [] - output_len = 0 - prev_len = step + input_length[0] - - try: - # generator - while True: - with self.cond: - while not self.flag: - self.cond.wait() - self.flag = 0 - - state = shared_state.consume() - status, seq_len = state.status, state.seq_len - - if status in [7, 8]: # TODO: use enum - finish = True - status = 0 - elif status: - yield self._get_error_output() - break - - if seq_len == prev_len and not finish: - continue - - output_ids += output_ids_buf[prev_len:seq_len].tolist() - output_len += seq_len - prev_len - - status = ResponseType.FINISH if finish else ResponseType.SUCCESS # noqa - output = EngineOutput(status, output_ids, output_len.item(), - out_logprobs) - - prev_len = seq_len - - if out_logprobs: - output_token_len = len(output.token_ids) - output.logprobs = out_logprobs[:output_token_len] - - yield output - - if finish: - break - - except Exception as e: - logger.error(e) - yield self._get_error_output() - - finally: - with self.cond: - # Contract: `cb` won't be called again if status is non-zero - # wait for status to be set as `finish` or `error` - while not state or state.status == 0: - while not self.flag: - self.cond.wait() - state = shared_state.consume() - self.cond = None - def decode(self, input_ids, steps: List[int] = None, From 5266f27617152ad8fa475929474fdc18c52ea410 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 1 Jan 2025 18:11:53 +0800 Subject: [PATCH 17/40] fix msvc build --- src/turbomind/engine/request.h | 5 ++++- src/turbomind/kernels/sampling_topp_kernels.cu | 7 ++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index b89483472c..4d6e60dfe4 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -4,6 +4,8 @@ #include #include +#include +#include #include #include "src/turbomind/utils/Tensor.h" @@ -107,7 +109,8 @@ struct Request { int ec; // set when disabling conflicting requests - enum { + enum + { kOk = 0, kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set kConflict = 2, // Concurrent requests to the same sequence diff --git a/src/turbomind/kernels/sampling_topp_kernels.cu b/src/turbomind/kernels/sampling_topp_kernels.cu index cf7faf95b4..e904722900 100644 --- a/src/turbomind/kernels/sampling_topp_kernels.cu +++ b/src/turbomind/kernels/sampling_topp_kernels.cu @@ -22,6 +22,7 @@ #include "3rdparty/cub/cub.cuh" #endif +#include "src/turbomind/kernels/core/math.h" #include "src/turbomind/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/sampling_topp_kernels.h" #include "src/turbomind/utils/constant.h" @@ -216,9 +217,9 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream) size_t topp_id_val_buf_size = sizeof(int) * params.batch_size * params.vocab_size_padded; size_t begin_offset_buf_size = sizeof(int) * params.batch_size; size_t end_offset_buf_size = sizeof(int) * params.batch_size; - topp_id_val_buf_size = div_up(topp_id_val_buf_size, 256UL) * 256; - begin_offset_buf_size = div_up(begin_offset_buf_size, 256UL) * 256; - end_offset_buf_size = div_up(end_offset_buf_size, 256UL) * 256; + topp_id_val_buf_size = div_up(topp_id_val_buf_size, 256) * 256; + begin_offset_buf_size = div_up(begin_offset_buf_size, 256) * 256; + end_offset_buf_size = div_up(end_offset_buf_size, 256) * 256; if (params.workspace == nullptr) { size_t cub_temp_storage_size; From 33ad2be2b82402d108f6d94415c21c5b18fc3618 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 1 Jan 2025 18:36:20 +0800 Subject: [PATCH 18/40] fix msvc build --- src/turbomind/kernels/sampling_topp_kernels.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/turbomind/kernels/sampling_topp_kernels.cu b/src/turbomind/kernels/sampling_topp_kernels.cu index e904722900..9eaa429fa2 100644 --- a/src/turbomind/kernels/sampling_topp_kernels.cu +++ b/src/turbomind/kernels/sampling_topp_kernels.cu @@ -217,9 +217,9 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream) size_t topp_id_val_buf_size = sizeof(int) * params.batch_size * params.vocab_size_padded; size_t begin_offset_buf_size = sizeof(int) * params.batch_size; size_t end_offset_buf_size = sizeof(int) * params.batch_size; - topp_id_val_buf_size = div_up(topp_id_val_buf_size, 256) * 256; - begin_offset_buf_size = div_up(begin_offset_buf_size, 256) * 256; - end_offset_buf_size = div_up(end_offset_buf_size, 256) * 256; + topp_id_val_buf_size = cdiv(topp_id_val_buf_size, 256) * 256; + begin_offset_buf_size = cdiv(begin_offset_buf_size, 256) * 256; + end_offset_buf_size = cdiv(end_offset_buf_size, 256) * 256; if (params.workspace == nullptr) { size_t cub_temp_storage_size; From 1c20608a3549138400ab604671750bed01a1e18c Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 1 Jan 2025 18:56:53 +0800 Subject: [PATCH 19/40] fix msvc build --- src/turbomind/kernels/sampling_topp_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/turbomind/kernels/sampling_topp_kernels.cu b/src/turbomind/kernels/sampling_topp_kernels.cu index 9eaa429fa2..4d4cff464c 100644 --- a/src/turbomind/kernels/sampling_topp_kernels.cu +++ b/src/turbomind/kernels/sampling_topp_kernels.cu @@ -237,7 +237,7 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream) 0, // begin_bit sizeof(T) * 8, // end_bit = sizeof(KeyT) * 8 stream)); // cudaStream_t - cub_temp_storage_size = div_up(cub_temp_storage_size, 256UL) * 256; + cub_temp_storage_size = cdiv(cub_temp_storage_size, 256) * 256; params.workspace_size = topp_id_val_buf_size + begin_offset_buf_size + end_offset_buf_size + cub_temp_storage_size; return; From 43020b53c79700e6a829b1e6da7fde3ffe57dab5 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Mon, 6 Jan 2025 22:27:21 +0800 Subject: [PATCH 20/40] add extra outputs --- lmdeploy/messages.py | 6 + lmdeploy/serve/async_engine.py | 25 ++-- lmdeploy/turbomind/turbomind.py | 119 ++++++++++++------ src/turbomind/engine/model_request.cc | 33 +++-- src/turbomind/engine/model_request.h | 13 +- src/turbomind/engine/request.h | 13 +- src/turbomind/models/llama/LlamaBatch.cc | 24 +++- src/turbomind/models/llama/LlamaBatch.h | 13 +- src/turbomind/python/bind.cpp | 2 +- .../triton_backend/llama/LlamaTritonModel.cc | 6 +- 10 files changed, 177 insertions(+), 77 deletions(-) diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index b54aa95330..fc495629cd 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -97,6 +97,8 @@ class GenerationConfig: logprobs: int = None response_format: Optional[Dict] = None logits_processors: Optional[List[LogitsProcessor]] = None + output_logits: bool = None + output_last_hidden_state: bool = None def convert_stop_bad_words_to_ids(self, tokenizer: Tokenizer): """convert stop_words/bad_sords to ids and append the ids to @@ -341,6 +343,8 @@ class Response: finish_reason: Optional[Literal['stop', 'length']] = None token_ids: List[int] = field(default_factory=list) logprobs: List[Dict[int, float]] = None + logits: torch.Tensor = None + last_hidden_state: torch.Tensor = None index: int = 0 @@ -360,6 +364,8 @@ class EngineOutput: token_ids: List[int] num_token: int logprobs: List[Dict[int, float]] = None + logits: torch.Tensor = None + last_hidden_state: torch.Tensor = None @dataclass diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 0e1a01f4f4..90093b6154 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -55,6 +55,8 @@ class GenOut: finish_reason: Optional[Literal['stop', 'length', 'error']] = None token_ids: List[int] = None logprobs: List[Dict[int, float]] = None + logits: Any = None + last_hidden_state: Any = None def _gen_out_to_response(out: GenOut, index) -> Response: @@ -64,6 +66,8 @@ def _gen_out_to_response(out: GenOut, index) -> Response: finish_reason=out.finish_reason, token_ids=out.token_ids, logprobs=out.logprobs, + last_hidden_state=out.last_hidden_state, + logits=out.logits, index=index) @@ -580,7 +584,8 @@ async def safe_run(self, inst, session_id, **kwargs): try: yield generator except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa - logger.error(f'[safe_run] exception caught: {e}') + logger.error( + f'[safe_run] exception caught: {type(e).__name__} {e}') # TODO: remove session_id from async cancel await inst.async_cancel(session_id) finally: @@ -723,16 +728,18 @@ def is_error(status): skip_special_tokens=gen_config.skip_special_tokens) res = token_ids[ids_offset:] - logprobs = None - if outputs.logprobs: + out = GenOut(response, self.id2step[session_id], + len(input_ids), tokens, finish_reason, res) + if outputs.logprobs is not None: log_offset = ids_offset - start_ids_offset - logprobs = outputs.logprobs[log_offset:] + out.logprobs = outputs.logprobs[log_offset:] + if outputs.last_hidden_state is not None: + out.last_hidden_state = outputs.last_hidden_state + if outputs.logits is not None: + out.logits = outputs.logits + + yield out - # response, history token len, - # input token len, gen token len - yield GenOut(response, self.id2step[session_id], - len(input_ids), tokens, finish_reason, res, - logprobs) # end of generator loop if not is_error(outputs.status): finish_reason = 'length' \ diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 64a8cab89e..91ab1014e4 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -319,6 +319,69 @@ def create_instance(self, cuda_stream_id=0): return TurboMindInstance(self, self.config, cuda_stream_id) +def _get_logits(outputs): + logits = outputs['logits'] + + def _func(out: EngineOutput, step: int): + out.logits = logits[:step - 1, :] + + return _func + + +def _get_last_hidden_state(outputs): + last_hidden_state = outputs['last_hidden_state'] + + def _func(out: EngineOutput, step: int): + out.last_hidden_state = last_hidden_state[:step - 1, :] + + return _func + + +def _get_logprobs_impl(logprob_vals: torch.Tensor, + logprob_idxs: torch.Tensor, + logprob_nums: torch.Tensor, + output_ids: List[int], + logprobs: int, + out_logprobs: List[Dict[int, float]] = None): + length = len(output_ids) + offset = len(out_logprobs) + if length == offset: + return out_logprobs + for (pos, idx, val, n) in zip(range(offset, + length), logprob_idxs[offset:length], + logprob_vals[offset:length], + logprob_nums[offset:length]): + topn = min(n.item(), logprobs) + tok_res = {idx[i].item(): val[i].item() for i in range(topn)} + token_id = output_ids[pos] + if token_id not in tok_res: + print(token_id, tok_res) + valid_n = n.item() + tok_res[token_id] = \ + val[:valid_n][idx[:valid_n] == token_id].item() + ids = list(tok_res.keys()) + for k in ids: + if tok_res[k] == float('-inf'): + tok_res.pop(k) + out_logprobs.append(tok_res) + return out_logprobs + + +def _get_logprobs(outputs, output_logprobs: int): + logprob_vals = outputs['logprob_vals'] + logprob_idxs = outputs['logprob_indexes'] + logprob_nums = outputs['logprob_nums'] + + logprobs = [] + + def _func(out: EngineOutput, step: int): + _get_logprobs_impl(logprob_vals, logprob_idxs, logprob_nums, + out.token_ids, output_logprobs, logprobs) + out.logprobs = logprobs + + return _func + + class StreamingSemaphore: def __init__(self): @@ -375,38 +438,16 @@ def _create_model_instance(self, device_id): model_inst = self.tm_model.model_comm.create_model_instance(device_id) return model_inst - def _get_logprobs(self, - logprob_vals: torch.Tensor, - logprob_indexes: torch.Tensor, - logprob_nums: torch.Tensor, - output_ids: torch.Tensor, - logprobs: int = None, - length: int = None, - out_logprobs: List[Dict[int, float]] = None, - session_id: int = None): - if logprobs is None: - return None - if out_logprobs is None: - out_logprobs = [] - if len(output_ids) <= len(out_logprobs): - return out_logprobs - offset = len(out_logprobs) - for (token_id, idx, val, n) in zip(output_ids[offset:length], - logprob_indexes[offset:length], - logprob_vals[offset:length], - logprob_nums[offset:length]): - topn = min(n.item(), logprobs) - tok_res = {idx[i].item(): val[i].item() for i in range(topn)} - if token_id.item() not in tok_res: - valid_n = n.item() - tok_res[token_id.item()] = \ - val[:valid_n][idx[:valid_n] == token_id].item() - ids = list(tok_res.keys()) - for k in ids: - if tok_res[k] == float('-inf'): - tok_res.pop(k) - out_logprobs.append(tok_res) - return out_logprobs + def _get_extra_output_processors(self, outputs: Dict[str, torch.Tensor], + gen_config: GenerationConfig): + fs = [] + if gen_config.output_logits: + fs.append(_get_logits(outputs)) + if gen_config.output_last_hidden_state: + fs.append(_get_last_hidden_state(outputs)) + if gen_config.logprobs: + fs.append(_get_logprobs(outputs, gen_config.logprobs)) + return fs def prepare_embeddings(self, input_embeddings=None, @@ -566,9 +607,10 @@ async def async_stream_infer(self, outputs = _tm_dict_to_torch_dict(outputs) + extra_fs = self._get_extra_output_processors(outputs, gen_config) + output_ids_buf = outputs['output_ids'] - out_logprobs = None finish = False state = None @@ -594,8 +636,11 @@ async def async_stream_infer(self, output_ids += output_ids_buf[prev_len:seq_len].tolist() output_len += seq_len - prev_len status = ResponseType.FINISH if finish else ResponseType.SUCCESS # noqa - output = EngineOutput(status, output_ids, output_len, - out_logprobs) + output = EngineOutput(status, output_ids, output_len) + + for f in extra_fs: + f(output, seq_len) + prev_len = seq_len yield output @@ -632,6 +677,10 @@ def _get_generation_config(self, cfg: GenerationConfig): c.repetition_penalty = cfg.repetition_penalty if cfg.min_new_tokens: c.min_new_tokens = cfg.min_new_tokens + if cfg.output_last_hidden_state: + c.output_last_hidden_state = cfg.output_last_hidden_state + if cfg.output_logits: + c.output_logits = cfg.output_logits if cfg.logprobs: if cfg.logprobs > MAX_LOGPROBS: cfg.logprobs = MAX_LOGPROBS diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index fae9a7b309..5d1e75c965 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -54,8 +54,12 @@ static T get(const std::unordered_map& m, const std: return fallback; } -ModelRequest::ModelRequest(Gateway* gateway, int session_len, int vocab_size): - gateway_{gateway}, session_len_{session_len}, vocab_size_{vocab_size} +ModelRequest::ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim): + gateway_{gateway}, + data_type_{data_type}, + session_len_{session_len}, + vocab_size_{vocab_size}, + hidden_dim_{hidden_dim} { } @@ -107,23 +111,30 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output const int input_len = inputs.at("input_ids")->shape[0]; const int output_len = input_len + param.gen_cfg.max_new_tokens; + const int max_seq_len = std::min(input_len + output_len, session_len_) + 1; + const int max_out_len = std::min(output_len, session_len_) + 1; + for (auto& [k, v] : *param.tensors) { inputs_->emplace(k, v); } - add(outputs_, "output_ids", TYPE_INT32, MEMORY_CPU, session_len_); + add(outputs_, "output_ids", TYPE_INT32, MEMORY_CPU, max_seq_len); add(outputs_, "sequence_length", TYPE_INT32, MEMORY_CPU, 1); - if (param.gen_cfg.output_logprobs) { - const int max_logprob_len = std::min(output_len, session_len_) + 1; - add(outputs_, "logprob_vals", TYPE_FP32, MEMORY_CPU, max_logprob_len, kMaxLogProb); - add(outputs_, "logprob_indexes", TYPE_INT32, MEMORY_CPU, max_logprob_len, kMaxLogProb); - add(outputs_, "logprob_nums", TYPE_INT32, MEMORY_CPU, max_logprob_len); - } - if (param.gen_cfg.output_logits) { /// TODO: allow output logits on GPU - add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, output_len, vocab_size_); + add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, max_seq_len, vocab_size_); + } + + if (param.gen_cfg.output_last_hidden_state) { + /// TODO: allow hidden states on GPU + add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, max_seq_len, hidden_dim_); + } + + if (param.gen_cfg.output_logprobs) { + add(outputs_, "logprob_vals", TYPE_FP32, MEMORY_CPU, max_out_len, kMaxLogProb); + add(outputs_, "logprob_indexes", TYPE_INT32, MEMORY_CPU, max_out_len, kMaxLogProb); + add(outputs_, "logprob_nums", TYPE_INT32, MEMORY_CPU, max_out_len); } auto r = std::make_shared(); diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h index b2d7dcea33..aea889e856 100644 --- a/src/turbomind/engine/model_request.h +++ b/src/turbomind/engine/model_request.h @@ -13,7 +13,7 @@ class ModelRequest { public: virtual ~ModelRequest() = default; - ModelRequest(Gateway* gateway, int session_len, int vocab_size); + ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim); // Cancel running request void Cancel(); @@ -40,12 +40,15 @@ class ModelRequest { OutputParam Forward(InputParam param, std::function cb); protected: - Gateway* gateway_; + Gateway* const gateway_; - uint64_t session_id_; + const DataType data_type_; + + const int session_len_; + const int hidden_dim_; + const int vocab_size_; - int session_len_; - int vocab_size_; + uint64_t session_id_; std::weak_ptr request_; diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index 4d6e60dfe4..86e98a5a9a 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -25,11 +25,9 @@ struct GenerationConfig { uint64_t random_seed = 0; - int output_logprobs = 0; - - // placeholders that are not implemented yet - bool output_hidden_states = false; - bool output_logits = false; + int output_logprobs = 0; + bool output_last_hidden_state = false; + bool output_logits = false; }; inline std::ostream& operator<<(std::ostream& os, const GenerationConfig& c) @@ -44,7 +42,7 @@ inline std::ostream& operator<<(std::ostream& os, const GenerationConfig& c) os << ", repetition_penalty=" << c.repetition_penalty; os << ", random_seed=" << c.random_seed; os << ", output_logprobs=" << c.output_logprobs; - os << ", output_hidden_states=" << c.output_hidden_states; + os << ", output_hidden_states=" << c.output_last_hidden_state; os << ", output_logits=" << c.output_logits; os << " }"; return os; @@ -109,8 +107,7 @@ struct Request { int ec; // set when disabling conflicting requests - enum - { + enum { kOk = 0, kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set kConflict = 2, // Concurrent requests to the same sequence diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 526f919130..7d2a2d8a1b 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1163,7 +1163,7 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) TensorMap outputs; for (int i = 0; i < batch_size; i++) { - if (state_->requests[i]->inputs.isExist("logprobs")) { + if (state_->requests[i]->gen_cfg.output_logprobs) { outputs.insert( {"sampled_logprobs", {MEMORY_GPU, TYPE_FP32, {(size_t)batch_size, 1, kMaxLogProb}, sampled_logprobs_}}); outputs.insert( @@ -1178,7 +1178,7 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) } template -void LlamaBatch::OutputContextLogits(T* context_decoder_output, +void LlamaBatch::OutputLogits(T* context_decoder_output, const std::vector& indices, const std::vector& lengths, const std::vector& sequences) @@ -1254,6 +1254,23 @@ void LlamaBatch::OutputContextLogits(T* cont } } +template +void LlamaBatch::OutputLastHiddenState(T* context_decoder_output, + const std::vector& idxs, + const std::vector& input_lens, + const std::vector& sequences) +{ + for (int i = 0; i < idxs.size(); ++i) { + auto& r = state_->requests[idxs[i]]; + if (r->gen_cfg.output_last_hidden_state) { + auto dst = r->outputs.getPtr("last_hidden_state"); + dst += sequences[i]->cache_len * model_->hidden_units_; + Copy(context_decoder_output, (int64_t)input_lens[i] * model_->hidden_units_, dst); + } + context_decoder_output += (int64_t)input_lens[i] * model_->hidden_units_; + } +} + template void LlamaBatch::Finish(GenerationState& g, std::vector& signals) { @@ -1703,7 +1720,8 @@ bool LlamaBatch::Forward(GenerationState& g) sequences.data()); // compute logits of inputs if requested - OutputContextLogits(context_decoder_output_buf_, decode_indices, decode_lengths, sequences); + OutputLogits(context_decoder_output_buf_, decode_indices, decode_lengths, sequences); + OutputLastHiddenState(context_decoder_output_buf_, decode_indices, decode_lengths, sequences); } std::fill(h_input_length_buf_, h_input_length_buf_ + active_size, 0); diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index 144463c225..3fe181bb94 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -100,10 +100,15 @@ class LlamaBatch { [[nodiscard]] Signal Interrupt(int index, bool force_stop = false, bool force_end = false); - void OutputContextLogits(T* context_decoder_output, - const std::vector& indices, - const std::vector& lengths, - const std::vector& sequences); + void OutputLogits(T* context_decoder_output, + const std::vector& indices, + const std::vector& lengths, + const std::vector& sequences); + + void OutputLastHiddenState(T* context_decoder_output, + const std::vector& indices, + const std::vector& lengths, + const std::vector& sequences); explicit LlamaBatch(const EngineParam& param, std::unique_ptr> model, diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index e72d9eff06..bbf16e11b8 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -342,7 +342,7 @@ PYBIND11_MODULE(_turbomind, m) .def_readwrite("repetition_penalty", &ft::GenerationConfig::repetition_penalty) .def_readwrite("random_seed", &ft::GenerationConfig::random_seed) .def_readwrite("output_logprobs", &ft::GenerationConfig::output_logprobs) - .def_readwrite("output_hidden_states", &ft::GenerationConfig::output_hidden_states) + .def_readwrite("output_last_hidden_state", &ft::GenerationConfig::output_last_hidden_state) .def_readwrite("output_logits", &ft::GenerationConfig::output_logits) .def("__repr__", [](const ft::GenerationConfig& c) { std::ostringstream oss; diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index e0cf1a02bc..c4c3c00d6c 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -379,7 +379,11 @@ std::unique_ptr LlamaTritonModel::createModelInstance(int devic FT_CHECK(engines_[device_id] != nullptr); - return std::make_unique(gateway_.get(), engine_param_.session_len, model_param_.vocab_size); + return std::make_unique(gateway_.get(), + getTensorType(), + engine_param_.session_len, + model_param_.vocab_size, + model_param_.hidden_units); } template From 841251852d5fdae912d47de9790be232cb5f7c17 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 7 Jan 2025 15:17:20 +0800 Subject: [PATCH 21/40] skip stop tokens --- lmdeploy/serve/async_engine.py | 55 ++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 90093b6154..2490d83cc2 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -603,6 +603,8 @@ async def generate( step: int = 0, do_preprocess: bool = True, adapter_name: Optional[str] = None, + skip_stop_tokens: bool = True, + rewind_stop_tokens: bool = False, **kwargs): """Generate responses. @@ -695,9 +697,13 @@ def is_error(status): return status not in [ResponseType.SUCCESS, ResponseType.FINISH] async with self.model_inst(session_id) as inst: - state = DetokenizeState(len(input_ids)) + stop_token_ids = gen_config.stop_token_ids \ + if skip_stop_tokens else [] token_ids = input_ids.copy() - prev_len = 0 + history_len = self.id2step[session_id] + input_len = len(input_ids) + output_len, gen_len = 0, 0 + state = DetokenizeState(len(input_ids)) start_ids_offset = state.ids_offset response = '' async with self.safe_run(inst, @@ -708,17 +714,32 @@ def is_error(status): stream_output=stream_response, sequence_start=sequence_start, sequence_end=sequence_end, - step=self.id2step[session_id]) as gen: + step=history_len) as gen: + prev_len = 0 + hit_stop_token = 0 async for outputs in gen: # decode res if is_error(outputs.status): - tokens = 0 break - tokens = outputs.num_token - token_ids += outputs.token_ids[prev_len - tokens:] - prev_len = tokens - if len(token_ids) <= state.ids_offset: + output_len = outputs.num_token + + if hit_stop_token: + continue + + # This assumes the engine will stop when stop token is hit + if output_len and outputs.token_ids[-1] in stop_token_ids: + hit_stop_token = 1 + + mask = slice(prev_len - output_len, + output_len - hit_stop_token) + + token_ids += outputs.token_ids[mask] + gen_len = len(token_ids) + + prev_len = output_len + + if gen_len <= state.ids_offset: continue ids_offset = state.ids_offset @@ -728,8 +749,9 @@ def is_error(status): skip_special_tokens=gen_config.skip_special_tokens) res = token_ids[ids_offset:] - out = GenOut(response, self.id2step[session_id], - len(input_ids), tokens, finish_reason, res) + out = GenOut(response, history_len, input_len, gen_len, + finish_reason, res) + if outputs.logprobs is not None: log_offset = ids_offset - start_ids_offset out.logprobs = outputs.logprobs[log_offset:] @@ -739,18 +761,18 @@ def is_error(status): out.logits = outputs.logits yield out + # end of generator loop - # end of generator loop if not is_error(outputs.status): finish_reason = 'length' \ - if tokens >= gen_config.max_new_tokens else 'stop' + if gen_len >= gen_config.max_new_tokens else 'stop' # utf-8 char at the end means it's a potential unfinished # byte sequence if not response.endswith('�'): - # avaid returning the last response twice + # avoid returning the last response twice response = '' yield GenOut(response, self.id2step[session_id], - len(input_ids), tokens, finish_reason) + len(input_ids), gen_len, finish_reason) else: yield GenOut(response='internal error happened', history_token_len=self.id2step[session_id], @@ -765,7 +787,10 @@ def is_error(status): # manually end pytorch session await inst.async_end(session_id) else: - self.id2step[session_id] += len(input_ids) + tokens + if rewind_stop_tokens: + # rewind the step to the token before the stop token + output_len = gen_len + self.id2step[session_id] += input_len + output_len def parse_tool_response(self, text, tools, **kwargs): """Parse model response containing tool information. From 3409742a5bcfdad66e1e975de401bef8a876fccd Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 7 Jan 2025 16:26:45 +0800 Subject: [PATCH 22/40] exit gracefully --- lmdeploy/serve/async_engine.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 2490d83cc2..cc6db9c31a 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio +import atexit import concurrent.futures import dataclasses import json @@ -166,15 +167,18 @@ class _EventLoopThread: def __init__(self): fut = concurrent.futures.Future() - self.thread = Thread( - target=partial(_EventLoopThread._thread_entry, fut)) + self.thread = Thread(target=partial(_EventLoopThread._thread_entry, + fut), + daemon=True) self.thread.start() self.loop: asyncio.AbstractEventLoop = fut.result() self.closed = False + atexit.register(self.close) @staticmethod def _thread_entry(fut): loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) fut.set_result(loop) try: loop.run_forever() @@ -190,9 +194,6 @@ def close(self): self.loop.call_soon_threadsafe(self.loop.stop) self.thread.join() - def __del__(self): - self.close() - class AsyncEngine(LogitsMixin): """Async inference engine. Maintaining a bunch of tm_model instances. From 21a75532533df252279a4c01e0bbf86632497174 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 7 Jan 2025 17:20:01 +0800 Subject: [PATCH 23/40] cancel all tasks atexit --- lmdeploy/serve/async_engine.py | 47 +++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index cc6db9c31a..2e2742767a 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -165,18 +165,17 @@ def __call__( class _EventLoopThread: - def __init__(self): + def __init__(self, daemon=False): fut = concurrent.futures.Future() - self.thread = Thread(target=partial(_EventLoopThread._thread_entry, - fut), - daemon=True) + self.thread = Thread(target=partial(self._thread_entry, fut), + daemon=daemon) self.thread.start() self.loop: asyncio.AbstractEventLoop = fut.result() self.closed = False - atexit.register(self.close) + if daemon: + atexit.register(self.close) - @staticmethod - def _thread_entry(fut): + def _thread_entry(self, fut): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) fut.set_result(loop) @@ -185,7 +184,37 @@ def _thread_entry(fut): except BaseException as e: logger.error(f'[internal_thread] {type(e).__name__} {e}') finally: - loop.close() + try: + self._cancel_all_tasks() + loop.run_until_complete(loop.shutdown_asyncgens()) + finally: + asyncio.set_event_loop(None) + loop.close() + + def _cancel_all_tasks(self): + """Modified from asyncio/runners.py.""" + to_cancel = asyncio.all_tasks(self.loop) + if not to_cancel: + return + + for task in to_cancel: + task.cancel() + + async def _gather(): + await asyncio.gather(*to_cancel, return_exceptions=True) + + self.loop.run_until_complete(_gather()) + + for task in to_cancel: + if task.cancelled(): + continue + if task.exception() is not None: + self.loop.call_exception_handler({ + 'message': + 'unhandled exception during worker thread shutdown', + 'exception': task.exception(), + 'task': task, + }) def close(self): if self.closed: @@ -280,7 +309,7 @@ def __init__(self, ] self._session_id = count(0) self.request_logger = RequestLogger(max_log_len) - self.internal_thread = _EventLoopThread() + self.internal_thread = _EventLoopThread(daemon=True) self.limiter: asyncio.Semaphore = None def close(self): From 49701df20c3723647c422dd7b15e2952cdf84e41 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 7 Jan 2025 22:24:40 +0800 Subject: [PATCH 24/40] refactor profiler --- benchmark/profile_throughput.py | 141 +++++++++---------------------- lmdeploy/profiler.py | 144 ++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+), 102 deletions(-) create mode 100644 lmdeploy/profiler.py diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 3a343e8f5b..854917f560 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -1,21 +1,18 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import asyncio -import csv -import itertools import json import os import random -import time from queue import Queue from typing import List, Tuple, Union -import numpy as np from tqdm import tqdm from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig) +from lmdeploy.profiler import Profiler, Session from lmdeploy.pytorch.engine import EngineInstance from lmdeploy.tokenizer import DetokenizeState, Tokenizer from lmdeploy.utils import get_logger @@ -72,7 +69,7 @@ class Engine: def __init__(self, model_path: str, engine_config: Union[PytorchEngineConfig, - TurbomindEngineConfig], csv: str): + TurbomindEngineConfig]): if isinstance(engine_config, TurbomindEngineConfig): from lmdeploy.turbomind import TurboMind tm_model = TurboMind.from_pretrained(model_path, @@ -84,7 +81,6 @@ def __init__(self, model_path: str, self.tm_model = tm_model self.tokenizer = tm_model.tokenizer - self.csv = csv self.pbar = None async def _inference(self, req_queue: Queue, session_id: int, @@ -92,12 +88,11 @@ async def _inference(self, req_queue: Queue, session_id: int, stream_output: bool, skip_tokenize: bool, skip_detokenize: bool): model_inst = self.tm_model.create_instance() - counters = [] - for prompt, input_seqlen, output_seqlen, cancel_after in iter( + sess: Session = None + for prompt, _, output_seqlen, cancel_after, sess in iter( req_queue.get_nowait, None): - ts = [time.perf_counter()] - ns = [0] + sess.tick(0) if skip_tokenize: input_ids = prompt @@ -128,11 +123,11 @@ async def _inference(self, req_queue: Queue, session_id: int, if not skip_detokenize: _, state = self.tokenizer.detokenize_incrementally( token_ids, state) - ts.append(time.perf_counter()) - ns.append(n_token) + sess.tick(n_token) prev_len = n_token if n_token > cancel_after: break + sess.finish(Session.SUCCESS) finally: await generator.aclose() @@ -140,14 +135,11 @@ async def _inference(self, req_queue: Queue, session_id: int, if isinstance(model_inst, EngineInstance): await model_inst.async_end(session_id) - counters.append((ts, ns, input_seqlen)) self.pbar.update(1) - return counters - - def process_request(self, requests, concurrency, temperature, top_p, top_k, - stream_output, skip_tokenize, skip_detokenize, - cancel_rate): + def process_request(self, requests, profiler: Profiler, concurrency, + temperature, top_p, top_k, stream_output, + skip_tokenize, skip_detokenize, cancel_rate): req_queue = Queue() # feed request to q @@ -156,11 +148,11 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k, if cancel_rate > 0: if random.random() < cancel_rate: cancel_after = random.randint(0, cancel_after) + sess = profiler.new_session(input_len, output_len) + req = [prompt, input_len, output_len, cancel_after, sess] if skip_tokenize: - req_queue.put((self.tokenizer.encode(prompt), input_len, - output_len, cancel_after)) - else: - req_queue.put((prompt, input_len, output_len, cancel_after)) + req[0] = self.tokenizer.encode(prompt) + req_queue.put(req) for i in range(concurrency): req_queue.put(None) @@ -177,91 +169,16 @@ async def _gather_tasks(tasks): self.pbar = tqdm(total=len(requests)) - start = time.time() - event_loop = asyncio.new_event_loop() asyncio.set_event_loop(event_loop) - counters = asyncio.run(_gather_tasks(tasks)) + profiler.start() - elapsed_time = time.time() - start + asyncio.run(_gather_tasks(tasks)) - self.pbar.close() + profiler.finish() - ttfts: List[float] = [] - tpots: List[float] = [] - e2es: List[float] = [] - itls: List[float] = [] - tpts: List[int] = [] - - total_output = 0 - total_input = 0 - - for ts, ns, input_len in itertools.chain.from_iterable(counters): - total_output += ns[-1] - total_input += input_len - e2es.append(ts[-1] - ts[0]) - ttfts.append(ts[1] - ts[0]) - if ns[-1] > ns[1]: - tpots.append((ts[-1] - ts[1]) / (ns[-1] - ns[1])) - else: # no-stream-output - tpots.append((ts[-1] - ts[0]) / (ns[-1] - ns[0])) - t_dif = np.subtract(ts[1:], ts[:-1]) - n_dif = np.subtract(ns[1:], ns[:-1]) - itls.extend(t_dif[1:]) - tpts.extend(n_dif) - - output_throughput = total_output / elapsed_time - input_throughput = total_input / elapsed_time - - qs = (50, 75, 90, 99) - - tpot_ms_mean = np.mean(tpots) - tpot_ms_stat = tuple(np.percentile(tpots, qs)) - e2e_mean = np.mean(e2es) - e2e_stat = tuple(np.percentile(e2es, qs)) - - if stream_output: - ttft_ms_mean = np.mean(ttfts) - ttft_ms_stat = tuple(np.percentile(ttfts, qs)) - itls_ms_mean = np.mean(itls) - itls_ms_stat = tuple(np.percentile(itls, qs)) - tpts_ms_mean = np.mean(tpts) - tpts_ms_stat = tuple(np.percentile(tpts, qs).astype(int)) - - rps = len(requests) / elapsed_time - - def tab_row(name, *items): - - def fmt(x): - return '{:>10.3f}'.format(x) if isinstance( - x, float) else '{:>10}'.format(x) - - print('{:<35}{}'.format(name, ''.join([fmt(x) for x in items]))) - - print('\n{s:{c}^{n}}'.format(s=' Profile Throughtput ', n=85, c='=')) - tab_row('Benchmark duration', elapsed_time) - tab_row('Total requests', len(requests)) - tab_row('Concurrency', concurrency) - tab_row('Cancel rate', cancel_rate) - tab_row('Stream output', str(stream_output).lower()) - tab_row('Skip tokenize', str(skip_tokenize).lower()) - tab_row('Skip detokenize', str(skip_detokenize).lower()) - tab_row('Total input tokens', total_input) - tab_row('Total generated tokens', total_output) - tab_row('Input token throughput (tok/s)', input_throughput) - tab_row('Output token throughput (tok/s)', output_throughput) - tab_row('Request throughput (req/s)', rps) - print('-' * 85) - tab_row('', 'mean', *(f'P{q}' for q in qs)) - tab_row('End-to-end Latency', e2e_mean, *e2e_stat) - if stream_output: - tab_row('Time to First Token (TTFT)', ttft_ms_mean, *ttft_ms_stat) - tab_row('Time per Output Token (TPOT)', tpot_ms_mean, *tpot_ms_stat) - if stream_output: - tab_row('Inter-token Latency (ITL)', itls_ms_mean, *itls_ms_stat) - tab_row('Tokens per Tick', tpts_ms_mean, *tpts_ms_stat) - print('=' * 85) + self.pbar.close() def parse_args(): @@ -298,6 +215,7 @@ def parse_args(): type=float, help='Possibility of a request being canceled', default=0) + parser.add_argument('--use-uvloop', action='store_true') parser.add_argument('--csv', type=str, help='Where to save the result.', @@ -372,13 +290,22 @@ def main(): dtype=args.dtype, ) - engine = Engine(args.model_path, engine_config, csv=args.csv) + if args.use_uvloop: + import uvloop + asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) + + engine = Engine(args.model_path, engine_config) requests = sample_requests(args.dataset, args.num_prompts, engine.tokenizer) + stream_output = not args.no_stream_output + + profiler = Profiler(stream_output, [50, 75, 95, 99]) + engine.process_request( requests, + profiler, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k, @@ -389,6 +316,16 @@ def main(): skip_detokenize=args.skip_detokenize, cancel_rate=args.cancel_rate) + hyperparams = [('Concurrency', args.concurrency), + ('Cancel rate', args.cancel_rate), + ('Stream output', str(stream_output).lower()), + ('Skip tokenize', str(args.skip_tokenize).lower()), + ('Skip detokenize', str(args.skip_detokenize).lower())] + profiler.compute_metrics() + profiler.summarize(title='Profile Throughput', hyperparams=hyperparams) + if args.csv: + profiler.save_csv(args.csv) + if __name__ == '__main__': main() diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py new file mode 100644 index 0000000000..7acde89613 --- /dev/null +++ b/lmdeploy/profiler.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import time +from typing import List + +import numpy as np + + +class Session: + + UNKNOWN = 0 + SUCCESS = 1 + FAIL = 2 + + def __init__(self, input_len, req_output_len): + self.ts = [] + self.ns = [] + self.input_len = input_len + self.req_output_len = req_output_len + self.status = Session.UNKNOWN + + def tick(self, n_token): + self.ts.append(time.perf_counter()) + self.ns.append(n_token) + + def finish(self, status): + self.status = status + + +class Profiler: + + def __init__(self, stream_output: bool, percentages: List[int]): + self.sessions: List[Session] = [] + self.stream_output = stream_output + self.percentages = percentages + + def new_session(self, *args, **kwargs): + sess = Session(*args, **kwargs) + self.sessions.append(sess) + return sess + + def start(self): + self.t_start = time.perf_counter() + + def finish(self): + self.elapsed_time = time.perf_counter() - self.t_start + + def compute_metrics(self): + self.ttfts: List[float] = [] + self.tpots: List[float] = [] + self.e2es: List[float] = [] + self.itls: List[float] = [] + self.tpts: List[int] = [] + self.total_output = 0 + self.total_input = 0 + self.success = 0 + + for sess in self.sessions: + if sess.status != Session.SUCCESS: + continue + ns = sess.ns + ts = sess.ts + if ns[-1] < sess.req_output_len: + continue + self.success += 1 + self.total_output += ns[-1] + self.total_input += sess.input_len + self.e2es.append(ts[-1] - ts[0]) + self.ttfts.append(ts[1] - ts[0]) + if ns[-1] > ns[1]: + self.tpots.append((ts[-1] - ts[1]) / (ns[-1] - ns[1])) + else: # no-stream-output + self.tpots.append((ts[-1] - ts[0]) / (ns[-1] - ns[0])) + t_dif = np.subtract(ts[1:], ts[:-1]) + n_dif = np.subtract(ns[1:], ns[:-1]) + self.itls.extend(t_dif[1:]) + self.tpts.extend(n_dif) + + self.output_throughput = self.total_output / self.elapsed_time + self.input_throughput = self.total_input / self.elapsed_time + + qs = self.percentages + + self.tpot_mean = np.mean(self.tpots) + self.tpot_stat = tuple(np.percentile(self.tpots, qs)) + self.e2e_mean = np.mean(self.e2es) + self.e2e_stat = tuple(np.percentile(self.e2es, qs)) + + if self.stream_output: + self.ttft_mean = np.mean(self.ttfts) + self.ttft_stat = tuple(np.percentile(self.ttfts, qs)) + self.itls_mean = np.mean(self.itls) + self.itls_stat = tuple(np.percentile(self.itls, qs)) + self.tpts_mean = np.mean(self.tpts) + self.tpts_stat = tuple(np.percentile(self.tpts, qs).astype(int)) + + self.rps = self.success / self.elapsed_time + + def summarize(self, + title: str, + hyperparams: List = None, + header=40, + digits=10): + + width = header + digits * (1 + len(self.percentages)) + + def tab_row(name, *items): + + def fmt(x): + return '{:>{d}.3f}'.format(x, d=digits) if isinstance( + x, float) else '{:>{d}}'.format(x, d=digits) + + print('{:<{p}}{}'.format(name, + ''.join([fmt(x) for x in items]), + p=header)) + + print('\n{s:{c}^{n}}'.format(s=f' {title} ', n=width, c='=')) + tab_row('Benchmark duration', self.elapsed_time) + tab_row('Total requests', len(self.sessions)) + tab_row('Successful requests', self.success) + if hyperparams: + for k, v in hyperparams: + tab_row(k, v) + tab_row('Total input tokens', self.total_input) + tab_row('Total generated tokens', self.total_output) + tab_row('Input throughput (tok/s)', self.input_throughput) + tab_row('Output throughput (tok/s)', self.output_throughput) + tab_row('Request throughput (req/s)', self.rps) + print('-' * width) + tab_row('', 'mean', *(f'P{q}' for q in self.percentages)) + tab_row('End-to-end Latency', self.e2e_mean, *self.e2e_stat) + if self.stream_output: + tab_row('Time to First Token (TTFT)', self.ttft_mean, + *self.ttft_stat) + tab_row('Time per Output Token (TPOT)', self.tpot_mean, + *self.tpot_stat) + if self.stream_output: + tab_row('Inter-token Latency (ITL)', self.itls_mean, + *self.itls_stat) + tab_row('Tokens per Tick', self.tpts_mean, *self.tpts_stat) + print('=' * width) + + def save_csv(self, csv: str): + # TODO + pass From f4b37af24ec1ea2a2476dd64232fe0540286d578 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 7 Jan 2025 22:28:22 +0800 Subject: [PATCH 25/40] fix id2step for api server --- lmdeploy/serve/openai/api_server.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index b23ef3018d..315a4a8889 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -340,8 +340,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, error_check_ret = await check_request(request) if error_check_ret is not None: return error_check_ret - if VariableInterface.async_engine.id2step.get(str(request.session_id), - 0) != 0: + if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0: return create_error_response( HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.') @@ -596,8 +595,7 @@ async def completions_v1(request: CompletionRequest, error_check_ret = await check_request(request) if error_check_ret is not None: return error_check_ret - if VariableInterface.async_engine.id2step.get(str(request.session_id), - 0) != 0: + if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0: return create_error_response( HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.') @@ -865,7 +863,7 @@ async def chat_interactive_v1(request: GenerateRequest, request.session_id = VariableInterface.session_id async_engine = VariableInterface.async_engine - sequence_start = async_engine.id2step.get(str(request.session_id), 0) == 0 + sequence_start = async_engine.id2step.get(request.session_id, 0) == 0 sequence_end = not request.interactive_mode if isinstance(request.stop, str): request.stop = [request.stop] From 2644fb7276a70ded5f82e6a077cbc54e671382da Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 8 Jan 2025 13:37:06 +0800 Subject: [PATCH 26/40] save csv --- lmdeploy/profiler.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py index 7acde89613..2b3a3242b2 100644 --- a/lmdeploy/profiler.py +++ b/lmdeploy/profiler.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import csv import time from typing import List @@ -139,6 +140,17 @@ def fmt(x): tab_row('Tokens per Tick', self.tpts_mean, *self.tpts_stat) print('=' * width) - def save_csv(self, csv: str): - # TODO - pass + def save_csv(self, csv_file: str): + """Export legacy metrics to CSV.""" + with open(csv_file, 'w') as csvfile: + writer = csv.writer(csvfile) + writer.writerow([ + 'RPS', 'RPM', 'FTL(ave)(s)', 'throughput(out tok/s)', + 'throughput(total tok/s)' + ]) + ttft_mean = f'{self.ttft_mean:.3f}' if self.stream_output else '-' + writer.writerow([ + f'{self.rps:.3f}', f'{(self.rps * 60):.3f}', ttft_mean, + f'{self.output_throughput:.3f}', + f'{(self.input_throughput + self.output_throughput):.3f}' + ]) From 6029a2ee81ab757afdd21a3e95472733f7837b0b Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 8 Jan 2025 15:29:36 +0800 Subject: [PATCH 27/40] fix interactive --- lmdeploy/serve/openai/api_server.py | 11 +++++++++++ src/turbomind/engine/model_request.cc | 15 +++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 315a4a8889..c37f7572a1 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -868,6 +868,17 @@ async def chat_interactive_v1(request: GenerateRequest, if isinstance(request.stop, str): request.stop = [request.stop] + end_session = sequence_end and not sequence_start \ + and request.prompt == '' and request.request_output_len == 0 + if end_session: + await async_engine.end_session(request.session_id) + return JSONResponse( + dict(text='', + tokens=0, + input_tokens=0, + history_tokens=0, + finish_reason=None)) + random_seed = request.seed if request.seed else None gen_config = GenerationConfig( diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index 5d1e75c965..fa789e4923 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -103,16 +103,17 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output auto& inputs = *param.tensors; - const int batch_size = 1; - const int beam_width = 1; - FT_CHECK(inputs.at("input_ids")->shape.size() == 1); const int input_len = inputs.at("input_ids")->shape[0]; const int output_len = input_len + param.gen_cfg.max_new_tokens; - const int max_seq_len = std::min(input_len + output_len, session_len_) + 1; + // Max possible length of a sequence, this depends on `history_len` which isn't available here, so `session_len` + // is used instead + const int max_seq_len = session_len_ + 1; const int max_out_len = std::min(output_len, session_len_) + 1; + // This does not include histroy length in interactive mode + const int max_in_out_len = std::min(input_len + output_len, session_len_) + 1; for (auto& [k, v] : *param.tensors) { inputs_->emplace(k, v); @@ -122,13 +123,11 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output add(outputs_, "sequence_length", TYPE_INT32, MEMORY_CPU, 1); if (param.gen_cfg.output_logits) { - /// TODO: allow output logits on GPU - add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, max_seq_len, vocab_size_); + add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, max_in_out_len, vocab_size_); } if (param.gen_cfg.output_last_hidden_state) { - /// TODO: allow hidden states on GPU - add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, max_seq_len, hidden_dim_); + add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, max_in_out_len, hidden_dim_); } if (param.gen_cfg.output_logprobs) { From 50fdb68367bcbba8af7725d55bb14e250793ca9a Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 8 Jan 2025 15:34:01 +0800 Subject: [PATCH 28/40] fix lint --- src/turbomind/engine/request.h | 3 ++- src/turbomind/models/llama/LlamaBatch.cc | 6 +++--- src/turbomind/models/llama/LlamaV2.cc | 3 +-- src/turbomind/python/bind.cpp | 4 ++-- src/turbomind/utils/cuda_utils.h | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index 86e98a5a9a..f50bd18eca 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -107,7 +107,8 @@ struct Request { int ec; // set when disabling conflicting requests - enum { + enum + { kOk = 0, kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set kConflict = 2, // Concurrent requests to the same sequence diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 7d2a2d8a1b..a9a6cc75b0 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1179,9 +1179,9 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) template void LlamaBatch::OutputLogits(T* context_decoder_output, - const std::vector& indices, - const std::vector& lengths, - const std::vector& sequences) + const std::vector& indices, + const std::vector& lengths, + const std::vector& sequences) { std::vector output_logits; int num_token = 0; diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 6018ac5819..b25a8eac60 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -20,16 +20,15 @@ // Modified from // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc - #include #include #include "src/turbomind/macro.h" -#include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h" +#include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/llama_params.h" diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index bbf16e11b8..042e858b09 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -283,10 +283,10 @@ static void safe_memcpy(void* dst, const void* src, size_t size) namespace { struct ScopedGIL { - ScopedGIL(const ScopedGIL&) = delete; + ScopedGIL(const ScopedGIL&) = delete; ScopedGIL& operator=(const ScopedGIL&) = delete; ScopedGIL(ScopedGIL&&) = delete; - ScopedGIL& operator=(ScopedGIL&&) = delete; + ScopedGIL& operator=(ScopedGIL&&) = delete; ScopedGIL() { state = PyGILState_Ensure(); diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h index 3f54401cff..ac1664bdd2 100644 --- a/src/turbomind/utils/cuda_utils.h +++ b/src/turbomind/utils/cuda_utils.h @@ -306,7 +306,7 @@ inline std::string getDeviceName() return std::string(props.name); } -template +template inline T div_up(T a, T n) { return (a + n - 1) / n; From e2ed1a23823e960b7fd267f2d63395a8d5911d14 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 8 Jan 2025 16:25:36 +0800 Subject: [PATCH 29/40] fix generate_token_len --- lmdeploy/serve/async_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 2e2742767a..e00477fbbd 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -765,11 +765,11 @@ def is_error(status): output_len - hit_stop_token) token_ids += outputs.token_ids[mask] - gen_len = len(token_ids) + gen_len = len(token_ids) - input_len prev_len = output_len - if gen_len <= state.ids_offset: + if len(token_ids) <= state.ids_offset: continue ids_offset = state.ids_offset From 21432bf11c6ac12847f76974648e813dc9421f65 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 8 Jan 2025 16:40:34 +0800 Subject: [PATCH 30/40] fix async_end --- lmdeploy/turbomind/turbomind.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 91ab1014e4..6b1b5c8de6 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -547,10 +547,10 @@ def async_end_cb(self, fut: asyncio.Future, status: int): logger.info(f'[async_end_cb] session ended, status = {status}') fut.get_loop().call_soon_threadsafe(fut.set_result, status) - def async_end(self, session_id): + async def async_end(self, session_id): fut = asyncio.get_running_loop().create_future() self.model_inst.end(partial(self.async_end_cb, fut), session_id) - return fut + await fut def async_signal_cb(self, s: StreamingSemaphore): """executing on engine's signaling thread.""" From ad0e07cc5643e88ea75d6bb55a675f2b932afe7b Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 8 Jan 2025 16:46:26 +0800 Subject: [PATCH 31/40] update pipeline ut --- autotest/interface/pipeline/test_pipeline_func.py | 2 +- autotest/utils/pipeline_chat.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index 87a0719bcb..0696684890 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -408,7 +408,7 @@ def run_pipeline_testcase(config, model, backend, file_name): result = True for i in range(2): result &= response[i].finish_reason == 'length' - result &= response[i].session_id == i + result &= response[i].index == i save_pipeline_common_log(config, file_name, result, response) del pipe torch.cuda.empty_cache() diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 8f03e4e406..5dcb358319 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -235,7 +235,7 @@ def assert_pipeline_single_stream_return(output, logprobs_num: int = 0): def assert_pipeline_batch_stream_return(output, size: int = 1): for i in range(size): - output_list = [item for item in output if item.session_id == i] + output_list = [item for item in output if item.index == i] result, msg = assert_pipeline_single_stream_return(output_list) if not result: return result, msg @@ -249,7 +249,7 @@ def assert_pipeline_single_element(output, result = True result &= output.generate_token_len > 0 result &= output.input_token_len > 0 - result &= output.session_id >= 0 + result &= output.index >= 0 if is_last: result &= len(output.text) >= 0 result &= output.finish_reason in ['stop', 'length'] From 4186da5aa565759696f3be8914a063177a85bf8a Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 8 Jan 2025 17:18:07 +0800 Subject: [PATCH 32/40] fix ignore eos --- lmdeploy/serve/async_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index e00477fbbd..c54ce0b328 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -728,7 +728,7 @@ def is_error(status): async with self.model_inst(session_id) as inst: stop_token_ids = gen_config.stop_token_ids \ - if skip_stop_tokens else [] + if skip_stop_tokens and not gen_config.ignore_eos else [] token_ids = input_ids.copy() history_len = self.id2step[session_id] input_len = len(input_ids) From bee78b6797090a896ebab94852af8441da4e5e24 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 8 Jan 2025 20:28:13 +0800 Subject: [PATCH 33/40] minor --- benchmark/profile_pipeline_api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py index 917867832c..820f11042a 100644 --- a/benchmark/profile_pipeline_api.py +++ b/benchmark/profile_pipeline_api.py @@ -108,7 +108,8 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k, completion_tokens = 0 for index, (n_token, finish_reason) in stats.items(): assert finish_reason == 'length', \ - f'unexpected finish_reason of index={index}, ' \ + f'unexpected finish_reason {finish_reason}, ' \ + f'index={index}, ' \ f'prompt={requests[index][0]}' assert n_token - 1 <= requests[index][-1] <= n_token, \ f'request to generate {requests[index][-1]} tokens, ' \ From 5f02cad1627ac2e97d334ce16b89078f5ef1753f Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 8 Jan 2025 21:46:03 +0800 Subject: [PATCH 34/40] refactor profile pipeline api --- benchmark/profile_pipeline_api.py | 110 ++++++++++++++---------------- lmdeploy/profiler.py | 6 ++ lmdeploy/turbomind/turbomind.py | 9 +-- 3 files changed, 61 insertions(+), 64 deletions(-) diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py index 820f11042a..d524be2992 100644 --- a/benchmark/profile_pipeline_api.py +++ b/benchmark/profile_pipeline_api.py @@ -1,11 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse -import csv import json import os import random -import time -from collections import OrderedDict from typing import List, Tuple from tqdm import tqdm @@ -14,6 +11,10 @@ from lmdeploy import (GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline) from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter +from lmdeploy.profiler import Profiler, Session +from lmdeploy.utils import get_logger + +logger = get_logger('lmdeploy') def sample_requests(dataset_path: str, num_requests: int, @@ -66,10 +67,9 @@ def __init__(self, model_path: str, engine_config, csv: str): self.csv = csv - def process_request(self, requests, concurrency, temperature, top_p, top_k, - stream_output): + def process_request(self, requests, profiler: Profiler, temperature, top_p, + top_k, stream_output): - stats = OrderedDict((index, None) for index in range(len(requests))) prompts = [prompt for prompt, _, _ in requests] gen_configs = [ GenerationConfig(temperature=temperature, @@ -81,7 +81,21 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k, for _, _, output_len in requests ] - start = time.perf_counter() + sess: List[Session] = [] + for _, input_len, output_len in requests: + sess.append(profiler.new_session(input_len, output_len)) + + def _to_status(finish_reason): + if finish_reason == 'length': + return Session.SUCCESS + else: + return Session.FAIL + + profiler.start() + + for s in sess: + s.tick(0) + if stream_output: pbar = tqdm(total=len(requests)) for output in self.pipe.stream_infer(prompts, @@ -90,9 +104,11 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k, index = output.index n_token = output.generate_token_len finish_reason = output.finish_reason - stats[index] = (n_token, finish_reason) + sess[index].tick(n_token) if finish_reason is not None: + sess[index].finish(_to_status(finish_reason)) pbar.update(1) + pbar.close() else: for output in self.pipe(prompts, gen_configs, @@ -101,57 +117,20 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k, index = output.index n_token = output.generate_token_len finish_reason = output.finish_reason - stats[index] = (n_token, finish_reason) - - elapsed_time = time.perf_counter() - start - - completion_tokens = 0 - for index, (n_token, finish_reason) in stats.items(): - assert finish_reason == 'length', \ - f'unexpected finish_reason {finish_reason}, ' \ - f'index={index}, ' \ - f'prompt={requests[index][0]}' - assert n_token - 1 <= requests[index][-1] <= n_token, \ - f'request to generate {requests[index][-1]} tokens, ' \ - f'but got {n_token} tokens' - completion_tokens += n_token - - prompt_tokens = 0 - for _, input_len, _ in requests: - prompt_tokens += input_len - - completion_token_throughput = completion_tokens / elapsed_time - total_token_throughput = (prompt_tokens + - completion_tokens) / elapsed_time - rps = len(requests) / elapsed_time - rpm = rps * 60 - - print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n' - f'elapsed_time: {elapsed_time:.3f}s\n') - - print( - f'number of prompts: {len(requests)}\n' - f'number of prompt tokens: {prompt_tokens:.0f}\n' - f'number of completion tokens: {completion_tokens:.0f}\n' - f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa - f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa - f'RPS (request per second): {rps:.3f} req/s\n' - f'RPM (request per minute): {rpm:.3f} req/min\n' - f'{"-" * 50}\n') - - if self.csv: - with open(self.csv, 'w') as csvfile: - writer = csv.writer(csvfile) - writer.writerow([ - 'batch', 'num_promts', 'RPS', 'RPM', - 'throughput(out tok/s)', 'throughput(total tok/s)' - ]) - writer.writerow([ - concurrency, - len(requests), f'{rps:.3f}', f'{rpm:.3f}', - f'{completion_token_throughput:.3f}', - f'{total_token_throughput:.3f}' - ]) + sess[index].tick(n_token) + sess[index].finish(_to_status(finish_reason)) + + profiler.finish() + + # report first failure + for i, s in enumerate(sess): + if s.status != Session.SUCCESS or s.ns[-1] < s.req_output_len: + logger.error( + f'Request {i} failed with {s.ns[-1]}/{s.req_output_len} tokens generated' # noqa: E501 + ) + logger.error(f'Prompt: {prompts[i]}') + logger.warning('Got failed requests, metrics may be invalid') + break def parse_args(): @@ -253,13 +232,24 @@ def main(): requests = sample_requests(args.dataset, args.num_prompts, engine.tokenizer) + profiler = Profiler(args.stream_output, [50, 75, 95, 99]) + engine.process_request(requests, + profiler, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k, - concurrency=args.concurrency, stream_output=args.stream_output) + hyperparams = [('Concurrency', args.concurrency), + ('Stream output', str(args.stream_output).lower())] + + profiler.compute_metrics() + profiler.summarize(title='Profile Pipeline API', hyperparams=hyperparams) + + if args.csv: + profiler.save_csv(args.csv) + if __name__ == '__main__': main() diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py index 2b3a3242b2..6d99ae99ee 100644 --- a/lmdeploy/profiler.py +++ b/lmdeploy/profiler.py @@ -81,6 +81,12 @@ def compute_metrics(self): qs = self.percentages + self.e2es = self.e2es or [float('inf')] + self.tpots = self.tpots or [float('inf')] + self.ttfts = self.ttfts or [float('inf')] + self.itls = self.itls or [float('inf')] + self.tpts = self.tpts or [0] + self.tpot_mean = np.mean(self.tpots) self.tpot_stat = tuple(np.percentile(self.tpots, qs)) self.e2e_mean = np.mean(self.e2es) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 6b1b5c8de6..7d6c395a84 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -648,11 +648,12 @@ async def async_stream_infer(self, if finish: break - except GeneratorExit: - logger.info(f'[async_stream_infer] GeneratorExit {session_id}') - await self.async_cancel(session_id) - except BaseException as e: + except (GeneratorExit, asyncio.CancelledError) as e: + logger.info(f'[async_stream_infer] {type(e).__name__}') + self.model_inst.cancel() + except Exception as e: logger.error(f'[async_stream_infer] {type(e).__name__} {e}') + self.model_inst.cancel() yield self._get_error_output() finally: # Contract: `cb` won't be called again if status is non-zero From 196532732aae66bcb6efab6684f8d8a3437530e3 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 9 Jan 2025 14:10:57 +0800 Subject: [PATCH 35/40] fix stop ids --- lmdeploy/serve/async_engine.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index c54ce0b328..9d2097f262 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -726,9 +726,14 @@ async def generate( def is_error(status): return status not in [ResponseType.SUCCESS, ResponseType.FINISH] + # used to skip / rewind stop words in interactive mode + stop_ids = [] + if skip_stop_tokens and not gen_config.ignore_eos: + stop_ids = gen_config.stop_token_ids or [] + if self.tokenizer.eos_token_id not in stop_ids: + stop_ids.append(self.tokenizer.eos_token_id) + async with self.model_inst(session_id) as inst: - stop_token_ids = gen_config.stop_token_ids \ - if skip_stop_tokens and not gen_config.ignore_eos else [] token_ids = input_ids.copy() history_len = self.id2step[session_id] input_len = len(input_ids) @@ -758,7 +763,7 @@ def is_error(status): continue # This assumes the engine will stop when stop token is hit - if output_len and outputs.token_ids[-1] in stop_token_ids: + if output_len and outputs.token_ids[-1] in stop_ids: hit_stop_token = 1 mask = slice(prev_len - output_len, From 7b513cb0b87d346de5460d21fb6b4c3c95d7dc72 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 9 Jan 2025 17:23:41 +0800 Subject: [PATCH 36/40] fix duplication --- lmdeploy/serve/async_engine.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 9d2097f262..2cf914d11a 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -759,7 +759,7 @@ def is_error(status): output_len = outputs.num_token - if hit_stop_token: + if hit_stop_token or prev_len == output_len: continue # This assumes the engine will stop when stop token is hit @@ -774,9 +774,6 @@ def is_error(status): prev_len = output_len - if len(token_ids) <= state.ids_offset: - continue - ids_offset = state.ids_offset response, state = self.tokenizer.detokenize_incrementally( token_ids, From 2e3a17d4a2b76b23a60cbba7749b07ceda6cdb07 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Fri, 10 Jan 2025 17:00:45 +0800 Subject: [PATCH 37/40] control output range of logits & last hidden states --- lmdeploy/messages.py | 4 +- lmdeploy/serve/async_engine.py | 5 + lmdeploy/turbomind/turbomind.py | 31 ++- src/turbomind/engine/model_request.cc | 8 +- src/turbomind/engine/request.h | 15 +- src/turbomind/models/llama/LlamaBatch.cc | 230 +++++++++++++---------- src/turbomind/models/llama/LlamaBatch.h | 14 +- 7 files changed, 183 insertions(+), 124 deletions(-) diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index fc495629cd..d4e6571b79 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -97,8 +97,8 @@ class GenerationConfig: logprobs: int = None response_format: Optional[Dict] = None logits_processors: Optional[List[LogitsProcessor]] = None - output_logits: bool = None - output_last_hidden_state: bool = None + output_logits: Literal['all', 'generation'] = None + output_last_hidden_state: Literal['all', 'generation'] = None def convert_stop_bad_words_to_ids(self, tokenizer: Tokenizer): """convert stop_words/bad_sords to ids and append the ids to diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 2cf914d11a..b8239a455c 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -789,8 +789,13 @@ def is_error(status): out.logprobs = outputs.logprobs[log_offset:] if outputs.last_hidden_state is not None: out.last_hidden_state = outputs.last_hidden_state + if hit_stop_token: + out.last_hidden_state = \ + out.last_hidden_state[:-hit_stop_token] if outputs.logits is not None: out.logits = outputs.logits + if hit_stop_token: + out.logits = out.logits[:-hit_stop_token] yield out # end of generator loop diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 7d6c395a84..4a15dd6841 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -319,20 +319,21 @@ def create_instance(self, cuda_stream_id=0): return TurboMindInstance(self, self.config, cuda_stream_id) -def _get_logits(outputs): +def _get_logits(outputs, offset: int): logits = outputs['logits'] def _func(out: EngineOutput, step: int): - out.logits = logits[:step - 1, :] + out.logits = logits[:step - offset - 1, :] return _func -def _get_last_hidden_state(outputs): +def _get_last_hidden_state(outputs, offset: int): last_hidden_state = outputs['last_hidden_state'] + print(f'last_hidden_state.shape = {last_hidden_state.shape}') def _func(out: EngineOutput, step: int): - out.last_hidden_state = last_hidden_state[:step - 1, :] + out.last_hidden_state = last_hidden_state[:step - offset - 1, :] return _func @@ -439,12 +440,19 @@ def _create_model_instance(self, device_id): return model_inst def _get_extra_output_processors(self, outputs: Dict[str, torch.Tensor], - gen_config: GenerationConfig): + gen_config: GenerationConfig, + input_len: int): + + def _get_offset(type): + return input_len - 1 if type == 'generation' else 0 + fs = [] if gen_config.output_logits: - fs.append(_get_logits(outputs)) + offset = _get_offset(gen_config.output_logits) + fs.append(_get_logits(outputs, offset)) if gen_config.output_last_hidden_state: - fs.append(_get_last_hidden_state(outputs)) + offset = _get_offset(gen_config.output_last_hidden_state) + fs.append(_get_last_hidden_state(outputs, offset)) if gen_config.logprobs: fs.append(_get_logprobs(outputs, gen_config.logprobs)) return fs @@ -607,7 +615,8 @@ async def async_stream_infer(self, outputs = _tm_dict_to_torch_dict(outputs) - extra_fs = self._get_extra_output_processors(outputs, gen_config) + extra_fs = self._get_extra_output_processors(outputs, gen_config, + input_len) output_ids_buf = outputs['output_ids'] @@ -678,10 +687,12 @@ def _get_generation_config(self, cfg: GenerationConfig): c.repetition_penalty = cfg.repetition_penalty if cfg.min_new_tokens: c.min_new_tokens = cfg.min_new_tokens + output_type = dict(all=1, generation=2) if cfg.output_last_hidden_state: - c.output_last_hidden_state = cfg.output_last_hidden_state + c.output_last_hidden_state = output_type[ + cfg.output_last_hidden_state] if cfg.output_logits: - c.output_logits = cfg.output_logits + c.output_logits = output_type[cfg.output_logits] if cfg.logprobs: if cfg.logprobs > MAX_LOGPROBS: cfg.logprobs = MAX_LOGPROBS diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index fa789e4923..6ba355e896 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -106,7 +106,7 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output FT_CHECK(inputs.at("input_ids")->shape.size() == 1); const int input_len = inputs.at("input_ids")->shape[0]; - const int output_len = input_len + param.gen_cfg.max_new_tokens; + const int output_len = param.gen_cfg.max_new_tokens; // Max possible length of a sequence, this depends on `history_len` which isn't available here, so `session_len` // is used instead @@ -123,11 +123,13 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output add(outputs_, "sequence_length", TYPE_INT32, MEMORY_CPU, 1); if (param.gen_cfg.output_logits) { - add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, max_in_out_len, vocab_size_); + const int len = param.gen_cfg.output_logits == GenerationConfig::kAll ? max_in_out_len : max_out_len; + add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, len, vocab_size_); } if (param.gen_cfg.output_last_hidden_state) { - add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, max_in_out_len, hidden_dim_); + const int len = param.gen_cfg.output_last_hidden_state == GenerationConfig::kAll ? max_in_out_len : max_out_len; + add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, len, hidden_dim_); } if (param.gen_cfg.output_logprobs) { diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index f50bd18eca..c1ab52f079 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -25,9 +25,15 @@ struct GenerationConfig { uint64_t random_seed = 0; - int output_logprobs = 0; - bool output_last_hidden_state = false; - bool output_logits = false; + int output_logprobs = 0; + + enum OutType { + kNone = 0, + kAll = 1, + kGeneration = 2 + }; + int output_last_hidden_state = 0; + int output_logits = 0; }; inline std::ostream& operator<<(std::ostream& os, const GenerationConfig& c) @@ -107,8 +113,7 @@ struct Request { int ec; // set when disabling conflicting requests - enum - { + enum { kOk = 0, kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set kConflict = 2, // Concurrent requests to the same sequence diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index a9a6cc75b0..a593ef6b68 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -1177,97 +1178,148 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) sync_check_cuda_error(); } -template -void LlamaBatch::OutputLogits(T* context_decoder_output, - const std::vector& indices, - const std::vector& lengths, - const std::vector& sequences) +template +void LlamaBatch::ComputeAndOutputLogits(T* hidden_states, int first, int last) { - std::vector output_logits; - int num_token = 0; - { - bool is_return_logits = false; - for (int k = 0; k < indices.size(); ++k) { - auto& request = state_->requests[indices[k]]; - auto logits = request->outputs.getPtr("logits", nullptr); - if (logits && sequences[k]->cache_len + lengths[k] <= sequences[k]->tokens.size()) { - logits = nullptr; - } - output_logits.push_back(logits); - num_token += lengths[k]; - if (output_logits.back()) { - is_return_logits = true; + int token_num = 0; + bool found = false; + for (int i = first; i < last; ++i) { + if (state_->requests[i]->gen_cfg.output_logits == GenerationConfig::kAll) { + const auto& s = *state_->sequences[i]; + // Skip when the seq is filling missed cache only + if (s.cache_len + h_input_length_buf_[i] > s.tokens.size()) { + found = true; } } - if (!is_return_logits) { - return; - } + token_num += h_input_length_buf_[i]; } - { - context_logits_buf_ = (float*)allocator_->reMalloc( - context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * num_token, false); - const auto tp = model_->tensor_para_.world_size_; - if (tp > 1) { - NcclGuard guard(model_->tensor_para_, stream_, true); - FT_CHECK(model_->vocab_size_padded_ % tp == 0); - const auto local_vocab_size = model_->vocab_size_padded_ / tp; - local_context_logits_buf_ = (float*)peer_allocator_->reMalloc( - local_context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * num_token, false); - } + if (!found) { + return; } - model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, context_decoder_output, num_token); + context_logits_buf_ = (float*)allocator_->reMalloc( + context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * token_num, false); + const auto tp = model_->tensor_para_.world_size_; - auto logits = context_logits_buf_; + if (tp > 1) { + NcclGuard guard(model_->tensor_para_, stream_, true); + FT_CHECK(model_->vocab_size_padded_ % tp == 0); + const auto local_vocab_size = model_->vocab_size_padded_ / tp; + local_context_logits_buf_ = (float*)peer_allocator_->reMalloc( + local_context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * token_num, false); + } + + model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, hidden_states, token_num); - // Only rank-0 writes to output if (rank_ != 0) { return; } - for (int k = 0; k < indices.size(); ++k) { - if (output_logits[k]) { - auto src_ptr = logits; - auto dst_ptr = output_logits[k]; - int num_new_token = 0; - if (sequences[k]->cache_len < sequences[k]->tokens.size()) { - num_new_token = sequences[k]->cache_len + lengths[k] - sequences[k]->tokens.size(); - src_ptr += (lengths[k] - num_new_token) * model_->vocab_size_padded_; - } - else { - num_new_token = lengths[k]; - dst_ptr += (sequences[k]->cache_len - sequences[k]->tokens.size()) * model_->vocab_size_; - } - if (model_->vocab_size_padded_ == model_->vocab_size_) { - Copy(src_ptr, model_->vocab_size_ * num_new_token, dst_ptr); + OutputLogits(context_logits_buf_, first, last, GenerationConfig::kAll); +} + +template +void LlamaBatch::OutputLogits(const float* logits, int first, int last, GenerationConfig::OutType out_type) +{ + // when `is_all` is true, logits only contains last token of the sequences + const bool is_all = out_type == GenerationConfig::kAll; + + for (int i = first; i < last; ++i) { + + const int input_len = h_input_length_buf_[i]; // input lenght for this iter + const float* src_ptr = logits; + + logits += (is_all ? input_len : 1) * model_->vocab_size_padded_; + + if (state_->requests[i]->gen_cfg.output_logits == out_type) { + + auto dst_ptr = state_->requests[i]->outputs.getPtr("logits"); + + const int cache_len = state_->sequences[i]->cache_len; + const int history_len = state_->sequences[i]->tokens.size(); + + // ----------H------I-------P----------- + // C C C C + + // offset to the last token prompt + const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape[0] - 1; + + int diff = (history_len + offset) - cache_len; + + const int valid_len = input_len - std::max(0, (history_len + offset) - cache_len); + + // TM_LOG_ERROR("%d %d %d %d %d %d %d", + // history_len, + // offset, + // cache_len, + // input_len, + // valid_len, + // std::max(0, diff), + // std::max(0, -diff)); + + if (valid_len <= 0) { + continue; } - else { - for (int tok = 0; tok < num_new_token; tok++) { - Copy(src_ptr, model_->vocab_size_, dst_ptr); - src_ptr += model_->vocab_size_padded_; - dst_ptr += model_->vocab_size_; - } + + if (is_all) { + // Skip invalid tokens caused by cache miss + src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->vocab_size_padded_; } + // Skip previous chunks + dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->vocab_size_; + + check_cuda_error(cudaMemcpy2DAsync(dst_ptr, + sizeof(float) * model_->vocab_size_, + src_ptr, + sizeof(float) * model_->vocab_size_padded_, + sizeof(float) * model_->vocab_size_, + valid_len, + cudaMemcpyDefault, + stream_)); } - logits += model_->vocab_size_padded_ * lengths[k]; } } -template -void LlamaBatch::OutputLastHiddenState(T* context_decoder_output, - const std::vector& idxs, - const std::vector& input_lens, - const std::vector& sequences) +template +void LlamaBatch::OutputLastHiddenState(const T* hidden_states, + int first, + int last) { - for (int i = 0; i < idxs.size(); ++i) { - auto& r = state_->requests[idxs[i]]; - if (r->gen_cfg.output_last_hidden_state) { - auto dst = r->outputs.getPtr("last_hidden_state"); - dst += sequences[i]->cache_len * model_->hidden_units_; - Copy(context_decoder_output, (int64_t)input_lens[i] * model_->hidden_units_, dst); + for (int i = first; i < last; ++i) { + + const int input_len = h_input_length_buf_[i]; // input lenght for this iter + const T* src_ptr = hidden_states; + + hidden_states += input_len * model_->hidden_units_; + + if (auto out_type = state_->requests[i]->gen_cfg.output_last_hidden_state) { + + const bool is_all = out_type == GenerationConfig::kAll; + + T* dst_ptr = state_->requests[i]->outputs.getPtr("last_hidden_state"); + + const int cache_len = state_->sequences[i]->cache_len; + const int history_len = state_->sequences[i]->tokens.size(); + + // offset to the last prompt token + const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape[0] - 1; + + const int valid_len = input_len - std::max(0, (history_len + offset) - cache_len); + + // TM_LOG_ERROR("%d %d %d %d %d", history_len, offset, cache_len, input_len, valid_len); + + if (valid_len <= 0) { + continue; + } + + // Skip invalid tokens caused by cache miss + src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->hidden_units_; + // Skip previous chunks + dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->hidden_units_; + + Copy(src_ptr, valid_len * model_->hidden_units_, dst_ptr); } - context_decoder_output += (int64_t)input_lens[i] * model_->hidden_units_; } } @@ -1662,20 +1714,11 @@ bool LlamaBatch::Forward(GenerationState& g) const int last = offsets[p + 1]; const int mini_batch_size = last - first; int* input_ids = context_decoder_ids_buf_; - // - std::vector decode_indices{}; - std::vector decode_lengths{}; - - std::vector sequences; BatchedCopy batched_copy; int sum_k = 0; for (int i = first; i < last; ++i) { input_ids = batched_copy.Add(input_d_ptrs[i], h_input_length_buf_[i], input_ids); - dbg(i, h_input_length_buf_[i]); - decode_indices.push_back(i); - decode_lengths.push_back(h_input_length_buf_[i]); - sequences.push_back(state_->sequences[i]); if (h_input_length_buf_[i] > 1) { sum_k += state_->h_context_length[i]; } @@ -1717,20 +1760,10 @@ bool LlamaBatch::Forward(GenerationState& g) dc_batch_size, pf_batch_size, lora_mask_buf_, - sequences.data()); + state_->sequences.data() + first); - // compute logits of inputs if requested - OutputLogits(context_decoder_output_buf_, decode_indices, decode_lengths, sequences); - OutputLastHiddenState(context_decoder_output_buf_, decode_indices, decode_lengths, sequences); - } - - std::fill(h_input_length_buf_, h_input_length_buf_ + active_size, 0); - - // `SequenceManager` needs real-time value of cache length - for (int i = 0; i < active_size; ++i) { - FT_CHECK((bool)state_->requests[i]); - FT_CHECK(state_->sequences[i]); - state_->sequences[i]->cache_len += state_->sequences[i]->input_length; + ComputeAndOutputLogits(context_decoder_output_buf_, first, last); + OutputLastHiddenState(context_decoder_output_buf_, first, last); } if (active_size > g.partial) { @@ -1738,12 +1771,10 @@ bool LlamaBatch::Forward(GenerationState& g) AnomalyHandler::instance().FixLogits(logits_buf_, active_size - g.partial, 1); - // count_and_fix(logits_buf_, (active_size - g.partial) * model_->vocab_size_padded_, "logits", 1); + OutputLogits(logits_buf_, 0, active_size - g.partial, GenerationConfig::kGeneration); FT_CHECK(g.step >= 0); - // TM_LOG_INFO("dyn decode bsz %d, partial %d", active_size, g.partial); - if (!g.skip_init_sampling) { InitializeSampling(g); } @@ -1767,6 +1798,15 @@ bool LlamaBatch::Forward(GenerationState& g) active_size - g.partial); } + std::fill(h_input_length_buf_, h_input_length_buf_ + active_size, 0); + + // `SequenceManager` needs real-time value of cache length + for (int i = 0; i < active_size; ++i) { + FT_CHECK((bool)state_->requests[i]); + FT_CHECK(state_->sequences[i]); + state_->sequences[i]->cache_len += state_->sequences[i]->input_length; + } + AnomalyHandler::instance().Summarize([&](const int* is_anomaly, int batch_size) { for (int i = 0; i < batch_size; ++i) { if (is_anomaly[i]) { diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index 3fe181bb94..44a20bebe1 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -100,15 +100,11 @@ class LlamaBatch { [[nodiscard]] Signal Interrupt(int index, bool force_stop = false, bool force_end = false); - void OutputLogits(T* context_decoder_output, - const std::vector& indices, - const std::vector& lengths, - const std::vector& sequences); - - void OutputLastHiddenState(T* context_decoder_output, - const std::vector& indices, - const std::vector& lengths, - const std::vector& sequences); + void ComputeAndOutputLogits(T* hidden_states, int first, int last); + + void OutputLogits(const float* logits, int first, int last, GenerationConfig::OutType out_type); + + void OutputLastHiddenState(const T* hidden_states, int first, int last); explicit LlamaBatch(const EngineParam& param, std::unique_ptr> model, From 80108dfe76e6a6ff4a8dd48f5f5d850056da6c71 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Fri, 10 Jan 2025 18:10:14 +0800 Subject: [PATCH 38/40] fix lint & typo --- src/turbomind/engine/gateway.cc | 2 +- src/turbomind/engine/request.h | 6 ++++-- src/turbomind/engine/signal_buffer.h | 2 +- src/turbomind/models/llama/LlamaBatch.cc | 4 +--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/turbomind/engine/gateway.cc b/src/turbomind/engine/gateway.cc index 8bc728072f..e949ec7cd3 100644 --- a/src/turbomind/engine/gateway.cc +++ b/src/turbomind/engine/gateway.cc @@ -15,7 +15,7 @@ Gateway::Gateway(std::function()> ctx_factory): request_qu void Gateway::shutdown() { request_queue_.close(); - signal_buffer_.clsoe(); + signal_buffer_.close(); signal_thread_.join(); } diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index c1ab52f079..6bf706c9b8 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -27,7 +27,8 @@ struct GenerationConfig { int output_logprobs = 0; - enum OutType { + enum OutType + { kNone = 0, kAll = 1, kGeneration = 2 @@ -113,7 +114,8 @@ struct Request { int ec; // set when disabling conflicting requests - enum { + enum + { kOk = 0, kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set kConflict = 2, // Concurrent requests to the same sequence diff --git a/src/turbomind/engine/signal_buffer.h b/src/turbomind/engine/signal_buffer.h index 10a50e15f7..cb09be7909 100644 --- a/src/turbomind/engine/signal_buffer.h +++ b/src/turbomind/engine/signal_buffer.h @@ -24,7 +24,7 @@ class SignalBuffer { cv_.notify_one(); } - void clsoe() + void close() { { std::lock_guard lock{mutex_}; diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index a593ef6b68..e37af1bb76 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1282,9 +1282,7 @@ void LlamaBatch::OutputLogits(const float* logits, int first, int last, Gener } template -void LlamaBatch::OutputLastHiddenState(const T* hidden_states, - int first, - int last) +void LlamaBatch::OutputLastHiddenState(const T* hidden_states, int first, int last) { for (int i = first; i < last; ++i) { From 6c2f9019149f7d6e99e24b1b4a70925e63cc3e26 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Fri, 10 Jan 2025 18:35:52 +0800 Subject: [PATCH 39/40] fix blank response --- lmdeploy/serve/async_engine.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index b8239a455c..d7366c654b 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -765,6 +765,9 @@ def is_error(status): # This assumes the engine will stop when stop token is hit if output_len and outputs.token_ids[-1] in stop_ids: hit_stop_token = 1 + # one token and it's been skipped + if output_len == prev_len + 1: + continue mask = slice(prev_len - output_len, output_len - hit_stop_token) From 31b01f1f3ffcfdcf52e32bba818049c13e2e6ebc Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Fri, 10 Jan 2025 18:54:46 +0800 Subject: [PATCH 40/40] export batch & num prompts --- benchmark/profile_pipeline_api.py | 3 ++- benchmark/profile_throughput.py | 3 ++- lmdeploy/profiler.py | 18 +++++++++++++----- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py index d524be2992..334be7fa14 100644 --- a/benchmark/profile_pipeline_api.py +++ b/benchmark/profile_pipeline_api.py @@ -248,7 +248,8 @@ def main(): profiler.summarize(title='Profile Pipeline API', hyperparams=hyperparams) if args.csv: - profiler.save_csv(args.csv) + profiler.save_csv(args.csv, (('batch', args.concurrency), + ('num_prompts', args.num_prompts))) if __name__ == '__main__': diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 854917f560..2e4d2a3b8c 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -324,7 +324,8 @@ def main(): profiler.compute_metrics() profiler.summarize(title='Profile Throughput', hyperparams=hyperparams) if args.csv: - profiler.save_csv(args.csv) + profiler.save_csv(args.csv, (('batch', args.concurrency), + ('num_prompts', args.num_prompts))) if __name__ == '__main__': diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py index 6d99ae99ee..c1bf6b3875 100644 --- a/lmdeploy/profiler.py +++ b/lmdeploy/profiler.py @@ -146,17 +146,25 @@ def fmt(x): tab_row('Tokens per Tick', self.tpts_mean, *self.tpts_stat) print('=' * width) - def save_csv(self, csv_file: str): + def save_csv(self, csv_file: str, hyperparams): """Export legacy metrics to CSV.""" with open(csv_file, 'w') as csvfile: writer = csv.writer(csvfile) + keys, vals = zip(*hyperparams) writer.writerow([ - 'RPS', 'RPM', 'FTL(ave)(s)', 'throughput(out tok/s)', - 'throughput(total tok/s)' + *keys, + 'RPS', + 'RPM', + 'FTL(ave)(s)', + 'throughput(out tok/s)', + 'throughput(total tok/s)', ]) ttft_mean = f'{self.ttft_mean:.3f}' if self.stream_output else '-' writer.writerow([ - f'{self.rps:.3f}', f'{(self.rps * 60):.3f}', ttft_mean, + *vals, + f'{self.rps:.3f}', + f'{(self.rps * 60):.3f}', + ttft_mean, f'{self.output_throughput:.3f}', - f'{(self.input_throughput + self.output_throughput):.3f}' + f'{(self.input_throughput + self.output_throughput):.3f}', ])