Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions cpp/serve/function_table.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ void FunctionTable::Init(TVMArgValue reload_lib, Device device, picojson::object
}
}
this->model_config = model_config;
this->cached_buffers = Map<String, ObjectRef>();

if (num_shards > 1) {
String lib_path{nullptr};
Expand Down Expand Up @@ -255,7 +254,7 @@ ObjectRef FunctionTable::CopyToWorker0(const NDArray& host_array, String buffer_
buffer = Downcast<DRef>((*it).second);
} else {
buffer = Downcast<DRef>(this->Empty(max_reserved_shape, host_array.DataType(), null_device));
this->cached_buffers.Set(buffer_cache_key, buffer);
this->cached_buffers.emplace(buffer_cache_key, buffer);
}
ShapeTuple real_shape = host_array.Shape();
DRef buffer_view = nd_view_func_(buffer, real_shape);
Expand All @@ -268,7 +267,7 @@ ObjectRef FunctionTable::CopyToWorker0(const NDArray& host_array, String buffer_
buffer = Downcast<NDArray>((*it).second);
} else {
buffer = NDArray::Empty(max_reserved_shape, host_array->dtype, local_gpu_device);
this->cached_buffers.Set(buffer_cache_key, buffer);
this->cached_buffers.emplace(buffer_cache_key, buffer);
}
buffer = buffer.CreateView(host_array.Shape(), host_array->dtype);
DLTensor copy_dst = *(buffer.operator->());
Expand Down
2 changes: 1 addition & 1 deletion cpp/serve/function_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ struct FunctionTable {
Device local_gpu_device;
Session sess{nullptr};
DRef disco_mod{nullptr};
Map<String, ObjectRef> cached_buffers{nullptr};
std::unordered_map<String, ObjectRef> cached_buffers;
tvm::runtime::Module local_vm{nullptr};
picojson::object model_config;

Expand Down
28 changes: 8 additions & 20 deletions tests/python/serve/test_serve_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
)
from mlc_llm.serve.engine import ModelInfo

model_path = "dist/Llama-2-7b-chat-hf-q0f16-MLC"
model_lib_path = "dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we revert the changes in this file? Given this test is not rather standard (like used in CI), I personally prefer a finer control of the paths in each function.

prompts = [
"What is the meaning of life?",
"Introduce the history of Pittsburgh to me. Please elaborate in detail.",
Expand Down Expand Up @@ -68,10 +71,7 @@ def test_engine_basic():
"""

# Initialize model loading info and KV cache config
model = ModelInfo(
"dist/Llama-2-7b-chat-hf-q0f16-MLC",
model_lib_path="dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so",
)
model = ModelInfo(model_path, model_lib_path=model_lib_path)
kv_cache_config = KVCacheConfig(page_size=16)

# Hyperparameters for tests (you can try different combinations).
Expand Down Expand Up @@ -129,10 +129,7 @@ def test_engine_continuous_batching_1():
"""

# Initialize model loading info and KV cache config
model = ModelInfo(
"dist/Llama-2-7b-chat-hf-q0f16-MLC",
model_lib_path="dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so",
)
model = ModelInfo(model_path, model_lib_path=model_lib_path)
kv_cache_config = KVCacheConfig(page_size=16)

# Hyperparameters for tests (you can try different combinations)
Expand Down Expand Up @@ -208,10 +205,7 @@ def test_engine_continuous_batching_2():
"""

# Initialize model loading info and KV cache config
model = ModelInfo(
"dist/Llama-2-7b-chat-hf-q0f16-MLC",
model_lib_path="dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so",
)
model = ModelInfo(model_path, model_lib_path=model_lib_path)
kv_cache_config = KVCacheConfig(page_size=16)

# Hyperparameters for tests (you can try different combinations)
Expand Down Expand Up @@ -288,10 +282,7 @@ def test_engine_continuous_batching_3():
"""

# Initialize model loading info and KV cache config
model = ModelInfo(
"dist/Llama-2-7b-chat-hf-q0f16-MLC",
model_lib_path="dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so",
)
model = ModelInfo(model_path, model_lib_path=model_lib_path)
kv_cache_config = KVCacheConfig(page_size=16)

# Hyperparameters for tests (you can try different combinations)
Expand Down Expand Up @@ -368,10 +359,7 @@ def all_finished(self) -> bool:

def test_engine_generate():
# Initialize model loading info and KV cache config
model = ModelInfo(
"dist/Llama-2-7b-chat-hf-q0f16-MLC",
model_lib_path="dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so",
)
model = ModelInfo(model_path, model_lib_path=model_lib_path)
kv_cache_config = KVCacheConfig(page_size=16, max_total_sequence_length=4096)
# Create engine
engine = Engine(model, kv_cache_config)
Expand Down