mlc-ai · Ubospica · Mar 15, 2024 · MasterJH5574 · Mar 15, 2024
diff --git a/cpp/serve/function_table.cc b/cpp/serve/function_table.cc
@@ -55,7 +55,6 @@ void FunctionTable::Init(TVMArgValue reload_lib, Device device, picojson::object
     }
   }
   this->model_config = model_config;
-  this->cached_buffers = Map<String, ObjectRef>();
 
   if (num_shards > 1) {
     String lib_path{nullptr};
@@ -255,7 +254,7 @@ ObjectRef FunctionTable::CopyToWorker0(const NDArray& host_array, String buffer_
       buffer = Downcast<DRef>((*it).second);
     } else {
       buffer = Downcast<DRef>(this->Empty(max_reserved_shape, host_array.DataType(), null_device));
-      this->cached_buffers.Set(buffer_cache_key, buffer);
+      this->cached_buffers.emplace(buffer_cache_key, buffer);
     }
     ShapeTuple real_shape = host_array.Shape();
     DRef buffer_view = nd_view_func_(buffer, real_shape);
@@ -268,7 +267,7 @@ ObjectRef FunctionTable::CopyToWorker0(const NDArray& host_array, String buffer_
       buffer = Downcast<NDArray>((*it).second);
     } else {
       buffer = NDArray::Empty(max_reserved_shape, host_array->dtype, local_gpu_device);
-      this->cached_buffers.Set(buffer_cache_key, buffer);
+      this->cached_buffers.emplace(buffer_cache_key, buffer);
     }
     buffer = buffer.CreateView(host_array.Shape(), host_array->dtype);
     DLTensor copy_dst = *(buffer.operator->());

diff --git a/cpp/serve/function_table.h b/cpp/serve/function_table.h
@@ -56,7 +56,7 @@ struct FunctionTable {
   Device local_gpu_device;
   Session sess{nullptr};
   DRef disco_mod{nullptr};
-  Map<String, ObjectRef> cached_buffers{nullptr};
+  std::unordered_map<String, ObjectRef> cached_buffers;
   tvm::runtime::Module local_vm{nullptr};
   picojson::object model_config;
 

diff --git a/tests/python/serve/test_serve_engine.py b/tests/python/serve/test_serve_engine.py
@@ -14,6 +14,9 @@
 )
 from mlc_llm.serve.engine import ModelInfo
 
+model_path = "dist/Llama-2-7b-chat-hf-q0f16-MLC"
+model_lib_path = "dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so"
+
 prompts = [
     "What is the meaning of life?",
     "Introduce the history of Pittsburgh to me. Please elaborate in detail.",
@@ -68,10 +71,7 @@ def test_engine_basic():
     """
 
     # Initialize model loading info and KV cache config
-    model = ModelInfo(
-        "dist/Llama-2-7b-chat-hf-q0f16-MLC",
-        model_lib_path="dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so",
-    )
+    model = ModelInfo(model_path, model_lib_path=model_lib_path)
     kv_cache_config = KVCacheConfig(page_size=16)
 
     # Hyperparameters for tests (you can try different combinations).
@@ -129,10 +129,7 @@ def test_engine_continuous_batching_1():
     """
 
     # Initialize model loading info and KV cache config
-    model = ModelInfo(
-        "dist/Llama-2-7b-chat-hf-q0f16-MLC",
-        model_lib_path="dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so",
-    )
+    model = ModelInfo(model_path, model_lib_path=model_lib_path)
     kv_cache_config = KVCacheConfig(page_size=16)
 
     # Hyperparameters for tests (you can try different combinations)
@@ -208,10 +205,7 @@ def test_engine_continuous_batching_2():
     """
 
     # Initialize model loading info and KV cache config
-    model = ModelInfo(
-        "dist/Llama-2-7b-chat-hf-q0f16-MLC",
-        model_lib_path="dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so",
-    )
+    model = ModelInfo(model_path, model_lib_path=model_lib_path)
     kv_cache_config = KVCacheConfig(page_size=16)
 
     # Hyperparameters for tests (you can try different combinations)
@@ -288,10 +282,7 @@ def test_engine_continuous_batching_3():
     """
 
     # Initialize model loading info and KV cache config
-    model = ModelInfo(
-        "dist/Llama-2-7b-chat-hf-q0f16-MLC",
-        model_lib_path="dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so",
-    )
+    model = ModelInfo(model_path, model_lib_path=model_lib_path)
     kv_cache_config = KVCacheConfig(page_size=16)
 
     # Hyperparameters for tests (you can try different combinations)
@@ -368,10 +359,7 @@ def all_finished(self) -> bool:
 
 def test_engine_generate():
     # Initialize model loading info and KV cache config
-    model = ModelInfo(
-        "dist/Llama-2-7b-chat-hf-q0f16-MLC",
-        model_lib_path="dist/Llama-2-7b-chat-hf-q0f16-MLC/Llama-2-7b-chat-hf-q0f16-MLC-cuda.so",
-    )
+    model = ModelInfo(model_path, model_lib_path=model_lib_path)
     kv_cache_config = KVCacheConfig(page_size=16, max_total_sequence_length=4096)
     # Create engine
     engine = Engine(model, kv_cache_config)