[Backend] Add ROCm support (#652)

spectrometerHBH · web-flow · commit ac8fa45030c0 · 2023-08-03T10:06:01.000-04:00
Depending on this PR: apache/tvm#15464 On 7900 xtx. ROCm 5.6 ``` ~/mlc-llm (rocm ✔) ./build/mlc_chat_cli --local-id Llama-2-7b-chat-hf-q4f16_1 Use MLC config: "/home/bohan/mlc-llm/dist/Llama-2-7b-chat-hf-q4f16_1/params/mlc-chat-config.json" Use model weights: "/home/bohan/mlc-llm/dist/Llama-2-7b-chat-hf-q4f16_1/params/ndarray-cache.json" Use model library: "/home/bohan/mlc-llm/dist/Llama-2-7b-chat-hf-q4f16_1/Llama-2-7b-chat-hf-q4f16_1-rocm.so" You can use the following special commands: /help print the special commands /exit quit the cli /stats print out the latest stats (token/sec) /reset restart a fresh chat /reload [local_id] reload model `local_id` from disk, or reload the current model if `local_id` is not specified Loading model... Loading finished Running system prompts... System prompts finished [INST]: Hi [/INST]: Hello! It's nice to meet you. I'm here to help you with any questions or tasks you may have, while always being safe and respectful. Is there something specific you would like to know or discuss? Please feel free to ask me anything, and I will do my best to provide a helpful and positive response. [INST]: /stats prefill: 507.3 tok/s, decode: 92.0 tok/s ``` ``` ~/mlc-llm (rocm ✗) ./build/mlc_chat_cli --local-id Llama-2-13b-chat-hf-q4f16_1 Use MLC config: "/home/bohan/mlc-llm/dist/Llama-2-13b-chat-hf-q4f16_1/params/mlc-chat-config.json" Use model weights: "/home/bohan/mlc-llm/dist/Llama-2-13b-chat-hf-q4f16_1/params/ndarray-cache.json" Use model library: "/home/bohan/mlc-llm/dist/Llama-2-13b-chat-hf-q4f16_1/Llama-2-13b-chat-hf-q4f16_1-rocm.so" You can use the following special commands: /help print the special commands /exit quit the cli /stats print out the latest stats (token/sec) /reset restart a fresh chat /reload [local_id] reload model `local_id` from disk, or reload the current model if `local_id` is not specified Loading model... Loading finished Running system prompts... System prompts finished [INST]: Hi [/INST]: Hello! I'm here to assist you with any questions you may have. Please keep in mind that I strive to provide safe and positive responses that are free of harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. If a question does not make sense or is not factually coherent, I will do my best to explain why instead of providing an incorrect answer. If I don't know the answer to a question, I will not provide false information. Is there anything specific you would like to know or discuss? [INST]: /stats prefill: 495.7 tok/s, decode: 69.0 tok/s ```
diff --git a/cpp/cli_main.cc b/cpp/cli_main.cc
@@ -42,6 +42,9 @@ std::string DetectDeviceName(std::string device_name) {
     if (DeviceAPI::Get(DLDevice{kDLMetal, 0}, allow_missing)) {
       return "metal";
     }
+    if (DeviceAPI::Get(DLDevice{kDLROCM, 0}, allow_missing)) {
+      return "rocm";
+    }
     if (DeviceAPI::Get(DLDevice{kDLVulkan, 0}, allow_missing)) {
       return "vulkan";
     }
@@ -56,6 +59,7 @@ std::string DetectDeviceName(std::string device_name) {
 DLDevice GetDevice(const std::string& device_name, int device_id) {
   if (device_name == "cuda") return DLDevice{kDLCUDA, device_id};
   if (device_name == "metal") return DLDevice{kDLMetal, device_id};
+  if (device_name == "rocm") return DLDevice{kDLROCM, device_id};
   if (device_name == "vulkan") return DLDevice{kDLVulkan, device_id};
   if (device_name == "opencl") return DLDevice{kDLOpenCL, device_id};
   LOG(FATAL) << "Do not recognize device name " << device_name;
diff --git a/mlc_llm/utils.py b/mlc_llm/utils.py
@@ -308,6 +308,20 @@ def _detect_local_cuda():
     )
 
 
+def _detect_local_rocm():
+    dev = tvm.rocm()
+    if not dev.exist:
+        return None
+    return tvm.target.Target(
+        {
+            "kind": "rocm",
+            "max_shared_memory_per_block": dev.max_shared_memory_per_block,
+            "max_threads_per_block": dev.max_threads_per_block,
+            "thread_warp_size": dev.warp_size,
+        }
+    )
+
+
 def _detect_local_vulkan():
     dev = tvm.vulkan()
     if not dev.exist:
@@ -336,6 +350,7 @@ def _detect_local_opencl():
 def detect_local_target():
     for method in [
         _detect_local_metal,
+        _detect_local_rocm,
         _detect_local_cuda,
         _detect_local_vulkan,
         _detect_local_opencl,
diff --git a/tests/debug/compare_lib.py b/tests/debug/compare_lib.py
@@ -52,9 +52,9 @@ def compare(
         super().compare(name, ref_args, new_args, ret_indices)
 
         if self.time_eval and name not in self.time_eval_results:
-            res = self.mod.time_evaluator(name, self.device, number=100, repeat=3)(
-                *new_args
-            )
+            res = self.mod.time_evaluator(
+                name, self.device, number=20, repeat=3#, cache_flush_bytes=256 * 10**6
+            )(*new_args)
             self.time_eval_results[name] = (res.mean, 1)
             print(f"Time-eval result {name} on {self.device}: {res}")
 
@@ -212,6 +212,8 @@ def _parse_args():
             parsed.primary_device = "cuda"
         elif tvm.metal().exist:
             parsed.primary_device = "metal"
+        elif tvm.rocm().exist:
+            parsed.primary_device = "rocm"
         else:
             raise ValueError("Cannot auto deduce device-name, please set it")
     return parsed