Support Mixtral on macOS (#1558)

junrushao · web-flow · commit 0bfb6c0fd286 · 2024-01-08T01:35:39.000-05:00
A follow-up of my previous PR (#1529). This PR makes Mixtral work on Metal GPUs that macOS comes with. There are honestly no much change needed, except for that Metal doesn't support fp64 data types. A python script to run Mixtral: ```python from mlc_chat import ChatConfig, ChatModule, callback from mlc_chat.support import logging logging.enable_logging() MODEL = "HF://junrushao/Mixtral-8x7B-Instruct-v0.1-q4f16_1-MLC" NUM_GPU = 1 def main(): cm = ChatModule(MODEL, chat_config=ChatConfig( sliding_window_size=1024, tensor_parallel_shards=NUM_GPU, )) cm.generate("What is the meaning of life?", progress_callback=callback.StreamToStdout(callback_interval=2)) if __name__ == "__main__": main() ``` Quantization formats: - 3-bit (19.662 GB): ["HF://junrushao/Mixtral-8x7B-Instruct-v0.1-q3f16_1-MLC"](https://huggingface.co/junrushao/Mixtral-8x7B-Instruct-v0.1-q3f16_1-MLC) - 4-bit (24.466 GB): ["HF://junrushao/Mixtral-8x7B-Instruct-v0.1-q4f16_1-MLC"](https://huggingface.co/junrushao/Mixtral-8x7B-Instruct-v0.1-q4f16_1-MLC)
diff --git a/python/mlc_chat/op/moe_misc.py b/python/mlc_chat/op/moe_misc.py
@@ -186,7 +186,7 @@ def moe_cumsum(expert_indices: Tensor, num_local_experts: int) -> Tensor:
         .permute_dims(1, 0)
         .reshape(batch_size * num_local_experts)
     )
-    with Target(
+    with Target.current(allow_none=True) or Target(
         {
             "kind": "cuda",
             "max_num_threads": 1024,

Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,7 @@ def moe_cumsum(expert_indices: Tensor, num_local_experts: int) -> Tensor:`
`186`	`186`	`.permute_dims(1, 0)`
`187`	`187`	`.reshape(batch_size * num_local_experts)`
`188`	`188`	`)`
`189`		`- with Target(`
	`189`	`+ with Target.current(allow_none=True) or Target(`
`190`	`190`	`{`
`191`	`191`	`"kind": "cuda",`
`192`	`192`	`"max_num_threads": 1024,`