vllm-project · pcmoritz · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py
@@ -25,7 +25,7 @@ def run_grid(bs, method):
     d_model = 4096
     num_total_experts = 8
     top_k = 2
-    tp_size = 2
+    tp_size = 4
     model_intermediate_size = 14336
     num_layers = 32
     num_calls = 100
@@ -48,13 +48,13 @@ def run_grid(bs, method):
     for block_size_n in [32, 64, 128, 256]:
         for block_size_m in BLOCK_SIZES_M:
             for block_size_k in [64, 128, 256]:
-                for group_size_m in [1, 16, 32, 64]:
+                for split_k_size in [1, 2, 4]:
                     for num_warps in [4, 8]:
                         configs.append({
                             "BLOCK_SIZE_M": block_size_m,
                             "BLOCK_SIZE_N": block_size_n,
                             "BLOCK_SIZE_K": block_size_k,
-                            "GROUP_SIZE_M": group_size_m,
+                            "SPLIT_K_SIZE": split_k_size,
                             "num_warps": num_warps,
                             "num_stages": 4,
                         })
@@ -134,20 +134,27 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
     hidden_states = torch.rand(
         (bs, d_model),
         device="cuda:0",
-        dtype=torch.bfloat16,
+        dtype=torch.float16,
     )
 
     ws = torch.rand(
         (num_total_experts, 2 * shard_intermediate_size, d_model),
         device=hidden_states.device,
         dtype=hidden_states.dtype,
-    )
+    ).to(torch.float8_e4m3fn)
 
     w2s = torch.rand(
         (num_total_experts, d_model, shard_intermediate_size),
         device=hidden_states.device,
         dtype=hidden_states.dtype,
-    )
+    ).to(torch.float8_e4m3fn)
+
+    s = torch.rand(d_model,
+                   device=hidden_states.device,
+                   dtype=torch.float16)
+    s2 = torch.rand(shard_intermediate_size,
+                    device=hidden_states.device,
+                    dtype=torch.float16)
 
     gating_output = F.softmax(torch.rand(
         (num_calls, bs, num_total_experts),
@@ -165,6 +172,8 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
             hidden_states=hidden_states,
             w1=ws,
             w2=w2s,
+            s=s,
+            s2=s2,
             gating_output=gating_output[i],
             topk=2,
             renormalize=True,

@@ -1,145 +1,145 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "SPLIT_K_SIZE": 2,
+        "num_warps": 8,
         "num_stages": 4
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "SPLIT_K_SIZE": 2,
         "num_warps": 4,
         "num_stages": 4
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "SPLIT_K_SIZE": 4,
+        "num_warps": 8,
         "num_stages": 4
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "SPLIT_K_SIZE": 2,
         "num_warps": 4,
         "num_stages": 4
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "SPLIT_K_SIZE": 2,
         "num_warps": 4,
         "num_stages": 4
     },
     "24": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "SPLIT_K_SIZE": 4,
         "num_warps": 4,
         "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "SPLIT_K_SIZE": 1,
         "num_warps": 4,
         "num_stages": 4
     },
     "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "SPLIT_K_SIZE": 2,
         "num_warps": 4,
         "num_stages": 4
     },
     "64": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "SPLIT_K_SIZE": 1,
+        "num_warps": 8,
         "num_stages": 4
     },
     "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "SPLIT_K_SIZE": 2,
         "num_warps": 4,
         "num_stages": 4
     },
-    "256": {
+    "128": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
+        "SPLIT_K_SIZE": 1,
         "num_warps": 8,
         "num_stages": 4
     },
-    "512": {
+    "256": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "SPLIT_K_SIZE": 1,
         "num_warps": 4,
         "num_stages": 4
     },
-    "1024": {
+    "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
+        "BLOCK_SIZE_K": 128,
+        "SPLIT_K_SIZE": 1,
         "num_warps": 4,
         "num_stages": 4
     },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "SPLIT_K_SIZE": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
     "1536": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
+        "BLOCK_SIZE_K": 128,
+        "SPLIT_K_SIZE": 1,
         "num_warps": 8,
         "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
+        "BLOCK_SIZE_K": 128,
+        "SPLIT_K_SIZE": 1,
         "num_warps": 8,
         "num_stages": 4
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
+        "BLOCK_SIZE_K": 128,
+        "SPLIT_K_SIZE": 1,
         "num_warps": 8,
         "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_K": 128,
+        "SPLIT_K_SIZE": 1,
         "num_warps": 8,
         "num_stages": 4
     }