update

yyttt6 · yyttt6 · commit 4d1c63d226e4 · 2025-11-02T06:23:12.000Z
diff --git a/examples/cast/example_group_per_split_token_cast_to_fp8.py b/examples/cast/example_group_per_split_token_cast_to_fp8.py
@@ -161,7 +161,7 @@ def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> \
     return x_fp8
 
 
-def main(M=8192, N=8192, BG=2, blk_m=8):
+def main(M=8192, N=8192, BG=2, blk_m=8, batch_sizes=[2048, 6144]):
     if dtype == "float":
         x = torch.randn(M, N, device="cuda", dtype=torch.float32)
     elif dtype == "float16":
@@ -170,7 +170,7 @@ def main(M=8192, N=8192, BG=2, blk_m=8):
         x = torch.randn(M, N, device="cuda", dtype=torch.bfloat16)
     else:
         raise ValueError(f"Unsupported dtype: {dtype}")
-    batch_sizes = torch.tensor([2048, 6144], device="cuda", dtype=torch.int32)
+    batch_sizes = torch.tensor(batch_sizes, device="cuda", dtype=torch.int32)
     M_max = int(ceil_div(batch_sizes.max(), 128) * 128)
 
     print("batch_sizes:", batch_sizes)
diff --git a/examples/cast/test_example_cast.py b/examples/cast/test_example_cast.py
@@ -4,7 +4,7 @@
 
 
 def test_example_group_per_split_token_cast_to_fp8():
-    example_group_per_split_token_cast_to_fp8.main(M=8192, N=2048, BG=1, blk_m=4)
+    example_group_per_split_token_cast_to_fp8.main(M=1024, N=1024, BG=2, blk_m=4, batch_sizes=[128, 896])
 
 
 def test_example_per_token_cast_to_fp8():