diff --git a/hgemm/README.md b/hgemm/README.md
index ea0e25c7..fc313709 100755
--- a/hgemm/README.md
+++ b/hgemm/README.md
@@ -29,8 +29,14 @@
 
 ## 目前性能  
 
+- NVIDIA L20  
+
 目前最优的实现，在L20上（理论Tensor Cores FP16算力为 119.5 TFLOPS），能达到cuBLAS大概95%~98%左右的性能(105-113 TFLOPS vs 105-115 TFLOPS)，部分case会超越cuBLAS。已知问题为bank conflicts没有完全消除，目前通过padding的方式缓解bank conflicts会导致shared memory浪费，也会影响SM occupancy。并且尚未手工实现Warp swizzle(受限于WMMA API的灵活性以及本人的能力)，后续将会尝试通过MMA PTX实现warp swizzle。
 
+- NVIDIA GeForce RTX 3080 Laptop   
+
+在NVIDIA GeForce RTX 3080 Laptop上测试，使用mma4x4_warp4x4（16 MMA m16n16k16 ops, warp tile 64x64）以及Thread block swizzle，大部分case能持平甚至超过cuBLAS。
+
 ## 共享内存 Bank Conflicts
 
 含义：在访问shared memory时，因多个线程读写同一个Bank中的不同数据地址时，导致shared memory 并发读写 退化 成顺序读写的现象叫做Bank Conflict；
@@ -229,7 +235,9 @@ nsys profile --stats=true -t cuda,osrt,nvtx -o hgemm.prof --force-overwrite true
 ```bash
 # 只测试Ada架构 不指定默认编译所有架构 耗时较长: Volta, Ampere, Ada, Hopper, ...
 export TORCH_CUDA_ARCH_LIST=Ada 
-python3 hgemm.py
+python3 hgemm.py # default, test some wmma kernels for all MNK
+python3 hgemm.py --wmma # test all wmma kernels for all MNK
+python3 hgemm.py --M 16384 --N 16384 --K 8192 --i 10 --wmma # test all wmma kernels for specific MNK
 ```
 
 输出:
@@ -723,3 +731,251 @@ python3 hgemm.py
                                   f16_th: ['-67.375   ', '14.9609375'], time:38.53211ms, swizzle: NOOP, TFLOPS: 114.14(+0.46%)
 ----------------------------------------------------------------------------------------------------------------------------------
 ```
+
+- NVIDIA GeForce RTX 3080 Laptop
+```bash
+python3 hgemm.py --wmma --no-default
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=4096, N=4096, K=2048, Warmup=5, Iters=20, 1/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['-34.9375  ', '2.25585938'], time:1.397085ms, swizzle: NOOP, TFLOPS: 49.19 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['-34.9375  ', '2.25585938'], time:1.632452ms, swizzle: NOOP, TFLOPS: 42.10
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:1.392316ms, swizzle: 1024, TFLOPS: 49.36 (+0.34%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:1.537656ms, swizzle: 1024, TFLOPS: 44.69
+                                (cublas): ['-34.90625 ', '2.21875   '], time:1.072788ms, swizzle: NOOP, TFLOPS: 64.06 (+29.78%)
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=4096, N=4096, K=4096, Warmup=5, Iters=20, 2/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['10.8515625', '9.4140625 '], time:3.154301ms, swizzle: NOOP, TFLOPS: 43.57 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['10.8515625', '9.4140625 '], time:3.152799ms, swizzle: NOOP, TFLOPS: 43.59 (+0.05%)
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:2.640366ms, swizzle: 1024, TFLOPS: 52.05 (+19.41%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:3.021883ms, swizzle: 1024, TFLOPS: 45.48
+                                (cublas): ['10.8515625', '9.4140625 '], time:2.330613ms, swizzle: NOOP, TFLOPS: 58.97 (+13.29%)
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=4096, N=4096, K=8192, Warmup=5, Iters=20, 3/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['68.375    ', '-2.234375 '], time:5.776286ms, swizzle: NOOP, TFLOPS: 47.59 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['68.375    ', '-2.234375 '], time:6.212115ms, swizzle: NOOP, TFLOPS: 44.25
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:5.236458ms, swizzle: 1024, TFLOPS: 52.49 (+10.31%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:5.674219ms, swizzle: 1024, TFLOPS: 48.44
+                                (cublas): ['68.375    ', '-2.234375 '], time:5.311441ms, swizzle: NOOP, TFLOPS: 51.75
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=4096, N=8192, K=2048, Warmup=5, Iters=20, 4/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['-34.9375  ', '2.25585938'], time:3.303718ms, swizzle: NOOP, TFLOPS: 41.60 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['-34.9375  ', '2.25585938'], time:3.193497ms, swizzle: NOOP, TFLOPS: 43.04 (+3.45%)
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:2.624654ms, swizzle: 2048, TFLOPS: 52.36 (+21.67%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:2.863550ms, swizzle: 2048, TFLOPS: 48.00
+                                (cublas): ['-34.90625 ', '2.21875   '], time:2.649235ms, swizzle: NOOP, TFLOPS: 51.88
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=4096, N=8192, K=4096, Warmup=5, Iters=20, 5/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['10.8515625', '9.4140625 '], time:5.747509ms, swizzle: NOOP, TFLOPS: 47.83 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['10.8515625', '9.4140625 '], time:6.356692ms, swizzle: NOOP, TFLOPS: 43.24
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:5.048251ms, swizzle: 2048, TFLOPS: 54.45 (+13.85%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:5.489063ms, swizzle: 2048, TFLOPS: 50.08
+                                (cublas): ['10.8515625', '9.4140625 '], time:6.013441ms, swizzle: NOOP, TFLOPS: 45.71
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=4096, N=8192, K=8192, Warmup=5, Iters=20, 6/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['68.375    ', '-2.234375 '], time:11.15694ms, swizzle: NOOP, TFLOPS: 49.27 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['68.375    ', '-2.234375 '], time:12.09821ms, swizzle: NOOP, TFLOPS: 45.44
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:9.958195ms, swizzle: 2048, TFLOPS: 55.21 (+12.04%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:10.67364ms, swizzle: 2048, TFLOPS: 51.51
+                                (cublas): ['68.375    ', '-2.234375 '], time:12.02430ms, swizzle: NOOP, TFLOPS: 45.72
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=4096, N=16384, K=2048, Warmup=5, Iters=20, 7/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['-34.9375  ', '2.25585938'], time:6.608533ms, swizzle: NOOP, TFLOPS: 41.59 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['-34.9375  ', '2.25585938'], time:6.812095ms, swizzle: NOOP, TFLOPS: 40.35
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:5.446910ms, swizzle: 4096, TFLOPS: 50.46 (+21.33%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:5.769944ms, swizzle: 4096, TFLOPS: 47.64
+                                (cublas): ['-34.90625 ', '2.21875   '], time:6.295609ms, swizzle: NOOP, TFLOPS: 43.66
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=4096, N=16384, K=4096, Warmup=5, Iters=20, 8/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['10.8515625', '9.4140625 '], time:11.90752ms, swizzle: NOOP, TFLOPS: 46.17 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['10.8515625', '9.4140625 '], time:12.66958ms, swizzle: NOOP, TFLOPS: 43.39
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:10.72070ms, swizzle: 4096, TFLOPS: 51.28 (+11.07%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:11.09249ms, swizzle: 4096, TFLOPS: 49.56
+                                (cublas): ['10.8515625', '9.4140625 '], time:9.910416ms, swizzle: NOOP, TFLOPS: 55.47 (+8.18%)
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=4096, N=16384, K=8192, Warmup=5, Iters=20, 9/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['68.375    ', '-2.234375 '], time:23.75357ms, swizzle: NOOP, TFLOPS: 46.29 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['68.375    ', '-2.234375 '], time:25.33891ms, swizzle: NOOP, TFLOPS: 43.39
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:20.78440ms, swizzle: 4096, TFLOPS: 52.90 (+14.29%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:22.58212ms, swizzle: 4096, TFLOPS: 48.69
+                                (cublas): ['68.375    ', '-2.234375 '], time:23.13928ms, swizzle: NOOP, TFLOPS: 47.52
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=8192, N=4096, K=2048, Warmup=5, Iters=20, 10/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['-34.9375  ', '2.25585938'], time:3.206682ms, swizzle: NOOP, TFLOPS: 42.86 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['-34.9375  ', '2.25585938'], time:3.255009ms, swizzle: NOOP, TFLOPS: 42.22
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:2.551007ms, swizzle: 1024, TFLOPS: 53.88 (+25.70%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:2.943944ms, swizzle: 1024, TFLOPS: 46.69
+                                (cublas): ['-34.90625 ', '2.21875   '], time:2.616691ms, swizzle: NOOP, TFLOPS: 52.52
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=8192, N=4096, K=4096, Warmup=5, Iters=20, 11/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['10.8515625', '9.4140625 '], time:5.581545ms, swizzle: NOOP, TFLOPS: 49.25 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['10.8515625', '9.4140625 '], time:5.918717ms, swizzle: NOOP, TFLOPS: 46.44
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:5.013823ms, swizzle: 1024, TFLOPS: 54.82 (+11.32%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:5.475091ms, swizzle: 1024, TFLOPS: 50.21
+                                (cublas): ['10.8515625', '9.4140625 '], time:5.620026ms, swizzle: NOOP, TFLOPS: 48.91
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=8192, N=4096, K=8192, Warmup=5, Iters=20, 12/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['68.375    ', '-2.234375 '], time:10.63799ms, swizzle: NOOP, TFLOPS: 51.68 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['68.375    ', '-2.234375 '], time:11.95423ms, swizzle: NOOP, TFLOPS: 45.99
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:10.08455ms, swizzle: 1024, TFLOPS: 54.51 (+5.49%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:10.80915ms, swizzle: 1024, TFLOPS: 50.86
+                                (cublas): ['68.375    ', '-2.234375 '], time:12.14854ms, swizzle: NOOP, TFLOPS: 45.25
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=8192, N=8192, K=2048, Warmup=5, Iters=20, 13/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['-34.9375  ', '2.25585938'], time:6.046414ms, swizzle: NOOP, TFLOPS: 45.46 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['-34.9375  ', '2.25585938'], time:6.623530ms, swizzle: NOOP, TFLOPS: 41.50
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:5.341410ms, swizzle: 2048, TFLOPS: 51.46 (+13.20%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:5.689215ms, swizzle: 2048, TFLOPS: 48.32
+                                (cublas): ['-34.90625 ', '2.21875   '], time:6.602764ms, swizzle: NOOP, TFLOPS: 41.63
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=8192, N=8192, K=4096, Warmup=5, Iters=20, 14/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['10.8515625', '9.4140625 '], time:11.54751ms, swizzle: NOOP, TFLOPS: 47.61 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['10.8515625', '9.4140625 '], time:12.49833ms, swizzle: NOOP, TFLOPS: 43.99
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:10.34743ms, swizzle: 2048, TFLOPS: 53.13 (+11.60%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:10.89727ms, swizzle: 2048, TFLOPS: 50.45
+                                (cublas): ['10.8515625', '9.4140625 '], time:11.89055ms, swizzle: NOOP, TFLOPS: 46.23
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=8192, N=8192, K=8192, Warmup=5, Iters=20, 15/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['68.375    ', '-2.234375 '], time:23.22742ms, swizzle: NOOP, TFLOPS: 47.34 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['68.375    ', '-2.234375 '], time:25.00588ms, swizzle: NOOP, TFLOPS: 43.97
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:20.04830ms, swizzle: 2048, TFLOPS: 54.84 (+15.86%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:21.89767ms, swizzle: 2048, TFLOPS: 50.21
+                                (cublas): ['68.375    ', '-2.234375 '], time:23.18794ms, swizzle: NOOP, TFLOPS: 47.42
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=8192, N=16384, K=2048, Warmup=5, Iters=20, 16/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['-34.9375  ', '2.25585938'], time:12.24069ms, swizzle: NOOP, TFLOPS: 44.91 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['-34.9375  ', '2.25585938'], time:13.07930ms, swizzle: NOOP, TFLOPS: 42.03
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:10.82205ms, swizzle: 4096, TFLOPS: 50.80 (+13.11%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:11.43186ms, swizzle: 4096, TFLOPS: 48.09
+                                (cublas): ['-34.90625 ', '2.21875   '], time:13.87636ms, swizzle: NOOP, TFLOPS: 39.62
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=8192, N=16384, K=4096, Warmup=5, Iters=20, 17/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['10.8515625', '9.4140625 '], time:23.84941ms, swizzle: NOOP, TFLOPS: 46.10 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['10.8515625', '9.4140625 '], time:31.07695ms, swizzle: NOOP, TFLOPS: 35.38
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:23.16045ms, swizzle: 4096, TFLOPS: 47.47 (+2.97%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:25.17983ms, swizzle: 4096, TFLOPS: 43.67
+                                (cublas): ['10.8515625', '9.4140625 '], time:20.92361ms, swizzle: NOOP, TFLOPS: 52.55 (+10.69%)
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=8192, N=16384, K=8192, Warmup=5, Iters=20, 18/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['68.375    ', '-2.234375 '], time:48.17764ms, swizzle: NOOP, TFLOPS: 45.64 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['68.375    ', '-2.234375 '], time:51.66683ms, swizzle: NOOP, TFLOPS: 42.56
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:42.50290ms, swizzle: 4096, TFLOPS: 51.74 (+13.35%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:46.67718ms, swizzle: 4096, TFLOPS: 47.11
+                                (cublas): ['68.375    ', '-2.234375 '], time:45.62001ms, swizzle: NOOP, TFLOPS: 48.20
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=16384, N=4096, K=2048, Warmup=5, Iters=20, 19/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['-34.9375  ', '2.25585938'], time:5.999112ms, swizzle: NOOP, TFLOPS: 45.82 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['-34.9375  ', '2.25585938'], time:6.952166ms, swizzle: NOOP, TFLOPS: 39.54
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:5.714607ms, swizzle: 1024, TFLOPS: 48.10 (+4.98%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:5.846762ms, swizzle: 1024, TFLOPS: 47.01
+                                (cublas): ['-34.9375  ', '2.25585938'], time:5.578041ms, swizzle: NOOP, TFLOPS: 49.28 (+2.45%)
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=16384, N=4096, K=4096, Warmup=5, Iters=20, 20/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['10.8515625', '9.4140625 '], time:11.36004ms, swizzle: NOOP, TFLOPS: 48.39 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['10.8515625', '9.4140625 '], time:12.24460ms, swizzle: NOOP, TFLOPS: 44.90
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:10.57424ms, swizzle: 1024, TFLOPS: 51.99 (+7.43%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:11.31019ms, swizzle: 1024, TFLOPS: 48.61
+                                (cublas): ['10.8515625', '9.4140625 '], time:10.14137ms, swizzle: NOOP, TFLOPS: 54.21 (+4.27%)
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=16384, N=4096, K=8192, Warmup=5, Iters=20, 21/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['68.375    ', '-2.234375 '], time:21.54934ms, swizzle: NOOP, TFLOPS: 51.02 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['68.375    ', '-2.234375 '], time:25.34153ms, swizzle: NOOP, TFLOPS: 43.39
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:21.18096ms, swizzle: 1024, TFLOPS: 51.91 (+1.74%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:22.19107ms, swizzle: 1024, TFLOPS: 49.55
+                                (cublas): ['68.375    ', '-2.234375 '], time:23.78721ms, swizzle: NOOP, TFLOPS: 46.22
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=16384, N=8192, K=2048, Warmup=5, Iters=20, 22/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['-34.9375  ', '2.25585938'], time:12.14342ms, swizzle: NOOP, TFLOPS: 45.27 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['-34.9375  ', '2.25585938'], time:13.07780ms, swizzle: NOOP, TFLOPS: 42.04
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:10.68298ms, swizzle: 2048, TFLOPS: 51.46 (+13.67%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:11.51511ms, swizzle: 2048, TFLOPS: 47.74
+                                (cublas): ['-34.9375  ', '2.25585938'], time:12.36820ms, swizzle: NOOP, TFLOPS: 44.45
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=16384, N=8192, K=4096, Warmup=5, Iters=20, 23/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['10.8515625', '9.4140625 '], time:23.26002ms, swizzle: NOOP, TFLOPS: 47.27 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['10.8515625', '9.4140625 '], time:25.28347ms, swizzle: NOOP, TFLOPS: 43.49
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:20.98624ms, swizzle: 2048, TFLOPS: 52.39 (+10.83%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:22.29118ms, swizzle: 2048, TFLOPS: 49.32
+                                (cublas): ['10.8515625', '9.4140625 '], time:23.58868ms, swizzle: NOOP, TFLOPS: 46.61
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=16384, N=8192, K=8192, Warmup=5, Iters=20, 24/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['68.375    ', '-2.234375 '], time:46.57695ms, swizzle: NOOP, TFLOPS: 47.21 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['68.375    ', '-2.234375 '], time:50.11103ms, swizzle: NOOP, TFLOPS: 43.88
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:39.97759ms, swizzle: 2048, TFLOPS: 55.01 (+16.51%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:45.07379ms, swizzle: 2048, TFLOPS: 48.79
+                                (cublas): ['68.375    ', '-2.234375 '], time:46.13645ms, swizzle: NOOP, TFLOPS: 47.66
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=16384, N=16384, K=2048, Warmup=5, Iters=20, 25/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['-34.9375  ', '2.25585938'], time:24.82917ms, swizzle: NOOP, TFLOPS: 44.28 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['-34.9375  ', '2.25585938'], time:26.81517ms, swizzle: NOOP, TFLOPS: 41.00
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:22.22962ms, swizzle: 4096, TFLOPS: 49.46 (+11.69%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['-34.9375  ', '2.25585938'], time:23.27709ms, swizzle: 4096, TFLOPS: 47.24
+                                (cublas): ['-34.90625 ', '2.21875   '], time:25.84185ms, swizzle: NOOP, TFLOPS: 42.55
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=16384, N=16384, K=4096, Warmup=5, Iters=20, 26/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['10.8515625', '9.4140625 '], time:48.43459ms, swizzle: NOOP, TFLOPS: 45.40 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['10.8515625', '9.4140625 '], time:52.00080ms, swizzle: NOOP, TFLOPS: 42.29
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:43.28680ms, swizzle: 4096, TFLOPS: 50.80 (+11.89%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['10.8515625', '9.4140625 '], time:47.73476ms, swizzle: 4096, TFLOPS: 46.07
+                                (cublas): ['10.8515625', '9.4140625 '], time:40.64793ms, swizzle: NOOP, TFLOPS: 54.10 (+6.49%)
+----------------------------------------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------
+                              M=16384, N=16384, K=8192, Warmup=5, Iters=20, 27/27
+----------------------------------------------------------------------------------------------------------------------------------
+           (mma4x4+warp4x4+stage3+dsmem): ['68.375    ', '-2.234375 '], time:96.91984ms, swizzle: NOOP, TFLOPS: 45.38 (+0.00%)
+           (mma4x4+warp4x4+stage2+dsmem): ['68.375    ', '-2.234375 '], time:102.8722ms, swizzle: NOOP, TFLOPS: 42.75
+   (mma4x4+warp4x4+stage3+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:85.65800ms, swizzle: 4096, TFLOPS: 51.34 (+13.15%)
+   (mma4x4+warp4x4+stage2+dsmem+swizzle): ['68.375    ', '-2.234375 '], time:95.70884ms, swizzle: 4096, TFLOPS: 45.95
+                                (cublas): ['68.375    ', '-2.234375 '], time:104.2092ms, swizzle: NOOP, TFLOPS: 42.20
+----------------------------------------------------------------------------------------------------------------------------------
+```
diff --git a/hgemm/hgemm.cu b/hgemm/hgemm.cu
index f46cade8..c4997035 100644
--- a/hgemm/hgemm.cu
+++ b/hgemm/hgemm.cu
@@ -1235,11 +1235,14 @@ void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages(torch::Tensor a, torch::Tensor b
                                                 int stages, bool swizzle, int swizzle_stride);
 void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem(torch::Tensor a, torch::Tensor b, torch::Tensor c, 
                                                       int stages, bool swizzle, int swizzle_stride);
+void hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem(torch::Tensor a, torch::Tensor b, torch::Tensor c, 
+                                                      int stages, bool swizzle, int swizzle_stride);                                                        
 void hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem(torch::Tensor a, torch::Tensor b, torch::Tensor c, 
                                                       int stages, bool swizzle, int swizzle_stride);
 void hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem(torch::Tensor a, torch::Tensor b, torch::Tensor c, 
-                                                        int stages, bool swizzle, int swizzle_stride);                                                      
-
+                                                        int stages, bool swizzle, int swizzle_stride);      
+void hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem(torch::Tensor a, torch::Tensor b, torch::Tensor c, 
+                                                        int stages, bool swizzle, int swizzle_stride);
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // CUDA Cores FP16
@@ -1285,7 +1288,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // stage, thread block swizzle, dsmem
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem)
+  TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem)
+  TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem)
 }
 
diff --git a/hgemm/hgemm.py b/hgemm/hgemm.py
index 21c0f4a9..4372a5d8 100644
--- a/hgemm/hgemm.py
+++ b/hgemm/hgemm.py
@@ -7,7 +7,26 @@
 
 torch.set_grad_enabled(False)
 
+def get_args():
+    parser = argparse.ArgumentParser(description="hgemm benchmark")
+    parser.add_argument("--M", type=int, default=None, help="Matrix M size")
+    parser.add_argument("--N", type=int, default=None, help="Matrix N size")
+    parser.add_argument("--K", type=int, default=None, help="Matrix K size")
+    parser.add_argument("--warmup", "--w", type=int, default=5, help="Warmup iters")
+    parser.add_argument("--iters", "--i", type=int, default=20, help="Benchmark iters")
+    parser.add_argument("--enable-mma-all", "--mma", action="store_true", help="Enable all MMA kernel tests")
+    parser.add_argument("--enable-wmma-all", "--wmma", action="store_true", help="Enable all WMMA kernel tests")
+    parser.add_argument("--enable-cuda-all", "--cuda", action="store_true", help="Enable all CUDA kernel tests")
+    parser.add_argument("--enable-torch", "--torch", action="store_true", help="Enable torch matmul")
+    parser.add_argument("--enable-cublas", "--cublas", action="store_true", default=True, help="Enable cublas hgemm")
+    parser.add_argument("--disable-default", "--no-default", action="store_true", default=False, help="Disable default tests")
+    return parser.parse_args()
+
+args = get_args()
+print(args)
+
 # Load the CUDA kernel as a python module
+print("Loading hgemm lib ...")
 lib = load(name='hgemm_lib', 
            sources=['hgemm.cu', 'hgemm_async.cu', 'hgemm_wmma.cu', 
                     'hgemm_wmma_stage.cu', 'hgemm_cublas.cu'], 
@@ -23,6 +42,7 @@
             ], 
            extra_cflags=['-std=c++17'])
 
+
 MAX_TFLOPS = -1
 
 def run_benchmark(perf_func: callable, 
@@ -30,7 +50,8 @@ def run_benchmark(perf_func: callable,
                   tag: str, out: Optional[torch.Tensor] = None, 
                   stages: int = -1, swizzle: bool = False,
                   swizzle_stride: int = 1,
-                  warmup: int = 5, iters: int = 20,
+                  warmup: int = args.warmup, 
+                  iters: int = args.iters,
                   show_all: bool = False):
     global MAX_TFLOPS
 
@@ -77,6 +98,7 @@ def run_benchmark(perf_func: callable,
         for i in range(iters):
             out = perf_func(a, b) 
     torch.cuda.synchronize()
+
     end = time.time()
     total_time = (end - start) * 1000 # ms
     mean_time = total_time / iters
@@ -96,83 +118,96 @@ def run_benchmark(perf_func: callable,
         else:
             improve = 0
         MAX_TFLOPS = TFLOPS
-        print(f"{out_info:>35}: {out_val}, time:{mean_time}ms, "
+        print(f"{out_info:>40}: {out_val}, time:{mean_time}ms, "
               f"swizzle: {swizzle_stride:<4}, TFLOPS: {TFLOPS:<6.2f}(+{improve:.2f}%)")
     else:
-        print(f"{out_info:>35}: {out_val}, time:{mean_time}ms, "
+        print(f"{out_info:>40}: {out_val}, time:{mean_time}ms, "
               f"swizzle: {swizzle_stride:<4}, TFLOPS: {TFLOPS:<6.2f}")
     if show_all: print(out)
     return out, mean_time
 
 
-def get_args():
-    parser = argparse.ArgumentParser(description="hgemm benchmark")
-    parser.add_argument("--enable-mma-all",  "-ma", action="store_true")
-    parser.add_argument("--enable-wmma-all", "-wa", action="store_true")
-    parser.add_argument("--enable-cuda-all", "-ca", action="store_true")
-    return parser.parse_args()
-
-
-args = get_args()
 Ms = [4096, 8192, 16384]
 Ns = [4096, 8192, 16384]
 Ks = [2048, 4096, 8192]
-MAX_M, MAX_N, MAX_K = 16384, 16384, 8192
+if args.M and args.N and args.K:
+    Ms = [args.M]
+    Ns = [args.N]
+    Ks = [args.K]
+MAX_M, MAX_N, MAX_K = max(Ms), max(Ns), max(Ks)
 # pre allocate for fast profiling.
+torch.cuda.synchronize()
+start = time.time()
+print(f"pre allocate for fast profiling start, MAX_M={MAX_M}, MAX_N={MAX_N}, MAX_K={MAX_K}")
 A = torch.randn((MAX_M, MAX_K), dtype=torch.half).cuda()
 B = torch.randn((MAX_K, MAX_N), dtype=torch.half).cuda()
 C = torch.randn((MAX_M, MAX_N), dtype=torch.half).cuda()
 torch.cuda.synchronize()
-
+end = time.time()
+print(f"pre allocate for fast profiling done, time: {(end - start) * 1000} ms")
 MNKs = [(M, N, K) for M in Ms for N in Ns for K in Ks]
+
+PERF_COUNT = 0
 for (M, N, K) in MNKs:
     MAX_TFLOPS = -1
+    PERF_COUNT += 1
+    print("-" * 130)
+    print(" " * 30 + f"M={M}, N={N}, K={K}, Warmup={args.warmup}, Iters={args.iters}, {PERF_COUNT}/{len(MNKs)}")
     print("-" * 130)
-    print(" " * 55 + f"M={M}, N={N}, K={K}")
     a = A[:M, :K].contiguous()
     b = B[:K, :N].contiguous()
     c = C[:M, :N].contiguous()
     torch.cuda.synchronize()
-
-    if args.enable_cuda_all:
+    if args.enable_cuda_all: # more cuda cores kernel tests.
         # CUDA Cores FP16
-        run_benchmark(lib.hgemm_naive_f16, a, b, "f16(naive)",  c)
-        run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x8_pack_bcf, a, b, "f16x8pack(t8x8+bcf)", c)
-
-    run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x8_pack_bcf_dbuf, a, b, "f16x8pack(t8x8+dbuf)", c)
-    run_benchmark(lib.hgemm_t_8x8_sliced_k16_f16x8_pack_dbuf, a, b, "f16x8pack(t8x8+k16+dbuf)", c)
-
-    print("-" * 68 + "WMMA" + "-" * 58)
-    # wmma api, stages, dsmem, swizzle
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2, a, b, "(mma4x2)", c)
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4, a, b, "(mma4x2+warp2x4)", c)
-    
-    # prefer on NVIDIA L20 device.
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages, a, b, "(mma2x4+warp2x4+stage3)", c, stages=3)
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages, a, b, "(mma2x4+warp2x4+stage2)", c, stages=2)
-
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem, a, b, "(mma2x4+...+stage3+dsmem)", c, stages=3)
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem, a, b, "(mma2x4+...+stage2+dsmem)", c, stages=2)
-
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages, a, b, "(mma2x4+...+stage3+swizzle)", c, stages=3, swizzle=True)
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages, a, b, "(mma2x4+...+stage2+swizzle)", c, stages=2, swizzle=True)
-
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem, a, b, "(...+stage3+dsmem+swizzle)", c, stages=3, swizzle=True)
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem, a, b, "(...+stage2+dsmem+swizzle)", c, stages=2, swizzle=True)
-    
-    if args.enable_wmma_all:
+        run_benchmark(lib.hgemm_naive_f16, a, b, "(naive)",  c)
+        run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x8_pack_bcf, a, b, "(f16x8pack+t8x8+bcf)", c)
+    if not args.disable_default:
+        run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x8_pack_bcf_dbuf, a, b, "(f16x8pack+t8x8+dbuf)", c)
+        run_benchmark(lib.hgemm_t_8x8_sliced_k16_f16x8_pack_dbuf, a, b, "(f16x8pack+t8x8+k16+dbuf)", c)
+        print("-" * 68 + "WMMA" + "-" * 58)
+        # wmma api, stages, dsmem, swizzle
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2, a, b, "(mma4x2)", c)
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4, a, b, "(mma4x2+warp2x4)", c)
+        # prefer on NVIDIA L20 device.
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages, a, b, "(mma4x2+warp2x4+stage3)", c, stages=3)
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages, a, b, "(mma4x2+warp2x4+stage2)", c, stages=2)
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem, a, b, "(mma4x2+warp2x4+stage3+dsmem)", c, stages=3)
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem, a, b, "(mma4x2+warp2x4+stage2+dsmem)", c, stages=2)
+        # swizzle
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages, a, b, "(mma4x2+warp2x4+stage3+swizzle)", c, stages=3, swizzle=True)
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages, a, b, "(mma4x2+warp2x4+stage2+swizzle)", c, stages=2, swizzle=True)
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem, a, b, "(mma4x2+warp2x4+stage3+dsmem+swizzle)", c, stages=3, swizzle=True)
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem, a, b, "(mma4x2+warp2x4+stage2+dsmem+swizzle)", c, stages=2, swizzle=True)
+        # TODO: add MMA PTX kernel tests.
+    if args.enable_wmma_all: # more wmma kernel tests.
+        # TODO: add more stages tests for mma2x4/mma4x4, 4,5 etc.
         # prefer on NVIDIA TRX 3080 Laptop 16GB GDDR6 device.
-        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem, a, b, "(mma4x4+...+stage3+dsmem)", c, stages=3)
-        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem, a, b, "(mma4x4+...+stage2+dsmem)", c, stages=2)
-        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem, a, b, "(warp2x4x2+...+stage3+dsmem)", c, stages=3)
-        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem, a, b, "(warp2x4x2+...+stage2+dsmem)", c, stages=2)
-        
-        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem, a, b, "(mma4x4+stage3+dsmem+swizzle)", c, stages=3, swizzle=True)
-        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem, a, b, "(mma4x4+stage2+dsmem+swizzle)", c, stages=2, swizzle=True)
-        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem, a, b, "(warp2x4x2+stage3+dsmem+swizzle)", c, stages=3, swizzle=True)
-        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem, a, b, "(warp2x4x2+stage2+dsmem+swizzle)", c, stages=2, swizzle=True)
-
-    run_benchmark(lib.hgemm_cublas_tensor_op, a, b, "f16(cublas)", c)
-    run_benchmark(partial(torch.matmul, out=c), a, b, "f16_th")
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem, a, b, "(mma4x4+warp4x4+stage3+dsmem)", c, stages=3)
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem, a, b, "(mma4x4+warp4x4+stage2+dsmem)", c, stages=2)
+        # may not get good performance for warp_tile_k, e.g. warp2x2x2, warp2x4x2 etc.
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem, a, b, "(mma4x2+warp4x4+stage3+dsmem)", c, stages=3)
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem, a, b, "(mma4x2+warp4x4+stage2+dsmem)", c, stages=2)
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem, a, b, "(mma4x2+warp2x4x2+stage3+dsmem)", c, stages=3)
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem, a, b, "(mma4x2+warp2x4x2+stage2+dsmem)", c, stages=2)
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem, a, b, "(mma4x4+warp2x2x2+stage3+dsmem)", c, stages=3)
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem, a, b, "(mma4x4+warp2x2x2+stage2+dsmem)", c, stages=2)
+        # swizzle
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem, a, b, "(mma4x4+warp4x4+stage3+dsmem+swizzle)", c, stages=3, swizzle=True)
+        run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem, a, b, "(mma4x4+warp4x4+stage2+dsmem+swizzle)", c, stages=2, swizzle=True)
+        # may not get good performance for warp_tile_k, e.g. warp2x2x2, warp2x4x2 etc.
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem, a, b, "(mma4x2+warp4x4+stage3+dsmem+swizzle)", c, stages=3, swizzle=True)
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem, a, b, "(mma4x2+warp4x4+stage2+dsmem+swizzle)", c, stages=2, swizzle=True)
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem, a, b, "(mma4x2+warp2x4x2+stage3+dsmem+swizzle)", c, stages=3, swizzle=True)
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem, a, b, "(mma4x2+warp2x4x2+stage2+dsmem+swizzle)", c, stages=2, swizzle=True)
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem, a, b, "(mma4x4+warp2x2x2+stage3+dsmem+swizzle)", c, stages=3, swizzle=True)
+        # run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem, a, b, "(mma4x4+warp2x2x2+stage2+dsmem+swizzle)", c, stages=2, swizzle=True)
+    if args.enable_mma_all: # more mma kernel tests.
+        print("-" * 68 + "MMA" + "-" * 59)
+        pass
+    if args.enable_cublas:
+        run_benchmark(lib.hgemm_cublas_tensor_op, a, b, "(cublas)", c)
+    if args.enable_torch:
+        run_benchmark(partial(torch.matmul, out=c), a, b, "(torch)")
     torch.cuda.synchronize()
     print("-" * 130)
diff --git a/hgemm/hgemm_wmma_stage.cu b/hgemm/hgemm_wmma_stage.cu
index cb635439..b583f685 100644
--- a/hgemm/hgemm_wmma_stage.cu
+++ b/hgemm/hgemm_wmma_stage.cu
@@ -34,7 +34,7 @@ using namespace nvcuda;
 HOST_DEVICE_INLINE 
 int div_ceil(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
 
-// stage2/3/4 (stage2=double buffers+copy async)
+// stage2/3/4 (stage2=double buffers+copy async), 128x128, warp2x4(32,64,16)
 // 1. When using shared memory exceeds 48 KB, dynamic shared memory needs to be used,
 // i.e., declare a block of dynamic shared memory with extern shared half smem[];. 
 // When calling the kernel, the size of the dynamic shared memory needs to be specified, 
@@ -249,7 +249,7 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_kernel(
   }
 }
 
-// stage2/3/4 (stage2=double buffers+copy async)
+// stage2/3/4 (stage2=double buffers+copy async), 128x128, warp2x4(32,64,16)
 // 1. When using shared memory exceeds 48 KB, dynamic shared memory needs to be used,
 // i.e., declare a block of dynamic shared memory with extern shared half smem[];. 
 // When calling the kernel, the size of the dynamic shared memory needs to be specified, 
@@ -480,7 +480,7 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel(
   }
 }
 
-// stage with 256x256 block, dynamic smem
+// stage with 256x256 block, warp4x4(64,64,16), dynamic smem
 // __launch_bounds__: avoid error 'too many resources required for launch'
 // reference: https://blog.csdn.net/feng__shuai/article/details/124395023
 template<const int WMMA_M=16, 
@@ -702,7 +702,7 @@ hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem_kernel(
   }
 }
 
-// 128x128, Stages + K32 + Reg Buffers?
+// 128x128, Stages + K32 + Reg Buffers, mma4x2, warp2x4x2(32,64,32)
 // stage2/3/4 (stage2=double buffers+copy async)
 // 1. When using shared memory exceeds 48 KB, dynamic shared memory needs to be used,
 // i.e., declare a block of dynamic shared memory with extern shared half smem[];. 
@@ -799,12 +799,10 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem_kernel(
                          load_smem_b_k * (BN + B_PAD) + 
                          load_smem_b_n) * sizeof(half)
     );
-    // first part
+
     CP_ASYNC_CG(load_smem_a_ptr, &A[load_gmem_a_addr], 16);
-    CP_ASYNC_CG(load_smem_b_ptr, &B[load_gmem_b_addr], 16);
-    CP_ASYNC_COMMIT_GROUP();
-    // second part
     CP_ASYNC_CG(load_smem_a_ptr + 16, &A[load_gmem_a_addr + 8], 16);
+    CP_ASYNC_CG(load_smem_b_ptr, &B[load_gmem_b_addr], 16);   
     CP_ASYNC_CG(load_smem_b_ptr + 16, &B[load_gmem_b_addr + 8], 16);
 
     CP_ASYNC_COMMIT_GROUP();
@@ -840,13 +838,11 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem_kernel(
                          load_smem_b_n) * sizeof(half)
     );
 
-    // first part
     CP_ASYNC_CG(load_smem_a_ptr, &A[load_gmem_a_addr], 16);
-    CP_ASYNC_CG(load_smem_b_ptr, &B[load_gmem_b_addr], 16);
-    CP_ASYNC_COMMIT_GROUP();
-    // second part
     CP_ASYNC_CG(load_smem_a_ptr + 16, &A[load_gmem_a_addr + 8], 16);
+    CP_ASYNC_CG(load_smem_b_ptr, &B[load_gmem_b_addr], 16);   
     CP_ASYNC_CG(load_smem_b_ptr + 16, &B[load_gmem_b_addr + 8], 16);
+
     CP_ASYNC_COMMIT_GROUP();
 
     // WARP_TILE_K=2
@@ -956,186 +952,671 @@ hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem_kernel(
   }
 }
 
-// TODO: Warp swizzle/permute support ? (MMA, not WMMA)
+// 128x128, Stages + K32 + Reg Buffers, mma4x4, warp2x2x2(32,32,32)
+// stage2/3/4 (stage2=double buffers+copy async)
+// 1. When using shared memory exceeds 48 KB, dynamic shared memory needs to be used,
+// i.e., declare a block of dynamic shared memory with extern shared half smem[];. 
+// When calling the kernel, the size of the dynamic shared memory needs to be specified, 
+// and smem addressing should be used in a one-dimensional array manner. 
+// 2. Improve L2 Cache locality (Thread Block Swizzle): https://zhuanlan.zhihu.com/p/555339335
+// 3. __launch_bounds__: avoid error 'too many resources required for launch'
+// reference: https://blog.csdn.net/feng__shuai/article/details/124395023
+template<const int WMMA_M=16, 
+         const int WMMA_N=16, 
+         const int WMMA_K=16, 
+         const int WMMA_TILE_M=4, 
+         const int WMMA_TILE_N=4, 
+         const int WARP_TILE_M=2, 
+         const int WARP_TILE_N=2,
+         const int WARP_TILE_K=2, 
+         const int A_PAD=0, 
+         const int B_PAD=0, 
+         const int K_STAGE=2,
+         const bool BLOCK_SWIZZLE=false>
+__global__ void __launch_bounds__(512) 
+hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem_kernel(
+  half* A, half* B, half* C, int M, int N, int K) {
+  // 512 threads(16 warps) per block.
+  // const int bx = blockIdx.x;
+  // BLOCK_SWIZZLE 0/1 control use block swizzle or not.
+  const int bx = ((int) BLOCK_SWIZZLE) * blockIdx.z * gridDim.x + blockIdx.x;
+  const int by = blockIdx.y;
+  const int NUM_K_TILES = div_ceil(K, WMMA_K * WARP_TILE_K);
+  constexpr int BM = WMMA_M * WMMA_TILE_M * WARP_TILE_M; // 16x4*2=128
+  constexpr int BN = WMMA_N * WMMA_TILE_N * WARP_TILE_N; // 16x4*2=128
+  constexpr int BK = WMMA_K * WARP_TILE_K; // 16*2=32
+  // s2: 2*128*(32)*2=16KB, 2*32*(128+16)*2=18KB, ~42KB
+  // s3: 3*128*(32)*2=24KB, 3*32*(128+16)*2=27KB, ~51KB
+  // s4: 4*128*(32)*2=32KB, 4*32*(128+16)*2=36KB, ~68KB
+  // s5: 5*128*(32)*2=40KB, 5*32*(128+16)*2=45KB, ~85KB
+  extern __shared__ half smem[]; 
+  half* s_a = smem;
+  half* s_b = smem + K_STAGE * BM * (BK + A_PAD);
+  constexpr int s_a_stage_offset = BM * (BK + A_PAD);
+  constexpr int s_b_stage_offset = BK * (BN + B_PAD);
 
-// --------------------- PyTorch bindings for custom kernel -----------------------
-#define STRINGFY(str) #str
-#define TORCH_BINDING_COMMON_EXTENSION(func)   \
-  m.def(STRINGFY(func), &func, STRINGFY(func));
+  // 要保证相同的warp下thread执行相同的指令
+  const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  const int warp_id = tid / WARP_SIZE; // 0~7 warp_id within block
+  const int warp_m = warp_id / 4; // 0,1,2,3
+  const int warp_n = warp_id % 4; // 0,1,2,3
+  
+  // 先计算shared memory中的索引
+  // tid和需要加载的smem s_a[BM][BK] 之间的索引关系 BM=128 BK=32 按行读取 A行主序
+  // 对于s_a每行32个数据，每个线程读取8个，需要4个线程；总共128行，需要128x4刚好512线程
+  int load_smem_a_m = tid / 4; // row 0~127
+  int load_smem_a_k = (tid % 4) * 8; // col 0,8,16,24
+  // tid和需要加载的smem s_b[BK][BN] 之间的索引关系 BK=32 BN=128 按行读取 B行主序
+  // 对于s_b每行128个数据，每个线程读8个数据，需要16个线程；总共32行，需要32x16=256个线程
+  int load_smem_b_k = tid / 16; // row 0~31
+  int load_smem_b_n = (tid % 16) * 8; // col 0,8,...,120
+  // 再计算全局内存中的索引
+  // 要加载到s_a中的元素对应到A全局内存中的行数 每个block负责出C中大小为BM*BN的块
+  int load_gmem_a_m = by * BM + load_smem_a_m; // global row of a and c
+  int load_gmem_b_n = bx * BN + load_smem_b_n; // global col of b and c
 
-#define CHECK_TORCH_TENSOR_DTYPE(T, th_type)                 \
-if(((T).options().dtype() != (th_type))) {                   \
-  std::cout << "Tensor Info:" << (T).options() << std::endl; \
-  throw std::runtime_error("values must be "#th_type);       \
-}
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, half> 
+  C_frag[WARP_TILE_M][WARP_TILE_N];
+  
+  #pragma unroll
+  for (int i = 0; i < WARP_TILE_M; ++i) {
+    #pragma unroll
+    for (int j = 0; j < WARP_TILE_N; ++j) {
+      wmma::fill_fragment(C_frag[i][j], 0.0);
+    }
+  }
 
-#define CHECK_TORCH_TENSOR_SHAPE(T, S0, S1)           \
-if (((T).size(0) != (S0)) || ((T).size(1) != (S1))) { \
-  throw std::runtime_error("Tensor size mismatch!");  \
-}
+  // only cvta smem base ptr once for cp.async.
+  uint32_t smem_a_base_ptr = __cvta_generic_to_shared(s_a);
+  uint32_t smem_b_base_ptr = __cvta_generic_to_shared(s_b);
 
-// 128x128 w/o dynamic smem
-#define LAUNCH_161616_STAGE_SWIZZLE_KERNEL(stages, stride)   \
-{                                                            \
-  const int N_SWIZZLE = (N + (stride) - 1) / (stride);       \
-  dim3 block(NUM_THREADS);                                   \
-  dim3 grid((div_ceil(N, BN) + N_SWIZZLE - 1) / N_SWIZZLE,   \
-             div_ceil(M, BM),                                \
-             N_SWIZZLE);                                     \
-  hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_kernel<         \
-    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,        \
-    WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD,                  \
-    (stages), true><<<grid, block>>>(                        \
-    reinterpret_cast<half*>(a.data_ptr()),                   \
-    reinterpret_cast<half*>(b.data_ptr()),                   \
-    reinterpret_cast<half*>(c.data_ptr()),                   \
-    M, N, K                                                  \
-  );                                                         \
-}
+  #pragma unroll
+  for (int k = 0; k < (K_STAGE - 1); ++k) { // 0, 1
+    // k * WMMA_K, WMMA_K=16 -> (k << 4)
+    int load_gmem_a_k = k * (WMMA_K * WARP_TILE_K) + load_smem_a_k; // global col of a
+    int load_gmem_a_addr = load_gmem_a_m * K + load_gmem_a_k;
+    int load_gmem_b_k = k * (WMMA_K * WARP_TILE_K) + load_smem_b_k; // global row of b
+    int load_gmem_b_addr = load_gmem_b_k * N + load_gmem_b_n; 
+    
+    uint32_t load_smem_a_ptr = (
+      smem_a_base_ptr + (k * s_a_stage_offset + 
+                         load_smem_a_m * (BK + A_PAD) + 
+                         load_smem_a_k) * sizeof(half)
+    );
 
-#define LAUNCH_161616_STAGE_NO_SWIZZLE_KERNEL(stages)        \
-{                                                            \
-  dim3 block(NUM_THREADS);                                   \
-  dim3 grid(div_ceil(N, BN), div_ceil(M, BM));               \
-  hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_kernel<         \
-    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,        \
-    WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD,                  \
-    (stages), false><<<grid, block>>>(                       \
-    reinterpret_cast<half*>(a.data_ptr()),                   \
-    reinterpret_cast<half*>(b.data_ptr()),                   \
-    reinterpret_cast<half*>(c.data_ptr()),                   \
-    M, N, K                                                  \
-  );                                                         \
-}
+    uint32_t load_smem_b_ptr = (
+      smem_b_base_ptr + (k * s_b_stage_offset + 
+                         load_smem_b_k * (BN + B_PAD) + 
+                         load_smem_b_n) * sizeof(half)
+    );
 
-// 128x128 stage 2/3/4 w/o block swizzle across N dim, static smem < 48KB
-void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages(
-  torch::Tensor a, torch::Tensor b, torch::Tensor c, 
-  int stages, bool swizzle, int swizzle_stride) {
-  CHECK_TORCH_TENSOR_DTYPE(a, torch::kHalf)
-  CHECK_TORCH_TENSOR_DTYPE(b, torch::kHalf)
-  CHECK_TORCH_TENSOR_DTYPE(c, torch::kHalf)
-  const int M = a.size(0);
-  const int K = a.size(1);
-  const int N = b.size(1); 
-  CHECK_TORCH_TENSOR_SHAPE(a, M, K)
-  CHECK_TORCH_TENSOR_SHAPE(b, K, N)
-  CHECK_TORCH_TENSOR_SHAPE(c, M, N)
-  constexpr int WMMA_M = 16;
-  constexpr int WMMA_N = 16;
-  constexpr int WMMA_K = 16;
-  constexpr int WMMA_TILE_M = 4;
-  constexpr int WMMA_TILE_N = 2; 
-  constexpr int WARP_TILE_M = 2;
-  constexpr int WARP_TILE_N = 4;
-  // s_a 4  ways bank conflicts within warp, after pad 8  -> 4 ways bank conflicts.
-  // s_b 16 ways bank conflicts within warp, after pad 8  -> 8 ways bank conflicts.
-  // s_b 16 ways bank conflicts within warp, after pad 16 -> 4 ways bank conflicts.
-  // so, the best padding policy for s_a and s_b is A_PAD=0/8, B_PAD=16. Thus, 
-  // improve B_PAD consume 8x~ less smem than A_PAD, 16xB_PAD vs 128xA_PAD.
-  constexpr int A_PAD = 0;  // 0,8,16
-  constexpr int B_PAD = 16; // 0,8,16
-  constexpr int NUM_THREADS= (
-    WMMA_TILE_M * WMMA_TILE_N * WARP_SIZE); // 2 * 4 * 32 = 256
-  constexpr int BM = WMMA_M * WMMA_TILE_M * WARP_TILE_M;    
-  constexpr int BN = WMMA_N * WMMA_TILE_N * WARP_TILE_N;    
-  constexpr int BK = WMMA_K;   
-  // s2: 2*128*(16)*2=8KB,  2*16*(128+16)*2=9KB,    ~17KB
-  // s3: 3*128*(16)*2=12KB, 3*16*(128+16)*2=13.5KB, ~26KB
-  // s4: 4*128*(16)*2=16KB, 4*16*(128+16)*2=18KB,   ~34KB                            
-  // s5: 5*128*(16)*2=20KB, 5*16*(128+16)*2=22.5KB, ~43KB    
-  if (swizzle) {
-    assert(swizzle_stride % 256 == 0);
-    switch (stages)
-    {
-    case 2: // ~17KB
-      LAUNCH_161616_STAGE_SWIZZLE_KERNEL(2, swizzle_stride);
-      break;
-    case 3: // ~26KB
-      LAUNCH_161616_STAGE_SWIZZLE_KERNEL(3, swizzle_stride);
-      break;
-    case 4: // ~34KB
-      LAUNCH_161616_STAGE_SWIZZLE_KERNEL(4, swizzle_stride);
-      break;
-    case 5: // ~43KB
-      LAUNCH_161616_STAGE_SWIZZLE_KERNEL(5, swizzle_stride);
-      break;
-    default:
-      LAUNCH_161616_STAGE_SWIZZLE_KERNEL(2, swizzle_stride);
-      break;
-    }
-  } else {
-    switch (stages)
-    {
-    case 2:
-      LAUNCH_161616_STAGE_NO_SWIZZLE_KERNEL(2);
-      break;
-    case 3:
-      LAUNCH_161616_STAGE_NO_SWIZZLE_KERNEL(3);
-      break;
-    case 4:
-      LAUNCH_161616_STAGE_NO_SWIZZLE_KERNEL(4);
-      break;
-    default:
-      LAUNCH_161616_STAGE_NO_SWIZZLE_KERNEL(2);
-      break;
-    }
+    CP_ASYNC_CG(load_smem_a_ptr, &A[load_gmem_a_addr], 16);
+    CP_ASYNC_CG(load_smem_b_ptr, &B[load_gmem_b_addr], 16);   
+
+    CP_ASYNC_COMMIT_GROUP();
   }
-}
 
-// 128x128 w dynamic smem, 98304=96KB < Ampere, Ada, Hopper ...
-#define LAUNCH_161616_STAGE_SWIZZLE_DSMEM_KERNEL(stages, stride)  \
-{                                                                 \
-  const int smem_max_size = (                                     \
-    (stages) * BM * (BK + A_PAD) * sizeof(half) +                 \
-    (stages) * BK * (BN + B_PAD) * sizeof(half));                 \
-  cudaFuncSetAttribute(                                           \
-    hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel<      \
-      WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,           \
-      WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD, (stages), true>,    \
-    cudaFuncAttributeMaxDynamicSharedMemorySize,                  \
-    98304);                                                       \
-  const int N_SWIZZLE = (N + (stride) - 1) / (stride);            \
-  dim3 block(NUM_THREADS);                                        \
-  dim3 grid((div_ceil(N, BN) + N_SWIZZLE - 1) / N_SWIZZLE,        \
-             div_ceil(M, BM),                                     \
-             N_SWIZZLE);                                          \
-  hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel<        \
-    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,             \
-    WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD, (stages), true><<<    \
-    grid, block, smem_max_size>>>(                                \
-    reinterpret_cast<half*>(a.data_ptr()),                        \
-    reinterpret_cast<half*>(b.data_ptr()),                        \
-    reinterpret_cast<half*>(c.data_ptr()),                        \
-    M, N, K                                                       \
-  );                                                              \
-}
+  CP_ASYNC_WAIT_GROUP(K_STAGE-2); // s2->0, s3->1, s4->2
+  __syncthreads(); 
 
-#define LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_KERNEL(stages)    \
-{                                                              \
-  const int smem_max_size = (                                  \
-    (stages) * BM * (BK + A_PAD) * sizeof(half) +              \
-    (stages) * BK * (BN + B_PAD) * sizeof(half));              \
-  cudaFuncSetAttribute(                                        \
-    hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel<   \
-      WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,        \
-      WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD, (stages), false>,\
-    cudaFuncAttributeMaxDynamicSharedMemorySize,               \
-    98304);                                                    \
-  dim3 block(NUM_THREADS);                                     \
-  dim3 grid(div_ceil(N, BN), div_ceil(M, BM));                 \
-  hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel<     \
-    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,          \
-    WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD, (stages), false><<<\
-    grid, block, smem_max_size>>>(                             \
-    reinterpret_cast<half*>(a.data_ptr()),                     \
-    reinterpret_cast<half*>(b.data_ptr()),                     \
+  #pragma unroll
+  for (int k = (K_STAGE - 1); k < NUM_K_TILES; k++) { 
+    // s2/4 can use bitwise ops but s3 can not, so, we use mod
+    // ops for all stages kernel. s2: (k + 1)&1, s4: (k + 1)&3
+    // s3: (k + 1) % 3
+    int smem_sel = (k + 1) % K_STAGE; // s3 k 2->0, k 3->1, k 4->2...
+    int smem_sel_next = k % K_STAGE;  // s3 k 2->2, k 3->0, k 4->1...
+
+    // k * WMMA_K, WMMA_K=16 -> (k << 4)
+    int load_gmem_a_k = k * (WMMA_K * WARP_TILE_K) + load_smem_a_k; // global col of a
+    int load_gmem_a_addr = load_gmem_a_m * K + load_gmem_a_k;
+    int load_gmem_b_k = k * (WMMA_K * WARP_TILE_K) + load_smem_b_k; // global row of b
+    int load_gmem_b_addr = load_gmem_b_k * N + load_gmem_b_n; 
+
+    // load stage 2, k start from 2
+    uint32_t load_smem_a_ptr = (
+      smem_a_base_ptr + (smem_sel_next * s_a_stage_offset + 
+                         load_smem_a_m * (BK + A_PAD) + 
+                         load_smem_a_k) * sizeof(half)
+    );
+
+    uint32_t load_smem_b_ptr = (
+      smem_b_base_ptr + (smem_sel_next * s_b_stage_offset + 
+                         load_smem_b_k * (BN + B_PAD) + 
+                         load_smem_b_n) * sizeof(half)
+    );
+
+    CP_ASYNC_CG(load_smem_a_ptr, &A[load_gmem_a_addr], 16);
+    CP_ASYNC_CG(load_smem_b_ptr, &B[load_gmem_b_addr], 16);   
+
+    CP_ASYNC_COMMIT_GROUP();
+
+    // WARP_TILE_K=2
+    for (int warp_k = 0; warp_k < WARP_TILE_K; ++warp_k) {
+      wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, 
+                     wmma::row_major> A_frag[WARP_TILE_M];
+      wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, 
+                     wmma::row_major> B_frag[WARP_TILE_N];
+      const int warp_smem_k = warp_k * WMMA_K; // 0,16
+
+      // compute stage 0
+      #pragma unroll
+      for (int i = 0; i < WARP_TILE_M; ++i) {
+        // load 2 tiles -> reg, smem a -> frags a, warp_m 0~3
+        int warp_smem_a_m = warp_m * (WMMA_M * WARP_TILE_M) + i * WMMA_M;
+        half* load_smem_a_frag_ptr = (s_a + smem_sel * s_a_stage_offset + 
+                                      warp_smem_a_m * (BK + A_PAD) + 
+                                      warp_smem_k);
+        wmma::load_matrix_sync(A_frag[i], load_smem_a_frag_ptr, BK + A_PAD); 
+      }
+
+      #pragma unroll
+      for (int j = 0; j < WARP_TILE_N; ++j) {
+        // load 4 tiles -> reg, smem b -> frags b, warp_n 0~2
+        int warp_smem_b_n = warp_n * (WMMA_N * WARP_TILE_N) + j * WMMA_N;
+        half* load_smem_b_frag_ptr = (s_b + smem_sel * s_b_stage_offset + 
+                                      warp_smem_k * (BN + B_PAD) + 
+                                      warp_smem_b_n);
+        wmma::load_matrix_sync(B_frag[j], load_smem_b_frag_ptr, BN + B_PAD);
+      }
+
+      #pragma unroll
+      for (int i = 0; i < WARP_TILE_M; ++i) {
+        #pragma unroll
+        for (int j = 0; j < WARP_TILE_N; ++j) {
+          wmma::mma_sync(C_frag[i][j], A_frag[i], B_frag[j], C_frag[i][j]);
+        }
+      }
+    }
+    
+    CP_ASYNC_WAIT_GROUP(K_STAGE-2);
+    __syncthreads(); 
+  }
+  
+  // make sure all memory issues ready.
+  if ((K_STAGE - 2) > 0) {
+    CP_ASYNC_WAIT_GROUP(0);
+    __syncthreads(); 
+  }
+
+  // processing last (K_STAGE-1) k iters.
+  {
+    #pragma unroll
+    for (int k = 0; k < (K_STAGE - 1); k++) {
+      const int stage_sel = ((NUM_K_TILES - (K_STAGE - 1) + k) % K_STAGE);
+
+      #pragma unroll
+      for (int warp_k = 0; warp_k < WARP_TILE_K; ++warp_k) {
+        wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, 
+                      wmma::row_major> A_frag[WARP_TILE_M];
+        wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, 
+                      wmma::row_major> B_frag[WARP_TILE_N];
+        const int warp_smem_k = warp_k * WMMA_K; // 0,16
+
+        // compute stage 0
+        #pragma unroll
+        for (int i = 0; i < WARP_TILE_M; ++i) {
+          // load 2 tiles -> reg, smem a -> frags a, warp_m 0~3
+          int warp_smem_a_m = warp_m * (WMMA_M * WARP_TILE_M) + i * WMMA_M;
+          half* load_smem_a_frag_ptr = (s_a + stage_sel * s_a_stage_offset + 
+                                        warp_smem_a_m * (BK + A_PAD) + 
+                                        warp_smem_k);
+          wmma::load_matrix_sync(A_frag[i], load_smem_a_frag_ptr, BK + A_PAD); 
+        }
+
+        #pragma unroll
+        for (int j = 0; j < WARP_TILE_N; ++j) {
+          // load 4 tiles -> reg, smem b -> frags b, warp_n 0~2
+          int warp_smem_b_n = warp_n * (WMMA_N * WARP_TILE_N) + j * WMMA_N;
+          half* load_smem_b_frag_ptr = (s_b + stage_sel * s_b_stage_offset + 
+                                        warp_smem_k * (BN + B_PAD) + 
+                                        warp_smem_b_n);
+          wmma::load_matrix_sync(B_frag[j], load_smem_b_frag_ptr, BN + B_PAD);
+        }
+
+        #pragma unroll
+        for (int i = 0; i < WARP_TILE_M; ++i) {
+          #pragma unroll
+          for (int j = 0; j < WARP_TILE_N; ++j) {
+            wmma::mma_sync(C_frag[i][j], A_frag[i], B_frag[j], C_frag[i][j]);
+          }
+        }
+      }
+    }
+  }
+
+  // finally, store back to C matrix.
+  #pragma unroll
+  for (int i = 0; i < WARP_TILE_M; ++i) {
+    #pragma unroll
+    for (int j = 0; j < WARP_TILE_N; ++j) {
+      const int store_gmem_a_m = by * BM + warp_m * (WMMA_M * WARP_TILE_M) + i * WMMA_M;
+      const int store_gmem_a_n = bx * BN + warp_n * (WMMA_N * WARP_TILE_N) + j * WMMA_N;
+      wmma::store_matrix_sync(C + store_gmem_a_m * N + store_gmem_a_n, C_frag[i][j], N, 
+                              wmma::mem_row_major);
+    }
+  }
+}
+
+// TODO: 256x128, Stages + K32 + Reg Buffers, mma4x2, warp4x4x2(64,64,16)
+template<const int WMMA_M=16, 
+         const int WMMA_N=16, 
+         const int WMMA_K=16, 
+         const int WMMA_TILE_M=4, 
+         const int WMMA_TILE_N=2, 
+         const int WARP_TILE_M=4, 
+         const int WARP_TILE_N=4,
+         const int WARP_TILE_K=1, 
+         const int A_PAD=0, 
+         const int B_PAD=0, 
+         const int K_STAGE=2,
+         const bool BLOCK_SWIZZLE=false>
+__global__ void __launch_bounds__(256) 
+hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem_kernel(
+  half* A, half* B, half* C, int M, int N, int K) {
+  // 256 threads(8 warps) per block.
+  // const int bx = blockIdx.x;
+  // BLOCK_SWIZZLE 0/1 control use block swizzle or not.
+  const int bx = ((int) BLOCK_SWIZZLE) * blockIdx.z * gridDim.x + blockIdx.x;
+  const int by = blockIdx.y;
+  const int NUM_K_TILES = div_ceil(K, WMMA_K * WARP_TILE_K);
+  constexpr int BM = WMMA_M * WMMA_TILE_M * WARP_TILE_M; // 16x4*4=256
+  constexpr int BN = WMMA_N * WMMA_TILE_N * WARP_TILE_N; // 16x2*4=128
+  constexpr int BK = WMMA_K * WARP_TILE_K; // 16*2=32
+  // s2: 2*128*(32)*2=16KB, 2*32*(128+16)*2=18KB, ~42KB
+  // s3: 3*128*(32)*2=24KB, 3*32*(128+16)*2=27KB, ~51KB
+  // s4: 4*128*(32)*2=32KB, 4*32*(128+16)*2=36KB, ~68KB
+  // s4: 5*128*(32)*2=40KB, 5*32*(128+16)*2=45KB, ~85KB
+  extern __shared__ half smem[]; 
+  half* s_a = smem;
+  half* s_b = smem + K_STAGE * BM * (BK + A_PAD);
+  constexpr int s_a_stage_offset = BM * (BK + A_PAD);
+  constexpr int s_b_stage_offset = BK * (BN + B_PAD);
+
+  // 要保证相同的warp下thread执行相同的指令
+  const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  const int warp_id = tid / WARP_SIZE; // 0~7 warp_id within block
+  const int warp_m = warp_id / 2; // 0,1,2,3
+  const int warp_n = warp_id % 2; // 0,1
+  
+  // 先计算shared memory中的索引
+  // tid和需要加载的smem s_a[BM][BK] 之间的索引关系 BM=256 BK=32 按行读取 A行主序
+  // 对于s_a每行16个数据，每个线程读取16个，需要1个线程；总共256行，刚好256线程
+  int load_smem_a_m = tid; // row 0~255
+  int load_smem_a_k = 0; // col 0,16
+  // tid和需要加载的smem s_b[BK][BN] 之间的索引关系 BK=16 BN=128 按行读取 B行主序
+  // 对于s_b每行128个数据，每个线程读8个数据，需要16个线程；总共16行，需要16x16=256个线程
+  int load_smem_b_k = tid / 16; // row 0~15
+  int load_smem_b_n = (tid % 16) * 8; // col 0,8,...,120
+  // 再计算全局内存中的索引
+  // 要加载到s_a中的元素对应到A全局内存中的行数 每个block负责出C中大小为BM*BN的块
+  int load_gmem_a_m = by * BM + load_smem_a_m; // global row of a and c
+  int load_gmem_b_n = bx * BN + load_smem_b_n; // global col of b and c
+
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, half> 
+  C_frag[WARP_TILE_M][WARP_TILE_N];
+  
+  #pragma unroll
+  for (int i = 0; i < WARP_TILE_M; ++i) {
+    #pragma unroll
+    for (int j = 0; j < WARP_TILE_N; ++j) {
+      wmma::fill_fragment(C_frag[i][j], 0.0);
+    }
+  }
+
+  // only cvta smem base ptr once for cp.async.
+  uint32_t smem_a_base_ptr = __cvta_generic_to_shared(s_a);
+  uint32_t smem_b_base_ptr = __cvta_generic_to_shared(s_b);
+
+  #pragma unroll
+  for (int k = 0; k < (K_STAGE - 1); ++k) { // 0, 1
+    // k * WMMA_K, WMMA_K=16 -> (k << 4)
+    int load_gmem_a_k = k * (WMMA_K * WARP_TILE_K) + load_smem_a_k; // global col of a
+    int load_gmem_a_addr = load_gmem_a_m * K + load_gmem_a_k;
+    int load_gmem_b_k = k * (WMMA_K * WARP_TILE_K) + load_smem_b_k; // global row of b
+    int load_gmem_b_addr = load_gmem_b_k * N + load_gmem_b_n; 
+    
+    uint32_t load_smem_a_ptr = (
+      smem_a_base_ptr + (k * s_a_stage_offset + 
+                         load_smem_a_m * (BK + A_PAD) + 
+                         load_smem_a_k) * sizeof(half)
+    );
+
+    uint32_t load_smem_b_ptr = (
+      smem_b_base_ptr + (k * s_b_stage_offset + 
+                         load_smem_b_k * (BN + B_PAD) + 
+                         load_smem_b_n) * sizeof(half)
+    );
+
+    CP_ASYNC_CG(load_smem_a_ptr, &A[load_gmem_a_addr], 16);
+    CP_ASYNC_CG(load_smem_b_ptr, &B[load_gmem_b_addr], 16);   
+    CP_ASYNC_CG(load_smem_a_ptr + 16, &A[load_gmem_a_addr + 8], 16);
+
+    CP_ASYNC_COMMIT_GROUP();
+  }
+
+  CP_ASYNC_WAIT_GROUP(K_STAGE-2); // s2->0, s3->1, s4->2
+  __syncthreads(); 
+
+  #pragma unroll
+  for (int k = (K_STAGE - 1); k < NUM_K_TILES; k++) { 
+    // s2/4 can use bitwise ops but s3 can not, so, we use mod
+    // ops for all stages kernel. s2: (k + 1)&1, s4: (k + 1)&3
+    // s3: (k + 1) % 3
+    int smem_sel = (k + 1) % K_STAGE; // s3 k 2->0, k 3->1, k 4->2...
+    int smem_sel_next = k % K_STAGE;  // s3 k 2->2, k 3->0, k 4->1...
+
+    // k * WMMA_K, WMMA_K=16 -> (k << 4)
+    int load_gmem_a_k = k * (WMMA_K * WARP_TILE_K) + load_smem_a_k; // global col of a
+    int load_gmem_a_addr = load_gmem_a_m * K + load_gmem_a_k;
+    int load_gmem_b_k = k * (WMMA_K * WARP_TILE_K) + load_smem_b_k; // global row of b
+    int load_gmem_b_addr = load_gmem_b_k * N + load_gmem_b_n; 
+
+    // load stage 2, k start from 2
+    uint32_t load_smem_a_ptr = (
+      smem_a_base_ptr + (smem_sel_next * s_a_stage_offset + 
+                         load_smem_a_m * (BK + A_PAD) + 
+                         load_smem_a_k) * sizeof(half)
+    );
+
+    uint32_t load_smem_b_ptr = (
+      smem_b_base_ptr + (smem_sel_next * s_b_stage_offset + 
+                         load_smem_b_k * (BN + B_PAD) + 
+                         load_smem_b_n) * sizeof(half)
+    );
+
+    CP_ASYNC_CG(load_smem_a_ptr, &A[load_gmem_a_addr], 16);
+    CP_ASYNC_CG(load_smem_b_ptr, &B[load_gmem_b_addr], 16);   
+    CP_ASYNC_CG(load_smem_a_ptr + 16, &A[load_gmem_a_addr + 8], 16);
+
+    CP_ASYNC_COMMIT_GROUP();
+
+    for (int warp_k = 0; warp_k < WARP_TILE_K; ++warp_k) {
+      wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, 
+                     wmma::row_major> A_frag[WARP_TILE_M];
+      wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, 
+                     wmma::row_major> B_frag[WARP_TILE_N];
+      const int warp_smem_k = warp_k * WMMA_K; // 0,16
+
+      // compute stage 0
+      #pragma unroll
+      for (int i = 0; i < WARP_TILE_M; ++i) {
+        // load 2 tiles -> reg, smem a -> frags a, warp_m 0~3
+        int warp_smem_a_m = warp_m * (WMMA_M * WARP_TILE_M) + i * WMMA_M;
+        half* load_smem_a_frag_ptr = (s_a + smem_sel * s_a_stage_offset + 
+                                      warp_smem_a_m * (BK + A_PAD) + 
+                                      warp_smem_k);
+        wmma::load_matrix_sync(A_frag[i], load_smem_a_frag_ptr, BK + A_PAD); 
+      }
+
+      #pragma unroll
+      for (int j = 0; j < WARP_TILE_N; ++j) {
+        // load 4 tiles -> reg, smem b -> frags b, warp_n 0~2
+        int warp_smem_b_n = warp_n * (WMMA_N * WARP_TILE_N) + j * WMMA_N;
+        half* load_smem_b_frag_ptr = (s_b + smem_sel * s_b_stage_offset + 
+                                      warp_smem_k * (BN + B_PAD) + 
+                                      warp_smem_b_n);
+        wmma::load_matrix_sync(B_frag[j], load_smem_b_frag_ptr, BN + B_PAD);
+      }
+
+      #pragma unroll
+      for (int i = 0; i < WARP_TILE_M; ++i) {
+        #pragma unroll
+        for (int j = 0; j < WARP_TILE_N; ++j) {
+          wmma::mma_sync(C_frag[i][j], A_frag[i], B_frag[j], C_frag[i][j]);
+        }
+      }
+    }
+    
+    CP_ASYNC_WAIT_GROUP(K_STAGE-2);
+    __syncthreads(); 
+  }
+  
+  // make sure all memory issues ready.
+  if ((K_STAGE - 2) > 0) {
+    CP_ASYNC_WAIT_GROUP(0);
+    __syncthreads(); 
+  }
+
+  // processing last (K_STAGE-1) k iters.
+  {
+    #pragma unroll
+    for (int k = 0; k < (K_STAGE - 1); k++) {
+      const int stage_sel = ((NUM_K_TILES - (K_STAGE - 1) + k) % K_STAGE);
+
+      #pragma unroll
+      for (int warp_k = 0; warp_k < WARP_TILE_K; ++warp_k) {
+        wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, 
+                      wmma::row_major> A_frag[WARP_TILE_M];
+        wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, 
+                      wmma::row_major> B_frag[WARP_TILE_N];
+        const int warp_smem_k = warp_k * WMMA_K; // 0,16
+
+        // compute stage 0
+        #pragma unroll
+        for (int i = 0; i < WARP_TILE_M; ++i) {
+          // load 2 tiles -> reg, smem a -> frags a, warp_m 0~3
+          int warp_smem_a_m = warp_m * (WMMA_M * WARP_TILE_M) + i * WMMA_M;
+          half* load_smem_a_frag_ptr = (s_a + stage_sel * s_a_stage_offset + 
+                                        warp_smem_a_m * (BK + A_PAD) + 
+                                        warp_smem_k);
+          wmma::load_matrix_sync(A_frag[i], load_smem_a_frag_ptr, BK + A_PAD); 
+        }
+
+        #pragma unroll
+        for (int j = 0; j < WARP_TILE_N; ++j) {
+          // load 4 tiles -> reg, smem b -> frags b, warp_n 0~2
+          int warp_smem_b_n = warp_n * (WMMA_N * WARP_TILE_N) + j * WMMA_N;
+          half* load_smem_b_frag_ptr = (s_b + stage_sel * s_b_stage_offset + 
+                                        warp_smem_k * (BN + B_PAD) + 
+                                        warp_smem_b_n);
+          wmma::load_matrix_sync(B_frag[j], load_smem_b_frag_ptr, BN + B_PAD);
+        }
+
+        #pragma unroll
+        for (int i = 0; i < WARP_TILE_M; ++i) {
+          #pragma unroll
+          for (int j = 0; j < WARP_TILE_N; ++j) {
+            wmma::mma_sync(C_frag[i][j], A_frag[i], B_frag[j], C_frag[i][j]);
+          }
+        }
+      }
+    }
+  }
+
+  // finally, store back to C matrix.
+  #pragma unroll
+  for (int i = 0; i < WARP_TILE_M; ++i) {
+    #pragma unroll
+    for (int j = 0; j < WARP_TILE_N; ++j) {
+      const int store_gmem_a_m = by * BM + warp_m * (WMMA_M * WARP_TILE_M) + i * WMMA_M;
+      const int store_gmem_a_n = bx * BN + warp_n * (WMMA_N * WARP_TILE_N) + j * WMMA_N;
+      wmma::store_matrix_sync(C + store_gmem_a_m * N + store_gmem_a_n, C_frag[i][j], N, 
+                              wmma::mem_row_major);
+    }
+  }
+}
+
+// TODO: Warp swizzle/permute support ? (MMA, not WMMA)
+
+// --------------------- PyTorch bindings for custom kernel -----------------------
+#define STRINGFY(str) #str
+#define TORCH_BINDING_COMMON_EXTENSION(func)   \
+  m.def(STRINGFY(func), &func, STRINGFY(func));
+
+#define CHECK_TORCH_TENSOR_DTYPE(T, th_type)                 \
+if(((T).options().dtype() != (th_type))) {                   \
+  std::cout << "Tensor Info:" << (T).options() << std::endl; \
+  throw std::runtime_error("values must be "#th_type);       \
+}
+
+#define CHECK_TORCH_TENSOR_SHAPE(T, S0, S1)           \
+if (((T).size(0) != (S0)) || ((T).size(1) != (S1))) { \
+  throw std::runtime_error("Tensor size mismatch!");  \
+}
+
+// 128x128 warp2x4(32,64) w/o dynamic smem
+#define LAUNCH_161616_STAGE_SWIZZLE_KERNEL(stages, stride)   \
+{                                                            \
+  const int N_SWIZZLE = (N + (stride) - 1) / (stride);       \
+  dim3 block(NUM_THREADS);                                   \
+  dim3 grid((div_ceil(N, BN) + N_SWIZZLE - 1) / N_SWIZZLE,   \
+             div_ceil(M, BM),                                \
+             N_SWIZZLE);                                     \
+  hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_kernel<         \
+    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,        \
+    WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD,                  \
+    (stages), true><<<grid, block>>>(                        \
+    reinterpret_cast<half*>(a.data_ptr()),                   \
+    reinterpret_cast<half*>(b.data_ptr()),                   \
+    reinterpret_cast<half*>(c.data_ptr()),                   \
+    M, N, K                                                  \
+  );                                                         \
+}
+
+#define LAUNCH_161616_STAGE_NO_SWIZZLE_KERNEL(stages)        \
+{                                                            \
+  dim3 block(NUM_THREADS);                                   \
+  dim3 grid(div_ceil(N, BN), div_ceil(M, BM));               \
+  hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_kernel<         \
+    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,        \
+    WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD,                  \
+    (stages), false><<<grid, block>>>(                       \
+    reinterpret_cast<half*>(a.data_ptr()),                   \
+    reinterpret_cast<half*>(b.data_ptr()),                   \
+    reinterpret_cast<half*>(c.data_ptr()),                   \
+    M, N, K                                                  \
+  );                                                         \
+}
+
+// 128x128 warp2x4(32,64) stage 2/3/4 w/o block swizzle across N dim, static smem < 48KB
+void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages(
+  torch::Tensor a, torch::Tensor b, torch::Tensor c, 
+  int stages, bool swizzle, int swizzle_stride) {
+  CHECK_TORCH_TENSOR_DTYPE(a, torch::kHalf)
+  CHECK_TORCH_TENSOR_DTYPE(b, torch::kHalf)
+  CHECK_TORCH_TENSOR_DTYPE(c, torch::kHalf)
+  const int M = a.size(0);
+  const int K = a.size(1);
+  const int N = b.size(1); 
+  CHECK_TORCH_TENSOR_SHAPE(a, M, K)
+  CHECK_TORCH_TENSOR_SHAPE(b, K, N)
+  CHECK_TORCH_TENSOR_SHAPE(c, M, N)
+  constexpr int WMMA_M = 16;
+  constexpr int WMMA_N = 16;
+  constexpr int WMMA_K = 16;
+  constexpr int WMMA_TILE_M = 4;
+  constexpr int WMMA_TILE_N = 2; 
+  constexpr int WARP_TILE_M = 2;
+  constexpr int WARP_TILE_N = 4;
+  // s_a 4  ways bank conflicts within warp, after pad 8  -> 4 ways bank conflicts.
+  // s_b 16 ways bank conflicts within warp, after pad 8  -> 8 ways bank conflicts.
+  // s_b 16 ways bank conflicts within warp, after pad 16 -> 4 ways bank conflicts.
+  // so, the best padding policy for s_a and s_b is A_PAD=0/8, B_PAD=16. Thus, 
+  // improve B_PAD consume 8x~ less smem than A_PAD, 16xB_PAD vs 128xA_PAD.
+  constexpr int A_PAD = 0;  // 0,8,16
+  constexpr int B_PAD = 16; // 0,8,16
+  constexpr int NUM_THREADS= (
+    WMMA_TILE_M * WMMA_TILE_N * WARP_SIZE); // 2 * 4 * 32 = 256
+  constexpr int BM = WMMA_M * WMMA_TILE_M * WARP_TILE_M;    
+  constexpr int BN = WMMA_N * WMMA_TILE_N * WARP_TILE_N;    
+  constexpr int BK = WMMA_K;   
+  // s2: 2*128*(16)*2=8KB,  2*16*(128+16)*2=9KB,    ~17KB
+  // s3: 3*128*(16)*2=12KB, 3*16*(128+16)*2=13.5KB, ~26KB
+  // s4: 4*128*(16)*2=16KB, 4*16*(128+16)*2=18KB,   ~34KB                            
+  // s5: 5*128*(16)*2=20KB, 5*16*(128+16)*2=22.5KB, ~43KB    
+  if (swizzle) {
+    assert(swizzle_stride % 256 == 0);
+    switch (stages)
+    {
+    case 2: // ~17KB
+      LAUNCH_161616_STAGE_SWIZZLE_KERNEL(2, swizzle_stride);
+      break;
+    case 3: // ~26KB
+      LAUNCH_161616_STAGE_SWIZZLE_KERNEL(3, swizzle_stride);
+      break;
+    case 4: // ~34KB
+      LAUNCH_161616_STAGE_SWIZZLE_KERNEL(4, swizzle_stride);
+      break;
+    case 5: // ~43KB
+      LAUNCH_161616_STAGE_SWIZZLE_KERNEL(5, swizzle_stride);
+      break;
+    default:
+      LAUNCH_161616_STAGE_SWIZZLE_KERNEL(2, swizzle_stride);
+      break;
+    }
+  } else {
+    switch (stages)
+    {
+    case 2:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_KERNEL(2);
+      break;
+    case 3:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_KERNEL(3);
+      break;
+    case 4:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_KERNEL(4);
+      break;
+    default:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_KERNEL(2);
+      break;
+    }
+  }
+}
+
+// 128x128 warp2x4(32,64) w dynamic smem, 98304=96KB < Ampere, Ada, Hopper ...
+#define LAUNCH_161616_STAGE_SWIZZLE_DSMEM_KERNEL(stages, stride)  \
+{                                                                 \
+  const int smem_max_size = (                                     \
+    (stages) * BM * (BK + A_PAD) * sizeof(half) +                 \
+    (stages) * BK * (BN + B_PAD) * sizeof(half));                 \
+  cudaFuncSetAttribute(                                           \
+    hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel<      \
+      WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,           \
+      WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD, (stages), true>,    \
+    cudaFuncAttributeMaxDynamicSharedMemorySize,                  \
+    98304);                                                       \
+  const int N_SWIZZLE = (N + (stride) - 1) / (stride);            \
+  dim3 block(NUM_THREADS);                                        \
+  dim3 grid((div_ceil(N, BN) + N_SWIZZLE - 1) / N_SWIZZLE,        \
+             div_ceil(M, BM),                                     \
+             N_SWIZZLE);                                          \
+  hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel<        \
+    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,             \
+    WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD, (stages), true><<<    \
+    grid, block, smem_max_size>>>(                                \
+    reinterpret_cast<half*>(a.data_ptr()),                        \
+    reinterpret_cast<half*>(b.data_ptr()),                        \
+    reinterpret_cast<half*>(c.data_ptr()),                        \
+    M, N, K                                                       \
+  );                                                              \
+}
+
+#define LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_KERNEL(stages)    \
+{                                                              \
+  const int smem_max_size = (                                  \
+    (stages) * BM * (BK + A_PAD) * sizeof(half) +              \
+    (stages) * BK * (BN + B_PAD) * sizeof(half));              \
+  cudaFuncSetAttribute(                                        \
+    hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel<   \
+      WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,        \
+      WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD, (stages), false>,\
+    cudaFuncAttributeMaxDynamicSharedMemorySize,               \
+    98304);                                                    \
+  dim3 block(NUM_THREADS);                                     \
+  dim3 grid(div_ceil(N, BN), div_ceil(M, BM));                 \
+  hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem_kernel<     \
+    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,          \
+    WARP_TILE_M, WARP_TILE_N, A_PAD, B_PAD, (stages), false><<<\
+    grid, block, smem_max_size>>>(                             \
+    reinterpret_cast<half*>(a.data_ptr()),                     \
+    reinterpret_cast<half*>(b.data_ptr()),                     \
     reinterpret_cast<half*>(c.data_ptr()),                     \
     M, N, K                                                    \
   );                                                           \
 }
 
-// 128x128 stage 2/3/4 + dynamic smem, w/o block swizzle across N dim
+// 128x128 warp2x4(32,64) stage 2/3/4 + dynamic smem, w/o block swizzle across N dim
 void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem(
   torch::Tensor a, torch::Tensor b, torch::Tensor c, 
   int stages, bool swizzle, int swizzle_stride) {
@@ -1220,7 +1701,7 @@ void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem(
   }
 }
 
-// 256x256 w dynamic smem, 98304=96KB < Ampere, Ada, Hopper ...
+// 256x256 warp4x4(64,64,32) w dynamic smem, 98304=96KB < Ampere, Ada, Hopper ...
 #define LAUNCH_161616_STAGE_SWIZZLE_DSMEM_256x256_KERNEL(stages, stride)  \
 {                                                                         \
   const int smem_max_size = (                                             \
@@ -1272,7 +1753,7 @@ void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stages_dsmem(
   );                                                                      \
 }
 
-// 256x256 stage 2/3/4 + dynamic smem, w/o block swizzle across N dim
+// 256x256 warp4x4(64,64,32) stage 2/3/4 + dynamic smem, w/o block swizzle across N dim
 void hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem(
   torch::Tensor a, torch::Tensor b, torch::Tensor c, 
   int stages, bool swizzle, int swizzle_stride) {
@@ -1343,7 +1824,7 @@ void hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem(
   }
 }
 
-// 128x128 warp2x4x2 w dynamic smem, 98304=96KB < Ampere, Ada, Hopper ...
+// 128x128 warp2x4x2(32,64,32) w dynamic smem, 98304=96KB < Ampere, Ada, Hopper ...
 #define LAUNCH_161616_STAGE_SWIZZLE_DSMEM_K32_KERNEL(stages, stride)\
 {                                                                 \
   const int smem_max_size = (                                     \
@@ -1399,6 +1880,7 @@ void hgemm_wmma_m16n16k16_mma4x4_warp4x4_stages_dsmem(
   );                                                           \
 }
 
+// 128x128 warp2x4x2(32,64,32)
 void hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem(
   torch::Tensor a, torch::Tensor b, torch::Tensor c, 
   int stages, bool swizzle, int swizzle_stride) {
@@ -1473,3 +1955,265 @@ void hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_stages_dsmem(
     }
   }
 }
+
+// 128x128 warp2x2x2(32,32,32) w dynamic smem, 98304=96KB < Ampere, Ada, Hopper ...
+#define LAUNCH_161616_STAGE_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(stages, stride)\
+{                                                                          \
+  const int smem_max_size = (                                              \
+    (stages) * BM * (BK + A_PAD) * sizeof(half) +                          \
+    (stages) * BK * (BN + B_PAD) * sizeof(half));                          \
+  cudaFuncSetAttribute(                                                    \
+    hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem_kernel<             \
+      WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,                    \
+      WARP_TILE_M, WARP_TILE_N, WARP_TILE_K,                               \
+      A_PAD, B_PAD, (stages), true>,                                       \
+    cudaFuncAttributeMaxDynamicSharedMemorySize,                           \
+    98304);                                                                \
+  const int N_SWIZZLE = (N + (stride) - 1) / (stride);                     \
+  dim3 block(NUM_THREADS);                                                 \
+  dim3 grid((div_ceil(N, BN) + N_SWIZZLE - 1) / N_SWIZZLE,                 \
+             div_ceil(M, BM),                                              \
+             N_SWIZZLE);                                                   \
+  hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem_kernel<               \
+    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,                      \
+    WARP_TILE_M, WARP_TILE_N, WARP_TILE_K,                                 \
+    A_PAD, B_PAD, (stages), true><<<                                       \
+    grid, block, smem_max_size>>>(                                         \
+    reinterpret_cast<half*>(a.data_ptr()),                                 \
+    reinterpret_cast<half*>(b.data_ptr()),                                 \
+    reinterpret_cast<half*>(c.data_ptr()),                                 \
+    M, N, K                                                                \
+  );                                                                       \
+}
+
+#define LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(stages) \
+{                                                                      \
+  const int smem_max_size = (                                          \
+    (stages) * BM * (BK + A_PAD) * sizeof(half) +                      \
+    (stages) * BK * (BN + B_PAD) * sizeof(half));                      \
+  cudaFuncSetAttribute(                                                \
+    hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem_kernel<         \
+      WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,                \
+      WARP_TILE_M, WARP_TILE_N, WARP_TILE_K,                           \
+      A_PAD, B_PAD, (stages), false>,                                  \
+    cudaFuncAttributeMaxDynamicSharedMemorySize,                       \
+    98304);                                                            \
+  dim3 block(NUM_THREADS);                                             \
+  dim3 grid(div_ceil(N, BN), div_ceil(M, BM));                         \
+  hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem_kernel<           \
+    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,                  \
+    WARP_TILE_M, WARP_TILE_N, WARP_TILE_K,                             \
+    A_PAD, B_PAD, (stages), false><<<                                  \
+    grid, block, smem_max_size>>>(                                     \
+    reinterpret_cast<half*>(a.data_ptr()),                             \
+    reinterpret_cast<half*>(b.data_ptr()),                             \
+    reinterpret_cast<half*>(c.data_ptr()),                             \
+    M, N, K                                                            \
+  );                                                                   \
+}
+
+void hgemm_wmma_m16n16k16_mma4x4_warp2x2x2_stages_dsmem(
+  torch::Tensor a, torch::Tensor b, torch::Tensor c, 
+  int stages, bool swizzle, int swizzle_stride) {
+  CHECK_TORCH_TENSOR_DTYPE(a, torch::kHalf)
+  CHECK_TORCH_TENSOR_DTYPE(b, torch::kHalf)
+  CHECK_TORCH_TENSOR_DTYPE(c, torch::kHalf)
+  const int M = a.size(0);
+  const int K = a.size(1);
+  const int N = b.size(1); 
+  CHECK_TORCH_TENSOR_SHAPE(a, M, K)
+  CHECK_TORCH_TENSOR_SHAPE(b, K, N)
+  CHECK_TORCH_TENSOR_SHAPE(c, M, N)
+  constexpr int WMMA_M = 16;
+  constexpr int WMMA_N = 16;
+  constexpr int WMMA_K = 16;
+  constexpr int WMMA_TILE_M = 4;
+  constexpr int WMMA_TILE_N = 4; 
+  constexpr int WARP_TILE_M = 2;
+  constexpr int WARP_TILE_N = 2;
+  constexpr int WARP_TILE_K = 2;
+  // s_a 4  ways bank conflicts within warp, after pad 8  -> 4 ways bank conflicts.
+  // s_b 16 ways bank conflicts within warp, after pad 8  -> 8 ways bank conflicts.
+  // s_b 16 ways bank conflicts within warp, after pad 16 -> 4 ways bank conflicts.
+  // so, the best padding policy for s_a and s_b is A_PAD=0/8, B_PAD=16. Thus, 
+  // improve B_PAD consume 8x~ less smem than A_PAD, 16xB_PAD vs 128xA_PAD.
+  constexpr int A_PAD = 0;  // 0,8,16
+  constexpr int B_PAD = 16; // 0,8,16
+  constexpr int NUM_THREADS= (
+    WMMA_TILE_M * WMMA_TILE_N * WARP_SIZE); // 4 * 4 * 32 = 512
+  constexpr int BM = WMMA_M * WMMA_TILE_M * WARP_TILE_M;    
+  constexpr int BN = WMMA_N * WMMA_TILE_N * WARP_TILE_N;    
+  constexpr int BK = WMMA_K * WARP_TILE_K;   
+  
+  if (swizzle) {
+    assert(swizzle_stride % 256 == 0);
+    switch (stages)
+    {
+    case 2: 
+      LAUNCH_161616_STAGE_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(2, swizzle_stride);
+      break;
+    case 3: 
+      LAUNCH_161616_STAGE_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(3, swizzle_stride);
+      break;
+    case 4: 
+      LAUNCH_161616_STAGE_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(4, swizzle_stride);
+      break;
+    case 5: 
+      LAUNCH_161616_STAGE_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(5, swizzle_stride);
+      break;
+    default:
+      LAUNCH_161616_STAGE_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(2, swizzle_stride);
+      break;
+    }
+  } else {
+    switch (stages)
+    {
+    case 2:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(2);
+      break;
+    case 3:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(3);
+      break;
+    case 4:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(4);
+      break;
+    case 5:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(5);
+      break;
+    default:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_K32_MMA4x4_KERNEL(2);
+      break;
+    }
+  }
+}
+
+// 256x128 warp4x4(64,64,16) w dynamic smem, 98304=96KB < Ampere, Ada, Hopper ...
+#define LAUNCH_161616_STAGE_SWIZZLE_DSMEM_WARP4X4_KERNEL(stages, stride)   \
+{                                                                          \
+  const int smem_max_size = (                                              \
+    (stages) * BM * (BK + A_PAD) * sizeof(half) +                          \
+    (stages) * BK * (BN + B_PAD) * sizeof(half));                          \
+  cudaFuncSetAttribute(                                                    \
+    hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem_kernel<               \
+      WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,                    \
+      WARP_TILE_M, WARP_TILE_N, WARP_TILE_K,                               \
+      A_PAD, B_PAD, (stages), true>,                                       \
+    cudaFuncAttributeMaxDynamicSharedMemorySize,                           \
+    98304);                                                                \
+  const int N_SWIZZLE = (N + (stride) - 1) / (stride);                     \
+  dim3 block(NUM_THREADS);                                                 \
+  dim3 grid((div_ceil(N, BN) + N_SWIZZLE - 1) / N_SWIZZLE,                 \
+             div_ceil(M, BM),                                              \
+             N_SWIZZLE);                                                   \
+  hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem_kernel<                 \
+    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,                      \
+    WARP_TILE_M, WARP_TILE_N, WARP_TILE_K,                                 \
+    A_PAD, B_PAD, (stages), true><<<                                       \
+    grid, block, smem_max_size>>>(                                         \
+    reinterpret_cast<half*>(a.data_ptr()),                                 \
+    reinterpret_cast<half*>(b.data_ptr()),                                 \
+    reinterpret_cast<half*>(c.data_ptr()),                                 \
+    M, N, K                                                                \
+  );                                                                       \
+}
+
+#define LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_WARP4x4_KERNEL(stages)    \
+{                                                                      \
+  const int smem_max_size = (                                          \
+    (stages) * BM * (BK + A_PAD) * sizeof(half) +                      \
+    (stages) * BK * (BN + B_PAD) * sizeof(half));                      \
+  cudaFuncSetAttribute(                                                \
+    hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem_kernel<           \
+      WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,                \
+      WARP_TILE_M, WARP_TILE_N, WARP_TILE_K,                           \
+      A_PAD, B_PAD, (stages), false>,                                  \
+    cudaFuncAttributeMaxDynamicSharedMemorySize,                       \
+    98304);                                                            \
+  dim3 block(NUM_THREADS);                                             \
+  dim3 grid(div_ceil(N, BN), div_ceil(M, BM));                         \
+  hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem_kernel<             \
+    WMMA_M, WMMA_N, WMMA_K, WMMA_TILE_M, WMMA_TILE_N,                  \
+    WARP_TILE_M, WARP_TILE_N, WARP_TILE_K,                             \
+    A_PAD, B_PAD, (stages), false><<<                                  \
+    grid, block, smem_max_size>>>(                                     \
+    reinterpret_cast<half*>(a.data_ptr()),                             \
+    reinterpret_cast<half*>(b.data_ptr()),                             \
+    reinterpret_cast<half*>(c.data_ptr()),                             \
+    M, N, K                                                            \
+  );                                                                   \
+}
+
+void hgemm_wmma_m16n16k16_mma4x2_warp4x4_stages_dsmem(
+  torch::Tensor a, torch::Tensor b, torch::Tensor c, 
+  int stages, bool swizzle, int swizzle_stride) {
+  CHECK_TORCH_TENSOR_DTYPE(a, torch::kHalf)
+  CHECK_TORCH_TENSOR_DTYPE(b, torch::kHalf)
+  CHECK_TORCH_TENSOR_DTYPE(c, torch::kHalf)
+  const int M = a.size(0);
+  const int K = a.size(1);
+  const int N = b.size(1); 
+  CHECK_TORCH_TENSOR_SHAPE(a, M, K)
+  CHECK_TORCH_TENSOR_SHAPE(b, K, N)
+  CHECK_TORCH_TENSOR_SHAPE(c, M, N)
+  constexpr int WMMA_M = 16;
+  constexpr int WMMA_N = 16;
+  constexpr int WMMA_K = 16;
+  constexpr int WMMA_TILE_M = 4;
+  constexpr int WMMA_TILE_N = 2; 
+  constexpr int WARP_TILE_M = 4;
+  constexpr int WARP_TILE_N = 4;
+  constexpr int WARP_TILE_K = 1;
+  // s_a 4  ways bank conflicts within warp, after pad 8  -> 4 ways bank conflicts.
+  // s_b 16 ways bank conflicts within warp, after pad 8  -> 8 ways bank conflicts.
+  // s_b 16 ways bank conflicts within warp, after pad 16 -> 4 ways bank conflicts.
+  // so, the best padding policy for s_a and s_b is A_PAD=0/8, B_PAD=16. Thus, 
+  // improve B_PAD consume 8x~ less smem than A_PAD, 16xB_PAD vs 128xA_PAD.
+  constexpr int A_PAD = 0;  // 0,8,16
+  constexpr int B_PAD = 16; // 0,8,16
+  constexpr int NUM_THREADS= (
+    WMMA_TILE_M * WMMA_TILE_N * WARP_SIZE); // 4 * 2 * 32 = 256
+  constexpr int BM = WMMA_M * WMMA_TILE_M * WARP_TILE_M;    
+  constexpr int BN = WMMA_N * WMMA_TILE_N * WARP_TILE_N;    
+  constexpr int BK = WMMA_K * WARP_TILE_K;   
+  
+  if (swizzle) {
+    assert(swizzle_stride % 256 == 0);
+    switch (stages)
+    {
+    case 2: 
+      LAUNCH_161616_STAGE_SWIZZLE_DSMEM_WARP4X4_KERNEL(2, swizzle_stride);
+      break;
+    case 3: 
+      LAUNCH_161616_STAGE_SWIZZLE_DSMEM_WARP4X4_KERNEL(3, swizzle_stride);
+      break;
+    case 4: 
+      LAUNCH_161616_STAGE_SWIZZLE_DSMEM_WARP4X4_KERNEL(4, swizzle_stride);
+      break;
+    case 5: 
+      LAUNCH_161616_STAGE_SWIZZLE_DSMEM_WARP4X4_KERNEL(5, swizzle_stride);
+      break;
+    default:
+      LAUNCH_161616_STAGE_SWIZZLE_DSMEM_WARP4X4_KERNEL(2, swizzle_stride);
+      break;
+    }
+  } else {
+    switch (stages)
+    {
+    case 2:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_WARP4x4_KERNEL(2);
+      break;
+    case 3:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_WARP4x4_KERNEL(3);
+      break;
+    case 4:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_WARP4x4_KERNEL(4);
+      break;
+    case 5:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_WARP4x4_KERNEL(5);
+      break;
+    default:
+      LAUNCH_161616_STAGE_NO_SWIZZLE_DSMEM_WARP4x4_KERNEL(2);
+      break;
+    }
+  }
+}
\ No newline at end of file