ROCm · micmelesse · Apr 17, 2025 · Mar 18, 2025 · Mar 21, 2025 · Mar 24, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,14 @@
+.eggs
+.gitignore
+build
+dist
+flash_attn.egg-info
+log
+scripts
+training/data
+gpucore*
+bench*.md
+*.pth
+*.html
+*.png
+*.csv
diff --git a/.github/workflows/amd_nightly.yml b/.github/workflows/amd_nightly.yml
@@ -50,28 +50,20 @@ jobs:
 
       - name: Install dependencies for bench and misc
         run: |
-          pip install matplotlib pandas pytest
+          pip install numpy==1.24 matplotlib pandas tabulate
 
-      # FIXME: run the full suite
       - name: AMD Internal Tests
         run: |
-          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" pytest flash_attn/flash_attn_triton_amd/test.py::test_fp8
+          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=0 pytest flash_attn/flash_attn_triton_amd/test.py
 
-      - name: AMD Bench
-        if: False
+      - name: Flash Attention Tests
         run: |
-          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=1 python flash_attn/flash_attn_triton_amd/bench.py
+          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=0 pytest tests/test_flash_attn_triton_amd.py
 
-      # run big test suites
-      - name: Flash Attention Tests using Pytorch reference implementation
-        if: False
+      - name: AMD Bench
         run: |
-          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_REF=1 pytest tests/test_flash_attn_triton_amd.py
+          python flash_attn/flash_attn_triton_amd/bench.py -benchmark_fn flash_attn_func flash_attn_varlen_func flash_attn_with_kvcache
 
-      - name: Flash Attention Tests
-        run: |
-          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" pytest tests/test_flash_attn_triton_amd.py
-
   Nightly-RDNA-AMD:
       runs-on: ${{ matrix.runner }}
       strategy:
@@ -110,4 +102,4 @@ jobs:
 
         - name: Flash Attention Tests
           run: |
-            FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" pytest tests/test_flash_attn_triton_amd.py::test_flash_attn_output
+            FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=0 pytest tests/test_flash_attn_triton_amd.py::test_flash_attn_output
diff --git a/.github/workflows/amd_tests.yml b/.github/workflows/amd_tests.yml
@@ -48,24 +48,16 @@ jobs:
 
       - name: Install dependencies for bench and misc
         run: |
-          pip install matplotlib pandas pytest
+          pip install numpy==1.24 matplotlib pandas tabulate
 
-      # FIXME: run the full suite
       - name: AMD Internal Tests
         run: |
-          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" pytest flash_attn/flash_attn_triton_amd/test.py::test_fp8
+          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=0 pytest flash_attn/flash_attn_triton_amd/test.py
 
-      - name: AMD Bench
-        if: False
-        run: |
-          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=1 python flash_attn/flash_attn_triton_amd/bench.py
-
-      # run big test suites
-      - name: Flash Attention Tests using Pytorch reference implementation
-        if: False
+      - name: Flash Attention Tests
         run: |
-          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_REF=1 pytest tests/test_flash_attn_triton_amd.py
+          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=0 pytest tests/test_flash_attn_triton_amd.py
 
-      - name: Flash Attention Tests
+      - name: AMD Bench
         run: |
-          FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" pytest tests/test_flash_attn_triton_amd.py
+          python flash_attn/flash_attn_triton_amd/bench.py -benchmark_fn flash_attn_func flash_attn_varlen_func flash_attn_with_kvcache
diff --git a/.gitignore b/.gitignore
@@ -33,6 +33,7 @@ venv
 scripts
 csrc/flash_attn_ck
 .eggs
+log
 *.log
 core.*
 gpucore.*
@@ -42,5 +43,9 @@ gpucore.*
 *.json
 *.txt
 *.pth
+*.md
 training/logs
 training/data
+# ck modules
+csrc/composable_kernel
+csrc/cutlass
diff --git a/README.md b/README.md
@@ -170,6 +170,11 @@ To test that things are working, you can run our tests. These tests take hours s
 FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" pytest tests/test_flash_attn_triton_amd.py
 ```
 
+You can use autotune for better performance by using this flag `FLASH_ATTENTION_TRITON_AMD_AUTOTUNE="TRUE"`
+```
+FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_AUTOTUNE="TRUE" python $PATH_TO_CODE
+```
+
 ###### Docker
 You can also use the Dockerfile below which does the above steps on top of the latest rocm/pytorch image.
 ```