ROCm · xinyazhang · Jul 26, 2024 · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,10 @@
 __pycache__/
-build/
+*build*/
 *.swp
 tritonsrc/tune-*.json
+*.csv
+*.png
+1
+2
+1.*
+2.*
diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,7 @@
 [submodule "third_party/triton"]
 	path = third_party/triton
 	url = https://github.com/ROCmSoftwarePlatform/triton.git
+	branch = aotriton-hyperjump
 [submodule "third_party/incbin"]
 	path = third_party/incbin
 	url = https://github.com/graphitemaster/incbin.git

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -15,6 +15,7 @@ set(AOTRITON_HIPCC_PATH "hipcc" CACHE STRING "Set HIPCC Path")
 option(AOTRITON_NO_SHARED "Disable shared object build. Incompatible with AOTRITON_COMPRESS_KERNEL." ON)
 option(AOTRITON_NO_PYTHON "Disable python binding build" OFF)
 option(AOTRITON_ENABLE_ASAN "Enable Address Sanitizer. Implies -g" OFF)
+option(AOTRITON_BUILD_FOR_TUNING "Build all GPU kernels and set -DAOTRITON_BUILD_FOR_TUNING=1 (=0 otherwise)" OFF)
 set(TARGET_GPUS "MI200;MI300X" CACHE STRING "Target Architecture (Note here uses Trade names)")
 set(AMDHSA_LD_PRELOAD "/opt/rocm/lib/libhsa-runtime64.so" CACHE STRING "Workaround of libamdhip64.so.5: undefined symbol: hsa_amd_memory_async_copy_on_engine")
 

diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt
@@ -13,3 +13,9 @@ if(AOTRITON_OVERRIDE_ZSTD_LIB)
 else()
   target_link_libraries(pyaotriton PRIVATE ${ZSTD_TARGET})
 endif()
+# TODO: Unify build option marcos with "interface target+public compile definitions"
+if(AOTRITON_BUILD_FOR_TUNING)
+  target_compile_definitions(pyaotriton PRIVATE -DAOTRITON_BUILD_FOR_TUNING=1)
+else(AOTRITON_BUILD_FOR_TUNING)
+  target_compile_definitions(pyaotriton PRIVATE -DAOTRITON_BUILD_FOR_TUNING=0)
+endif(AOTRITON_BUILD_FOR_TUNING)
diff --git a/bindings/module.cc b/bindings/module.cc
@@ -14,8 +14,18 @@ namespace py = pybind11;
 namespace pyaotriton {
   namespace v2 {
     namespace flash {
+      using aotriton::v2::flash::ExtraArguments;
       void setup_module(py::module_& m) {
         m.def("check_gpu", &aotriton::v2::flash::check_gpu, py::arg("stream"));
+        py::class_<ExtraArguments>(m, "ExtraArguments")
+          .def(py::init<>())
+#if AOTRITON_BUILD_FOR_TUNING
+          .def_readwrite("force_kernel_index", &ExtraArguments::force_kernel_index)
+          .def_readonly("total_number_of_kernels", &ExtraArguments::total_number_of_kernels)
+          .def_readonly("selected_kernel_psels", &ExtraArguments::selected_kernel_psels)
+          .def_readonly("selected_kernel_copts", &ExtraArguments::selected_kernel_copts)
+#endif
+        ;
         m.def("attn_fwd",
               &aotriton::v2::flash::attn_fwd,
               "Flash Attention Forward Pass",
@@ -31,7 +41,8 @@ namespace pyaotriton {
               py::arg("philox_offset"),
               py::arg("encoded_softmax"),
               py::arg("is_causal"),
-              py::arg("stream") = nullptr);
+              py::arg("stream") = nullptr,
+              py::arg("extargs") = ExtraArguments());
         m.def("attn_fwd_compact_varlen",
               &aotriton::v2::flash::attn_fwd_compact_varlen,
               "Flash Attention Forward Pass, Compact Stored Varlen",
@@ -51,7 +62,8 @@ namespace pyaotriton {
               py::arg("philox_offset"),
               py::arg("encoded_softmax"),
               py::arg("is_causal"),
-              py::arg("stream") = nullptr);
+              py::arg("stream") = nullptr,
+              py::arg("extargs") = ExtraArguments());
         m.def("attn_bwd",
               &aotriton::v2::flash::attn_bwd,
               "Flash Attention Backward Pass",
@@ -72,7 +84,8 @@ namespace pyaotriton {
               py::arg("philox_seed"),
               py::arg("philox_offset"),
               py::arg("is_causal"),
-              py::arg("stream") = nullptr);
+              py::arg("stream") = nullptr,
+              py::arg("extargs") = ExtraArguments());
         m.def("attn_bwd_compact_varlen",
               &aotriton::v2::flash::attn_bwd_compact_varlen,
               "Flash Attention Backward Pass, Compact Stored Varlen",
@@ -97,7 +110,8 @@ namespace pyaotriton {
               py::arg("philox_seed"),
               py::arg("philox_offset"),
               py::arg("is_causal"),
-              py::arg("stream") = nullptr);
+              py::arg("stream") = nullptr,
+              py::arg("extargs") = ExtraArguments());
         m.def("debug_fill_dropout_rng",
               &aotriton::v2::flash::debug_fill_dropout_rng,
               "Flash Attention Debugging Function to get raw RNG numbers used in dropout",

diff --git a/docs/How To Generate Tuning Database.md b/docs/How To Generate Tuning Database.md
@@ -0,0 +1,12 @@
+# TL;DR
+
+```
+mkdir cpptune_build
+cd cpptune_build
+cmake .. -DCMAKE_INSTALL_PREFIX=./install_dir -DCMAKE_BUILD_TYPE=Release -DAOTRITON_BUILD_FOR_TUNING=ON -G Ninja
+# Optionally only build for one arch
+# cmake .. -DCMAKE_INSTALL_PREFIX=./install_dir -DCMAKE_BUILD_TYPE=Release -DAOTRITON_BUILD_FOR_TUNING=ON -DTARGET_GPUS=Navi32 -G Ninja
+ninja install
+cd ..
+PYTHONPATH=cpptune_build/bindings/ python test/tune_flash.py --bias_type 0 --db_file v2python/rules/tuning_database.sqlite3
+```
diff --git a/... Update Constraints of Tuning Database.md → ... Update Constraints of Tuning Database.md b/... Update Constraints of Tuning Database.md → ... Update Constraints of Tuning Database.md
diff --git a/include/aotriton/flash.h b/include/aotriton/flash.h
@@ -16,6 +16,16 @@ using T4 = aotriton::TensorView<4>;
 using T2 = aotriton::TensorView<2>;
 using T1 = aotriton::TensorView<1>;
 
+struct ExtraArguments {
+#if AOTRITON_BUILD_FOR_TUNING
+  // TODO: Move them into a base class since they are common to all kernels
+  int force_kernel_index = -1;
+  int total_number_of_kernels = -1;
+  const char* selected_kernel_psels = nullptr;
+  const char* selected_kernel_copts = nullptr;
+#endif
+};
+
 hipError_t
 attn_fwd(T4 q, // batch_size x num_heads x seqlen_q x head_size
          T4 k, // batch_size x num_heads x seqlen_k x head_size
@@ -29,7 +39,8 @@ attn_fwd(T4 q, // batch_size x num_heads x seqlen_q x head_size
          uint64_t philox_offset,
          T4 encoded_softmax,
          bool is_causal,
-         aotriton::Stream stream);
+         aotriton::Stream stream,
+         ExtraArguments* extargs = nullptr);
 
 hipError_t
 attn_fwd_compact_varlen(T4 q, // 1 x num_heads x total_q x head_size, total_q := \sum_{i=0}^{b} s_i
@@ -48,7 +59,8 @@ attn_fwd_compact_varlen(T4 q, // 1 x num_heads x total_q x head_size, total_q :=
                         uint64_t philox_offset,
                         T4 encoded_softmax,
                         bool is_causal,
-                        aotriton::Stream stream);
+                        aotriton::Stream stream,
+                        ExtraArguments* extargs = nullptr);
 
 hipError_t
 attn_bwd(T4 q, // batch_size x num_heads x seqlen_q x head_size
@@ -68,7 +80,8 @@ attn_bwd(T4 q, // batch_size x num_heads x seqlen_q x head_size
          uint64_t philox_seed,
          uint64_t philox_offset,
          bool is_causal,
-         aotriton::Stream stream);
+         aotriton::Stream stream,
+         ExtraArguments* extargs = nullptr);
 
 hipError_t
 attn_bwd_compact_varlen(T4 q, // 1 x num_heads x total_q x head_size, total_q := \sum_{i=0}^{b}
@@ -92,7 +105,8 @@ attn_bwd_compact_varlen(T4 q, // 1 x num_heads x total_q x head_size, total_q :=
                         uint64_t philox_seed,
                         uint64_t philox_offset,
                         bool is_causal,
-                        aotriton::Stream stream);
+                        aotriton::Stream stream,
+                        ExtraArguments* extargs = nullptr);
 
 hipError_t
 debug_fill_dropout_rng(T4 r,

diff --git a/include/aotriton/util.h b/include/aotriton/util.h
@@ -37,6 +37,8 @@ enum GpuArch : uint64_t {
   GPU_ARCH_UNKNOWN = 0,
   GPU_ARCH_AMD_GFX90A = CAT(GpuVendor::kAMD, 0x90a),
   GPU_ARCH_AMD_GFX942 = CAT(GpuVendor::kAMD, 0x942),
+  GPU_ARCH_AMD_GFX1100 = CAT(GpuVendor::kAMD, 0x1100),
+  GPU_ARCH_AMD_GFX1101 = CAT(GpuVendor::kAMD, 0x1101),
 };
 
 template<int Rank>

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,3 @@
+-r requirements.txt
+tqdm
+textual
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ packaging
 pluggy
 numpy
 setuptools
+wheel
diff --git a/test/aotriton_flash.py b/test/aotriton_flash.py
@@ -7,8 +7,9 @@
     attn_fwd_compact_varlen as fa_forward_compact_varlen,
     attn_bwd_compact_varlen as fa_backward_compact_varlen,
     debug_fill_dropout_rng as fa_debug_fill_dropout_rng,
+    ExtraArguments as ExtraArguments,
 )
-from pyaotriton import T1, T2, T4, DType, Stream
+from pyaotriton import T1, T2, T4, DType, Stream, hipError_t
 
 def cast_dtype(dtype):
     assert not dtype.is_complex
@@ -37,7 +38,9 @@ def mk_aotensor(q, if_empty_then_like=None):
     return klass(q.data_ptr(), tuple(q.size()), q.stride(), cast_dtype(q.dtype))
 
 def attn_fwd(q, k, v, b, sm_scale, M, o,
-             dropout_p, philox_seed, philox_offset, encoded_softmax, is_causal):
+             dropout_p, philox_seed, philox_offset, encoded_softmax, is_causal,
+             extargs=None):
+    extargs = ExtraArguments() if extargs is None else extargs
     err = fa_forward(mk_aotensor(q),
                      mk_aotensor(k),
                      mk_aotensor(v),
@@ -50,13 +53,31 @@ def attn_fwd(q, k, v, b, sm_scale, M, o,
                      int(philox_offset),
                      mk_aotensor(encoded_softmax, if_empty_then_like=q),
                      is_causal,
-                     Stream())
-    print(f'{err=}')
+                     Stream(),
+                     extargs)
+    # print(f'{err=}')
+    return err
+
+def ipc_attn_fwd(ipc_to_read, ipc_to_write):
+    import torch
+    while True:
+        tup = ipc_to_read.get()
+        if tup is None:
+            break
+        q, k, v, b, sm_scale, M, o, dropout_p, philox_seed, philox_offset, encoded_softmax, is_causal, force_kernel_index, shard = tup
+        extargs = ExtraArguments()
+        extargs.force_kernel_index = force_kernel_index
+        with torch.cuda.device(shard):
+            ret = attn_fwd(q, k, v, b, sm_scale, M, o,
+                           dropout_p, philox_seed, philox_offset, encoded_softmax, is_causal,
+                           extargs)
+            torch.cuda.synchronize()
+            ipc_to_write.put(ret)
 
 def attn_bwd(q, k, v, b, sm_scale, o, dout, dq, dk, dv, db, L, delta,
              dropout_p, philox_seed, philox_offset, is_causal):
     b = mk_aotensor(b, if_empty_then_like=q)
-    print(f'{b=}')
+    # print(f'{b=}')
     err = fa_backward(mk_aotensor(q),
                       mk_aotensor(k),
                       mk_aotensor(v),
@@ -75,14 +96,16 @@ def attn_bwd(q, k, v, b, sm_scale, o, dout, dq, dk, dv, db, L, delta,
                       int(philox_offset),
                       is_causal,
                       Stream())
-    print(f'{err=}')
+    # print(f'{err=}')
+    return err
 
 def debug_fill_dropout_rng(R, philox_seed, philox_offset):
     err = fa_debug_fill_dropout_rng(mk_aotensor(R),
                                     philox_seed,
                                     philox_offset,
                                     Stream())
-    print(f'{err=}')
+    # print(f'debug_fill_dropout_rng {err=}')
+    return err
 
 def attn_fwd_compact_varlen(q, k, v,
         cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k,
@@ -105,7 +128,8 @@ def attn_fwd_compact_varlen(q, k, v,
                                     mk_aotensor(encoded_softmax, if_empty_then_like=q),
                                     is_causal,
                                     Stream())
-    print(f'{err=}')
+    # print(f'{err=}')
+    return err
 
 def attn_bwd_compact_varlen(q, k, v,
         cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k,
@@ -135,4 +159,5 @@ def attn_bwd_compact_varlen(q, k, v,
                                      int(philox_offset),
                                      is_causal,
                                      Stream())
-    print(f'{err=}')
+    # print(f'{err=}')
+    return err
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ packaging @@
     pluggy
     numpy
     setuptools
+    wheel