diff --git a/include/triton/Conversion/TritonGPUToLLVM/Passes.h b/include/triton/Conversion/TritonGPUToLLVM/Passes.h index b013f26289ce..1c8430bcd262 100644 --- a/include/triton/Conversion/TritonGPUToLLVM/Passes.h +++ b/include/triton/Conversion/TritonGPUToLLVM/Passes.h @@ -19,6 +19,7 @@ namespace triton { namespace gpu { std::unique_ptr> createAllocateSharedMemoryPass(); +std::unique_ptr> createPerfCollectionPass(); } // namespace gpu diff --git a/include/triton/Conversion/TritonGPUToLLVM/Passes.td b/include/triton/Conversion/TritonGPUToLLVM/Passes.td index 700dcd6b4859..87d2d269162f 100644 --- a/include/triton/Conversion/TritonGPUToLLVM/Passes.td +++ b/include/triton/Conversion/TritonGPUToLLVM/Passes.td @@ -7,5 +7,9 @@ def AllocateSharedMemory : Pass<"allocate-shared-memory", "mlir::ModuleOp"> { let summary = "Add metadata for shared memory allocation"; let constructor = "mlir::triton::gpu::createAllocateSharedMemoryPass()"; } +def PerfCollection : Pass<"perf-collection", "mlir::ModuleOp"> { + let summary = "Collect perf information"; + let constructor = "mlir::triton::gpu::createPerfCollectionPass()"; +} #endif diff --git a/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt b/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt index cca2830b044a..55a55a954657 100644 --- a/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt +++ b/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt @@ -10,6 +10,7 @@ add_triton_library(TritonGPUToLLVM MakeRangeOpToLLVM.cpp HistogramOpToLLVM.cpp AllocateSharedMemory.cpp + PerfCollection.cpp ReduceOpToLLVM.cpp ScanOpToLLVM.cpp ConvertLayoutOpToLLVM.cpp diff --git a/lib/Conversion/TritonGPUToLLVM/PerfCollection.cpp b/lib/Conversion/TritonGPUToLLVM/PerfCollection.cpp new file mode 100644 index 000000000000..7602a5d4d2ec --- /dev/null +++ b/lib/Conversion/TritonGPUToLLVM/PerfCollection.cpp @@ -0,0 +1,75 @@ +#include "mlir/Pass/Pass.h" +#include "triton/Analysis/Allocation.h" +#include "triton/Analysis/Utility.h" +#include "triton/Conversion/TritonGPUToLLVM/Passes.h" +#include "triton/Dialect/Triton/IR/Dialect.h" +#include "triton/Dialect/TritonGPU/IR/Dialect.h" + +using namespace mlir; +using namespace mlir::triton; + +namespace mlir { +namespace triton { +#define GEN_PASS_DEF_PERFCOLLECTION +#include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc" +} // namespace triton +} // namespace mlir + +namespace { + +struct PerfCollection + : public mlir::triton::impl::PerfCollectionBase { + void runOnOperation() override { + ModuleOp mod = getOperation(); + MLIRContext *ctx = &getContext(); + + mod.walk([&](FunctionOpInterface funcOp) { + funcOp.walk([&](Operation *op) { + // Go through key operations convert_layout, load/store, dot. + // LoadOp, StoreOp, AtomicCASOp, AtomicRMWOp, AsyncCopyGlobalToLocalOp, + // AsyncTMACopyGlobalToLocalOp + if (auto convertOp = dyn_cast(op)) { + // size of conversion, does it use smem? + op->emitRemark() << "has convertOp with size"; + } + if (auto loadOp = dyn_cast(op)) { + // How can we tell if it's coalesced? We need to wait till lowering + // when vectorization is decided. + } + // DotAsyncOp, DotOp, DotWaitOp + if (auto dotOp = dyn_cast(op)) { + // Show the shape of the dot: [M, N, K], also mma version. + op->emitRemark() << "has dot"; + } + if (auto dotOp = dyn_cast(op)) { + // Show the shape of the dot: [M, N, K], also mma version. + auto dotEnc = dyn_cast( + cast(dotOp->getResult(0).getType()) + .getEncoding()); + if (dotEnc && dotEnc.getVersionMajor() == 3) + op->emitRemark() << "has async_dot v3"; + else + op->emitRemark() << "has async_dot"; + } + }); + }); + } +}; + +} // namespace + +namespace mlir { + +namespace triton { + +namespace gpu { + +std::unique_ptr> createPerfCollectionPass() { + return std::make_unique(); +} + +} // namespace gpu + +} // namespace triton + +} // namespace mlir diff --git a/python/src/passes.cc b/python/src/passes.cc index 37d6fa7be1fc..94321a9d626e 100644 --- a/python/src/passes.cc +++ b/python/src/passes.cc @@ -62,6 +62,7 @@ void init_triton_passes_ttgpuir(py::module &&m) { createReduceDataDuplicationPass); ADD_PASS_WRAPPER_0("add_allocate_shared_memory", createAllocateSharedMemoryPass); + ADD_PASS_WRAPPER_0("add_perf_collection", createPerfCollectionPass); } void init_triton_passes_convert(py::module &&m) { diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py index f9d7d91d57cb..a08acc04ef09 100644 --- a/third_party/nvidia/backend/compiler.py +++ b/third_party/nvidia/backend/compiler.py @@ -197,6 +197,8 @@ def make_llir(src, metadata, options, capability): # TritonGPU -> LLVM-IR (MLIR) pm = ir.pass_manager(mod.context) pm.enable_debug() + # add an perf collection pass + passes.ttgpuir.add_perf_collection(pm) nvidia.passes.ttgpuir.add_decompose_unsupported_conversions(pm) passes.convert.add_scf_to_cf(pm) passes.convert.add_index_to_llvmir(pm)