diff --git a/include/triton/Conversion/TritonGPUToLLVM/Passes.h b/include/triton/Conversion/TritonGPUToLLVM/Passes.h
index b013f26289ce..1c8430bcd262 100644
--- a/include/triton/Conversion/TritonGPUToLLVM/Passes.h
+++ b/include/triton/Conversion/TritonGPUToLLVM/Passes.h
@@ -19,6 +19,7 @@ namespace triton {
 
 namespace gpu {
 std::unique_ptr<OperationPass<ModuleOp>> createAllocateSharedMemoryPass();
+std::unique_ptr<OperationPass<ModuleOp>> createPerfCollectionPass();
 
 } // namespace gpu
 
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Passes.td b/include/triton/Conversion/TritonGPUToLLVM/Passes.td
index 700dcd6b4859..87d2d269162f 100644
--- a/include/triton/Conversion/TritonGPUToLLVM/Passes.td
+++ b/include/triton/Conversion/TritonGPUToLLVM/Passes.td
@@ -7,5 +7,9 @@ def AllocateSharedMemory : Pass<"allocate-shared-memory", "mlir::ModuleOp"> {
     let summary = "Add metadata for shared memory allocation";
     let constructor = "mlir::triton::gpu::createAllocateSharedMemoryPass()";
 }
+def PerfCollection : Pass<"perf-collection", "mlir::ModuleOp"> {
+    let summary = "Collect perf information";
+    let constructor = "mlir::triton::gpu::createPerfCollectionPass()";
+}
 
 #endif
diff --git a/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt b/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt
index cca2830b044a..55a55a954657 100644
--- a/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt
+++ b/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt
@@ -10,6 +10,7 @@ add_triton_library(TritonGPUToLLVM
     MakeRangeOpToLLVM.cpp
     HistogramOpToLLVM.cpp
     AllocateSharedMemory.cpp
+    PerfCollection.cpp
     ReduceOpToLLVM.cpp
     ScanOpToLLVM.cpp
     ConvertLayoutOpToLLVM.cpp
diff --git a/lib/Conversion/TritonGPUToLLVM/PerfCollection.cpp b/lib/Conversion/TritonGPUToLLVM/PerfCollection.cpp
new file mode 100644
index 000000000000..7602a5d4d2ec
--- /dev/null
+++ b/lib/Conversion/TritonGPUToLLVM/PerfCollection.cpp
@@ -0,0 +1,75 @@
+#include "mlir/Pass/Pass.h"
+#include "triton/Analysis/Allocation.h"
+#include "triton/Analysis/Utility.h"
+#include "triton/Conversion/TritonGPUToLLVM/Passes.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
+using namespace mlir;
+using namespace mlir::triton;
+
+namespace mlir {
+namespace triton {
+#define GEN_PASS_DEF_PERFCOLLECTION
+#include "triton/Conversion/TritonGPUToLLVM/Passes.h.inc"
+} // namespace triton
+} // namespace mlir
+
+namespace {
+
+struct PerfCollection
+    : public mlir::triton::impl::PerfCollectionBase<PerfCollection> {
+  void runOnOperation() override {
+    ModuleOp mod = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    mod.walk([&](FunctionOpInterface funcOp) {
+      funcOp.walk([&](Operation *op) {
+        // Go through key operations convert_layout, load/store, dot.
+        // LoadOp, StoreOp, AtomicCASOp, AtomicRMWOp, AsyncCopyGlobalToLocalOp,
+        // AsyncTMACopyGlobalToLocalOp
+        if (auto convertOp = dyn_cast<mlir::triton::gpu::ConvertLayoutOp>(op)) {
+          // size of conversion, does it use smem?
+          op->emitRemark() << "has convertOp with size";
+        }
+        if (auto loadOp = dyn_cast<mlir::triton::LoadOp>(op)) {
+          // How can we tell if it's coalesced? We need to wait till lowering
+          // when vectorization is decided.
+        }
+        // DotAsyncOp, DotOp, DotWaitOp
+        if (auto dotOp = dyn_cast<mlir::triton::DotOp>(op)) {
+          // Show the shape of the dot: [M, N, K], also mma version.
+          op->emitRemark() << "has dot";
+        }
+        if (auto dotOp = dyn_cast<mlir::triton::nvidia_gpu::DotAsyncOp>(op)) {
+          // Show the shape of the dot: [M, N, K], also mma version.
+          auto dotEnc = dyn_cast<mlir::triton::gpu::NvidiaMmaEncodingAttr>(
+              cast<RankedTensorType>(dotOp->getResult(0).getType())
+                  .getEncoding());
+          if (dotEnc && dotEnc.getVersionMajor() == 3)
+            op->emitRemark() << "has async_dot v3";
+          else
+            op->emitRemark() << "has async_dot";
+        }
+      });
+    });
+  }
+};
+
+} // namespace
+
+namespace mlir {
+
+namespace triton {
+
+namespace gpu {
+
+std::unique_ptr<OperationPass<ModuleOp>> createPerfCollectionPass() {
+  return std::make_unique<PerfCollection>();
+}
+
+} // namespace gpu
+
+} // namespace triton
+
+} // namespace mlir
diff --git a/python/src/passes.cc b/python/src/passes.cc
index 37d6fa7be1fc..94321a9d626e 100644
--- a/python/src/passes.cc
+++ b/python/src/passes.cc
@@ -62,6 +62,7 @@ void init_triton_passes_ttgpuir(py::module &&m) {
                      createReduceDataDuplicationPass);
   ADD_PASS_WRAPPER_0("add_allocate_shared_memory",
                      createAllocateSharedMemoryPass);
+  ADD_PASS_WRAPPER_0("add_perf_collection", createPerfCollectionPass);
 }
 
 void init_triton_passes_convert(py::module &&m) {
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
index f9d7d91d57cb..a08acc04ef09 100644
--- a/third_party/nvidia/backend/compiler.py
+++ b/third_party/nvidia/backend/compiler.py
@@ -197,6 +197,8 @@ def make_llir(src, metadata, options, capability):
         # TritonGPU -> LLVM-IR (MLIR)
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
+        # add an perf collection pass
+        passes.ttgpuir.add_perf_collection(pm)
         nvidia.passes.ttgpuir.add_decompose_unsupported_conversions(pm)
         passes.convert.add_scf_to_cf(pm)
         passes.convert.add_index_to_llvmir(pm)