triton-lang · Jokeren · Nov 21, 2023 · Nov 18, 2023 · Nov 18, 2023 · Nov 20, 2023
@@ -110,6 +110,8 @@ class MoveOpAfterLayoutConversion : public mlir::RewritePattern {
       if (isa<triton::LoadOp>(currOp))
         checkOp = true;
       else if (checkOp) {
+        // Bail out if there exists an op after Load that is not FpToFp,
+        // Bitcast, or Arith.
         if (!isa<triton::FpToFpOp, triton::BitcastOp>(currOp) &&
             currOp->getDialect()->getTypeID() !=
                 mlir::TypeID::get<arith::ArithDialect>())
@@ -134,19 +136,22 @@ class MoveOpAfterLayoutConversion : public mlir::RewritePattern {
     // only considers conversions to dot operand
     if (!cvtTy.getEncoding().isa<triton::gpu::DotOperandEncodingAttr>())
       return mlir::failure();
-    auto argTy = cvtArgOp->getOperand(0).getType().cast<RankedTensorType>();
     auto retTy = cvtArgOp->getResult(0).getType().cast<RankedTensorType>();
-    if (!argTy || !retTy)
+    if (!retTy)
       return mlir::failure();
     Type newRetTy = RankedTensorType::get(
         retTy.getShape(), retTy.getElementType(), cvtTy.getEncoding());
-    Type newCvtTy = RankedTensorType::get(
-        retTy.getShape(), argTy.getElementType(), cvtTy.getEncoding());
     int numArgs = cvtArgOp->getNumOperands();
     SmallVector<triton::gpu::ConvertLayoutOp> newCvts(numArgs);
-    for (int i = 0; i < numArgs; i++)
+    for (int i = 0; i < numArgs; i++) {
+      auto argTy = cvtArgOp->getOperand(i).getType().cast<RankedTensorType>();
+      if (!argTy)
+        return mlir::failure();
+      Type newCvtTy = RankedTensorType::get(
+          retTy.getShape(), argTy.getElementType(), cvtTy.getEncoding());
       newCvts[i] = rewriter.create<triton::gpu::ConvertLayoutOp>(
           cvt.getLoc(), newCvtTy, cvtArgOp->getOperand(i));
+    }
     auto newRet = rewriter.clone(*cvtArgOp);
     for (int i = 0; i < numArgs; i++)
       newRet->setOperand(i, newCvts[i]);

@@ -153,3 +153,24 @@ tt.func @mma_v3_reg_operand_A(%arg0: tensor<128x64xf16, #mma>, %arg1: tensor<64x
   tt.return %r : tensor<128x64xf32, #mma>
 }
 }
+
+// -----
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+#mma = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 8]}>
+module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func @a_impl(%pa: tensor<128x128x!tt.ptr<f16, 1>, #blocked>) -> tensor<128x128xf32, #mma> {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    %cst_3 = arith.constant dense<5> : tensor<128x1xi32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #blocked>
+    %tl = tt.load %pa {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x128xf16, #blocked>
+    %tr = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %te = tt.expand_dims %tr {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<128x1xi32, #blocked>
+    %tc = arith.cmpi slt, %te, %cst_3 : tensor<128x1xi32, #blocked>
+    %tb = tt.broadcast %tc : (tensor<128x1xi1, #blocked>) -> tensor<128x128xi1, #blocked>
+    %ts = arith.select %tb, %tl, %cst_4 : tensor<128x128xi1, #blocked>, tensor<128x128xf16, #blocked>
+    %conv = triton_gpu.convert_layout %ts : (tensor<128x128xf16, #blocked>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+    %td = tt.dot %cst_0, %conv, %cst {allowTF32 = true, maxNumImpreciseAcc = 0 : i32} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma>
+    tt.return %td : tensor<128x128xf32, #mma>
+  }
+}