From 2d5292abd0b3c85d3a82ccd0213f20f0bb553bae Mon Sep 17 00:00:00 2001 From: Neil Dhar Date: Sat, 4 Apr 2026 16:11:43 -0700 Subject: [PATCH] Fix inferDstEncoding for rank 1 reduction `inferDstEncoding` currently tries to unconditionally build a sliced encoding from the src encoding it is given. But this is incorrect if the source is rank 1, since we can't take a slice of a rank 1 tensor. --- lib/Dialect/TritonGPU/Transforms/Utility.cpp | 3 +++ test/Gluon/auto_encoding.mlir | 21 ++++++++++++++++ test/TritonNvidiaGPU/tmem_layouts.mlir | 26 ++++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index c8bd342e90e2..d4f641ae162f 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -312,6 +312,9 @@ std::string GraphLayoutMarker::getColor(const Type &type) const { // -------------------------------------------------------------------------- // static Attribute inferDstEncoding(triton::ReduceOp op, Attribute encoding) { + // If the input is rank 1, the output is a scalar value. + if (cast(encoding).getRank() == 1) + return {}; return triton::gpu::SliceEncodingAttr::get( op->getContext(), op.getAxis(), cast(encoding)); diff --git a/test/Gluon/auto_encoding.mlir b/test/Gluon/auto_encoding.mlir index 277346d48ade..68ebcef9598f 100644 --- a/test/Gluon/auto_encoding.mlir +++ b/test/Gluon/auto_encoding.mlir @@ -178,3 +178,24 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num- tt.return %out : tensor<16xi32, #blocked> } } + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> + +module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} { + // CHECK-LABEL: @infer_reduce_to_scalar + // CHECK-NOT: auto_encoding + // CHECK: "tt.reduce" + // CHECK: tt.return + tt.func public @infer_reduce_to_scalar() -> i32 { + %0 = arith.constant dense<1> : tensor<16xi32, #gluon.auto_encoding> + %1 = gluon.set_auto_layout %0 : tensor<16xi32, #gluon.auto_encoding> -> tensor<16xi32, #blocked> + %2 = "tt.reduce"(%0) <{axis = 0 : i32}> ({ + ^bb0(%lhs: i32, %rhs: i32): + %3 = arith.addi %lhs, %rhs : i32 + tt.reduce.return %3 : i32 + }) : (tensor<16xi32, #gluon.auto_encoding>) -> i32 + tt.return %2 : i32 + } +} diff --git a/test/TritonNvidiaGPU/tmem_layouts.mlir b/test/TritonNvidiaGPU/tmem_layouts.mlir index f3c506e8f5db..9287eabc236c 100644 --- a/test/TritonNvidiaGPU/tmem_layouts.mlir +++ b/test/TritonNvidiaGPU/tmem_layouts.mlir @@ -214,3 +214,29 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ tt.return } } + +// ----- + +#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[32, 0], [64, 0], [0, 32]], block = []}> +#tmem = #ttng.tensor_memory_encoding + +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:100"} { + // CHECK-LABEL: @tmem_load_reduce_rank1 + // CHECK: "tt.reduce" + // CHECK: "tt.reduce" + // CHECK: tt.return + tt.func public @tmem_load_reduce_rank1(%arg0: !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory>) -> f32 { + %0 = ttng.tmem_load %arg0 : !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory> -> tensor<128x64xf32, #linear> + %1 = "tt.reduce"(%0) <{axis = 1 : i32}> ({ + ^bb0(%lhs: f32, %rhs: f32): + %2 = arith.addf %lhs, %rhs : f32 + tt.reduce.return %2 : f32 + }) : (tensor<128x64xf32, #linear>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #linear}>> + %3 = "tt.reduce"(%1) <{axis = 0 : i32}> ({ + ^bb0(%lhs: f32, %rhs: f32): + %4 = arith.addf %lhs, %rhs : f32 + tt.reduce.return %4 : f32 + }) : (tensor<128xf32, #ttg.slice<{dim = 1, parent = #linear}>>) -> f32 + tt.return %3 : f32 + } +}