Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/triton/Dialect/Triton/IR/Dialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,13 @@ class DialectInferLayoutInterface
// makes the reshape a "nop", i.e. the same GPU threads contain the same
// elements as before the reshape using legacy layouts. This is not always
// possible (in which case we fallback to using LinearLayouts)
// If allowReorder is set, an existing value in dstEnc is preferred when it
// still yields a non-expensive view.
// In the future we'll always use LinearLayouts
virtual LogicalResult
inferReshapeOpEncoding(ArrayRef<int64_t> srcShape, Attribute srcEnc,
ArrayRef<int64_t> dstShape, Attribute &dstEnc,
bool allowReorder,
std::optional<Location> loc) const = 0;

// Check if two layouts are structurally the same, even if their names are
Expand Down
9 changes: 8 additions & 1 deletion include/triton/Dialect/TritonGPU/IR/Dialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,14 @@ SmallVector<unsigned> getOrderForDotOperand(unsigned opIdx, unsigned rank,
bool isExpensiveCat(CatOp cat, Attribute targetEncoding);

// Return true if a view between the two types cannot be implemented as a no-op.
bool isExpensiveView(Type srcType, Type dstType);
bool isExpensiveView(ArrayRef<int64_t> srcShape, Attribute srcEncoding,
ArrayRef<int64_t> dstShape, Attribute dstEncoding);
inline bool isExpensiveView(Type srcType, Type dstType) {
auto tensorSrcType = cast<RankedTensorType>(srcType);
auto tensorDstType = cast<RankedTensorType>(dstType);
return isExpensiveView(tensorSrcType.getShape(), tensorSrcType.getEncoding(),
tensorDstType.getShape(), tensorDstType.getEncoding());
}

// Return a blocked encoding where the shape is distributed contiguously amongst
// the threads, warps, CTAs with 1 element per threads.
Expand Down
2 changes: 1 addition & 1 deletion lib/Dialect/Gluon/IR/Dialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ struct GluonInferLayoutInterface : public triton::DialectInferLayoutInterface {

LogicalResult
inferReshapeOpEncoding(ArrayRef<int64_t> srcShape, Attribute srcEnc,
ArrayRef<int64_t> dstShape, Attribute &dstEnc,
ArrayRef<int64_t> dstShape, Attribute &dstEnc, bool,
std::optional<Location> loc) const override {
return inferAutoEncoding(srcEnc, dstEnc);
}
Expand Down
10 changes: 6 additions & 4 deletions lib/Dialect/Triton/IR/Ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -850,9 +850,10 @@ void ReshapeOp::build(OpBuilder &builder, OperationState &state,
auto srcEnc = srcTy.getEncoding();
Attribute dstEnc;
if (srcEnc) {
auto result = cast<DialectInferLayoutInterface>(&srcEnc.getDialect())
->inferReshapeOpEncoding(srcTy.getShape(), srcEnc, shape,
dstEnc, state.location);
auto result =
cast<DialectInferLayoutInterface>(&srcEnc.getDialect())
->inferReshapeOpEncoding(srcTy.getShape(), srcEnc, shape, dstEnc,
allowReorder, state.location);
assert(succeeded(result));
}
auto dstTy = RankedTensorType::get(shape, srcTy.getElementType(), dstEnc);
Expand Down Expand Up @@ -922,7 +923,8 @@ LogicalResult ReshapeOp::verify() {
auto layoutInterface =
cast<DialectInferLayoutInterface>(&srcEnc.getDialect());
auto result = layoutInterface->inferReshapeOpEncoding(
srcTy.getShape(), srcEnc, dstTy.getShape(), inferredDstEnc, getLoc());
srcTy.getShape(), srcEnc, dstTy.getShape(), inferredDstEnc,
/*allowReorder=*/false, getLoc());
if (failed(result))
return failure();
return layoutInterface->verifyLayoutsAreEqual(
Expand Down
18 changes: 12 additions & 6 deletions lib/Dialect/TritonGPU/IR/Dialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,10 @@ SmallVector<unsigned> getContigPerThread(RankedTensorType type) {
return toLinearEncoding(type).getContigPerThread();
}

bool isExpensiveView(Type srcType, Type dstType) {
auto tensorSrcType = cast<RankedTensorType>(srcType);
auto tensorDstType = cast<RankedTensorType>(dstType);
auto llSrc = toLinearLayout(tensorSrcType);
auto llDst = toLinearLayout(tensorDstType);
bool isExpensiveView(ArrayRef<int64_t> srcShape, Attribute srcEncoding,
ArrayRef<int64_t> dstShape, Attribute dstEncoding) {
auto llSrc = toLinearLayout(srcShape, srcEncoding);
auto llDst = toLinearLayout(dstShape, dstEncoding);
// In case there are replicated value we need to make sure the new and old
// layout have matching masks.
for (auto [srcMask, dstMask] :
Expand All @@ -127,7 +126,8 @@ bool isExpensiveView(Type srcType, Type dstType) {
if (srcMask.second != dstMask.second)
return true;
}
return getTotalElemsPerThread(srcType) != getTotalElemsPerThread(dstType);
return getTotalElemsPerThread(srcEncoding, srcShape) !=
getTotalElemsPerThread(dstEncoding, dstShape);
}

/* Utility function used by get.*Order methods of SliceEncodingAttr.
Expand Down Expand Up @@ -3285,11 +3285,17 @@ struct TritonGPUInferLayoutInterface
LogicalResult
inferReshapeOpEncoding(ArrayRef<int64_t> srcShape, Attribute srcEnc,
ArrayRef<int64_t> dstShape, Attribute &dstEnc,
bool allowReorder,
std::optional<Location> loc) const override {
if (product(srcShape) != product(dstShape)) {
return emitOptionalError(loc, "numel of dst shape does not match "
"numel of src shape");
}
// If allowReorder is true, there are multiple valid encodings. Prefer the
// hint if it is set and valid.
if (allowReorder && dstEnc)
if (!isExpensiveView(srcShape, srcEnc, dstShape, dstEnc))
return success();
auto result =
inferReshapeOpLegacyEncoding(srcShape, srcEnc, dstShape, dstEnc);
if (succeeded(result)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,8 @@ bool canBeRemat(Operation *op) {
return false;
if (auto gather = dyn_cast<GatherOp>(op))
return !gather.getEfficientLayout();
if (auto reshape = dyn_cast<ReshapeOp>(op))
return !reshape.getEfficientLayout();

if (isa<scf::WhileOp, scf::ConditionOp>(op))
return false;
Expand Down
27 changes: 13 additions & 14 deletions lib/Dialect/TritonGPU/Transforms/Utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ std::string GraphLayoutMarker::getColor(const Type &type) const {
// -------------------------------------------------------------------------- //

static Attribute inferDstEncoding(triton::ReduceOp op, Attribute encoding) {
// If the input is rank 1, the output is a scalar value.
if (cast<ttg::LayoutEncodingTrait>(encoding).getRank() == 1)
return {};
return triton::gpu::SliceEncodingAttr::get(
op->getContext(), op.getAxis(),
cast<ttg::DistributedEncodingTrait>(encoding));
Expand Down Expand Up @@ -462,26 +465,22 @@ static Attribute inferSrcEncoding(triton::TransposeOpInterface op,
static Attribute inferReshapeOpDstEncoding(ArrayRef<int64_t> srcShape,
Attribute srcEnc,
ArrayRef<int64_t> dstShape,
bool allowReorder) {
// We don't do anything smart to allow-reorder reshapes here. They are
// handled in OptimizeThreadLocality.
if (allowReorder)
return {};

Attribute dstEnc;
Attribute dstEncHint = {},
bool allowReorder = false) {
Attribute dstEnc = dstEncHint;
auto result =
srcEnc.getDialect()
.getRegisteredInterface<triton::DialectInferLayoutInterface>()
->inferReshapeOpEncoding(srcShape, srcEnc, dstShape, dstEnc,
/*loc=*/std::nullopt);
allowReorder, /*loc=*/std::nullopt);
assert(succeeded(result));
return dstEnc;
}

static Attribute inferDstEncoding(triton::ReshapeOp op, Attribute encoding) {
return inferReshapeOpDstEncoding(op.getSrc().getType().getShape(), encoding,
op.getType().getShape(),
op.getAllowReorder());
return inferReshapeOpDstEncoding(
op.getSrc().getType().getShape(), encoding, op.getType().getShape(),
op.getType().getEncoding(), op.getAllowReorder());
}

static Attribute inferDstEncoding(GatherOp op, Attribute encoding) {
Expand All @@ -496,9 +495,9 @@ static Attribute inferSrcEncoding(triton::ReshapeOp op, Attribute encoding) {
// as the encoding of x given the encoding of y in `reshape(y) -> x`. It's an
// invariant of inferReshapeOpNoReorderEncoding that it's symmetric in this
// way.
return inferReshapeOpDstEncoding(op.getType().getShape(), encoding,
op.getSrc().getType().getShape(),
op.getAllowReorder());
return inferReshapeOpDstEncoding(
op.getType().getShape(), encoding, op.getSrc().getType().getShape(),
op.getSrc().getType().getEncoding(), op.getAllowReorder());
}

static bool isSingleValue(Value value) {
Expand Down
21 changes: 21 additions & 0 deletions test/Gluon/auto_encoding.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,24 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
tt.return %out : tensor<16xi32, #blocked>
}
}

// -----

#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>

module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
// CHECK-LABEL: @infer_reduce_to_scalar
// CHECK-NOT: auto_encoding
// CHECK: "tt.reduce"
// CHECK: tt.return
tt.func public @infer_reduce_to_scalar() -> i32 {
%0 = arith.constant dense<1> : tensor<16xi32, #gluon.auto_encoding>
%1 = gluon.set_auto_layout %0 : tensor<16xi32, #gluon.auto_encoding> -> tensor<16xi32, #blocked>
%2 = "tt.reduce"(%0) <{axis = 0 : i32}> ({
^bb0(%lhs: i32, %rhs: i32):
%3 = arith.addi %lhs, %rhs : i32
tt.reduce.return %3 : i32
}) : (tensor<16xi32, #gluon.auto_encoding>) -> i32
tt.return %2 : i32
}
}
35 changes: 35 additions & 0 deletions test/TritonGPU/combine.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -2217,6 +2217,41 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr

// -----

#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [0, 1]}>
#blocked3 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked4 = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked1 = #ttg.slice<{dim = 0, parent = #blocked}>

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
// CHECK-LABEL: @permuting_reshape_backward_remat
// CHECK-NOT: ttg.convert_layout
// CHECK: tt.return
tt.func public @permuting_reshape_backward_remat(%arg0: !tt.ptr<i32> {tt.divisibility = 16 : i32}) -> tensor<8x2xi32, #blocked3> {
%0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked1>
%1 = tt.splat %arg0 : !tt.ptr<i32> -> tensor<16x!tt.ptr<i32>, #blocked1>
%2 = tt.addptr %1, %0 : tensor<16x!tt.ptr<i32>, #blocked1>, tensor<16xi32, #blocked1>
%3 = tt.load %2 : tensor<16x!tt.ptr<i32>, #blocked1>
%4 = tt.reshape %3 allow_reorder : tensor<16xi32, #blocked1> -> tensor<8x2xi32, #blocked4>
%5 = ttg.convert_layout %4 : tensor<8x2xi32, #blocked4> -> tensor<8x2xi32, #blocked3>
tt.return %5 : tensor<8x2xi32, #blocked3>
}

// CHECK-LABEL: @permuting_reshape_no_backward_remat_efficient_layout
// CHECK: ttg.convert_layout
// CHECK: tt.return
tt.func public @permuting_reshape_no_backward_remat_efficient_layout(%arg0: !tt.ptr<i32> {tt.divisibility = 16 : i32}) -> tensor<8x2xi32, #blocked3> {
%0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked1>
%1 = tt.splat %arg0 : !tt.ptr<i32> -> tensor<16x!tt.ptr<i32>, #blocked1>
%2 = tt.addptr %1, %0 : tensor<16x!tt.ptr<i32>, #blocked1>, tensor<16xi32, #blocked1>
%3 = tt.load %2 : tensor<16x!tt.ptr<i32>, #blocked1>
%4 = tt.reshape %3 allow_reorder efficient_layout : tensor<16xi32, #blocked1> -> tensor<8x2xi32, #blocked4>
%5 = ttg.convert_layout %4 : tensor<8x2xi32, #blocked4> -> tensor<8x2xi32, #blocked3>
tt.return %5 : tensor<8x2xi32, #blocked3>
}
}

// -----

#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
#slice1dim1 = #ttg.slice<{dim = 1, parent = #blocked1}>
#blocked2 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
Expand Down
26 changes: 26 additions & 0 deletions test/TritonNvidiaGPU/tmem_layouts.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,29 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
tt.return
}
}

// -----

#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[32, 0], [64, 0], [0, 32]], block = []}>
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 64, colStride = 1>

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:100"} {
// CHECK-LABEL: @tmem_load_reduce_rank1
// CHECK: "tt.reduce"
// CHECK: "tt.reduce"
// CHECK: tt.return
tt.func public @tmem_load_reduce_rank1(%arg0: !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory>) -> f32 {
%0 = ttng.tmem_load %arg0 : !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory> -> tensor<128x64xf32, #linear>
%1 = "tt.reduce"(%0) <{axis = 1 : i32}> ({
^bb0(%lhs: f32, %rhs: f32):
%2 = arith.addf %lhs, %rhs : f32
tt.reduce.return %2 : f32
}) : (tensor<128x64xf32, #linear>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #linear}>>
%3 = "tt.reduce"(%1) <{axis = 0 : i32}> ({
^bb0(%lhs: f32, %rhs: f32):
%4 = arith.addf %lhs, %rhs : f32
tt.reduce.return %4 : f32
}) : (tensor<128xf32, #ttg.slice<{dim = 1, parent = #linear}>>) -> f32
tt.return %3 : f32
}
}
7 changes: 4 additions & 3 deletions unittest/Dialect/TritonGPU/DialectTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ void testReshape(RankedTensorType srcTy, RankedTensorType dstTy,
ctx, [&](Diagnostic &diag) { diags.push_back(" - " + diag.str()); });
result = inferLayout->inferReshapeOpEncoding(
srcTy.getShape(), srcTy.getEncoding(), dstTy.getShape(), inferredEnc,
UnknownLoc::get(ctx));
/*allowReorder=*/false, UnknownLoc::get(ctx));
}

// We expect the reshape to succeed as long as the inputs have the same
Expand All @@ -164,7 +164,7 @@ void testReshape(RankedTensorType srcTy, RankedTensorType dstTy,
Attribute inferredSrcEnc;
auto result = inferLayout->inferReshapeOpEncoding(
dstTy.getShape(), inferredEnc, srcTy.getShape(), inferredSrcEnc,
UnknownLoc::get(ctx));
/*allowReorder=*/false, UnknownLoc::get(ctx));
EXPECT_TRUE(succeeded(result))
<< "Inverse encoding inference (" << triton::join(dstTy.getShape(), "x")
<< " " << stringifyLLVMType(inferredEnc) << " -> "
Expand Down Expand Up @@ -439,7 +439,8 @@ TEST_F(JoinOpTest, JoinOpLayoutPropagation) {
}
Attribute reshapedEnc;
result = inferLayout->inferReshapeOpEncoding(
transShape, transEnc, newShape, reshapedEnc, std::nullopt);
transShape, transEnc, newShape, reshapedEnc,
/*allowReorder=*/false, std::nullopt);
assert(succeeded(result));
// The layouts should be structurally the same
// but reshapeEnc will likely be a LinearEncodingAttr
Expand Down
Loading