Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 0 additions & 31 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1587,35 +1587,4 @@ def AMDGPU_MakeDmaDescriptorOp : AMDGPU_MakeDescriptorOp<"make_dma_descriptor">

}

def AMDGPU_TensorLoadToLDSOp :
AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite, MemRead]>]>,
Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
let summary = "Load tensors from global memory to LDS.";
let description = [{
Load tensors of up to five dimensions from global memory to LDS.

This operation was introduced in gfx1250.
}];

let assemblyFormat = [{
$desc attr-dict `:` qualified(type($desc))
}];
}

def AMDGPU_TensorStoreFromLDSOp :
AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite, MemRead]>]>,
Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {

let summary = "Store tensors from LDS to global memory.";
let description = [{
Store tensors of up to five dimensions from LDS to global memory.

This operation was introduced in gfx1250.
}];

let assemblyFormat = [{
$desc attr-dict `:` qualified(type($desc))
}];
}

#endif // AMDGPU
57 changes: 7 additions & 50 deletions mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3218,6 +3218,11 @@ struct AMDGPULowerDescriptor : public ConvertOpToLLVMPattern<DescriptorOp> {

Location loc = op.getLoc();

IntegerType i32 = rewriter.getI32Type();
[[maybe_unused]] Type v4i32 =
this->typeConverter->convertType(VectorType::get(4, i32));
assert(v4i32 && "expected type conversion to succeed");

SmallVector<Value> consts;
for (int64_t i = 0; i < 8; ++i)
consts.push_back(createI32Constant(rewriter, loc, i));
Expand All @@ -3232,32 +3237,6 @@ struct AMDGPULowerDescriptor : public ConvertOpToLLVMPattern<DescriptorOp> {
}
};

template <typename SourceOp, typename TargetOp>
struct AMDGPUTensorLoadStoreOpLowering
: public ConvertOpToLLVMPattern<SourceOp> {
using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
using Adaptor = typename ConvertOpToLLVMPattern<SourceOp>::OneToNOpAdaptor;
AMDGPUTensorLoadStoreOpLowering(const LLVMTypeConverter &converter,
Chipset chipset)
: ConvertOpToLLVMPattern<SourceOp>(converter), chipset(chipset) {}
Chipset chipset;

LogicalResult
matchAndRewrite(SourceOp op, Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
if (chipset < kGfx1250)
return op->emitOpError("is only supported on gfx1250");

ValueRange desc = adaptor.getDesc();
rewriter.replaceOpWithNewOp<TargetOp>(op, desc[0], desc[1], desc[2],
desc[3], /*cachePolicy=*/0,
/*alias_scopes=*/nullptr,
/*noalias_scopes=*/nullptr,
/*tbaa=*/nullptr);
return success();
}
};

struct ConvertAMDGPUToROCDLPass
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
using Base::Base;
Expand Down Expand Up @@ -3327,24 +3306,6 @@ void mlir::populateAMDGPUTypeAndAttributeConversions(
Type i32 = IntegerType::get(type.getContext(), 32);
return typeConverter.convertType(VectorType::get(4, i32));
});
typeConverter.addConversion(
[&](TDMDescriptorType type,
SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
Type i32 = IntegerType::get(type.getContext(), 32);
Type v4i32 = typeConverter.convertType(VectorType::get(4, i32));
Type v8i32 = typeConverter.convertType(VectorType::get(8, i32));
llvm::append_values(result, v4i32, v8i32, v4i32, v4i32);
return success();
});

auto addUnrealizedCast = [](OpBuilder &builder, TypeRange types,
ValueRange inputs,
Location loc) -> SmallVector<Value> {
auto cast = UnrealizedConversionCastOp::create(builder, loc, types, inputs);
return cast.getResults();
};

typeConverter.addTargetMaterialization(addUnrealizedCast);
}

void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
Expand Down Expand Up @@ -3375,11 +3336,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
AMDGPUMakeDmaBaseLowering<MakeDmaBaseOp>,
AMDGPUMakeDmaBaseLowering<MakeGatherDmaBaseOp>,
AMDGPULowerDescriptor<MakeDmaDescriptorOp>,
AMDGPULowerDescriptor<MakeGatherDmaDescriptorOp>,
AMDGPUTensorLoadStoreOpLowering<TensorLoadToLDSOp,
ROCDL::TensorLoadToLDSOp>,
AMDGPUTensorLoadStoreOpLowering<TensorStoreFromLDSOp,
ROCDL::TensorStoreFromLDSOp>>(
converter, chipset);
AMDGPULowerDescriptor<MakeGatherDmaDescriptorOp>>(converter,
chipset);
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
}
18 changes: 0 additions & 18 deletions mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -773,24 +773,6 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
func.return %descriptor : !amdgpu.tdm_descriptor
}

// CHECK-LABEL: func @tensor_load_to_lds
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor) {
// CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor
func.return
}

// CHECK-LABEL: func @tensor_store_from_lds
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor) {
// CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor
func.return
}

// -----

// CHECK-LABEL: func @make_gather_dma_descriptor
Expand Down