diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 96f5f5c6f1a3f..4865dc13f324b 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1587,4 +1587,35 @@ def AMDGPU_MakeDmaDescriptorOp : AMDGPU_MakeDescriptorOp<"make_dma_descriptor"> } +def AMDGPU_TensorLoadToLDSOp : + AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite, MemRead]>]>, + Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> { + let summary = "Load tensors from global memory to LDS."; + let description = [{ + Load tensors of up to five dimensions from global memory to LDS. + + This operation was introduced in gfx1250. + }]; + + let assemblyFormat = [{ + $desc attr-dict `:` qualified(type($desc)) + }]; +} + +def AMDGPU_TensorStoreFromLDSOp : + AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite, MemRead]>]>, + Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> { + + let summary = "Store tensors from LDS to global memory."; + let description = [{ + Store tensors of up to five dimensions from LDS to global memory. + + This operation was introduced in gfx1250. + }]; + + let assemblyFormat = [{ + $desc attr-dict `:` qualified(type($desc)) + }]; +} + #endif // AMDGPU diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 541bb02d79eae..90009c9722fe3 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -3218,11 +3218,6 @@ struct AMDGPULowerDescriptor : public ConvertOpToLLVMPattern { Location loc = op.getLoc(); - IntegerType i32 = rewriter.getI32Type(); - [[maybe_unused]] Type v4i32 = - this->typeConverter->convertType(VectorType::get(4, i32)); - assert(v4i32 && "expected type conversion to succeed"); - SmallVector consts; for (int64_t i = 0; i < 8; ++i) consts.push_back(createI32Constant(rewriter, loc, i)); @@ -3237,6 +3232,32 @@ struct AMDGPULowerDescriptor : public ConvertOpToLLVMPattern { } }; +template +struct AMDGPUTensorLoadStoreOpLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + using Adaptor = typename ConvertOpToLLVMPattern::OneToNOpAdaptor; + AMDGPUTensorLoadStoreOpLowering(const LLVMTypeConverter &converter, + Chipset chipset) + : ConvertOpToLLVMPattern(converter), chipset(chipset) {} + Chipset chipset; + + LogicalResult + matchAndRewrite(SourceOp op, Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (chipset < kGfx1250) + return op->emitOpError("is only supported on gfx1250"); + + ValueRange desc = adaptor.getDesc(); + rewriter.replaceOpWithNewOp(op, desc[0], desc[1], desc[2], + desc[3], /*cachePolicy=*/0, + /*alias_scopes=*/nullptr, + /*noalias_scopes=*/nullptr, + /*tbaa=*/nullptr); + return success(); + } +}; + struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { using Base::Base; @@ -3306,6 +3327,33 @@ void mlir::populateAMDGPUTypeAndAttributeConversions( Type i32 = IntegerType::get(type.getContext(), 32); return typeConverter.convertType(VectorType::get(4, i32)); }); + typeConverter.addConversion( + [&](TDMDescriptorType type, + SmallVectorImpl &result) -> std::optional { + Type i32 = IntegerType::get(type.getContext(), 32); + Type v4i32 = typeConverter.convertType(VectorType::get(4, i32)); + Type v8i32 = typeConverter.convertType(VectorType::get(8, i32)); + llvm::append_values(result, v4i32, v8i32, v4i32, v4i32); + return success(); + }); + + auto addUnrealizedCast = [](OpBuilder &builder, TypeRange types, + ValueRange inputs, + Location loc) -> SmallVector { + // Only create unrealized_conversion_cast for TDMDescriptorType. + // All other types which are not expected, should be + // materialized by other target materialization functions. + if (inputs.size() != 1) + return {}; + + if (!isa(inputs[0].getType())) + return {}; + + auto cast = UnrealizedConversionCastOp::create(builder, loc, types, inputs); + return cast.getResults(); + }; + + typeConverter.addTargetMaterialization(addUnrealizedCast); } void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, @@ -3336,7 +3384,11 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaBaseLowering, AMDGPULowerDescriptor, - AMDGPULowerDescriptor>(converter, - chipset); + AMDGPULowerDescriptor, + AMDGPUTensorLoadStoreOpLowering, + AMDGPUTensorLoadStoreOpLowering>( + converter, chipset); patterns.add(converter); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 4979e85785970..e62db9ff571bf 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -773,6 +773,24 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base, %wg_ func.return %descriptor : !amdgpu.tdm_descriptor } +// CHECK-LABEL: func @tensor_load_to_lds +// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor) +func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor) { + // CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]] + // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor + func.return +} + +// CHECK-LABEL: func @tensor_store_from_lds +// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor) +func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor) { + // CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]] + // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor + func.return +} + // ----- // CHECK-LABEL: func @make_gather_dma_descriptor