diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir index 908e49e178bc..16622d1035e6 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir @@ -232,7 +232,6 @@ builtin.module attributes { transform.with_named_sequence } { } // CHECK-LABEL: func.func @contract_to_mfma_32x32x8_mm_mnbatch -// CHECK: %[[INIT:.+]] = arith.constant dense<0.000000e+00> // CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %{{.+}} : vector<64x32xf32> -> vector<2x1x4x1x4x1xf32> // CHECK: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %{{.+}} : vector<64x8xf16> -> vector<2x1x1x1x1x4xf16> // CHECK: %[[C_SLICE0:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<4x1x4x1xf32> from vector<2x1x4x1x4x1xf32 @@ -241,15 +240,18 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK: %[[C0_CAST:.+]] = vector.shape_cast %[[C_SLICE0]] : vector<4x1x4x1xf32> to vector<16xf32> // CHECK: %[[MFMA0:.+]] = amdgpu.mfma %[[A0_CAST]] * %{{.+}} + %[[C0_CAST]] // CHECK: %[[R0_CAST:.+]] = vector.shape_cast %[[MFMA0]] : vector<16xf32> to vector<4x1x4x1xf32> -// CHECK: %[[C0_INS:.+]] = vector.insert %[[R0_CAST]], %[[INIT]] [0, 0] : vector<4x1x4x1xf32> into vector<2x1x4x1x4x1xf32> // CHECK: %[[C_SLICE1:.+]] = vector.extract %[[C_SIMT]][1, 0] : vector<4x1x4x1xf32> from vector<2x1x4x1x4x1xf32> // CHECK: %[[A_SLICE1:.+]] = vector.extract %[[A_SIMT]][1, 0] : vector<1x1x1x4xf16> from vector<2x1x1x1x1x4xf16> // CHECK: %[[A1_CAST:.+]] = vector.shape_cast %[[A_SLICE1]] : vector<1x1x1x4xf16> to vector<4xf16> // CHECK: %[[C1_CAST:.+]] = vector.shape_cast %[[C_SLICE1]] : vector<4x1x4x1xf32> to vector<16xf32> // CHECK: %[[MFMA1:.+]] = amdgpu.mfma %[[A1_CAST]] * %{{.+}} + %[[C1_CAST]] // CHECK: %[[R1_CAST:.+]] = vector.shape_cast %[[MFMA1]] : vector<16xf32> to vector<4x1x4x1xf32> -// CHECK: %[[C1_INS:.+]] = vector.insert %[[R1_CAST]], %[[C0_INS]] [1, 0] : vector<4x1x4x1xf32> into vector<2x1x4x1x4x1xf32> -// CHECK: %[[R:.+]] = iree_vector_ext.to_simd %[[C1_INS]] : vector<2x1x4x1x4x1xf32> -> vector<64x32xf32> +// CHECK: %[[R0:.+]]:16 = vector.to_elements %[[R0_CAST]] : vector<4x1x4x1xf32> +// CHECK: %[[R1:.+]]:16 = vector.to_elements %[[R1_CAST]] : vector<4x1x4x1xf32> +// CHECK: %[[INS:.+]] = vector.from_elements +// CHECK-SAME: %[[R0]]#0, %[[R0]]#1, %[[R0]]#2, %[[R0]]#3, %[[R0]]#4, %[[R0]]#5, %[[R0]]#6, %[[R0]]#7, %[[R0]]#8, %[[R0]]#9, %[[R0]]#10, %[[R0]]#11, %[[R0]]#12, %[[R0]]#13, %[[R0]]#14, %[[R0]]#15 +// CHECK-SAME: %[[R1]]#0, %[[R1]]#1, %[[R1]]#2, %[[R1]]#3, %[[R1]]#4, %[[R1]]#5, %[[R1]]#6, %[[R1]]#7, %[[R1]]#8, %[[R1]]#9, %[[R1]]#10, %[[R1]]#11, %[[R1]]#12, %[[R1]]#13, %[[R1]]#14, %[[R1]]#15 +// CHECK: %[[R:.+]] = iree_vector_ext.to_simd %[[INS]] : vector<2x1x4x1x4x1xf32> -> vector<64x32xf32> // CHECK: return %[[R]] // ----- @@ -403,28 +405,23 @@ builtin.module attributes { transform.with_named_sequence } { } } -// CHECK-LABEL: func.func @contract_to_mfma_32x32x8_mm_mnbatch_order -// CHECK: %[[INIT:.+]] = arith.constant dense<0.000000e+00> : vector<2x3x4x1x4x1xf32> -// CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %{{.+}} : vector<64x96xf32> -> vector<2x3x4x1x4x1xf32> -// CHECK: vector.extract %[[C_SIMT]][0, 0] -// CHECK: amdgpu.mfma -// CHECK: %[[INS0:.+]] = vector.insert %{{.+}}, %[[INIT]] [0, 0] -// CHECK: vector.extract %[[C_SIMT]][0, 1] -// CHECK: amdgpu.mfma -// CHECK: %[[INS1:.+]] = vector.insert %{{.+}}, %[[INS0]] [0, 1] -// CHECK: vector.extract %[[C_SIMT]][0, 2] -// CHECK: amdgpu.mfma -// CHECK: %[[INS2:.+]] = vector.insert %{{.+}}, %[[INS1]] [0, 2] -// CHECK: vector.extract %[[C_SIMT]][1, 0] -// CHECK: amdgpu.mfma -// CHECK: %[[INS3:.+]] = vector.insert %{{.+}}, %[[INS2]] [1, 0] -// CHECK: vector.extract %[[C_SIMT]][1, 1] -// CHECK: amdgpu.mfma -// CHECK: %[[INS4:.+]] = vector.insert %{{.+}}, %[[INS3]] [1, 1] -// CHECK: vector.extract %[[C_SIMT]][1, 2] -// CHECK: amdgpu.mfma -// CHECK: %[[INS5:.+]] = vector.insert %{{.+}}, %[[INS4]] [1, 2] -// CHECK: iree_vector_ext.to_simd %[[INS5]] +// CHECK-LABEL: func.func @contract_to_mfma_32x32x8_mm_mnbatch_order +// CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %{{.+}} : vector<64x96xf32> -> vector<2x3x4x1x4x1xf32> +// CHECK: vector.extract %[[C_SIMT]][0, 0] +// CHECK: amdgpu.mfma +// CHECK: vector.extract %[[C_SIMT]][0, 1] +// CHECK: amdgpu.mfma +// CHECK: vector.extract %[[C_SIMT]][0, 2] +// CHECK: amdgpu.mfma +// CHECK: vector.extract %[[C_SIMT]][1, 0] +// CHECK: amdgpu.mfma +// CHECK: vector.extract %[[C_SIMT]][1, 1] +// CHECK: amdgpu.mfma +// CHECK: vector.extract %[[C_SIMT]][1, 2] +// CHECK: amdgpu.mfma +// CHECK-COUNT-6: vector.to_elements {{.*}} : vector<4x1x4x1xf32> +// CHECK: %[[INS:.+]] = vector.from_elements +// CHECK: iree_vector_ext.to_simd %[[INS]] // ----- @@ -495,15 +492,17 @@ builtin.module attributes { transform.with_named_sequence } { } // CHECK-LABEL: func.func @contract_to_mfma_32x32x8_mmt -// CHECK: %[[INIT:.+]] = arith.constant dense<0.000000e+00> : vector<1x2x4x1x4x1xf32> // CHECK: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %{{.+}} : vector<64x8xf16> -> vector<2x1x1x1x1x4xf16> // CHECK: vector.extract %[[B_SIMT]][0, 0] // CHECK: amdgpu.mfma -// CHECK: %[[INS0:.+]] = vector.insert %{{.+}}, %[[INIT]] [0, 0] // CHECK: vector.extract %[[B_SIMT]][1, 0] // CHECK: amdgpu.mfma -// CHECK: %[[INS1:.+]] = vector.insert %17, %[[INS0]] [0, 1] -// CHECK: iree_vector_ext.to_simd %[[INS1]] : vector<1x2x4x1x4x1xf32> -> vector<32x64xf32> +// CHECK: %[[R0:.+]]:16 = vector.to_elements %{{.+}} : vector<4x1x4x1xf32> +// CHECK: %[[R1:.+]]:16 = vector.to_elements %{{.+}} : vector<4x1x4x1xf32> +// CHECK: %[[INS:.+]] = vector.from_elements +// CHECK-SAME: %[[R0]]#0, %[[R0]]#1, %[[R0]]#2, %[[R0]]#3, %[[R0]]#4, %[[R0]]#5, %[[R0]]#6, %[[R0]]#7, %[[R0]]#8, %[[R0]]#9, %[[R0]]#10, %[[R0]]#11, %[[R0]]#12, %[[R0]]#13, %[[R0]]#14, %[[R0]]#15 +// CHECK-SAME: %[[R1]]#0, %[[R1]]#1, %[[R1]]#2, %[[R1]]#3, %[[R1]]#4, %[[R1]]#5, %[[R1]]#6, %[[R1]]#7, %[[R1]]#8, %[[R1]]#9, %[[R1]]#10, %[[R1]]#11, %[[R1]]#12, %[[R1]]#13, %[[R1]]#14, %[[R1]]#15 +// CHECK: iree_vector_ext.to_simd %[[INS]] : vector<1x2x4x1x4x1xf32> -> vector<32x64xf32> // ----- @@ -838,6 +837,7 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK: %[[B_CAST_1:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ> // CHECK: %[[MFMA_1:.*]] = amdgpu.mfma %[[A_CAST_1]] * %[[B_CAST_1]] + %[[MFMA_0]] // CHECK-SAME: {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp = none +// CHECK: %[[MFMA_1_CAST:.*]] = vector.shape_cast %[[MFMA_1]] : vector<4xf32> to vector<1x1x4x1xf32> // CHECK: %[[B_CAST_2:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ> // CHECK: %[[C_CAST_1:.+]] = vector.shape_cast %{{.+}} : vector<1x1x4x1xf32> to vector<4xf32> // CHECK: %[[MFMA_2:.*]] = amdgpu.mfma %[[A_CAST]] * %[[B_CAST_2]] + %[[C_CAST_1]] @@ -846,6 +846,10 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK: %[[MFMA_3:.*]] = amdgpu.mfma %[[A_CAST_1]] * %[[B_CAST_3]] + %[[MFMA_2]] // CHECK-SAME: {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp = none // CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_3]] : vector<4xf32> to vector<1x1x4x1xf32> -// CHECK: %[[B_OUT:.*]] = vector.insert %[[R_CAST]] +// CHECK: %[[R0:.+]]:4 = vector.to_elements %[[MFMA_1_CAST]] : vector<1x1x4x1xf32> +// CHECK: %[[R1:.+]]:4 = vector.to_elements %[[R_CAST]] : vector<1x1x4x1xf32> +// CHECK: %[[B_OUT:.+]] = vector.from_elements +// CHECK-SAME: %[[R0]]#0, %[[R0]]#1, %[[R0]]#2, %[[R0]]#3 +// CHECK-SAME: %[[R1]]#0, %[[R1]]#1, %[[R1]]#2, %[[R1]]#3 // CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x2x1x1x4x1xf32> -> vector<32x32xf32> // CHECK: return %[[R_SIMD]] diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_multi_reduce.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_multi_reduce.mlir index 358b2f51859d..dfb4ff69b530 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_multi_reduce.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_multi_reduce.mlir @@ -150,14 +150,12 @@ builtin.module attributes { transform.with_named_sequence } { } // CHECK-LABEL: func @inter_subgroup_reduction -// CHECK-DAG: %[[CST1:.+]] = arith.constant dense<0.000000e+00> : vector<2xf32> // Local reduction // CHECK: vector.multi_reduction , %{{.*}}, %{{.*}} [1, 3, 5] : vector<2x1x1x1x1x4xf32> to vector<2x1x1xf32> // Thread reduction // CHECK: %[[THREAD_RED0:.+]] = gpu.subgroup_reduce maximumf %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32 -// CHECK: %[[THREAD_RED1:.+]] = vector.insert %[[THREAD_RED0]], %[[CST1]] [0] : f32 into vector<2xf32> // CHECK: %[[THREAD_RED2:.+]] = gpu.subgroup_reduce maximumf %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32 -// CHECK: %[[THREAD_RED3:.+]] = vector.insert %[[THREAD_RED2]], %[[THREAD_RED1]] [1] : f32 into vector<2xf32> +// CHECK: %[[THREAD_RED3:.+]] = vector.from_elements %[[THREAD_RED0]], %[[THREAD_RED2]] : vector<2xf32> // CHECK: %[[THREAD_RED4:.+]] = vector.shape_cast %[[THREAD_RED3]] : vector<2xf32> to vector<2x1x1xf32> // Subgroup reduction // CHECK-DAG: %[[ALLOC:.+]] = memref.alloc() : memref<32x2xf32, #gpu.address_space> @@ -177,11 +175,10 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_simt %{{.*}} : vector<32xf32> -> vector<2x1x1xf32> // CHECK-DAG: %[[DISTR0:.+]] = vector.extract %[[SG_READ0]][0, 0] : f32 from vector<1x1xf32> // CHECK-DAG: %[[RED0:.+]] = gpu.subgroup_reduce maximumf %[[DISTR0]] cluster(size = 2, stride = 16) : (f32) -> f32 -// CHECK-DAG: %[[INS0:.+]] = vector.insert %[[RED0]], %[[CST1]] [0] : f32 into vector<2xf32> // CHECK-DAG: %[[DISTR1:.+]] = vector.extract %[[SG_READ1]][0, 0] : f32 from vector<1x1xf32> // CHECK-DAG: %[[RED1:.+]] = gpu.subgroup_reduce maximumf %[[DISTR1]] cluster(size = 2, stride = 16) : (f32) -> f32 -// CHECK-DAG: %[[INS1:.+]] = vector.insert %[[RED1]], %[[INS0]] [1] : f32 into vector<2xf32> -// CHECK-DAG: %[[CAST:.+]] = vector.shape_cast %[[INS1]] : vector<2xf32> to vector<2x1x1xf32> +// CHECK-DAG: %[[INS:.+]] = vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> +// CHECK-DAG: %[[CAST:.+]] = vector.shape_cast %[[INS]] : vector<2xf32> to vector<2x1x1xf32> // CHECK-DAG: arith.maximumf %[[CAST]], %[[ACC]] : vector<2x1x1xf32> // ----- diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp index 5a005394f308..1b502030850c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp @@ -995,6 +995,7 @@ void ConvertToLLVMPass::runOnOperation() { patterns, /*force32BitVectorIndices=*/false); vector::populateVectorMaskOpLoweringPatterns(patterns); vector::populateVectorShapeCastLoweringPatterns(patterns); + vector::populateVectorFromElementsLoweringPatterns(patterns); // TODO: doubtful that the "default" does what one want here, it is likely // better to use shuffle. vector::populateVectorTransposeLoweringPatterns( @@ -1079,6 +1080,7 @@ void ConvertToLLVMPass::runOnOperation() { vector::populateVectorStepLoweringPatterns(patterns); populateVectorToLLVMConversionPatterns(typeConverter, patterns, reassociateFpReductions); + vector::populateVectorFromElementsLoweringPatterns(patterns); ub::populateUBToLLVMConversionPatterns(typeConverter, patterns); vector::populateVectorTransferLoweringPatterns(patterns, /*maxTransferRank=*/1); diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir index a3d9b7ce1c98..e516e3381ad6 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir @@ -35,8 +35,8 @@ module { // CHECK-LABEL: func.func @aligned_generic_pack // CHECK: %[[IN_0:.+]] = vector.broadcast %{{.+}} : vector<16xf32> to vector<16x16xf32> -// CHECK-COUNT-15: %{{.+}} = vector.insert {{.+}} : vector<16xf32> into vector<16x16xf32> -// CHECK: %[[IN_1:.+]] = vector.insert {{.+}} : vector<16xf32> into vector<16x16xf32> +// CHECK-COUNT-16: %{{.+}} = vector.to_elements {{.+}} : vector<16xf32> +// CHECK: %[[IN_1:.+]] = vector.from_elements {{.+}} : vector<16x16xf32> // CHECK: %[[T0:.+]] = arith.addf %[[IN_0]], %[[IN_1]] : vector<16x16xf32> // CHECK: %[[T1:.+]] = arith.minimumf %[[T0]], %{{.+}} : vector<16x16xf32> // CHECK: %[[T2:.+]] = arith.maximumf %[[T1]], %{{.+}} : vector<16x16xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir index 7ad910488284..553631270bf4 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir @@ -274,5 +274,5 @@ func.func @split_reduction_double_reduction_unsupported() attributes {hal.execut } // CHECK-LABEL: func.func @split_reduction_double_reduction_unsupported() -// CHECK: vector.insert %{{.+}}, %{{.+}} : i32 into vector<4xi32> +// CHECK: vector.from_elements %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : vector<4xi32> // CHECK-NOT: vector.insert %{{.+}}, %{{.+}} : i32 into vector<1xi32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp index 09b4967b499a..760c0d09c6f0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp @@ -106,6 +106,7 @@ struct ConvertToNVVMPass final patterns, options.vectorContractLowering); vector::populateVectorGatherLoweringPatterns(patterns); vector::populateVectorMaskOpLoweringPatterns(patterns); + vector::populateVectorFromElementsLoweringPatterns(patterns); // We currently always use 64 bit indices, thus ensure the bit width of // the mask compare is consistent. vector::populateVectorMaskMaterializationPatterns( diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp index 8102e0499d40..b7b3d4848154 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp @@ -172,6 +172,26 @@ static LogicalResult validateDataTypes(Operation *op, return success(); } +/// TODO(hanchung): Delete the pattern once it is upstreamed: +/// https://github.com/llvm/llvm-project/pull/156992 +struct LowerToElementsPattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(vector::ToElementsOp op, + PatternRewriter &rewriter) const override { + VectorType vecType = op.getSource().getType(); + if (vecType.getRank() == 1 || vecType.getNumScalableDims() > 0) { + return failure(); + } + auto vec1DType = + VectorType::get({vecType.getNumElements()}, vecType.getElementType()); + Value shapeCast = rewriter.create( + op.getLoc(), vec1DType, op.getSource()); + rewriter.replaceOpWithNewOp(op, op.getResultTypes(), + shapeCast); + return success(); + } +}; + /// A pass that replaces all occurrences of GPU device operations with their /// corresponding ROCDL equivalent. /// @@ -256,6 +276,7 @@ struct ConvertToROCDLPass final vector::populateVectorInterleaveToShufflePatterns(patterns); vector::populateVectorContractLoweringPatterns( patterns, options.vectorContractLowering); + vector::populateVectorFromElementsLoweringPatterns(patterns); vector::populateVectorGatherLoweringPatterns(patterns); vector::populateVectorMaskOpLoweringPatterns(patterns); // We currently always use 64 bit indices, thus ensure the bit width of @@ -269,6 +290,7 @@ struct ConvertToROCDLPass final patterns, options.vectorTransposeLowering); vector::populateVectorTransferLoweringPatterns(patterns); arith::populateExpandBFloat16Patterns(patterns); + patterns.insert(&getContext()); if (failed(applyPatternsGreedily(m, std::move(patterns)))) { return signalPassFailure(); } diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/break_down_large_vector.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/break_down_large_vector.mlir index de2a702ec164..3db6cf051de0 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/break_down_large_vector.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/break_down_large_vector.mlir @@ -3,7 +3,7 @@ // CHECK-LABEL: func @extract_strided_slice_8_elements func.func @extract_strided_slice_8_elements(%input: vector<8xf16>) -> vector<4xf16> { // CHECK-COUNT-4: vector.extract - // CHECK-COUNT-4: vector.insert + // CHECK: vector.from_elements %0 = vector.extract_strided_slice %input {offsets = [1], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16> return %0: vector<4xf16> } @@ -22,9 +22,8 @@ func.func @extract_strided_slice_4_elements(%input: vector<4xf16>) -> vector<2xf // CHECK-LABEL: func @bitcast_16_elements func.func @bitcast_16_elements(%input: vector<16xi8>) -> vector<4xi32> { // CHECK-DAG: %[[CST_I32:.*]] = arith.constant dense<0> : vector<4xi32> - // CHECK-DAG: arith.constant dense<0> : vector<4xi8> // CHECK-COUNT-4: vector.extract - // CHECK-COUNT-4: vector.insert + // CHECK: vector.from_elements // CHECK: vector.bitcast %{{.*}} : vector<4xi8> to vector<1xi32> // CHECK: vector.insert_strided_slice {{.*}}, %[[CST_I32]] // CHECK-COUNT-3: vector.bitcast @@ -41,28 +40,22 @@ func.func @bitcast_extract_extend_0(%input: vector<1xi32>) -> vector<4xi32> { return %extend : vector<4xi32> } - // CHECK-LABEL: func @bitcast_extract_extend_0 // CHECK-SAME: (%[[INPUT:.+]]: vector<1xi32>) -// CHECK-DAG: %[[ZERO:.+]] = arith.constant dense<0> : vector<4xi32> // CHECK-DAG: %[[MASK:.+]] = arith.constant 15 : i32 // CHECK-DAG: %[[OFF1:.+]] = arith.constant 4 : i32 // CHECK-DAG: %[[OFF2:.+]] = arith.constant 8 : i32 // CHECK-DAG: %[[OFF3:.+]] = arith.constant 12 : i32 // CHECK: %[[BASE:.+]] = vector.extract %[[INPUT]][0] : i32 from vector<1xi32> // CHECK: %[[AND0:.+]] = arith.andi %[[BASE]], %[[MASK]] : i32 -// CHECK: %[[INS0:.+]] = vector.insert %[[AND0]], %[[ZERO]] [0] // CHECK: %[[SHR1:.+]] = arith.shrui %[[BASE]], %[[OFF1]] : i32 // CHECK: %[[AND1:.+]] = arith.andi %[[SHR1]], %[[MASK]] : i32 -// CHECK: %[[INS1:.+]] = vector.insert %[[AND1]], %[[INS0]] [1] // CHECK: %[[SHR2:.+]] = arith.shrui %[[BASE]], %[[OFF2]] : i32 // CHECK: %[[AND2:.+]] = arith.andi %[[SHR2]], %[[MASK]] : i32 -// CHECK: %[[INS2:.+]] = vector.insert %[[AND2]], %[[INS1]] [2] // CHECK: %[[SHR3:.+]] = arith.shrui %[[BASE]], %[[OFF3]] : i32 // CHECK: %[[AND3:.+]] = arith.andi %[[SHR3]], %[[MASK]] : i32 -// CHECK: %[[INS3:.+]] = vector.insert %[[AND3]], %[[INS2]] [3] -// CHECK: return %[[INS3]] : vector<4xi32> - +// CHECK: %[[RES:.+]] = vector.from_elements %[[AND0]], %[[AND1]], %[[AND2]], %[[AND3]] : vector<4xi32> +// CHECK: return %[[RES]] : vector<4xi32> // ----- @@ -75,7 +68,6 @@ func.func @bitcast_extract_extend_1(%input: vector<4xi32>) -> vector<4xi32> { // CHECK-LABEL: func.func @bitcast_extract_extend_1 // CHECK-SAME: (%[[INPUT:.+]]: vector<4xi32>) -// CHECK-DAG: %[[ZERO:.+]] = arith.constant dense<0> : vector<4xi32> // CHECK-DAG: %[[MASK:.+]] = arith.constant 15 : i32 // CHECK-DAG: %[[OFF0:.+]] = arith.constant 16 : i32 // CHECK-DAG: %[[OFF1:.+]] = arith.constant 20 : i32 @@ -84,14 +76,11 @@ func.func @bitcast_extract_extend_1(%input: vector<4xi32>) -> vector<4xi32> { // CHECK: %[[BASE:.+]] = vector.extract %[[INPUT]][2] : i32 from vector<4xi32> // CHECK: %[[SHR0:.+]] = arith.shrui %[[BASE]], %[[OFF0]] : i32 // CHECK: %[[AND0:.+]] = arith.andi %[[SHR0]], %[[MASK]] : i32 -// CHECK: %[[INS0:.+]] = vector.insert %[[AND0]], %[[ZERO]] [0] // CHECK: %[[SHR1:.+]] = arith.shrui %[[BASE]], %[[OFF1]] : i32 // CHECK: %[[AND1:.+]] = arith.andi %[[SHR1]], %[[MASK]] : i32 -// CHECK: %[[INS1:.+]] = vector.insert %[[AND1]], %[[INS0]] [1] // CHECK: %[[SHR2:.+]] = arith.shrui %[[BASE]], %[[OFF2]] : i32 // CHECK: %[[AND2:.+]] = arith.andi %[[SHR2]], %[[MASK]] : i32 -// CHECK: %[[INS2:.+]] = vector.insert %[[AND2]], %[[INS1]] [2] // CHECK: %[[SHR3:.+]] = arith.shrui %[[BASE]], %[[OFF3]] : i32 // CHECK: %[[AND3:.+]] = arith.andi %[[SHR3]], %[[MASK]] : i32 -// CHECK: %[[INS3:.+]] = vector.insert %[[AND3]], %[[INS2]] [3] -// CHECK: return %[[INS3]] : vector<4xi32> +// CHECK: %[[RES:.+]] = vector.from_elements %[[AND0]], %[[AND1]], %[[AND2]], %[[AND3]] : vector<4xi32> +// CHECK: return %[[RES]] : vector<4xi32> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_elementwise_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_elementwise_ops.mlir index 299633abdb8f..6b69e75a00da 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_elementwise_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_elementwise_ops.mlir @@ -48,7 +48,6 @@ func.func @transpose_leading_one_dim(%input: tensor<4x1x1xf32>) -> tensor<1x1x4x // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[ZERO:.+]] = ub.poison : vector<4xf32> // CHECK: %[[R0:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C0]], %[[C0]]]{{.+}} : tensor<4x1x1xf32>, vector<1xf32> // CHECK: %[[R1:.+]] = vector.transfer_read %[[INPUT]][%[[C1]], %[[C0]], %[[C0]]]{{.+}} : tensor<4x1x1xf32>, vector<1xf32> @@ -56,15 +55,12 @@ func.func @transpose_leading_one_dim(%input: tensor<4x1x1xf32>) -> tensor<1x1x4x // CHECK: %[[R3:.+]] = vector.transfer_read %[[INPUT]][%[[C3]], %[[C0]], %[[C0]]]{{.+}} : tensor<4x1x1xf32>, vector<1xf32> // CHECK: %[[E0:.+]] = vector.extract %[[R0]][0] : f32 from vector<1xf32> -// CHECK: %[[I0:.+]] = vector.insert %[[E0]], %[[ZERO]] [0] : f32 into vector<4xf32> // CHECK: %[[E1:.+]] = vector.extract %[[R1]][0] : f32 from vector<1xf32> -// CHECK: %[[I1:.+]] = vector.insert %[[E1]], %[[I0]] [1] : f32 into vector<4xf32> // CHECK: %[[E2:.+]] = vector.extract %[[R2]][0] : f32 from vector<1xf32> -// CHECK: %[[I2:.+]] = vector.insert %[[E2]], %[[I1]] [2] : f32 into vector<4xf32> // CHECK: %[[E3:.+]] = vector.extract %[[R3]][0] : f32 from vector<1xf32> -// CHECK: %[[I3:.+]] = vector.insert %[[E3]], %[[I2]] [3] : f32 into vector<4xf32> +// CHECK: %[[RES:.+]] = vector.from_elements %[[E0]], %[[E1]], %[[E2]], %[[E3]] : vector<4xf32> -// CHECK: %[[W:.+]] = vector.transfer_write %[[I3]], %{{.+}} +// CHECK: %[[W:.+]] = vector.transfer_write %[[RES]], %{{.+}} // CHECK: return %[[W]] : tensor<1x1x4xf32> // ----- @@ -93,8 +89,6 @@ func.func @transpose_add(%lhs: tensor<4x2xf32>, %rhs: tensor<2xf32>) -> tensor<2 // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[OINIT:.+]] = ub.poison : vector<4xf32> - // CHECK: %[[LHS0:.+]] = vector.transfer_read %[[LHS]][%[[C0]], %[[C0]]]{{.+}} : tensor<4x2xf32>, vector<2xf32> // CHECK: %[[LHS1:.+]] = vector.transfer_read %[[LHS]][%[[C1]], %[[C0]]]{{.+}} : tensor<4x2xf32>, vector<2xf32> // CHECK: %[[LHS2:.+]] = vector.transfer_read %[[LHS]][%[[C2]], %[[C0]]]{{.+}} : tensor<4x2xf32>, vector<2xf32> @@ -107,24 +101,18 @@ func.func @transpose_add(%lhs: tensor<4x2xf32>, %rhs: tensor<2xf32>) -> tensor<2 // CHECK: %[[ADD3:.+]] = arith.addf %[[LHS3]], %[[RHS0]] // CHECK: %[[E0:.+]] = vector.extract %[[ADD0]][0] -// CHECK: %[[I0:.+]] = vector.insert %[[E0]], %[[OINIT]] [0] // CHECK: %[[E1:.+]] = vector.extract %[[ADD1]][0] -// CHECK: %[[I1:.+]] = vector.insert %[[E1]], %[[I0]] [1] // CHECK: %[[E2:.+]] = vector.extract %[[ADD2]][0] -// CHECK: %[[I2:.+]] = vector.insert %[[E2]], %[[I1]] [2] // CHECK: %[[E3:.+]] = vector.extract %[[ADD3]][0] -// CHECK: %[[I3:.+]] = vector.insert %[[E3]], %[[I2]] [3] +// CHECK: %[[R0:.+]] = vector.from_elements %[[E0]], %[[E1]], %[[E2]], %[[E3]] : vector<4xf32> // CHECK: %[[E4:.+]] = vector.extract %[[ADD0]][1] -// CHECK: %[[I4:.+]] = vector.insert %[[E4]], %[[OINIT]] [0] // CHECK: %[[E5:.+]] = vector.extract %[[ADD1]][1] -// CHECK: %[[I5:.+]] = vector.insert %[[E5]], %[[I4]] [1] // CHECK: %[[E6:.+]] = vector.extract %[[ADD2]][1] -// CHECK: %[[I6:.+]] = vector.insert %[[E6]], %[[I5]] [2] // CHECK: %[[E7:.+]] = vector.extract %[[ADD3]][1] -// CHECK: %[[I7:.+]] = vector.insert %[[E7]], %[[I6]] [3] +// CHECK: %[[R1:.+]] = vector.from_elements %[[E4]], %[[E5]], %[[E6]], %[[E7]] : vector<4xf32> -// CHECK: %[[W0:.+]] = vector.transfer_write %[[I3]], %{{.+}}[%[[C0]], %[[C0]]] -// CHECK: %[[W1:.+]] = vector.transfer_write %[[I7]], %[[W0]][%[[C1]], %[[C0]]] +// CHECK: %[[W0:.+]] = vector.transfer_write %[[R0]], %{{.+}}[%[[C0]], %[[C0]]] +// CHECK: %[[W1:.+]] = vector.transfer_write %[[R1]], %[[W0]][%[[C1]], %[[C0]]] // CHECK: return %[[W1]] // ----- @@ -146,5 +134,5 @@ func.func @transpose_nd(%input: tensor<2x4x2x1x1xf32>) -> tensor<2x2x1x1x4xf32> // CHECK-LABEL: func @transpose_nd // CHECK-SAME: (%[[INPUT:.+]]: tensor<2x4x2x1x1xf32>) // CHECK-COUNT-16: vector.transfer_read %[[INPUT]]{{.+}} : tensor<2x4x2x1x1xf32>, vector<1xf32> -// CHECK-COUNT-16: vector.insert {{.+}} : f32 into vector<4xf32> +// CHECK-COUNT-4: vector.from_elements {{.+}} : vector<4xf32> // CHECK-COUNT-4: vector.transfer_write {{.+}} : vector<4xf32>, tensor<2x2x1x1x4xf32> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_gather.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_gather.mlir index f077d7a5a2a1..67e4126e41c9 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_gather.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_gather.mlir @@ -39,26 +39,22 @@ func.func @vector_gather(%arg0: memref<16x1082x1922xi8>, %index_vec: vector<16xi // CHECK-LABEL: func.func @vector_gather // CHECK-SAME: %[[ARG0:.+]]: memref<16x1082x1922xi8> // CHECK-SAME: %[[INDEX_VEC:.+]]: vector<16xindex> -// CHECK-DAG: %[[SLICE_INIT:.+]] = arith.constant dense<0> : vector<4xi8> // CHECK-DAG: %[[INIT:.+]] = arith.constant dense<0> : vector<16xi8> // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[IND0:.+]] = vector.extract %[[INDEX_VEC]][0] : index from vector<16xindex> // CHECK: %[[LOAD0:.+]] = vector.load %[[ARG0]][%[[C0]], %[[C0]], %[[IND0]]] : memref<16x1082x1922xi8>, vector<1xi8> // CHECK: %[[EXTRACT0:.+]] = vector.extract %[[LOAD0]][0] : i8 from vector<1xi8> -// CHECK: %[[INSERT0:.+]] = vector.insert %[[EXTRACT0]], %[[SLICE_INIT]] [0] : i8 into vector<4xi8> // CHECK: %[[IND1:.+]] = vector.extract %[[INDEX_VEC]][1] : index from vector<16xindex> // CHECK: %[[LOAD1:.+]] = vector.load %[[ARG0]][%[[C0]], %[[C0]], %[[IND1]]] : memref<16x1082x1922xi8>, vector<1xi8> // CHECK: %[[EXTRACT1:.+]] = vector.extract %[[LOAD1]][0] : i8 from vector<1xi8> -// CHECK: %[[INSERT1:.+]] = vector.insert %[[EXTRACT1]], %[[INSERT0]] [1] : i8 into vector<4xi8> // CHECK: %[[IND2:.+]] = vector.extract %[[INDEX_VEC]][2] : index from vector<16xindex> // CHECK: %[[LOAD2:.+]] = vector.load %[[ARG0]][%[[C0]], %[[C0]], %[[IND2]]] : memref<16x1082x1922xi8>, vector<1xi8> // CHECK: %[[EXTRACT2:.+]] = vector.extract %[[LOAD2]][0] : i8 from vector<1xi8> -// CHECK: %[[INSERT2:.+]] = vector.insert %[[EXTRACT2]], %[[INSERT1]] [2] : i8 into vector<4xi8> // CHECK: %[[IND3:.+]] = vector.extract %[[INDEX_VEC]][3] : index from vector<16xindex> // CHECK: %[[LOAD3:.+]] = vector.load %[[ARG0]][%[[C0]], %[[C0]], %[[IND3]]] : memref<16x1082x1922xi8>, vector<1xi8> // CHECK: %[[EXTRACT3:.+]] = vector.extract %[[LOAD3]][0] : i8 from vector<1xi8> -// CHECK: %[[INSERT3:.+]] = vector.insert %[[EXTRACT3]], %[[INSERT2]] [3] : i8 into vector<4xi8> +// CHECK: %[[VEC:.+]] = vector.from_elements %[[EXTRACT0]], %[[EXTRACT1]], %[[EXTRACT2]], %[[EXTRACT3]] : vector<4xi8> -// CHECK: vector.insert_strided_slice %[[INSERT3]], %[[INIT]] {offsets = [0], strides = [1]} : vector<4xi8> into vector<16xi8> +// CHECK: vector.insert_strided_slice %[[VEC]], %[[INIT]] {offsets = [0], strides = [1]} : vector<4xi8> into vector<16xi8> // CHECK-12: vector.load %[[ARG0]][%[[C0]], %[[C0]], %{{.*}}] : memref<16x1082x1922xi8>, vector<1xi8> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir index 8fb51a5f9cc2..2494d9b41f04 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir @@ -251,14 +251,11 @@ func.func @scalarize_vector_transfer_op(%arg: vector<3xf32>) -> (vector<3xf32>) // CHECK-DAG: %[[INDEX0:.+]] = arith.constant 3 : index // CHECK-DAG: %[[INDEX1:.+]] = arith.constant 4 : index // CHECK-DAG: %[[INDEX2:.+]] = arith.constant 5 : index - // CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<3xf32> // CHECK: %[[ELEM0:.+]] = memref.load %{{.+}}[%[[INDEX0]]] - // CHECK: %[[V0:.+]] = vector.insert %[[ELEM0]], %[[CST]] [0] : f32 into vector<3xf32> // CHECK: %[[ELEM1:.+]] = memref.load %{{.+}}[%[[INDEX1]]] - // CHECK: %[[V1:.+]] = vector.insert %[[ELEM1]], %[[V0]] [1] : f32 into vector<3xf32> // CHECK: %[[ELEM2:.+]] = memref.load %{{.+}}[%[[INDEX2]]] - // CHECK: %[[V2:.+]] = vector.insert %[[ELEM2]], %[[V1]] [2] : f32 into vector<3xf32> + // CHECK: %[[V2:.+]] = vector.from_elements %[[ELEM0]], %[[ELEM1]], %[[ELEM2]] : vector<3xf32> // CHECK: %[[EXT_0:.+]] = vector.extract %{{.*}}[0] : f32 from vector<3xf32> // CHECK: memref.store %[[EXT_0]], %{{.*}}[%[[INDEX0]]] : memref<20xf32> // CHECK: %[[EXT_1:.+]] = vector.extract %{{.*}}[1] : f32 from vector<3xf32> @@ -283,19 +280,15 @@ func.func @scalarize_non_minor_identity_transfer_read(%memory: memref<4x2x4xi32> return %0: vector<4xi32> } -// CHECK: %[[INIT:.+]] = arith.constant dense<0> : vector<4xi32> // CHECK: %[[LD0:.+]] = memref.load %[[MEM]][%[[I1]], %[[I2]], %[[I3]]] : memref<4x2x4xi32> -// CHECK: %[[INSERT0:.+]] = vector.insert %[[LD0]], %[[INIT]] [0] : i32 into vector<4xi32> // CHECK: %[[IDX1:.+]] = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%[[I1]]] // CHECK: %[[LD1:.+]] = memref.load %[[MEM]][%[[IDX1]], %[[I2]], %[[I3]]] : memref<4x2x4xi32> -// CHECK: %[[INSERT1:.+]] = vector.insert %[[LD1]], %[[INSERT0]] [1] : i32 into vector<4xi32> // CHECK: %[[IDX2:.+]] = affine.apply affine_map<()[s0] -> (s0 + 2)>()[%[[I1]]] // CHECK: %[[LD2:.+]] = memref.load %[[MEM]][%[[IDX2]], %[[I2]], %[[I3]]] : memref<4x2x4xi32> -// CHECK: %[[INSERT2:.+]] = vector.insert %[[LD2]], %[[INSERT1]] [2] : i32 into vector<4xi32> // CHECK: %[[IDX3:.+]] = affine.apply affine_map<()[s0] -> (s0 + 3)>()[%[[I1]]] // CHECK: %[[LD3:.+]] = memref.load %[[MEM]][%[[IDX3]], %[[I2]], %[[I3]]] : memref<4x2x4xi32> -// CHECK: %[[INSERT3:.+]] = vector.insert %[[LD3]], %[[INSERT2]] [3] : i32 into vector<4xi32> -// CHECK: return %[[INSERT3]] +// CHECK: %[[RES:.+]] = vector.from_elements %[[LD0]], %[[LD1]], %[[LD2]], %[[LD3]] : vector<4xi32> +// CHECK: return %[[RES]] // ----- @@ -451,21 +444,17 @@ func.func @scalarize_vector_load_op(%i: index) -> vector<4xi32> { return %1: vector<4xi32> } -// CHECK: %[[INIT:.+]] = arith.constant dense<0> : vector<4xi32> // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan // CHECK: %[[LD0:.+]] = memref.load %[[SUBSPAN]][%[[C0]], %[[ARG0]]] : memref<10x10xi32> -// CHECK: %[[INSERT0:.+]] = vector.insert %[[LD0]], %[[INIT]] [0] : i32 into vector<4xi32> // CHECK: %[[IDX1:.+]] = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%[[ARG0]]] // CHECK: %[[LD1:.+]] = memref.load %[[SUBSPAN]][%[[C0]], %[[IDX1]]] : memref<10x10xi32> -// CHECK: %[[INSERT1:.+]] = vector.insert %[[LD1]], %[[INSERT0]] [1] : i32 into vector<4xi32> // CHECK: %[[IDX2:.+]] = affine.apply affine_map<()[s0] -> (s0 + 2)>()[%[[ARG0]]] // CHECK: %[[LD2:.+]] = memref.load %[[SUBSPAN]][%[[C0]], %[[IDX2]]] : memref<10x10xi32> -// CHECK: %[[INSERT2:.+]] = vector.insert %[[LD2]], %[[INSERT1]] [2] : i32 into vector<4xi32> // CHECK: %[[IDX3:.+]] = affine.apply affine_map<()[s0] -> (s0 + 3)>()[%[[ARG0]]] // CHECK: %[[LD3:.+]] = memref.load %[[SUBSPAN]][%[[C0]], %[[IDX3]]] : memref<10x10xi32> -// CHECK: %[[INSERT3:.+]] = vector.insert %[[LD3]], %[[INSERT2]] [3] : i32 into vector<4xi32> -// CHECK: return %[[INSERT3]] +// CHECK: %[[RES:.+]] = vector.from_elements %[[LD0]], %[[LD1]], %[[LD2]], %[[LD3]] : vector<4xi32> +// CHECK: return %[[RES]] // ----- @@ -668,7 +657,6 @@ func.func @scalarize_masked_vector_transfer_op(%arg: vector<3xf32>, %mask: vecto } // CHECK-LABEL: func.func @scalarize_masked_vector_transfer_op -// CHECK-DAG: %[[INIT:.+]] = arith.constant dense<0.000000e+00> : vector<3xf32> // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index // CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index @@ -682,14 +670,13 @@ func.func @scalarize_masked_vector_transfer_op(%arg: vector<3xf32>, %mask: vecto // CHECK: } else { // CHECK: scf.yield %[[PAD]] : f32 // CHECK: } -// CHECK: vector.insert %[[MASK_LD0]], %[[INIT]] [0] : f32 into vector<3xf32> // CHECK: vector.extract %{{.*}}[1] : i1 from vector<3xi1> // CHECK: scf.if %{{.*}} -> (f32) { // CHECK: memref.load %{{.*}}[%[[C4]]] : memref<20xf32> // CHECK: vector.extract %{{.*}}[2] : i1 from vector<3xi1> // CHECK: scf.if %{{.*}} -> (f32) { // CHECK: memref.load %{{.*}}[%[[C5]]] : memref<20xf32> -// CHECK: %[[MASK_TR:.+]] = vector.insert {{.*}} [2] : f32 into vector<3xf32> +// CHECK: %[[READ:.+]] = vector.from_elements {{.*}} : vector<3xf32> /// Transfer write. // CHECK: scf.if %[[MB0]] { @@ -704,7 +691,7 @@ func.func @scalarize_masked_vector_transfer_op(%arg: vector<3xf32>, %mask: vecto // CHECK: %[[E2:.+]] = vector.extract {{.*}}[2] : f32 from vector<3xf32> // CHECK: memref.store %[[E2]], %{{.*}}[%[[C5]]] : memref<20xf32> // CHECK: } -// CHECK: return %[[MASK_TR]] : vector<3xf32> +// CHECK: return %[[READ]] : vector<3xf32> // ----- @@ -723,7 +710,6 @@ func.func @extract_vector_transfer_read_mask_bits(%arg: vector<3xf32>, %index: i // CHECK-LABEL: func.func @extract_vector_transfer_read_mask_bits // CHECK-SAME: %{{.*}}: vector<3xf32>, %[[MASK_SIZE:.+]]: index -// CHECK-DAG: %[[INIT:.+]] = arith.constant dense<0.000000e+00> : vector<3xf32> // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index @@ -739,10 +725,10 @@ func.func @extract_vector_transfer_read_mask_bits(%arg: vector<3xf32>, %index: i // CHECK: } else { // CHECK: scf.yield %[[PAD]] : f32 // CHECK: } -// CHECK: vector.insert %[[MASK_LD0]], %[[INIT]] [0] : f32 into vector<3xf32> // CHECK: %[[MB1:.+]] = arith.cmpi sgt, %[[MASK_SIZE]], %[[C1]] : index // CHECK: scf.if %[[MB1]] -> (f32) { // CHECK: memref.load %{{.*}}[%[[C4]]] : memref<20xf32> // CHECK: %[[MB2:.+]] = arith.cmpi sgt, %[[MASK_SIZE]], %[[C2]] : index // CHECK: scf.if %[[MB2]] -> (f32) { // CHECK: memref.load %{{.*}}[%[[C5]]] : memref<20xf32> +// CHECK: vector.from_elements {{.+}} : vector<3xf32> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir index 833440145ff1..2c7f4e4fe496 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir @@ -285,9 +285,7 @@ func.func @matmul_4x4x4_i8_to_i32_dot_prod(%lhs: tensor<4x4xi8>, %rhs : tensor<4 // CHECK-SAME: (%[[LHS:.+]]: tensor<4x4xi8>, %[[RHS:.+]]: tensor<4x4xi8>) // CHECK-DAG: %[[PV:.+]] = ub.poison : i8 // CHECK-DAG: %[[C0I32:.+]] = arith.constant 0 : i32 -// CHECK-DAG: %[[V4I8:.+]] = ub.poison : vector<4xi8> // CHECK-DAG: %[[V4I32:.+]] = arith.constant dense<0> : vector<4xi32> -// CHECK-DAG: %[[V1I32:.+]] = arith.constant dense<0> : vector<1xi32> // CHECK-DAG: %[[IDX0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[IDX1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[IDX2:.+]] = arith.constant 2 : index @@ -301,15 +299,11 @@ func.func @matmul_4x4x4_i8_to_i32_dot_prod(%lhs: tensor<4x4xi8>, %rhs : tensor<4 // CHECK-NEXT: %[[RHS2:.+]] = vector.transfer_read %[[RHS]][%[[IDX2]], %[[IDX0]]], %[[PV]] // CHECK-NEXT: %[[RHS3:.+]] = vector.transfer_read %[[RHS]][%[[IDX3]], %[[IDX0]]], %[[PV]] // CHECK: %[[EXTR0:.+]] = vector.extract %[[RHS0]][0] -// CHECK-NEXT: %[[INS0:.+]] = vector.insert %[[EXTR0]], %[[V4I8]] [0] // CHECK-NEXT: %[[EXTR1:.+]] = vector.extract %[[RHS1]][0] -// CHECK-NEXT: %[[INS1:.+]] = vector.insert %[[EXTR1]], %[[INS0]] [1] // CHECK-NEXT: %[[EXTR2:.+]] = vector.extract %[[RHS2]][0] -// CHECK-NEXT: %[[INS2:.+]] = vector.insert %[[EXTR2]], %[[INS1]] [2] // CHECK-NEXT: %[[EXTR3:.+]] = vector.extract %[[RHS3]][0] -// CHECK-NEXT: %[[COL0:.+]] = vector.insert %[[EXTR3]], %[[INS2]] [3] +// CHECK: %[[COL0:.+]] = vector.from_elements %[[EXTR0]], %[[EXTR1]], %[[EXTR2]], %[[EXTR3]] : vector<4xi8> // CHECK: %[[DOT0:.+]] = spirv.SDotAccSat %[[LHS0]], %[[COL0]], %[[C0I32]] -// CHECK-NEXT: %[[RES0:.+]] = vector.insert %[[DOT0]], %[[V1I32]] [0] // CHECK-COUNT-15: spirv.SDotAccSat // // CHECK-COUNT-16: vector.insert_strided_slice {{.+}} : vector<1xi32> into vector<4xi32> diff --git a/third_party/llvm-project b/third_party/llvm-project index 8243dfcde024..8b88014f2ddd 160000 --- a/third_party/llvm-project +++ b/third_party/llvm-project @@ -1 +1 @@ -Subproject commit 8243dfcde024285fc888c927908ba8af2318aecd +Subproject commit 8b88014f2dddce669f714872dd8495348422469d