Skip to content

Commit

Permalink
also reorder types for swmmac, dst before src types
Browse files Browse the repository at this point in the history
  • Loading branch information
mbrkusanin committed Jan 22, 2024
1 parent 780eeed commit 8ee840b
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 61 deletions.
51 changes: 27 additions & 24 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18329,6 +18329,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
// the form:
// D = A * B + C
// We need to specify one type for matrices AB and one for matrices CD.
// Sparse matrix operations can have different types for A and B as well as
// an additional type for sparsity index.
// Destination type should be put before types used for source operands.
SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
// On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
// There is no need for the variable opsel argument, so always set it to
Expand All @@ -18341,14 +18344,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0};
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0};
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
Expand All @@ -18357,7 +18360,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
LLVM_FALLTHROUGH;
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
ArgsForMatchingMatrixTypes = {2, 0};
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
Expand All @@ -18366,111 +18369,111 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
LLVM_FALLTHROUGH;
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
ArgsForMatchingMatrixTypes = {2, 0};
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
ArgsForMatchingMatrixTypes = {2, 0};
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
ArgsForMatchingMatrixTypes = {2, 0};
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
ArgsForMatchingMatrixTypes = {4, 1};
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
ArgsForMatchingMatrixTypes = {4, 1};
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0};
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0};
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0};
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
ArgsForMatchingMatrixTypes = {2, 0};
ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
ArgsForMatchingMatrixTypes = {4, 1};
ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
break;
}
Expand Down
22 changes: 11 additions & 11 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ typedef __bf16 v16bf __attribute__((ext_vector_type(16)));

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
// CHECK-GFX1200-NEXT: ret void
//
Expand All @@ -26,7 +26,7 @@ void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8bf16.v16bf16.v8f32.i16(<8 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8bf16.v16bf16.i16(<8 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
Expand All @@ -37,7 +37,7 @@ void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8bf a, v16bf b,

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
Expand All @@ -48,7 +48,7 @@ void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8bf16.v16bf16.v8bf16.i16(<8 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], <8 x bfloat> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8bf16.v8bf16.v16bf16.i16(<8 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], <8 x bfloat> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
Expand All @@ -59,7 +59,7 @@ void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8bf* out, v8bf a, v16bf b

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v2i32.v4i32.v8i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
Expand All @@ -70,7 +70,7 @@ void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.i32.v2i32.v8i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
Expand All @@ -81,7 +81,7 @@ void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v2i32.v4i32.v8i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
Expand All @@ -92,7 +92,7 @@ void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v2i32.v4i32.v8f32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
Expand All @@ -103,7 +103,7 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b,

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v2i32.v4i32.v8f32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
Expand All @@ -114,7 +114,7 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b,

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v2i32.v4i32.v8f32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
Expand All @@ -125,7 +125,7 @@ void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b,

// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v2i32.v4i32.v8f32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1200-NEXT: ret void
//
Expand Down
Loading

0 comments on commit 8ee840b

Please sign in to comment.